|
4 | 4 |
|
5 | 5 | use crate::encoding::Decoder;
|
6 | 6 | use crate::errors::Result as XmlResult;
|
7 |
| -use crate::escape::{escape, resolve_predefined_entity, unescape_with}; |
| 7 | +#[cfg(any(doc, not(feature = "encoding")))] |
| 8 | +use crate::escape::EscapeError; |
| 9 | +use crate::escape::{escape, normalize_attribute_value, resolve_predefined_entity, unescape_with}; |
8 | 10 | use crate::name::{LocalName, Namespace, QName};
|
9 | 11 | use crate::reader::NsReader;
|
10 | 12 | use crate::utils::{is_whitespace, Bytes};
|
@@ -32,7 +34,119 @@ pub struct Attribute<'a> {
|
32 | 34 | }
|
33 | 35 |
|
34 | 36 | impl<'a> Attribute<'a> {
|
35 |
| - /// Decodes using UTF-8 then unescapes the value. |
| 37 | + /// Returns the attribute value normalized as per [the XML specification]. |
| 38 | + /// |
| 39 | + /// Do not use this method with HTML attributes. |
| 40 | + /// |
| 41 | + /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` |
| 42 | + /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. |
| 43 | + /// |
| 44 | + /// This will allocate unless the raw attribute value does not require normalization. |
| 45 | + /// |
| 46 | + /// See also [`normalized_value_with()`](#method.normalized_value_with) |
| 47 | + /// |
| 48 | + /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize |
| 49 | + #[cfg(any(doc, not(feature = "encoding")))] |
| 50 | + pub fn normalized_value(&self, depth: usize) -> Result<Cow<'a, str>, EscapeError> { |
| 51 | + self.normalized_value_with(depth, |_| None) |
| 52 | + } |
| 53 | + |
| 54 | + /// Returns the attribute value normalized as per [the XML specification], |
| 55 | + /// using a custom entity resolver. |
| 56 | + /// |
| 57 | + /// Do not use this method with HTML attributes. |
| 58 | + /// |
| 59 | + /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` |
| 60 | + /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function |
| 61 | + /// for resolving entities can be provided as `resolve_entity`. Builtin entities will still |
| 62 | + /// take precedence. |
| 63 | + /// |
| 64 | + /// This will allocate unless the raw attribute value does not require normalization. |
| 65 | + /// |
| 66 | + /// See also [`normalized_value()`](#method.normalized_value) |
| 67 | + /// |
| 68 | + /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize |
| 69 | + #[cfg(any(doc, not(feature = "encoding")))] |
| 70 | + pub fn normalized_value_with<'entity>( |
| 71 | + &self, |
| 72 | + depth: usize, |
| 73 | + resolve_entity: impl Fn(&str) -> Option<&'entity str>, |
| 74 | + ) -> Result<Cow<'a, str>, EscapeError> { |
| 75 | + let decoded = match &self.value { |
| 76 | + Cow::Borrowed(bytes) => { |
| 77 | + Cow::Borrowed(std::str::from_utf8(bytes).expect("unable to decode as utf-8")) |
| 78 | + } |
| 79 | + // Convert to owned, because otherwise Cow will be bound with wrong lifetime |
| 80 | + Cow::Owned(bytes) => Cow::Owned( |
| 81 | + std::str::from_utf8(bytes) |
| 82 | + .expect("unable to decode as utf-8") |
| 83 | + .to_owned(), |
| 84 | + ), |
| 85 | + }; |
| 86 | + |
| 87 | + match normalize_attribute_value(&decoded, depth, resolve_entity)? { |
| 88 | + // Because result is borrowed, no replacements was done and we can use original string |
| 89 | + Cow::Borrowed(_) => Ok(decoded), |
| 90 | + Cow::Owned(s) => Ok(s.into()), |
| 91 | + } |
| 92 | + } |
| 93 | + |
| 94 | + /// Decodes using a provided reader and returns the attribute value normalized |
| 95 | + /// as per [the XML specification]. |
| 96 | + /// |
| 97 | + /// Do not use this method with HTML attributes. |
| 98 | + /// |
| 99 | + /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` |
| 100 | + /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. |
| 101 | + /// |
| 102 | + /// This will allocate unless the raw attribute value does not require normalization. |
| 103 | + /// |
| 104 | + /// See also [`decode_and_normalize_value_with()`](#method.decode_and_normalize_value_with) |
| 105 | + /// |
| 106 | + /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize |
| 107 | + pub fn decode_and_normalize_value( |
| 108 | + &self, |
| 109 | + decoder: Decoder, |
| 110 | + depth: usize, |
| 111 | + ) -> XmlResult<Cow<'a, str>> { |
| 112 | + self.decode_and_normalize_value_with(decoder, depth, |_| None) |
| 113 | + } |
| 114 | + |
| 115 | + /// Decodes using a provided reader and returns the attribute value normalized |
| 116 | + /// as per [the XML specification], using a custom entity resolver. |
| 117 | + /// |
| 118 | + /// Do not use this method with HTML attributes. |
| 119 | + /// |
| 120 | + /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` |
| 121 | + /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function |
| 122 | + /// for resolving entities can be provided as `resolve_entity`. Builtin entities will still |
| 123 | + /// take precedence. |
| 124 | + /// |
| 125 | + /// This will allocate unless the raw attribute value does not require normalization. |
| 126 | + /// |
| 127 | + /// See also [`decode_and_normalize_value()`](#method.decode_and_normalize_value) |
| 128 | + /// |
| 129 | + /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize |
| 130 | + pub fn decode_and_normalize_value_with<'entity>( |
| 131 | + &self, |
| 132 | + decoder: Decoder, |
| 133 | + depth: usize, |
| 134 | + resolve_entity: impl Fn(&str) -> Option<&'entity str>, |
| 135 | + ) -> XmlResult<Cow<'a, str>> { |
| 136 | + let decoded = match &self.value { |
| 137 | + Cow::Borrowed(bytes) => decoder.decode(bytes)?, |
| 138 | + // Convert to owned, because otherwise Cow will be bound with wrong lifetime |
| 139 | + Cow::Owned(bytes) => decoder.decode(bytes)?.into_owned().into(), |
| 140 | + }; |
| 141 | + |
| 142 | + match normalize_attribute_value(&decoded, depth, resolve_entity)? { |
| 143 | + // Because result is borrowed, no replacements was done and we can use original string |
| 144 | + Cow::Borrowed(_) => Ok(decoded), |
| 145 | + Cow::Owned(s) => Ok(s.into()), |
| 146 | + } |
| 147 | + } |
| 148 | + |
| 149 | + /// Returns the unescaped value. |
36 | 150 | ///
|
37 | 151 | /// This is normally the value you are interested in. Escape sequences such as `>` are
|
38 | 152 | /// replaced with their unescaped equivalents such as `>`.
|
@@ -1011,6 +1125,91 @@ mod xml {
|
1011 | 1125 | use super::*;
|
1012 | 1126 | use pretty_assertions::assert_eq;
|
1013 | 1127 |
|
| 1128 | + #[cfg(any(doc, not(feature = "encoding")))] |
| 1129 | + mod attribute_value_normalization { |
| 1130 | + |
| 1131 | + use super::*; |
| 1132 | + use pretty_assertions::assert_eq; |
| 1133 | + |
| 1134 | + /// Empty values returned are unchanged |
| 1135 | + #[test] |
| 1136 | + fn test_empty() { |
| 1137 | + let raw_value = "".as_bytes(); |
| 1138 | + let attr = Attribute::from(("foo".as_bytes(), raw_value)); |
| 1139 | + assert_eq!(attr.normalized_value(5), Ok(Cow::Borrowed(""))); |
| 1140 | + } |
| 1141 | + |
| 1142 | + /// Already normalized values are returned unchanged |
| 1143 | + #[test] |
| 1144 | + fn test_already_normalized() { |
| 1145 | + let raw_value = "foobar123".as_bytes(); |
| 1146 | + let output = "foobar123"; |
| 1147 | + let attr = Attribute::from(("foo".as_bytes(), raw_value)); |
| 1148 | + assert_eq!(attr.normalized_value(5), Ok(Cow::Borrowed(output))); |
| 1149 | + } |
| 1150 | + |
| 1151 | + /// Return, tab, and newline characters (0xD, 0x9, 0xA) must be substituted with |
| 1152 | + /// a space character |
| 1153 | + #[test] |
| 1154 | + fn test_space_replacement() { |
| 1155 | + let raw_value = "\r\nfoo\rbar\tbaz\n\ndelta\n".as_bytes(); |
| 1156 | + let output = " foo bar baz delta ".to_owned(); |
| 1157 | + let attr = Attribute::from(("foo".as_bytes(), raw_value)); |
| 1158 | + assert_eq!(attr.normalized_value(5), Ok(Cow::Owned(output))); |
| 1159 | + } |
| 1160 | + |
| 1161 | + /// Entities must be terminated |
| 1162 | + #[test] |
| 1163 | + fn test_unterminated_entity() { |
| 1164 | + let raw_value = "abc"def".as_bytes(); |
| 1165 | + let attr = Attribute::from(("foo".as_bytes(), raw_value)); |
| 1166 | + assert_eq!( |
| 1167 | + attr.normalized_value(5), |
| 1168 | + Err(EscapeError::UnterminatedEntity(3..11)) |
| 1169 | + ); |
| 1170 | + } |
| 1171 | + |
| 1172 | + /// Unknown entities raise error |
| 1173 | + #[test] |
| 1174 | + fn test_unrecognized_entity() { |
| 1175 | + let raw_value = "abc&unkn;def".as_bytes(); |
| 1176 | + let attr = Attribute::from(("foo".as_bytes(), raw_value)); |
| 1177 | + assert_eq!( |
| 1178 | + attr.normalized_value(5), |
| 1179 | + Err(EscapeError::UnrecognizedEntity(4..8, "unkn".to_owned())) // TODO: is this divergence between range behavior of UnterminatedEntity and UnrecognizedEntity appropriate? existing unescape code behaves the same. (see: start index) |
| 1180 | + ); |
| 1181 | + } |
| 1182 | + |
| 1183 | + /// custom entity replacement works, entity replacement text processed recursively |
| 1184 | + #[test] |
| 1185 | + fn test_entity_replacement() { |
| 1186 | + let raw_value = "&d;&d;A&a; &a;B&da;".as_bytes(); |
| 1187 | + let output = "\r\rA\n \nB\r\n".to_owned(); |
| 1188 | + let attr = Attribute::from(("foo".as_bytes(), raw_value)); |
| 1189 | + fn custom_resolver(ent: &str) -> Option<&'static str> { |
| 1190 | + match ent { |
| 1191 | + "d" => Some("
"), |
| 1192 | + "a" => Some("
"), |
| 1193 | + "da" => Some("
"), |
| 1194 | + _ => None, |
| 1195 | + } |
| 1196 | + } |
| 1197 | + assert_eq!( |
| 1198 | + attr.normalized_value_with(5, &custom_resolver), |
| 1199 | + Ok(Cow::Owned(output)) |
| 1200 | + ); |
| 1201 | + } |
| 1202 | + |
| 1203 | + #[test] |
| 1204 | + fn test_char_references() { |
| 1205 | + // character literal references are substituted without being replaced by spaces |
| 1206 | + let raw_value = "

A

B
".as_bytes(); |
| 1207 | + let output = "\r\rA\n\nB\r\n".to_owned(); |
| 1208 | + let attr = Attribute::from(("foo".as_bytes(), raw_value)); |
| 1209 | + assert_eq!(attr.normalized_value(5), Ok(Cow::Owned(output))); |
| 1210 | + } |
| 1211 | + } |
| 1212 | + |
1014 | 1213 | /// Checked attribute is the single attribute
|
1015 | 1214 | mod single {
|
1016 | 1215 | use super::*;
|
|
0 commit comments