|
4 | 4 |
|
5 | 5 | use crate::errors::{Error, Result as XmlResult};
|
6 | 6 | use crate::escape::{do_unescape, escape};
|
| 7 | +use crate::escapei::EscapeError; |
7 | 8 | use crate::name::QName;
|
8 | 9 | use crate::reader::{is_whitespace, Reader};
|
9 | 10 | use crate::utils::{write_byte_string, write_cow_string, Bytes};
|
@@ -32,6 +33,13 @@ pub struct Attribute<'a> {
|
32 | 33 | }
|
33 | 34 |
|
34 | 35 | impl<'a> Attribute<'a> {
|
| 36 | + /// |
| 37 | + pub fn normalized_value(&'a self) -> Result<Cow<'a, [u8]>, EscapeError> { |
| 38 | + let normalized = normalize_attribute_value(self.value.as_ref()); |
| 39 | + let escaped = do_unescape(&*normalized, None)?; |
| 40 | + Ok(Cow::Owned(escaped.into_owned())) |
| 41 | + } |
| 42 | + |
35 | 43 | /// Returns the unescaped value.
|
36 | 44 | ///
|
37 | 45 | /// This is normally the value you are interested in. Escape sequences such as `>` are
|
@@ -182,6 +190,90 @@ impl<'a> From<Attr<&'a [u8]>> for Attribute<'a> {
|
182 | 190 | }
|
183 | 191 | }
|
184 | 192 |
|
| 193 | +/// Normalize the attribute value according to xml specification section 3.3.3 |
| 194 | +/// |
| 195 | +/// https://www.w3.org/TR/xml/#AVNormalize |
| 196 | +/// |
| 197 | +/// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value |
| 198 | +/// * Sequences of whitespace-like characters are replaced with a single whitespace character |
| 199 | +/// * Character and entity references are substituted as defined by the spec |
| 200 | +fn normalize_attribute_value(attr: &[u8]) -> Cow<[u8]> { |
| 201 | + // TODO: character references, entity references, error handling associated with those |
| 202 | + |
| 203 | + #[derive(PartialEq)] |
| 204 | + enum ParseState { |
| 205 | + Space, |
| 206 | + CDATA, |
| 207 | + } |
| 208 | + |
| 209 | + // Trim characters from the beginning and end of the attribute value - this can't fail. |
| 210 | + fn trim_value(attr: &[u8]) -> &[u8] { |
| 211 | + let first_non_space_char = attr.iter().position(|c| !is_whitespace(*c)); |
| 212 | + |
| 213 | + if first_non_space_char.is_none() { |
| 214 | + // The entire value was whitespace-like characters |
| 215 | + return b""; |
| 216 | + } |
| 217 | + |
| 218 | + let last_non_space_char = attr.iter().rposition(|c| !is_whitespace(*c)); |
| 219 | + |
| 220 | + // Trim all whitespace-like characters away from the beginning and end of the attribute value. |
| 221 | + let begin = first_non_space_char.unwrap(); |
| 222 | + let end = last_non_space_char.unwrap_or(attr.len()); |
| 223 | + &attr[begin..=end] |
| 224 | + } |
| 225 | + |
| 226 | + let trimmed_attr = trim_value(attr); |
| 227 | + |
| 228 | + // A new buffer is only created when we encounter a situation that requires it. |
| 229 | + let mut normalized: Option<Vec<u8>> = None; |
| 230 | + // We start on character data because all whitespace-like characters are already trimmed away. |
| 231 | + let mut current_state = ParseState::CDATA; |
| 232 | + |
| 233 | + // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference |
| 234 | + // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new |
| 235 | + // buffer and continue using this buffer. |
| 236 | + for (idx, ch) in trimmed_attr.iter().enumerate() { |
| 237 | + match ch { |
| 238 | + b'\n' | b'\r' | b'\t' | b' ' => match current_state { |
| 239 | + ParseState::Space => match normalized { |
| 240 | + Some(_) => continue, |
| 241 | + None => normalized = Some(Vec::from(&trimmed_attr[..idx])), |
| 242 | + }, |
| 243 | + ParseState::CDATA => { |
| 244 | + current_state = ParseState::Space; |
| 245 | + match normalized.as_mut() { |
| 246 | + Some(buf) => buf.push(b' '), |
| 247 | + None => { |
| 248 | + let mut buf = Vec::from(&trimmed_attr[..idx]); |
| 249 | + buf.push(b' '); |
| 250 | + normalized = Some(buf); |
| 251 | + } |
| 252 | + } |
| 253 | + } |
| 254 | + }, |
| 255 | + c @ _ => match current_state { |
| 256 | + ParseState::Space => { |
| 257 | + current_state = ParseState::CDATA; |
| 258 | + if let Some(normalized) = normalized.as_mut() { |
| 259 | + normalized.push(*c); |
| 260 | + } |
| 261 | + } |
| 262 | + ParseState::CDATA => { |
| 263 | + if let Some(normalized) = normalized.as_mut() { |
| 264 | + normalized.push(*c); |
| 265 | + } |
| 266 | + } |
| 267 | + }, |
| 268 | + } |
| 269 | + } |
| 270 | + |
| 271 | + match normalized { |
| 272 | + Some(normalized) => Cow::Owned(normalized), |
| 273 | + None => Cow::Borrowed(trimmed_attr), |
| 274 | + } |
| 275 | +} |
| 276 | + |
185 | 277 | ////////////////////////////////////////////////////////////////////////////////////////////////////
|
186 | 278 |
|
187 | 279 | /// Iterator over XML attributes.
|
@@ -798,6 +890,43 @@ mod xml {
|
798 | 890 | use super::*;
|
799 | 891 | use pretty_assertions::assert_eq;
|
800 | 892 |
|
| 893 | + #[test] |
| 894 | + fn attribute_value_normalization() { |
| 895 | + // empty value |
| 896 | + assert_eq!(normalize_attribute_value(b""), Cow::Borrowed(b"")); |
| 897 | + // return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character |
| 898 | + assert_eq!( |
| 899 | + normalize_attribute_value(b"\rfoo\rbar\tbaz\ndelta\n"), |
| 900 | + Cow::Owned::<[u8]>(b"foo bar baz delta".to_vec()) |
| 901 | + ); |
| 902 | + // leading and trailing spaces must be stripped |
| 903 | + assert_eq!(normalize_attribute_value(b" foo "), Cow::Borrowed(b"foo")); |
| 904 | + // leading space |
| 905 | + assert_eq!(normalize_attribute_value(b" bar"), Cow::Borrowed(b"bar")); |
| 906 | + // trailing space |
| 907 | + assert_eq!(normalize_attribute_value(b"baz "), Cow::Borrowed(b"baz")); |
| 908 | + // sequences of spaces must be replaced with a single space |
| 909 | + assert_eq!( |
| 910 | + normalize_attribute_value(b" foo bar baz "), |
| 911 | + Cow::Owned::<[u8]>(b"foo bar baz".to_vec()) |
| 912 | + ); |
| 913 | + // sequence replacement mixed with characters treated as whitespace (\t \r \n) |
| 914 | + assert_eq!( |
| 915 | + normalize_attribute_value(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r"), |
| 916 | + Cow::Owned::<[u8]>(b"foo bar baz delta echo foxtrot".to_vec()) |
| 917 | + ); |
| 918 | + // character references for whitespace-like characters are not combined after substitution |
| 919 | + assert_eq!( |
| 920 | + normalize_attribute_value(b" Р"), |
| 921 | + Cow::Owned::<[u8]>(b" \r\t\n".to_vec()) |
| 922 | + ); |
| 923 | + // sequence replacement mixed with characters treated as whitespace (\t \r \n) |
| 924 | + assert_eq!( |
| 925 | + normalize_attribute_value(b"  foo\tbar baz  delta\n\r echo foxtrotÐ"), |
| 926 | + Cow::Owned::<[u8]>(b" foo bar baz \ndelta \t echo foxtrot\r".to_vec()) |
| 927 | + ); |
| 928 | + } |
| 929 | + |
801 | 930 | /// Checked attribute is the single attribute
|
802 | 931 | mod single {
|
803 | 932 | use super::*;
|
|
0 commit comments