temp

dralley · dralley · commit 1a138d66d2fa · 2022-06-22T21:41:12.000-04:00
diff --git a/src/events/attributes.rs b/src/events/attributes.rs
@@ -33,9 +33,88 @@ pub struct Attribute<'a> {
 }
 
 impl<'a> Attribute<'a> {
+    /// Normalize the attribute value according to xml specification section 3.3.3
     ///
+    /// https://www.w3.org/TR/xml/#AVNormalize
+    ///
+    /// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value
+    /// * Sequences of whitespace-like characters are replaced with a single whitespace character
+    /// * Character and entity references are substituted as defined by the spec
     pub fn normalized_value(&'a self) -> Result<Cow<'a, [u8]>, EscapeError> {
-        let normalized = normalize_attribute_value(self.value.as_ref());
+        // TODO: character references, entity references, error handling associated with those
+
+        #[derive(PartialEq)]
+        enum ParseState {
+            Space,
+            CDATA,
+        }
+
+        // Trim characters from the beginning and end of the attribute value - this can't fail.
+        fn trim_value(attr: &[u8]) -> &[u8] {
+            let first_non_space_char = attr.iter().position(|c| !is_whitespace(*c));
+
+            if first_non_space_char.is_none() {
+                // The entire value was whitespace-like characters
+                return b"";
+            }
+
+            let last_non_space_char = attr.iter().rposition(|c| !is_whitespace(*c));
+
+            // Trim all whitespace-like characters away from the beginning and end of the attribute value.
+            let begin = first_non_space_char.unwrap();
+            let end = last_non_space_char.unwrap_or(attr.len());
+            &attr[begin..=end]
+        }
+
+        let trimmed_attr = trim_value(self.value.as_ref());
+
+        // A new buffer is only created when we encounter a situation that requires it.
+        let mut normalized: Option<Vec<u8>> = None;
+        // We start on character data because all whitespace-like characters are already trimmed away.
+        let mut current_state = ParseState::CDATA;
+
+        // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
+        // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
+        // buffer and continue using this buffer.
+        for (idx, ch) in trimmed_attr.iter().enumerate() {
+            match ch {
+                b'\n' | b'\r' | b'\t' | b' ' => match current_state {
+                    ParseState::Space => match normalized {
+                        Some(_) => continue,
+                        None => normalized = Some(Vec::from(&trimmed_attr[..idx])),
+                    },
+                    ParseState::CDATA => {
+                        current_state = ParseState::Space;
+                        match normalized.as_mut() {
+                            Some(buf) => buf.push(b' '),
+                            None => {
+                                let mut buf = Vec::from(&trimmed_attr[..idx]);
+                                buf.push(b' ');
+                                normalized = Some(buf);
+                            }
+                        }
+                    }
+                },
+                c @ _ => match current_state {
+                    ParseState::Space => {
+                        current_state = ParseState::CDATA;
+                        if let Some(normalized) = normalized.as_mut() {
+                            normalized.push(*c);
+                        }
+                    }
+                    ParseState::CDATA => {
+                        if let Some(normalized) = normalized.as_mut() {
+                            normalized.push(*c);
+                        }
+                    }
+                },
+            }
+        }
+
+        let normalized = match normalized {
+            Some(normalized) => Cow::Owned(normalized),
+            None => Cow::Borrowed(trimmed_attr),
+        };
         let escaped = do_unescape(&*normalized, None)?;
         Ok(Cow::Owned(escaped.into_owned()))
     }
@@ -190,90 +269,6 @@ impl<'a> From<Attr<&'a [u8]>> for Attribute<'a> {
     }
 }
 
-/// Normalize the attribute value according to xml specification section 3.3.3
-///
-/// https://www.w3.org/TR/xml/#AVNormalize
-///
-/// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value
-/// * Sequences of whitespace-like characters are replaced with a single whitespace character
-/// * Character and entity references are substituted as defined by the spec
-fn normalize_attribute_value(attr: &[u8]) -> Cow<[u8]> {
-    // TODO: character references, entity references, error handling associated with those
-
-    #[derive(PartialEq)]
-    enum ParseState {
-        Space,
-        CDATA,
-    }
-
-    // Trim characters from the beginning and end of the attribute value - this can't fail.
-    fn trim_value(attr: &[u8]) -> &[u8] {
-        let first_non_space_char = attr.iter().position(|c| !is_whitespace(*c));
-
-        if first_non_space_char.is_none() {
-            // The entire value was whitespace-like characters
-            return b"";
-        }
-
-        let last_non_space_char = attr.iter().rposition(|c| !is_whitespace(*c));
-
-        // Trim all whitespace-like characters away from the beginning and end of the attribute value.
-        let begin = first_non_space_char.unwrap();
-        let end = last_non_space_char.unwrap_or(attr.len());
-        &attr[begin..=end]
-    }
-
-    let trimmed_attr = trim_value(attr);
-
-    // A new buffer is only created when we encounter a situation that requires it.
-    let mut normalized: Option<Vec<u8>> = None;
-    // We start on character data because all whitespace-like characters are already trimmed away.
-    let mut current_state = ParseState::CDATA;
-
-    // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
-    // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
-    // buffer and continue using this buffer.
-    for (idx, ch) in trimmed_attr.iter().enumerate() {
-        match ch {
-            b'\n' | b'\r' | b'\t' | b' ' => match current_state {
-                ParseState::Space => match normalized {
-                    Some(_) => continue,
-                    None => normalized = Some(Vec::from(&trimmed_attr[..idx])),
-                },
-                ParseState::CDATA => {
-                    current_state = ParseState::Space;
-                    match normalized.as_mut() {
-                        Some(buf) => buf.push(b' '),
-                        None => {
-                            let mut buf = Vec::from(&trimmed_attr[..idx]);
-                            buf.push(b' ');
-                            normalized = Some(buf);
-                        }
-                    }
-                }
-            },
-            c @ _ => match current_state {
-                ParseState::Space => {
-                    current_state = ParseState::CDATA;
-                    if let Some(normalized) = normalized.as_mut() {
-                        normalized.push(*c);
-                    }
-                }
-                ParseState::CDATA => {
-                    if let Some(normalized) = normalized.as_mut() {
-                        normalized.push(*c);
-                    }
-                }
-            },
-        }
-    }
-
-    match normalized {
-        Some(normalized) => Cow::Owned(normalized),
-        None => Cow::Borrowed(trimmed_attr),
-    }
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Iterator over XML attributes.
@@ -893,36 +888,56 @@ mod xml {
     #[test]
     fn attribute_value_normalization() {
         // empty value
-        assert_eq!(normalize_attribute_value(b""), Cow::Borrowed(b""));
+        let attr = Attribute::from(("foo", ""));
+        assert_eq!(attr.normalized_value().unwrap(), Cow::Borrowed(b""));
+
         // return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character
+        let attr = Attribute::from(("foo", "\rfoo\rbar\tbaz\ndelta\n"));
         assert_eq!(
-            normalize_attribute_value(b"\rfoo\rbar\tbaz\ndelta\n"),
+            attr.normalized_value().unwrap(),
             Cow::Owned::<[u8]>(b"foo bar baz delta".to_vec())
         );
+
         // leading and trailing spaces must be stripped
-        assert_eq!(normalize_attribute_value(b"  foo "), Cow::Borrowed(b"foo"));
+        let attr = Attribute::from(("foo", "  foo "));
+        assert_eq!(attr.normalized_value().unwrap(), Cow::Borrowed(b"foo"));
+
         // leading space
-        assert_eq!(normalize_attribute_value(b" bar"), Cow::Borrowed(b"bar"));
+        let attr = Attribute::from(("foo", " bar"));
+        assert_eq!(attr.normalized_value().unwrap(), Cow::Borrowed(b"bar"));
+
         // trailing space
-        assert_eq!(normalize_attribute_value(b"baz "), Cow::Borrowed(b"baz"));
+        let attr = Attribute::from(("foo", "baz "));
+        assert_eq!(attr.normalized_value().unwrap(), Cow::Borrowed(b"baz"));
+
         // sequences of spaces must be replaced with a single space
+        let attr = Attribute::from(("foo", "   foo bar   baz "));
         assert_eq!(
-            normalize_attribute_value(b"   foo bar   baz "),
+            attr.normalized_value().unwrap(),
             Cow::Owned::<[u8]>(b"foo bar baz".to_vec())
         );
+
         // sequence replacement mixed with characters treated as whitespace (\t \r \n)
+        let attr = Attribute::from(("foo", " \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r"));
         assert_eq!(
-            normalize_attribute_value(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r"),
+            attr.normalized_value().unwrap(),
             Cow::Owned::<[u8]>(b"foo bar baz delta echo foxtrot".to_vec())
         );
+
         // character references for whitespace-like characters are not combined after substitution
+        let attr = Attribute::from(("foo", "&#x20;&#xD0;&#xA0;&#x90;"));
         assert_eq!(
-            normalize_attribute_value(b"&#x20;&#xD0;&#xA0;&#x90;"),
+            attr.normalized_value().unwrap(),
             Cow::Owned::<[u8]>(b" \r\t\n".to_vec())
         );
+
         // sequence replacement mixed with characters treated as whitespace (\t \r \n)
+        let attr = Attribute::from((
+            "foo",
+            " &#x20;foo\tbar baz &#xA0;delta\n&#x90;\r echo foxtrot&#xD0;",
+        ));
         assert_eq!(
-            normalize_attribute_value(b" &#x20;foo\tbar baz &#xA0;delta\n&#x90;\r echo foxtrot&#xD0;"),
+            attr.normalized_value().unwrap(),
             Cow::Owned::<[u8]>(b" foo bar baz \ndelta \t echo foxtrot\r".to_vec())
         );
     }