Merge pull request #895 from Mingun/fix-eol-normalization

Mingun · web-flow · commit e7fa0cec38b5 · 2025-08-24T13:30:53.000+05:00
Fix EOL normalization in `\r` followed by any character which is encoded as `c2 XX` in UTF-8 except `c2 85`
diff --git a/Changelog.md b/Changelog.md
@@ -18,8 +18,16 @@
 
 ### Bug Fixes
 
+- [#895]: Fix incorrect normalization of `\rX` EOL sequences where `X` is a char which is
+  UTF-8 encoded as [c2 xx], except [c2 85].
+
 ### Misc Changes
 
+- [#895]: Add new `xml10_content()` and `xml11_content()` methods which behaves the same as
+  `html_content()` and `xml_content()` methods, but express intention more clearly.
+
+[#895]: https://github.com/tafia/quick-xml/pull/895
+
 
 ## 0.38.2 -- 2025-08-19
 
diff --git a/src/escape.rs b/src/escape.rs
@@ -305,7 +305,7 @@ where
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 // TODO: It would be better to reuse buffer after decoding if possible
-pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str> {
+pub(crate) fn normalize_xml11_eols<'input>(text: &'input str) -> Cow<'input, str> {
     let bytes = text.as_bytes();
 
     // The following sequences of UTF-8 encoded input should be translated into
@@ -326,13 +326,13 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str>
         // we are sure that index within string
         normalized.push_str(&text[0..i]);
 
-        let mut pos = normalize_xml_eol_step(&mut normalized, text, i, '\n');
+        let mut pos = normalize_xml11_eol_step(&mut normalized, text, i, '\n');
         while let Some(i) = memchr3(b'\r', 0xC2, 0xE2, &bytes[pos..]) {
             let index = pos + i;
             // NOTE: unsafe { text.get_unchecked(pos..index) } could be used because
             // we are sure that index within string
             normalized.push_str(&text[pos..index]);
-            pos = normalize_xml_eol_step(&mut normalized, text, index, '\n');
+            pos = normalize_xml11_eol_step(&mut normalized, text, index, '\n');
         }
         if let Some(rest) = text.get(pos..) {
             normalized.push_str(rest);
@@ -378,7 +378,7 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str>
 ///
 /// [eof]: https://www.w3.org/TR/xml11/#sec-line-ends
 /// [only for]: https://html.spec.whatwg.org/#normalize-newlines
-fn normalize_xml_eol_step(normalized: &mut String, text: &str, index: usize, ch: char) -> usize {
+fn normalize_xml11_eol_step(normalized: &mut String, text: &str, index: usize, ch: char) -> usize {
     let input = text.as_bytes();
     match input[index] {
         b'\r' => {
@@ -388,15 +388,15 @@ fn normalize_xml_eol_step(normalized: &mut String, text: &str, index: usize, ch:
                     normalized.push(ch);
                     return index + 2; // skip \r\n
                 }
-                // Because input is correct UTF-8 and in UTF-8 every character has
-                // an unique prefix, byte C2 means only start of #x85 character
                 if next == 0xC2 {
+                    // UTF-8 encoding of #x85 character is [c2 85]
                     if index + 2 < input.len() && input[index + 2] == 0x85 {
                         normalized.push(ch);
                     } else {
+                        normalized.push(ch);
                         // NOTE: unsafe { text.get_unchecked(index..index + 3) } could be used because
                         // we are sure that index within string
-                        normalized.push_str(&text[index..index + 3]);
+                        normalized.push_str(&text[index + 1..index + 3]);
                     }
                     return index + 3; // skip \r + UTF-8 encoding of character (c2 xx)
                 }
@@ -441,7 +441,7 @@ fn normalize_xml_eol_step(normalized: &mut String, text: &str, index: usize, ch:
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 // TODO: It would be better to reuse buffer after decoding if possible
-pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str> {
+pub(crate) fn normalize_xml10_eols<'input>(text: &'input str) -> Cow<'input, str> {
     let bytes = text.as_bytes();
 
     // The following sequences of UTF-8 encoded input should be translated into
@@ -459,13 +459,13 @@ pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str>
         // we are sure that index within string
         normalized.push_str(&text[0..i]);
 
-        let mut pos = normalize_html_eol_step(&mut normalized, bytes, i, '\n');
+        let mut pos = normalize_xml10_eol_step(&mut normalized, bytes, i, '\n');
         while let Some(i) = memchr(b'\r', &bytes[pos..]) {
             let index = pos + i;
             // NOTE: unsafe { text.get_unchecked(pos..index) } could be used because
             // we are sure that index within string
             normalized.push_str(&text[pos..index]);
-            pos = normalize_html_eol_step(&mut normalized, bytes, index, '\n');
+            pos = normalize_xml10_eol_step(&mut normalized, bytes, index, '\n');
         }
         if let Some(rest) = text.get(pos..) {
             normalized.push_str(rest);
@@ -487,7 +487,12 @@ pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str>
 /// - `ch`: a character that should be put to the string instead of newline sequence
 ///
 /// [only for]: https://html.spec.whatwg.org/#normalize-newlines
-fn normalize_html_eol_step(normalized: &mut String, input: &[u8], index: usize, ch: char) -> usize {
+fn normalize_xml10_eol_step(
+    normalized: &mut String,
+    input: &[u8],
+    index: usize,
+    ch: char,
+) -> usize {
     match input[index] {
         b'\r' => {
             normalized.push(ch);
@@ -2062,56 +2067,59 @@ mod normalization {
     mod eol {
         use super::*;
 
-        mod xml {
+        mod xml11 {
             use super::*;
             use pretty_assertions::assert_eq;
 
             #[test]
             fn empty() {
-                assert_eq!(normalize_xml_eols(""), "");
+                assert_eq!(normalize_xml11_eols(""), "");
             }
 
             #[test]
             fn already_normalized() {
                 assert_eq!(
-                    normalize_xml_eols("\nalready \n\n normalized\n"),
+                    normalize_xml11_eols("\nalready \n\n normalized\n"),
                     "\nalready \n\n normalized\n",
                 );
             }
 
             #[test]
             fn cr_lf() {
-                assert_eq!(normalize_xml_eols("\r\nsome\r\n\r\ntext"), "\nsome\n\ntext");
+                assert_eq!(
+                    normalize_xml11_eols("\r\nsome\r\n\r\ntext"),
+                    "\nsome\n\ntext"
+                );
             }
 
             #[test]
             fn cr_u0085() {
                 assert_eq!(
-                    normalize_xml_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"),
+                    normalize_xml11_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"),
                     "\nsome\n\ntext",
                 );
             }
 
             #[test]
             fn u0085() {
                 assert_eq!(
-                    normalize_xml_eols("\u{0085}some\u{0085}\u{0085}text"),
+                    normalize_xml11_eols("\u{0085}some\u{0085}\u{0085}text"),
                     "\nsome\n\ntext",
                 );
             }
 
             #[test]
             fn u2028() {
                 assert_eq!(
-                    normalize_xml_eols("\u{2028}some\u{2028}\u{2028}text"),
+                    normalize_xml11_eols("\u{2028}some\u{2028}\u{2028}text"),
                     "\nsome\n\ntext",
                 );
             }
 
             #[test]
             fn mixed() {
                 assert_eq!(
-                    normalize_xml_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"),
+                    normalize_xml11_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"),
                     "\n\n\n\n\n\nsome\n\n\ntext",
                 );
             }
@@ -2138,9 +2146,9 @@ mod normalization {
 
                     dbg!((input, &description));
                     if ch == '\u{0085}' {
-                        assert_eq!(normalize_xml_eols(input), "\n", "{}", description);
+                        assert_eq!(normalize_xml11_eols(input), "\n", "{}", description);
                     } else {
-                        assert_eq!(normalize_xml_eols(input), input, "{}", description);
+                        assert_eq!(normalize_xml11_eols(input), input, "{}", description);
                     }
                 }
                 assert_eq!((first..=last).count(), 64);
@@ -2171,9 +2179,12 @@ mod normalization {
 
                     dbg!((input, &description));
                     if ch == '\u{0085}' {
-                        assert_eq!(normalize_xml_eols(input), "\n", "{}", description);
+                        assert_eq!(normalize_xml11_eols(input), "\n", "{}", description);
                     } else {
-                        assert_eq!(normalize_xml_eols(input), input, "{}", description);
+                        let mut expected = utf8.clone();
+                        expected[0] = b'\n';
+                        let expected = std::str::from_utf8(&expected).expect(&description);
+                        assert_eq!(normalize_xml11_eols(input), expected, "{}", description);
                     }
                 }
                 assert_eq!((first..=last).count(), 64);
@@ -2204,68 +2215,68 @@ mod normalization {
 
                     dbg!((input, &description));
                     if ch == '\u{2028}' {
-                        assert_eq!(normalize_xml_eols(input), "\n", "{}", description);
+                        assert_eq!(normalize_xml11_eols(input), "\n", "{}", description);
                     } else {
-                        assert_eq!(normalize_xml_eols(input), input, "{}", description);
+                        assert_eq!(normalize_xml11_eols(input), input, "{}", description);
                     }
                 }
                 assert_eq!((first..=last).count(), 4096);
             }
         }
 
-        mod html {
+        mod xml10 {
             use super::*;
             use pretty_assertions::assert_eq;
 
             #[test]
             fn empty() {
-                assert_eq!(normalize_html_eols(""), "");
+                assert_eq!(normalize_xml10_eols(""), "");
             }
 
             #[test]
             fn already_normalized() {
                 assert_eq!(
-                    normalize_html_eols("\nalready \n\n normalized\n"),
+                    normalize_xml10_eols("\nalready \n\n normalized\n"),
                     "\nalready \n\n normalized\n",
                 );
             }
 
             #[test]
             fn cr_lf() {
                 assert_eq!(
-                    normalize_html_eols("\r\nsome\r\n\r\ntext"),
+                    normalize_xml10_eols("\r\nsome\r\n\r\ntext"),
                     "\nsome\n\ntext"
                 );
             }
 
             #[test]
             fn cr_u0085() {
                 assert_eq!(
-                    normalize_html_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"),
+                    normalize_xml10_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"),
                     "\n\u{0085}some\n\u{0085}\n\u{0085}text",
                 );
             }
 
             #[test]
             fn u0085() {
                 assert_eq!(
-                    normalize_html_eols("\u{0085}some\u{0085}\u{0085}text"),
+                    normalize_xml10_eols("\u{0085}some\u{0085}\u{0085}text"),
                     "\u{0085}some\u{0085}\u{0085}text",
                 );
             }
 
             #[test]
             fn u2028() {
                 assert_eq!(
-                    normalize_html_eols("\u{2028}some\u{2028}\u{2028}text"),
+                    normalize_xml10_eols("\u{2028}some\u{2028}\u{2028}text"),
                     "\u{2028}some\u{2028}\u{2028}text",
                 );
             }
 
             #[test]
             fn mixed() {
                 assert_eq!(
-                    normalize_html_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"),
+                    normalize_xml10_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"),
                     "\n\n\n\u{2028}\n\nsome\n\u{0085}\n\u{0085}text",
                 );
             }
diff --git a/src/events/mod.rs b/src/events/mod.rs