tafia · dralley · Jul 26, 2025 · Jul 26, 2025 · Jul 26, 2025 · Jan 29, 2023
diff --git a/Changelog.md b/Changelog.md
@@ -22,11 +22,34 @@
   - `Deserializer::buffering_with_resolver`
 - [#878]: Add ability to serialize structs in `$value` fields. The struct name will
   be used as a tag name. Previously only enums was allowed there.
+- [#806]: Add `BytesText::xml_content`, `BytesCData::xml_content` and `BytesRef::xml_content`
+  methods which returns XML EOL normalized strings.
+- [#806]: Add `BytesText::html_content`, `BytesCData::html_content` and `BytesRef::html_content`
+  methods which returns HTML EOL normalized strings.
+- [#371]: Improved compliance with the XML attribute value normalization process by adding
+  - `Attribute::normalized_value()`
+  - `Attribute::normalized_value_with()`
+  - `Attribute::decoded_and_normalized_value()`
+  - `Attribute::decoded_and_normalized_value_with()`
+
+  which ought to be used in place of deprecated
+  - `Attribute::unescape_value()`
+  - `Attribute::unescape_value_with()`
+  - `Attribute::decode_and_unescape_value()`
+  - `Attribute::decode_and_unescape_value_with()`
+
+  Deprecated functions now behaves the same as newly added.
 
 ### Bug Fixes
 
+- [#806]: Properly normalize EOL characters in `Deserializer`.
+
 ### Misc Changes
 
+- [#371]: New error variant `EscapeError::TooManyNestedEntities` was added.
+
+[#371]: https://github.com/tafia/quick-xml/issues/371
+[#806]: https://github.com/tafia/quick-xml/issues/806
 [#878]: https://github.com/tafia/quick-xml/pull/878
 [#882]: https://github.com/tafia/quick-xml/pull/882
 

diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs
@@ -44,18 +44,17 @@ static INPUTS: &[(&str, &str)] = &[
     ("players.xml", PLAYERS),
 ];
 
-// TODO: use fully normalized attribute values
 fn parse_document_from_str(doc: &str) -> XmlResult<()> {
     let mut r = Reader::from_str(doc);
     loop {
         match black_box(r.read_event()?) {
             Event::Start(e) | Event::Empty(e) => {
                 for attr in e.attributes() {
-                    black_box(attr?.decode_and_unescape_value(r.decoder())?);
+                    black_box(attr?.decoded_and_normalized_value(r.decoder())?);
                 }
             }
             Event::Text(e) => {
-                black_box(e.decode()?);
+                black_box(e.xml_content()?);
             }
             Event::CData(e) => {
                 black_box(e.into_inner());
@@ -68,19 +67,18 @@ fn parse_document_from_str(doc: &str) -> XmlResult<()> {
     Ok(())
 }
 
-// TODO: use fully normalized attribute values
 fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> {
     let mut r = Reader::from_reader(doc);
     let mut buf = Vec::new();
     loop {
         match black_box(r.read_event_into(&mut buf)?) {
             Event::Start(e) | Event::Empty(e) => {
                 for attr in e.attributes() {
-                    black_box(attr?.decode_and_unescape_value(r.decoder())?);
+                    black_box(attr?.decoded_and_normalized_value(r.decoder())?);
                 }
             }
             Event::Text(e) => {
-                black_box(e.decode()?);
+                black_box(e.xml_content()?);
             }
             Event::CData(e) => {
                 black_box(e.into_inner());
@@ -94,19 +92,18 @@ fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> {
     Ok(())
 }
 
-// TODO: use fully normalized attribute values
 fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> {
     let mut r = NsReader::from_str(doc);
     loop {
         match black_box(r.read_resolved_event()?) {
             (resolved_ns, Event::Start(e) | Event::Empty(e)) => {
                 black_box(resolved_ns);
                 for attr in e.attributes() {
-                    black_box(attr?.decode_and_unescape_value(r.decoder())?);
+                    black_box(attr?.decoded_and_normalized_value(r.decoder())?);
                 }
             }
             (resolved_ns, Event::Text(e)) => {
-                black_box(e.decode()?);
+                black_box(e.xml_content()?);
                 black_box(resolved_ns);
             }
             (resolved_ns, Event::CData(e)) => {
@@ -121,7 +118,6 @@ fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> {
     Ok(())
 }
 
-// TODO: use fully normalized attribute values
 fn parse_document_from_bytes_with_namespaces(doc: &[u8]) -> XmlResult<()> {
     let mut r = NsReader::from_reader(doc);
     let mut buf = Vec::new();
@@ -130,11 +126,11 @@ fn parse_document_from_bytes_with_namespaces(doc: &[u8]) -> XmlResult<()> {
             (resolved_ns, Event::Start(e) | Event::Empty(e)) => {
                 black_box(resolved_ns);
                 for attr in e.attributes() {
-                    black_box(attr?.decode_and_unescape_value(r.decoder())?);
+                    black_box(attr?.decoded_and_normalized_value(r.decoder())?);
                 }
             }
             (resolved_ns, Event::Text(e)) => {
-                black_box(e.decode()?);
+                black_box(e.xml_content()?);
                 black_box(resolved_ns);
             }
             (resolved_ns, Event::CData(e)) => {

diff --git a/benches/microbenches.rs b/benches/microbenches.rs
@@ -146,7 +146,7 @@ fn one_event(c: &mut Criterion) {
             config.trim_text(true);
             config.check_end_names = false;
             match r.read_event() {
-                Ok(Event::Comment(e)) => nbtxt += e.decode().unwrap().len(),
+                Ok(Event::Comment(e)) => nbtxt += e.xml_content().unwrap().len(),
                 something_else => panic!("Did not expect {:?}", something_else),
             };
 
@@ -243,6 +243,50 @@ fn attributes(c: &mut Criterion) {
             assert_eq!(count, 150);
         })
     });
+
+    group.finish();
+}
+
+/// Benchmarks normalizing attribute values
+fn attribute_value_normalization(c: &mut Criterion) {
+    let mut group = c.benchmark_group("attribute_value_normalization");
+
+    group.bench_function("noop_short", |b| {
+        b.iter(|| {
+            black_box(unescape("foobar")).unwrap();
+        })
+    });
+
+    group.bench_function("noop_long", |b| {
+        b.iter(|| {
+            black_box(unescape("just a bit of text without any entities")).unwrap();
+        })
+    });
+
+    group.bench_function("replacement_chars", |b| {
+        b.iter(|| {
+            black_box(unescape("just a bit\n of text without\tany entities")).unwrap();
+        })
+    });
+
+    group.bench_function("char_reference", |b| {
+        b.iter(|| {
+            let text = "prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
+            black_box(unescape(text)).unwrap();
+            let text = "&#38;&#60;";
+            black_box(unescape(text)).unwrap();
+        })
+    });
+
+    group.bench_function("entity_reference", |b| {
+        b.iter(|| {
+            let text = "age &gt; 72 &amp;&amp; age &lt; 21";
+            black_box(unescape(text)).unwrap();
+            let text = "&quot;what&apos;s that?&quot;";
+            black_box(unescape(text)).unwrap();
+        })
+    });
+
     group.finish();
 }
 
@@ -355,6 +399,7 @@ criterion_group!(
     read_resolved_event_into,
     one_event,
     attributes,
+    attribute_value_normalization,
     escaping,
     unescaping,
 );

diff --git a/examples/custom_entities.rs b/examples/custom_entities.rs
@@ -154,7 +154,8 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
         let label = attrs.next().unwrap()?;
         assert_eq!(label.key, QName(b"label"));
         assert_eq!(
-            label.decode_and_unescape_value_with(reader.decoder(), |ent| reader.get_entity(ent))?,
+            label
+                .decoded_and_normalized_value_with(reader.decoder(), 9, |e| reader.get_entity(e))?,
             "Message: hello world"
         );
 
@@ -185,7 +186,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
         let attr = attrs.next().unwrap()?;
         assert_eq!(attr.key, QName(b"attr"));
         assert_eq!(
-            attr.decode_and_unescape_value_with(reader.decoder(), |ent| reader.get_entity(ent))?,
+            attr.decoded_and_normalized_value_with(reader.decoder(), 9, |e| reader.get_entity(e))?,
             "Message: hello world"
         );
 

diff --git a/examples/read_nodes.rs b/examples/read_nodes.rs
@@ -70,8 +70,8 @@ impl Translation {
         for attr_result in element.attributes() {
             let a = attr_result?;
             match a.key.as_ref() {
-                b"Language" => lang = a.decode_and_unescape_value(reader.decoder())?,
-                b"Tag" => tag = a.decode_and_unescape_value(reader.decoder())?,
+                b"Language" => lang = a.decoded_and_normalized_value(reader.decoder())?,
+                b"Tag" => tag = a.decoded_and_normalized_value(reader.decoder())?,
                 _ => (),
             }
         }
@@ -141,7 +141,7 @@ fn main() -> Result<(), AppError> {
                                             Ok::<Cow<'_, str>, Infallible>(std::borrow::Cow::from(""))
                                         })
                                         .unwrap().to_string();
-                                    let value = a.decode_and_unescape_value(reader.decoder()).or_else(|err| {
+                                    let value = a.decoded_and_normalized_value(reader.decoder()).or_else(|err| {
                                             dbg!("unable to read key in DefaultSettings attribute {:?}, utf8 error {:?}", &a, err);
                                             Ok::<Cow<'_, str>, Infallible>(std::borrow::Cow::from(""))
                                     }).unwrap().to_string();

diff --git a/fuzz/fuzz_targets/fuzz_target_1.rs b/fuzz/fuzz_targets/fuzz_target_1.rs
@@ -34,7 +34,7 @@ where
                 debug_format!(e.name());
                 for a in e.attributes() {
                     debug_format!(a);
-                    if a.ok().map_or(false, |a| a.unescape_value().is_err()) {
+                    if a.ok().map_or(false, |a| a.normalized_value().is_err()) {
                         break;
                     }
                 }

diff --git a/src/de/mod.rs b/src/de/mod.rs
@@ -2439,8 +2439,8 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
             }
 
             match self.next_impl()? {
-                PayloadEvent::Text(e) => result.to_mut().push_str(&e.decode()?),
-                PayloadEvent::CData(e) => result.to_mut().push_str(&e.decode()?),
+                PayloadEvent::Text(e) => result.to_mut().push_str(&e.xml_content()?),
+                PayloadEvent::CData(e) => result.to_mut().push_str(&e.xml_content()?),
                 PayloadEvent::GeneralRef(e) => self.resolve_reference(result.to_mut(), e)?,
 
                 // SAFETY: current_event_is_last_text checks that event is Text, CData or GeneralRef
@@ -2456,8 +2456,8 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
             return match self.next_impl()? {
                 PayloadEvent::Start(e) => Ok(DeEvent::Start(e)),
                 PayloadEvent::End(e) => Ok(DeEvent::End(e)),
-                PayloadEvent::Text(e) => self.drain_text(e.decode()?),
-                PayloadEvent::CData(e) => self.drain_text(e.decode()?),
+                PayloadEvent::Text(e) => self.drain_text(e.xml_content()?),
+                PayloadEvent::CData(e) => self.drain_text(e.xml_content()?),
                 PayloadEvent::DocType(e) => {
                     self.entity_resolver
                         .capture(e)

diff --git a/src/encoding.rs b/src/encoding.rs
@@ -6,6 +6,8 @@ use std::str::Utf8Error;
 #[cfg(feature = "encoding")]
 use encoding_rs::{DecoderResult, Encoding, UTF_16BE, UTF_16LE, UTF_8};
 
+use crate::escape::{normalize_html_eols, normalize_xml_eols};
+
 /// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-8.
 /// See <https://unicode.org/faq/utf_bom.html#bom1>
 pub(crate) const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
@@ -150,6 +152,52 @@ impl Decoder {
             Cow::Owned(bytes) => Ok(self.decode(bytes)?.into_owned().into()),
         }
     }
+
+    /// Decodes the `Cow` buffer, normalizes XML EOLs, preserves the lifetime
+    pub(crate) fn xml_content<'b>(
+        &self,
+        bytes: &Cow<'b, [u8]>,
+    ) -> Result<Cow<'b, str>, EncodingError> {
+        match bytes {
+            Cow::Borrowed(bytes) => {
+                let text = self.decode(bytes)?;
+                match normalize_xml_eols(&text) {
+                    // If text borrowed after normalization that means that it's not changed
+                    Cow::Borrowed(_) => Ok(text),
+                    Cow::Owned(s) => Ok(Cow::Owned(s)),
+                }
+            }
+            Cow::Owned(bytes) => {
+                let text = self.decode(bytes)?;
+                let text = normalize_xml_eols(&text);
+                // Convert to owned, because otherwise Cow will be bound with wrong lifetime
+                Ok(text.into_owned().into())
+            }
+        }
+    }
+
+    /// Decodes the `Cow` buffer, normalizes HTML5 EOLs, preserves the lifetime
+    pub(crate) fn html_content<'b>(
+        &self,
+        bytes: &Cow<'b, [u8]>,
+    ) -> Result<Cow<'b, str>, EncodingError> {
+        match bytes {
+            Cow::Borrowed(bytes) => {
+                let text = self.decode(bytes)?;
+                match normalize_html_eols(&text) {
+                    // If text borrowed after normalization that means that it's not changed
+                    Cow::Borrowed(_) => Ok(text),
+                    Cow::Owned(s) => Ok(Cow::Owned(s)),
+                }
+            }
+            Cow::Owned(bytes) => {
+                let text = self.decode(bytes)?;
+                let text = normalize_html_eols(&text);
+                // Convert to owned, because otherwise Cow will be bound with wrong lifetime
+                Ok(text.into_owned().into())
+            }
+        }
+    }
 }
 
 /// Decodes the provided bytes using the specified encoding.