diff --git a/Changelog.md b/Changelog.md index 280b1b59..29d46164 100644 --- a/Changelog.md +++ b/Changelog.md @@ -22,11 +22,34 @@ - `Deserializer::buffering_with_resolver` - [#878]: Add ability to serialize structs in `$value` fields. The struct name will be used as a tag name. Previously only enums was allowed there. +- [#806]: Add `BytesText::xml_content`, `BytesCData::xml_content` and `BytesRef::xml_content` + methods which returns XML EOL normalized strings. +- [#806]: Add `BytesText::html_content`, `BytesCData::html_content` and `BytesRef::html_content` + methods which returns HTML EOL normalized strings. +- [#371]: Improved compliance with the XML attribute value normalization process by adding + - `Attribute::normalized_value()` + - `Attribute::normalized_value_with()` + - `Attribute::decoded_and_normalized_value()` + - `Attribute::decoded_and_normalized_value_with()` + + which ought to be used in place of deprecated + - `Attribute::unescape_value()` + - `Attribute::unescape_value_with()` + - `Attribute::decode_and_unescape_value()` + - `Attribute::decode_and_unescape_value_with()` + + Deprecated functions now behaves the same as newly added. ### Bug Fixes +- [#806]: Properly normalize EOL characters in `Deserializer`. + ### Misc Changes +- [#371]: New error variant `EscapeError::TooManyNestedEntities` was added. + +[#371]: https://github.com/tafia/quick-xml/issues/371 +[#806]: https://github.com/tafia/quick-xml/issues/806 [#878]: https://github.com/tafia/quick-xml/pull/878 [#882]: https://github.com/tafia/quick-xml/pull/882 diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs index 5c5b9353..bd307fbc 100644 --- a/benches/macrobenches.rs +++ b/benches/macrobenches.rs @@ -44,18 +44,17 @@ static INPUTS: &[(&str, &str)] = &[ ("players.xml", PLAYERS), ]; -// TODO: use fully normalized attribute values fn parse_document_from_str(doc: &str) -> XmlResult<()> { let mut r = Reader::from_str(doc); loop { match black_box(r.read_event()?) { Event::Start(e) | Event::Empty(e) => { for attr in e.attributes() { - black_box(attr?.decode_and_unescape_value(r.decoder())?); + black_box(attr?.decoded_and_normalized_value(r.decoder())?); } } Event::Text(e) => { - black_box(e.decode()?); + black_box(e.xml_content()?); } Event::CData(e) => { black_box(e.into_inner()); @@ -68,7 +67,6 @@ fn parse_document_from_str(doc: &str) -> XmlResult<()> { Ok(()) } -// TODO: use fully normalized attribute values fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> { let mut r = Reader::from_reader(doc); let mut buf = Vec::new(); @@ -76,11 +74,11 @@ fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> { match black_box(r.read_event_into(&mut buf)?) { Event::Start(e) | Event::Empty(e) => { for attr in e.attributes() { - black_box(attr?.decode_and_unescape_value(r.decoder())?); + black_box(attr?.decoded_and_normalized_value(r.decoder())?); } } Event::Text(e) => { - black_box(e.decode()?); + black_box(e.xml_content()?); } Event::CData(e) => { black_box(e.into_inner()); @@ -94,7 +92,6 @@ fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> { Ok(()) } -// TODO: use fully normalized attribute values fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> { let mut r = NsReader::from_str(doc); loop { @@ -102,11 +99,11 @@ fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> { (resolved_ns, Event::Start(e) | Event::Empty(e)) => { black_box(resolved_ns); for attr in e.attributes() { - black_box(attr?.decode_and_unescape_value(r.decoder())?); + black_box(attr?.decoded_and_normalized_value(r.decoder())?); } } (resolved_ns, Event::Text(e)) => { - black_box(e.decode()?); + black_box(e.xml_content()?); black_box(resolved_ns); } (resolved_ns, Event::CData(e)) => { @@ -121,7 +118,6 @@ fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> { Ok(()) } -// TODO: use fully normalized attribute values fn parse_document_from_bytes_with_namespaces(doc: &[u8]) -> XmlResult<()> { let mut r = NsReader::from_reader(doc); let mut buf = Vec::new(); @@ -130,11 +126,11 @@ fn parse_document_from_bytes_with_namespaces(doc: &[u8]) -> XmlResult<()> { (resolved_ns, Event::Start(e) | Event::Empty(e)) => { black_box(resolved_ns); for attr in e.attributes() { - black_box(attr?.decode_and_unescape_value(r.decoder())?); + black_box(attr?.decoded_and_normalized_value(r.decoder())?); } } (resolved_ns, Event::Text(e)) => { - black_box(e.decode()?); + black_box(e.xml_content()?); black_box(resolved_ns); } (resolved_ns, Event::CData(e)) => { diff --git a/benches/microbenches.rs b/benches/microbenches.rs index 02adc343..63e1cc55 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -146,7 +146,7 @@ fn one_event(c: &mut Criterion) { config.trim_text(true); config.check_end_names = false; match r.read_event() { - Ok(Event::Comment(e)) => nbtxt += e.decode().unwrap().len(), + Ok(Event::Comment(e)) => nbtxt += e.xml_content().unwrap().len(), something_else => panic!("Did not expect {:?}", something_else), }; @@ -243,6 +243,50 @@ fn attributes(c: &mut Criterion) { assert_eq!(count, 150); }) }); + + group.finish(); +} + +/// Benchmarks normalizing attribute values +fn attribute_value_normalization(c: &mut Criterion) { + let mut group = c.benchmark_group("attribute_value_normalization"); + + group.bench_function("noop_short", |b| { + b.iter(|| { + black_box(unescape("foobar")).unwrap(); + }) + }); + + group.bench_function("noop_long", |b| { + b.iter(|| { + black_box(unescape("just a bit of text without any entities")).unwrap(); + }) + }); + + group.bench_function("replacement_chars", |b| { + b.iter(|| { + black_box(unescape("just a bit\n of text without\tany entities")).unwrap(); + }) + }); + + group.bench_function("char_reference", |b| { + b.iter(|| { + let text = "prefix "some stuff","more stuff""; + black_box(unescape(text)).unwrap(); + let text = "&<"; + black_box(unescape(text)).unwrap(); + }) + }); + + group.bench_function("entity_reference", |b| { + b.iter(|| { + let text = "age > 72 && age < 21"; + black_box(unescape(text)).unwrap(); + let text = ""what's that?""; + black_box(unescape(text)).unwrap(); + }) + }); + group.finish(); } @@ -355,6 +399,7 @@ criterion_group!( read_resolved_event_into, one_event, attributes, + attribute_value_normalization, escaping, unescaping, ); diff --git a/examples/custom_entities.rs b/examples/custom_entities.rs index be9d11ea..46e2da06 100644 --- a/examples/custom_entities.rs +++ b/examples/custom_entities.rs @@ -154,7 +154,8 @@ fn main() -> Result<(), Box> { let label = attrs.next().unwrap()?; assert_eq!(label.key, QName(b"label")); assert_eq!( - label.decode_and_unescape_value_with(reader.decoder(), |ent| reader.get_entity(ent))?, + label + .decoded_and_normalized_value_with(reader.decoder(), 9, |e| reader.get_entity(e))?, "Message: hello world" ); @@ -185,7 +186,7 @@ fn main() -> Result<(), Box> { let attr = attrs.next().unwrap()?; assert_eq!(attr.key, QName(b"attr")); assert_eq!( - attr.decode_and_unescape_value_with(reader.decoder(), |ent| reader.get_entity(ent))?, + attr.decoded_and_normalized_value_with(reader.decoder(), 9, |e| reader.get_entity(e))?, "Message: hello world" ); diff --git a/examples/read_nodes.rs b/examples/read_nodes.rs index 50a5f90d..cf845dba 100644 --- a/examples/read_nodes.rs +++ b/examples/read_nodes.rs @@ -70,8 +70,8 @@ impl Translation { for attr_result in element.attributes() { let a = attr_result?; match a.key.as_ref() { - b"Language" => lang = a.decode_and_unescape_value(reader.decoder())?, - b"Tag" => tag = a.decode_and_unescape_value(reader.decoder())?, + b"Language" => lang = a.decoded_and_normalized_value(reader.decoder())?, + b"Tag" => tag = a.decoded_and_normalized_value(reader.decoder())?, _ => (), } } @@ -141,7 +141,7 @@ fn main() -> Result<(), AppError> { Ok::, Infallible>(std::borrow::Cow::from("")) }) .unwrap().to_string(); - let value = a.decode_and_unescape_value(reader.decoder()).or_else(|err| { + let value = a.decoded_and_normalized_value(reader.decoder()).or_else(|err| { dbg!("unable to read key in DefaultSettings attribute {:?}, utf8 error {:?}", &a, err); Ok::, Infallible>(std::borrow::Cow::from("")) }).unwrap().to_string(); diff --git a/fuzz/fuzz_targets/fuzz_target_1.rs b/fuzz/fuzz_targets/fuzz_target_1.rs index dbadfe2f..9691b813 100644 --- a/fuzz/fuzz_targets/fuzz_target_1.rs +++ b/fuzz/fuzz_targets/fuzz_target_1.rs @@ -34,7 +34,7 @@ where debug_format!(e.name()); for a in e.attributes() { debug_format!(a); - if a.ok().map_or(false, |a| a.unescape_value().is_err()) { + if a.ok().map_or(false, |a| a.normalized_value().is_err()) { break; } } diff --git a/src/de/mod.rs b/src/de/mod.rs index 794be9d0..7671985b 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -2439,8 +2439,8 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { } match self.next_impl()? { - PayloadEvent::Text(e) => result.to_mut().push_str(&e.decode()?), - PayloadEvent::CData(e) => result.to_mut().push_str(&e.decode()?), + PayloadEvent::Text(e) => result.to_mut().push_str(&e.xml_content()?), + PayloadEvent::CData(e) => result.to_mut().push_str(&e.xml_content()?), PayloadEvent::GeneralRef(e) => self.resolve_reference(result.to_mut(), e)?, // SAFETY: current_event_is_last_text checks that event is Text, CData or GeneralRef @@ -2456,8 +2456,8 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> { return match self.next_impl()? { PayloadEvent::Start(e) => Ok(DeEvent::Start(e)), PayloadEvent::End(e) => Ok(DeEvent::End(e)), - PayloadEvent::Text(e) => self.drain_text(e.decode()?), - PayloadEvent::CData(e) => self.drain_text(e.decode()?), + PayloadEvent::Text(e) => self.drain_text(e.xml_content()?), + PayloadEvent::CData(e) => self.drain_text(e.xml_content()?), PayloadEvent::DocType(e) => { self.entity_resolver .capture(e) diff --git a/src/encoding.rs b/src/encoding.rs index 7378db39..1e51e2f8 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -6,6 +6,8 @@ use std::str::Utf8Error; #[cfg(feature = "encoding")] use encoding_rs::{DecoderResult, Encoding, UTF_16BE, UTF_16LE, UTF_8}; +use crate::escape::{normalize_html_eols, normalize_xml_eols}; + /// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-8. /// See pub(crate) const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF]; @@ -150,6 +152,52 @@ impl Decoder { Cow::Owned(bytes) => Ok(self.decode(bytes)?.into_owned().into()), } } + + /// Decodes the `Cow` buffer, normalizes XML EOLs, preserves the lifetime + pub(crate) fn xml_content<'b>( + &self, + bytes: &Cow<'b, [u8]>, + ) -> Result, EncodingError> { + match bytes { + Cow::Borrowed(bytes) => { + let text = self.decode(bytes)?; + match normalize_xml_eols(&text) { + // If text borrowed after normalization that means that it's not changed + Cow::Borrowed(_) => Ok(text), + Cow::Owned(s) => Ok(Cow::Owned(s)), + } + } + Cow::Owned(bytes) => { + let text = self.decode(bytes)?; + let text = normalize_xml_eols(&text); + // Convert to owned, because otherwise Cow will be bound with wrong lifetime + Ok(text.into_owned().into()) + } + } + } + + /// Decodes the `Cow` buffer, normalizes HTML5 EOLs, preserves the lifetime + pub(crate) fn html_content<'b>( + &self, + bytes: &Cow<'b, [u8]>, + ) -> Result, EncodingError> { + match bytes { + Cow::Borrowed(bytes) => { + let text = self.decode(bytes)?; + match normalize_html_eols(&text) { + // If text borrowed after normalization that means that it's not changed + Cow::Borrowed(_) => Ok(text), + Cow::Owned(s) => Ok(Cow::Owned(s)), + } + } + Cow::Owned(bytes) => { + let text = self.decode(bytes)?; + let text = normalize_html_eols(&text); + // Convert to owned, because otherwise Cow will be bound with wrong lifetime + Ok(text.into_owned().into()) + } + } + } } /// Decodes the provided bytes using the specified encoding. diff --git a/src/escape.rs b/src/escape.rs index dd0f5f47..78a4cdca 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -1,9 +1,10 @@ //! Manage xml character escapes -use memchr::memchr2_iter; +use memchr::{memchr, memchr2_iter, memchr3}; use std::borrow::Cow; use std::num::ParseIntError; use std::ops::Range; +use std::slice::Iter; /// Error of parsing character reference (`&#;` or `&#x;`). #[derive(Clone, Debug, PartialEq)] @@ -50,6 +51,12 @@ pub enum EscapeError { /// Attempt to parse character reference (`&#;` or `&#x;`) /// was unsuccessful, not all characters are decimal or hexadecimal numbers. InvalidCharRef(ParseCharRefError), + /// Expanded more than maximum possible entities during attribute normalization. + /// + /// Attribute normalization includes expanding of general entities (`&entity;`) + /// which replacement text also could contain entities, which is also must be expanded. + /// If more than 128 entities would be expanded, this error is returned. + TooManyNestedEntities, } impl std::fmt::Display for EscapeError { @@ -66,6 +73,9 @@ impl std::fmt::Display for EscapeError { Self::InvalidCharRef(e) => { write!(f, "invalid character reference: {}", e) } + Self::TooManyNestedEntities => { + f.write_str("too many nested entities in an attribute value") + } } } } @@ -302,6 +312,406 @@ where } } +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// TODO: It would be better to reuse buffer after decoding if possible +pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str> { + let bytes = text.as_bytes(); + + // The following sequences of UTF-8 encoded input should be translated into + // a single `\n` (U+000a) character to normalize EOLs: + // + // |UTF-8 |String| + // |--------|------| + // |0d 0a |\r\n | + // |0d c2 85|\r\x85| + // |0d |\r | + // |c2 85 |\x85 | + // |e2 80 a8|\u2028| + if let Some(i) = memchr3(b'\r', 0xC2, 0xE2, bytes) { + // We found a character that requires normalization, so create new normalized + // string, put the prefix as is and then put normalized character + let mut normalized = String::with_capacity(text.len()); + // NOTE: unsafe { text.get_unchecked(0..i) } could be used because + // we are sure that index within string + normalized.push_str(&text[0..i]); + + let mut pos = normalize_xml_eol_step(&mut normalized, bytes, i, '\n'); + while let Some(i) = memchr3(b'\r', 0xC2, 0xE2, &bytes[pos..]) { + let index = pos + i; + // NOTE: unsafe { text.get_unchecked(pos..index) } could be used because + // we are sure that index within string + normalized.push_str(&text[pos..index]); + pos = normalize_xml_eol_step(&mut normalized, bytes, index, '\n'); + } + if let Some(rest) = text.get(pos..) { + normalized.push_str(rest); + } + return normalized.into(); + } + Cow::Borrowed(text) +} + +/// All line breaks MUST have been normalized on input to #xA as described +/// in [2.11 End-of-Line Handling][eof], so the rest of this algorithm operates +/// on text normalized in this way. +/// +/// To simplify the tasks of applications, the XML processor MUST behave +/// as if it normalized all line breaks in external parsed entities +/// (including the document entity) on input, before parsing, by translating +/// all of the following to a single #xA character (_which attribute normalization +/// routine will replace by #x20 character_): +/// +/// 1. the two-character sequence #xD #xA +/// 2. the two-character sequence #xD #x85 +/// 3. the single character #x85 +/// 4. the single character #x2028 +/// 5. any #xD character that is not immediately followed by #xA or #x85. +/// +/// The characters #x85 and #x2028 cannot be reliably recognized and translated +/// until an entity's encoding declaration (if present) has been read. +/// Therefore, it is a fatal error to use them within the XML declaration or text declaration. +/// +/// Note, that this function cannot be used to normalize HTML values. The text in HTML +/// normally is not normalized in any way; normalization is performed only in limited +/// contexts and [only for] `\r\n` and `\r`. +/// +/// # Parameters +/// +/// - `normalized`: the string with the result of normalization +/// - `input`: UTF-8 bytes of the string to be normalized +/// - `index`: a byte index into `input` of character which is processed right now. +/// It always points to the first byte of character in UTF-8 encoding +/// - `ch`: a character that should be put to the string instead of newline sequence +/// +/// Returns the index of next unprocessed byte in the `input`. +/// +/// [eof]: https://www.w3.org/TR/xml11/#sec-line-ends +/// [only for]: https://html.spec.whatwg.org/#normalize-newlines +fn normalize_xml_eol_step(normalized: &mut String, input: &[u8], index: usize, ch: char) -> usize { + match input[index] { + b'\r' => { + normalized.push(ch); + if index + 1 < input.len() { + let next = input[index + 1]; + if next == b'\n' { + return index + 2; // skip \r\n + } + // Because input is correct UTF-8 and in UTF-8 every character has + // an unique prefix, byte C2 means only start of #x85 character + if next == 0xC2 { + return index + 3; // skip UTF-8 encoding of #xD #x85 characters (0d c2 85) + } + } + index + 1 // skip \r + } + b'\n' => { + normalized.push(ch); + index + 1 // skip \n + } + // Start of UTF-8 encoding of #x85 character (c2 85) + 0xC2 => { + normalized.push(ch); + index + 2 // skip UTF-8 encoding of #x85 character (c2 85) + } + // Start of UTF-8 encoding of #x2028 character (e2 80 a8) + 0xE2 => { + normalized.push(ch); + index + 3 // skip UTF-8 encoding of #x2028 character (e2 80 a8) + } + + x => unreachable!( + "at {}: expected ''\\n', '\\r', '\\xC2', or '\\xE2', found '{}' / {} / `0x{:X}`", + index, x as char, x, x + ), + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// TODO: It would be better to reuse buffer after decoding if possible +pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str> { + let bytes = text.as_bytes(); + + // The following sequences of UTF-8 encoded input should be translated into + // a single `\n` (U+000a) character to normalize EOLs: + // + // |UTF-8 |String| + // |--------|------| + // |0d 0a |\r\n | + // |0d |\r | + if let Some(i) = memchr(b'\r', bytes) { + // We found a character that requires normalization, so create new normalized + // string, put the prefix as is and then put normalized character + let mut normalized = String::with_capacity(text.len()); + // NOTE: unsafe { text.get_unchecked(0..i) } could be used because + // we are sure that index within string + normalized.push_str(&text[0..i]); + + let mut pos = normalize_html_eol_step(&mut normalized, bytes, i, '\n'); + while let Some(i) = memchr(b'\r', &bytes[pos..]) { + let index = pos + i; + // NOTE: unsafe { text.get_unchecked(pos..index) } could be used because + // we are sure that index within string + normalized.push_str(&text[pos..index]); + pos = normalize_html_eol_step(&mut normalized, bytes, index, '\n'); + } + if let Some(rest) = text.get(pos..) { + normalized.push_str(rest); + } + return normalized.into(); + } + Cow::Borrowed(text) +} + +/// The text in HTML normally is not normalized in any way; normalization is +/// performed only in limited contexts and [only for] `\r\n` and `\r`. +/// +/// # Parameters +/// +/// - `normalized`: the string with the result of normalization +/// - `input`: UTF-8 bytes of the string to be normalized +/// - `index`: a byte index into `input` of character which is processed right now. +/// It always points to the first byte of character in UTF-8 encoding +/// - `ch`: a character that should be put to the string instead of newline sequence +/// +/// [only for]: https://html.spec.whatwg.org/#normalize-newlines +fn normalize_html_eol_step(normalized: &mut String, input: &[u8], index: usize, ch: char) -> usize { + match input[index] { + b'\r' => { + normalized.push(ch); + if index + 1 < input.len() && input[index + 1] == b'\n' { + return index + 2; // skip \r\n + } + index + 1 // skip \r + } + b'\n' => { + normalized.push(ch); + index + 1 // skip \n + } + + x => unreachable!( + "at {}: expected ''\\n' or '\\r', found '{}' / {} / `0x{:X}`", + index, x as char, x, x + ), + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +const fn is_normalization_char(b: &u8) -> bool { + // The following sequences should be translated into a single `\n` (U+000a) character + // to normalize EOLs: + // + // |UTF-8 |String| + // |--------|------| + // |0d 0a |\r\n | + // |0d c2 85|\r\x85| + // |0d |\r | + // |c2 85 |\x85 | + // |e2 80 a8|\x2028| + matches!(*b, b'\t' | b'\r' | b'\n' | 0xC2 | 0xE2 | b'&') +} + +/// Returns the attribute value normalized as per [the XML specification], +/// using a custom entity resolver. +/// +/// Do not use this method with HTML attributes. +/// +/// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` +/// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function +/// for resolving entities can be provided as `resolve_entity`. Builtin entities will still +/// take precedence. +/// +/// This will allocate unless the raw attribute value does not require normalization. +/// +/// # Parameters +/// +/// - `value`: unnormalized attribute value +/// - `depth`: maximum number of nested entities that can be expanded. If expansion +/// chain will be more that this value, the function will return [`EscapeError::TooManyNestedEntities`] +/// - `resolve_entity`: a function to resolve entity. This function could be called +/// multiple times on the same input and can return different values in each case +/// for the same input, although it is not recommended +/// +/// # Lifetimes +/// +/// - `'input`: lifetime of the unnormalized attribute. If normalization is not requred, +/// the input returned unchanged with the same lifetime +/// - `'entity`: lifetime of all entities that is returned by the entity resolution routine +/// +/// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize +pub(crate) fn normalize_attribute_value<'input, 'entity, F>( + value: &'input str, + depth: usize, + mut resolve_entity: F, +) -> Result, EscapeError> +where + // the lifetime of the output comes from a capture or is `'static` + F: FnMut(&str) -> Option<&'entity str>, +{ + let mut iter = value.as_bytes().iter(); + + // If we found the charater that requires normalization, create a normalized + // version of the attribute, otherwise return the value unchanged + if let Some(i) = iter.position(is_normalization_char) { + let mut normalized = String::with_capacity(value.len()); + let pos = normalize_attribute_step( + &mut normalized, + &mut iter, + value, + 0, + i, + depth, + &mut resolve_entity, + )?; + + normalize_attribute_steps( + &mut normalized, + &mut iter, + value, + pos, + depth, + &mut resolve_entity, + )?; + return Ok(normalized.into()); + } + Ok(Cow::Borrowed(value)) +} + +fn normalize_attribute_steps<'entity, F>( + normalized: &mut String, + iter: &mut Iter, + input: &str, + mut pos: usize, + depth: usize, + resolve_entity: &mut F, +) -> Result<(), EscapeError> +where + // the lifetime of the output comes from a capture or is `'static` + F: FnMut(&str) -> Option<&'entity str>, +{ + while let Some(i) = iter.position(is_normalization_char) { + pos = + normalize_attribute_step(normalized, iter, input, pos, pos + i, depth, resolve_entity)?; + } + if let Some(rest) = input.get(pos..) { + normalized.push_str(rest); + } + Ok(()) +} + +/// Performs one step of the [normalization algorithm] (but with recursive part): +/// +/// 1. For a character reference, append the referenced character +/// to the normalized value. +/// 2. For an entity reference, recursively apply this algorithm +/// to the replacement text of the entity. +/// 3. For a white space character (#x20, #xD, #xA, #x9), append +/// a space character (#x20) to the normalized value. +/// 4. For another character, append the character to the normalized value. +/// +/// Because [according to the specification], XML parser should parse line-of-end +/// normalized input, but quick-xml does not do that, this function also performs +/// normalization of EOL characters. That should be done before expanding entities +/// and character references, so cannot be processed later. +/// +/// This function could be used also just to normalize line ends if the iterator +/// won't be stop on `&` characters. +/// +/// # Parameters +/// +/// - `normalized`: Output of the algorithm. Normalized value will be placed here +/// - `iter`: Iterator over bytes of `input` +/// - `input`: Original non-normalized value +/// - `last_pos`: Index of the last byte in `input` that was processed +/// - `index`: Index of the byte in `input` that should be processed now +/// - `seen_cr`: `\r\n` and `\r\x85` sequences should be normalized into one space +/// so this parameter tracks if we seen the `\r` before processing the current byte +/// - `depth`: Current recursion depth. Too deep recursion will interrupt the algorithm +/// - `resolve_entity`: Resolver of entities. Returns `None` for unknown entities +/// +/// [normalization algorithm]: https://www.w3.org/TR/xml11/#AVNormalize +/// [according to the specification]: https://www.w3.org/TR/xml11/#sec-line-ends +fn normalize_attribute_step<'entity, F>( + normalized: &mut String, + iter: &mut Iter, + input: &str, + last_pos: usize, + index: usize, + depth: usize, + resolve_entity: &mut F, +) -> Result +where + // the lifetime of the output comes from a capture or is `'static` + F: FnMut(&str) -> Option<&'entity str>, +{ + if depth == 0 { + return Err(EscapeError::TooManyNestedEntities); + } + // 4. For another character, append the character to the normalized value. + normalized.push_str(&input[last_pos..index]); + + match input.as_bytes()[index] { + b'&' => { + let start = index + 1; // +1 - skip `&` + let end = start + + match iter.position(|&b| b == b';') { + Some(end) => end, + None => return Err(EscapeError::UnterminatedEntity(index..input.len())), + }; + + // Content between & and ; - &pat; + // Note, that this content have non-normalized EOLs as required by the specification, + // but because numbers in any case cannot have spaces inside, this is not the problem. + // Normalization of spaces in entity references and checking that they corresponds to + // [`Name`] production on conscience `resolve_entity`. + // + // [`Name`]: https://www.w3.org/TR/xml11/#NT-Name + let pat = &input[start..end]; + // 1. For a character reference, append the referenced character + // to the normalized value. + if pat.starts_with('#') { + let entity = &pat[1..]; // starts after the # + let codepoint = parse_number(entity).map_err(EscapeError::InvalidCharRef)?; + normalized.push_str(codepoint.encode_utf8(&mut [0u8; 4])); + } else + // 2. For an entity reference, recursively apply this algorithm + // to the replacement text of the entity. + if let Some(value) = resolve_entity(pat) { + normalize_attribute_steps( + normalized, + &mut value.as_bytes().iter(), + value, + 0, + depth.saturating_sub(1), + resolve_entity, + )?; + } else { + return Err(EscapeError::UnrecognizedEntity(start..end, pat.to_string())); + } + Ok(end + 1) // +1 - skip `;` + } + // 3. For a white space character (#x20, #xD, #xA, #x9), append + // a space character (#x20) to the normalized value. + // Space character has no special meaning, so it is handled on step 4 + b'\t' => { + normalized.push(' '); + Ok(index + 1) // +1 - skip \t + } + _ => { + let pos = normalize_xml_eol_step(normalized, input.as_bytes(), index, ' '); + // We should advance iterator because we may skip several characters + for _ in 0..pos - index - 1 { + iter.next(); + } + Ok(pos) + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + /// Resolves predefined XML entities or all HTML5 entities depending on the feature /// [`escape-html`](https://docs.rs/quick-xml/latest/quick_xml/#escape-html). /// @@ -1844,3 +2254,242 @@ fn from_str_radix(src: &str, radix: u32) -> Result { _ => u32::from_str_radix(src, radix).map_err(ParseCharRefError::InvalidNumber), } } + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#[cfg(test)] +mod normalization { + use super::*; + + mod eol { + use super::*; + + mod xml { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn empty() { + assert_eq!(normalize_xml_eols(""), ""); + } + + #[test] + fn already_normalized() { + assert_eq!( + normalize_xml_eols("\nalready \n\n normalized\n"), + "\nalready \n\n normalized\n", + ); + } + + #[test] + fn cr_lf() { + assert_eq!(normalize_xml_eols("\r\nsome\r\n\r\ntext"), "\nsome\n\ntext"); + } + + #[test] + fn cr_u0085() { + assert_eq!( + normalize_xml_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"), + "\nsome\n\ntext", + ); + } + + #[test] + fn u0085() { + assert_eq!( + normalize_xml_eols("\u{0085}some\u{0085}\u{0085}text"), + "\nsome\n\ntext", + ); + } + + #[test] + fn u2028() { + assert_eq!( + normalize_xml_eols("\u{2028}some\u{2028}\u{2028}text"), + "\nsome\n\ntext", + ); + } + + #[test] + fn mixed() { + assert_eq!( + normalize_xml_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"), + "\n\n\n\n\n\nsome\n\n\ntext", + ); + } + } + + mod html { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn empty() { + assert_eq!(normalize_html_eols(""), ""); + } + + #[test] + fn already_normalized() { + assert_eq!( + normalize_html_eols("\nalready \n\n normalized\n"), + "\nalready \n\n normalized\n", + ); + } + + #[test] + fn cr_lf() { + assert_eq!( + normalize_html_eols("\r\nsome\r\n\r\ntext"), + "\nsome\n\ntext" + ); + } + + #[test] + fn cr_u0085() { + assert_eq!( + normalize_html_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"), + "\n\u{0085}some\n\u{0085}\n\u{0085}text", + ); + } + + #[test] + fn u0085() { + assert_eq!( + normalize_html_eols("\u{0085}some\u{0085}\u{0085}text"), + "\u{0085}some\u{0085}\u{0085}text", + ); + } + + #[test] + fn u2028() { + assert_eq!( + normalize_html_eols("\u{2028}some\u{2028}\u{2028}text"), + "\u{2028}some\u{2028}\u{2028}text", + ); + } + + #[test] + fn mixed() { + assert_eq!( + normalize_html_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"), + "\n\n\n\u{2028}\n\nsome\n\u{0085}\n\u{0085}text", + ); + } + } + } + + mod attribute { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn empty() { + assert_eq!( + normalize_attribute_value("", 5, |_| { None }), + Ok("".into()) + ); + } + + #[test] + fn only_spaces() { + assert_eq!( + normalize_attribute_value(" ", 5, |_| { None }), + Ok(" ".into()) + ); + assert_eq!( + normalize_attribute_value("\t\t\t", 5, |_| { None }), + Ok(" ".into()) + ); + assert_eq!( + normalize_attribute_value("\r\r\r", 5, |_| { None }), + Ok(" ".into()) + ); + assert_eq!( + normalize_attribute_value("\n\n\n", 5, |_| { None }), + Ok(" ".into()) + ); + } + + #[test] + fn already_normalized() { + assert_eq!( + normalize_attribute_value("already normalized", 5, |_| { None }), + Ok("already normalized".into()) + ); + } + + #[test] + fn characters() { + assert_eq!( + normalize_attribute_value("string with character", 5, |_| { None }), + Ok("string with character".into()) + ); + assert_eq!( + normalize_attribute_value("string with character", 5, |_| { None }), + Ok("string with character".into()) + ); + } + + #[test] + fn entities() { + assert_eq!( + normalize_attribute_value("string with &entity; reference", 5, |_| { + Some("replacement") + }), + Ok("string with replacement reference".into()) + ); + assert_eq!( + normalize_attribute_value("string with &entity-1; reference", 5, |entity| { + match entity { + "entity-1" => Some("recursive &entity-2;"), + "entity-2" => Some("entity 2"), + _ => None, + } + }), + Ok("string with recursive entity 2 reference".into()) + ); + } + + #[test] + fn unclosed_entity() { + assert_eq!( + normalize_attribute_value("string with unclosed &entity reference", 5, |_| { + // 0 ^ = 21 ^ = 38 + Some("replacement") + }), + Err(EscapeError::UnterminatedEntity(21..38)) + ); + assert_eq!( + normalize_attribute_value( + "string with unclosed (character) reference", + // ^ = 21 ^ = 47 + 5, + |_| { None } + ), + Err(EscapeError::UnterminatedEntity(21..47)) + ); + } + + #[test] + fn unknown_entity() { + assert_eq!( + normalize_attribute_value("string with unknown &entity; reference", 5, |_| { + // 0 ^ ^ = 21..27 + None + }), + Err(EscapeError::UnrecognizedEntity( + 21..27, + "entity".to_string(), + )) + ); + } + + #[test] + fn recursive_entity() { + assert_eq!( + normalize_attribute_value("&entity; reference", 5, |_| Some("recursive &entity;")), + Err(EscapeError::TooManyNestedEntities), + ); + } + } +} diff --git a/src/events/attributes.rs b/src/events/attributes.rs index 5fbb1cb1..b1bb1943 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -4,7 +4,9 @@ use crate::encoding::Decoder; use crate::errors::Result as XmlResult; -use crate::escape::{escape, resolve_predefined_entity, unescape_with}; +use crate::escape::{ + escape, normalize_attribute_value, resolve_predefined_entity, resolve_xml_entity, +}; use crate::name::{LocalName, Namespace, QName}; use crate::reader::NsReader; use crate::utils::{is_whitespace, Bytes}; @@ -16,11 +18,11 @@ use std::{borrow::Cow, ops::Range}; /// A struct representing a key/value XML attribute. /// /// Field `value` stores raw bytes, possibly containing escape-sequences. Most users will likely -/// want to access the value using one of the [`unescape_value`] and [`decode_and_unescape_value`] +/// want to access the value using one of the [`normalized_value`] and [`decoded_and_normalized_value`] /// functions. /// -/// [`unescape_value`]: Self::unescape_value -/// [`decode_and_unescape_value`]: Self::decode_and_unescape_value +/// [`normalized_value`]: Self::normalized_value +/// [`decoded_and_normalized_value`]: Self::decoded_and_normalized_value #[derive(Clone, Eq, PartialEq)] pub struct Attribute<'a> { /// The key to uniquely define the attribute. @@ -32,7 +34,240 @@ pub struct Attribute<'a> { } impl<'a> Attribute<'a> { - /// Decodes using UTF-8 then unescapes the value. + /// Returns the attribute value normalized as per [the XML specification]. + /// + /// The characters `\t`, `\r`, `\n` are replaced with whitespace characters (`0x20`). + /// + /// The following escape sequences are replaced with their unescaped equivalents: + /// + /// | Character | Replacement + /// |-----------|------------ + /// | `<` | `<` + /// | `>` | `>` + /// | `&` | `&` + /// | `'` | `'` + /// | `"` | `"` + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// Note, althougth you may use this library to parse HTML, you cannot use this + /// method to get HTML content, because its returns normalized value: the following + /// sequences are translated into a single space (U+0020) character: + /// + /// - `\r\n` + /// - `\r\x85` + /// - `\r` + /// - `\n` + /// - `\t` + /// - `\x85` + /// - `\x2028` + /// + /// The text in HTML normally is not normalized in any way; normalization is + /// performed only in limited contexts and [only for] `\r\n` and `\r`. + /// + /// See also [`normalized_value_with()`](Self::normalized_value_with). + /// + ///
+ /// + /// NOTE: Because this method is available only if [`encoding`] feature is **not** enabled, + /// should only be used by applications. + /// Libs should use [`decoded_and_normalized_value()`](Self::decoded_and_normalized_value) + /// instead, because if lib will be used in a project which depends on quick_xml with + /// [`encoding`] feature enabled, the lib will fail to compile due to [feature unification]. + /// + ///
+ /// + /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize + /// [only for]: https://html.spec.whatwg.org/#normalize-newlines + /// [`encoding`]: ../../index.html#encoding + /// [feature unification]: https://doc.rust-lang.org/cargo/reference/features.html#feature-unification + #[cfg(any(doc, not(feature = "encoding")))] + pub fn normalized_value(&self) -> XmlResult> { + // resolve_xml_entity returns only non-recursive replacements, so depth=1 is enougth + self.normalized_value_with(1, resolve_xml_entity) + } + + /// Returns the attribute value normalized as per [the XML specification], + /// using a custom entity resolver. + /// + /// Do not use this method with HTML attributes. + /// + /// The characters `\t`, `\r`, `\n` are replaced with whitespace characters (`0x20`). + /// + /// A function for resolving entities can be provided as `resolve_entity`. + /// This method does not resolve any predefined entities, but you can use + /// [`resolve_xml_entity`] in your function. + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// Note, althougth you may use this library to parse HTML, you cannot use this + /// method to get HTML content, because its returns normalized value: the following + /// sequences are translated into a single space (U+0020) character: + /// + /// - `\r\n` + /// - `\r\x85` + /// - `\r` + /// - `\n` + /// - `\t` + /// - `\x85` + /// - `\x2028` + /// + /// The text in HTML normally is not normalized in any way; normalization is + /// performed only in limited contexts and [only for] `\r\n` and `\r`. + /// + /// See also [`normalized_value()`](Self::normalized_value). + /// + ///
+ /// + /// NOTE: Because this method is available only if [`encoding`] feature is **not** enabled, + /// should only be used by applications. + /// Libs should use [`decoded_and_normalized_value_with()`](Self::decoded_and_normalized_value_with) + /// instead, because if lib will be used in a project which depends on quick_xml with + /// [`encoding`] feature enabled, the lib will fail to compile due to [feature unification]. + /// + ///
+ /// + /// # Parameters + /// + /// - `depth`: maximum number of nested entities that can be expanded. If expansion + /// chain will be more that this value, the function will return [`EscapeError::TooManyNestedEntities`] + /// - `resolve_entity`: a function to resolve entity. This function could be called + /// multiple times on the same input and can return different values in each case + /// for the same input, although it is not recommended + /// + /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize + /// [only for]: https://html.spec.whatwg.org/#normalize-newlines + /// [`encoding`]: ../../index.html#encoding + /// [feature unification]: https://doc.rust-lang.org/cargo/reference/features.html#feature-unification + /// [`EscapeError::TooManyNestedEntities`]: crate::escape::EscapeError::TooManyNestedEntities + #[cfg(any(doc, not(feature = "encoding")))] + pub fn normalized_value_with<'entity>( + &self, + depth: usize, + resolve_entity: impl FnMut(&str) -> Option<&'entity str>, + ) -> XmlResult> { + use crate::encoding::EncodingError; + use std::str::from_utf8; + + let decoded = match &self.value { + Cow::Borrowed(bytes) => Cow::Borrowed(from_utf8(bytes).map_err(EncodingError::Utf8)?), + // Convert to owned, because otherwise Cow will be bound with wrong lifetime + Cow::Owned(bytes) => { + Cow::Owned(from_utf8(bytes).map_err(EncodingError::Utf8)?.to_owned()) + } + }; + + match normalize_attribute_value(&decoded, depth, resolve_entity)? { + // Because result is borrowed, no replacements was done and we can use original string + Cow::Borrowed(_) => Ok(decoded), + Cow::Owned(s) => Ok(s.into()), + } + } + + /// Decodes using a provided reader and returns the attribute value normalized + /// as per [the XML specification]. + /// + /// Do not use this method with HTML attributes. + /// + /// The characters `\t`, `\r`, `\n` are replaced with whitespace characters (`0x20`). + /// + /// The following escape sequences are replaced with their unescaped equivalents: + /// + /// | Character | Replacement + /// |-----------|------------ + /// | `<` | `<` + /// | `>` | `>` + /// | `&` | `&` + /// | `'` | `'` + /// | `"` | `"` + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// Note, althougth you may use this library to parse HTML, you cannot use this + /// method to get HTML content, because its returns normalized value: the following + /// sequences are translated into a single space (U+0020) character: + /// + /// - `\r\n` + /// - `\r\x85` + /// - `\r` + /// - `\n` + /// - `\t` + /// - `\x85` + /// - `\x2028` + /// + /// The text in HTML normally is not normalized in any way; normalization is + /// performed only in limited contexts and [only for] `\r\n` and `\r`. + /// + /// See also [`decoded_and_normalized_value_with()`](#method.decoded_and_normalized_value_with) + /// + /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize + /// [only for]: https://html.spec.whatwg.org/#normalize-newlines + pub fn decoded_and_normalized_value(&self, decoder: Decoder) -> XmlResult> { + // resolve_xml_entity returns only non-recursive replacements, so depth=1 is enougth + self.decoded_and_normalized_value_with(decoder, 1, resolve_xml_entity) + } + + /// Decodes using a provided reader and returns the attribute value normalized + /// as per [the XML specification], using a custom entity resolver. + /// + /// Do not use this method with HTML attributes. + /// + /// The characters `\t`, `\r`, `\n` are replaced with whitespace characters (`0x20`). + /// + /// A function for resolving entities can be provided as `resolve_entity`. + /// This method does not resolve any predefined entities, but you can use + /// [`resolve_xml_entity`] in your function. + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// Note, althougth you may use this library to parse HTML, you cannot use this + /// method to get HTML content, because its returns normalized value: the following + /// sequences are translated into a single space (U+0020) character: + /// + /// - `\r\n` + /// - `\r\x85` + /// - `\r` + /// - `\n` + /// - `\t` + /// - `\x85` + /// - `\x2028` + /// + /// The text in HTML normally is not normalized in any way; normalization is + /// performed only in limited contexts and [only for] `\r\n` and `\r`. + /// + /// See also [`decoded_and_normalized_value()`](#method.decoded_and_normalized_value) + /// + /// # Parameters + /// + /// - `depth`: maximum number of nested entities that can be expanded. If expansion + /// chain will be more that this value, the function will return [`EscapeError::TooManyNestedEntities`] + /// - `resolve_entity`: a function to resolve entity. This function could be called + /// multiple times on the same input and can return different values in each case + /// for the same input, although it is not recommended + /// + /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize + /// [only for]: https://html.spec.whatwg.org/#normalize-newlines + /// [`EscapeError::TooManyNestedEntities`]: crate::escape::EscapeError::TooManyNestedEntities + pub fn decoded_and_normalized_value_with<'entity>( + &self, + decoder: Decoder, + depth: usize, + resolve_entity: impl FnMut(&str) -> Option<&'entity str>, + ) -> XmlResult> { + let decoded = match &self.value { + Cow::Borrowed(bytes) => decoder.decode(bytes)?, + // Convert to owned, because otherwise Cow will be bound with wrong lifetime + Cow::Owned(bytes) => decoder.decode(bytes)?.into_owned().into(), + }; + + match normalize_attribute_value(&decoded, depth, resolve_entity)? { + // Because result is borrowed, no replacements was done and we can use original string + Cow::Borrowed(_) => Ok(decoded), + Cow::Owned(s) => Ok(s.into()), + } + } + + /// Returns the unescaped value. /// /// This is normally the value you are interested in. Escape sequences such as `>` are /// replaced with their unescaped equivalents such as `>`. @@ -45,7 +280,7 @@ impl<'a> Attribute<'a> { /// /// NOTE: Because this method is available only if [`encoding`] feature is **not** enabled, /// should only be used by applications. - /// Libs should use [`decode_and_unescape_value()`](Self::decode_and_unescape_value) + /// Libs should use [`decoded_and_normalized_value()`](Self::decoded_and_normalized_value) /// instead, because if lib will be used in a project which depends on quick_xml with /// [`encoding`] feature enabled, the lib will fail to compile due to [feature unification]. /// @@ -54,8 +289,10 @@ impl<'a> Attribute<'a> { /// [`encoding`]: ../../index.html#encoding /// [feature unification]: https://doc.rust-lang.org/cargo/reference/features.html#feature-unification #[cfg(any(doc, not(feature = "encoding")))] + #[deprecated = "use `Self::normalized_value()`"] pub fn unescape_value(&self) -> XmlResult> { - self.unescape_value_with(resolve_predefined_entity) + // resolve_predefined_entity returns only non-recursive replacements, so depth=1 is enougth + self.normalized_value_with(1, resolve_predefined_entity) } /// Decodes using UTF-8 then unescapes the value, using custom entities. @@ -73,7 +310,7 @@ impl<'a> Attribute<'a> { /// /// NOTE: Because this method is available only if [`encoding`] feature is **not** enabled, /// should only be used by applications. - /// Libs should use [`decode_and_unescape_value_with()`](Self::decode_and_unescape_value_with) + /// Libs should use [`decoded_and_normalized_value_with()`](Self::decoded_and_normalized_value_with) /// instead, because if lib will be used in a project which depends on quick_xml with /// [`encoding`] feature enabled, the lib will fail to compile due to [feature unification]. /// @@ -82,38 +319,36 @@ impl<'a> Attribute<'a> { /// [`encoding`]: ../../index.html#encoding /// [feature unification]: https://doc.rust-lang.org/cargo/reference/features.html#feature-unification #[cfg(any(doc, not(feature = "encoding")))] + #[deprecated = "use `Self::normalized_value_with()`"] #[inline] pub fn unescape_value_with<'entity>( &self, resolve_entity: impl FnMut(&str) -> Option<&'entity str>, ) -> XmlResult> { - self.decode_and_unescape_value_with(Decoder::utf8(), resolve_entity) + self.normalized_value_with(128, resolve_entity) } /// Decodes then unescapes the value. /// /// This will allocate if the value contains any escape sequences or in /// non-UTF-8 encoding. + #[deprecated = "use `Self::decoded_and_normalized_value()`"] pub fn decode_and_unescape_value(&self, decoder: Decoder) -> XmlResult> { - self.decode_and_unescape_value_with(decoder, resolve_predefined_entity) + // resolve_predefined_entity returns only non-recursive replacements, so depth=1 is enougth + self.decoded_and_normalized_value_with(decoder, 1, resolve_predefined_entity) } /// Decodes then unescapes the value with custom entities. /// /// This will allocate if the value contains any escape sequences or in /// non-UTF-8 encoding. + #[deprecated = "use `Self::decoded_and_normalized_value_with()`"] pub fn decode_and_unescape_value_with<'entity>( &self, decoder: Decoder, resolve_entity: impl FnMut(&str) -> Option<&'entity str>, ) -> XmlResult> { - let decoded = decoder.decode_cow(&self.value)?; - - match unescape_with(&decoded, resolve_entity)? { - // Because result is borrowed, no replacements was done and we can use original string - Cow::Borrowed(_) => Ok(decoded), - Cow::Owned(s) => Ok(s.into()), - } + self.decoded_and_normalized_value_with(decoder, 128, resolve_entity) } /// If attribute value [represents] valid boolean values, returns `Some`, otherwise returns `None`. @@ -1011,6 +1246,104 @@ mod xml { use super::*; use pretty_assertions::assert_eq; + mod attribute_value_normalization { + use super::*; + use crate::errors::Error; + use crate::escape::EscapeError::*; + use pretty_assertions::assert_eq; + + /// Empty values returned are unchanged + #[test] + fn empty() { + let raw_value = "".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!( + attr.decoded_and_normalized_value(Decoder::utf8()).unwrap(), + Cow::Borrowed("") + ); + } + + /// Already normalized values are returned unchanged + #[test] + fn already_normalized() { + let raw_value = "foobar123".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!( + attr.decoded_and_normalized_value(Decoder::utf8()).unwrap(), + Cow::Borrowed("foobar123") + ); + } + + /// Return, tab, and newline characters (0xD, 0x9, 0xA) must be substituted with + /// a space character, \r\n and \r\u{85} should be replaced by one space + #[test] + fn space_replacement() { + let raw_value = "\r\nfoo\u{85}\u{2028}\rbar\tbaz\n\ndelta\n\r\u{85}".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!( + attr.decoded_and_normalized_value(Decoder::utf8()).unwrap(), + Cow::::Owned(" foo bar baz delta ".to_string()) + ); + } + + /// Entities must be terminated + #[test] + fn unterminated_entity() { + let raw_value = "abc"def".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + match attr.decoded_and_normalized_value(Decoder::utf8()) { + Err(Error::Escape(err)) => assert_eq!(err, UnterminatedEntity(3..11)), + x => panic!("Expected Err(Escape(err)), got {:?}", x), + } + } + + /// Unknown entities raise error + #[test] + fn unrecognized_entity() { + let raw_value = "abc&unkn;def".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + match attr.decoded_and_normalized_value(Decoder::utf8()) { + // TODO: is this divergence between range behavior of UnterminatedEntity + // and UnrecognizedEntity appropriate? existing unescape code behaves the same. (see: start index) + Err(Error::Escape(err)) => { + assert_eq!(err, UnrecognizedEntity(4..8, "unkn".to_owned())) + } + x => panic!("Expected Err(Escape(err)), got {:?}", x), + } + } + + /// custom entity replacement works, entity replacement text processed recursively + #[test] + fn entity_replacement() { + let raw_value = "&d;&d;A&a; &a;B&da;".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + fn custom_resolver(ent: &str) -> Option<&'static str> { + match ent { + "d" => Some(" "), + "a" => Some(" "), + "da" => Some(" "), + _ => None, + } + } + assert_eq!( + attr.decoded_and_normalized_value_with(Decoder::utf8(), 5, &custom_resolver) + .unwrap(), + Cow::::Owned("\r\rA\n \nB\r\n".to_string()) + ); + } + + #[test] + fn char_references() { + // character literal references are substituted without being replaced by spaces + let raw_value = " A B ".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!( + attr.decoded_and_normalized_value(Decoder::utf8()).unwrap(), + Cow::::Owned("\r\rA\n\nB\r\n".to_string()) + ); + } + } + /// Checked attribute is the single attribute mod single { use super::*; diff --git a/src/events/mod.rs b/src/events/mod.rs index fc24e2d9..18b90ee4 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -583,6 +583,46 @@ impl<'a> BytesText<'a> { self.decoder.decode_cow(&self.content) } + /// Decodes the content of the XML event. + /// + /// When this event produced by the reader, it uses the encoding information + /// associated with that reader to interpret the raw bytes contained within + /// this text event. + /// + /// This will allocate if the value contains any escape sequences or in non-UTF-8 + /// encoding, or EOL normalization is required. + /// + /// Note, that this method should be used only if event represents XML content, + /// because rules for normalizing EOLs for [XML] and [HTML] differs. + /// + /// To get HTML content use [`html_content()`](Self::html_content). + /// + /// [XML]: https://www.w3.org/TR/xml11/#sec-line-ends + /// [HTML]: https://html.spec.whatwg.org/#normalize-newlines + pub fn xml_content(&self) -> Result, EncodingError> { + self.decoder.xml_content(&self.content) + } + + /// Decodes the content of the HTML event. + /// + /// When this event produced by the reader, it uses the encoding information + /// associated with that reader to interpret the raw bytes contained within + /// this text event. + /// + /// This will allocate if the value contains any escape sequences or in non-UTF-8 + /// encoding, or EOL normalization is required. + /// + /// Note, that this method should be used only if event represents HTML content, + /// because rules for normalizing EOLs for [XML] and [HTML] differs. + /// + /// To get XML content use [`xml_content()`](Self::xml_content). + /// + /// [XML]: https://www.w3.org/TR/xml11/#sec-line-ends + /// [HTML]: https://html.spec.whatwg.org/#normalize-newlines + pub fn html_content(&self) -> Result, EncodingError> { + self.decoder.html_content(&self.content) + } + /// Removes leading XML whitespace bytes from text content. /// /// Returns `true` if content is empty after that @@ -828,7 +868,49 @@ impl<'a> BytesCData<'a> { /// associated with that reader to interpret the raw bytes contained within this /// CDATA event. pub fn decode(&self) -> Result, EncodingError> { - Ok(self.decoder.decode_cow(&self.content)?) + self.decoder.decode_cow(&self.content) + } + + /// Decodes the raw input byte content of the CDATA section of the XML event + /// into a string. + /// + /// When this event produced by the reader, it uses the encoding information + /// associated with that reader to interpret the raw bytes contained within + /// this CDATA event. + /// + /// This will allocate if the value in non-UTF-8 encoding, or EOL normalization + /// is required. + /// + /// Note, that this method should be used only if event represents XML content, + /// because rules for normalizing EOLs for [XML] and [HTML] differs. + /// + /// To get HTML content use [`html_content()`](Self::html_content). + /// + /// [XML]: https://www.w3.org/TR/xml11/#sec-line-ends + /// [HTML]: https://html.spec.whatwg.org/#normalize-newlines + pub fn xml_content(&self) -> Result, EncodingError> { + self.decoder.xml_content(&self.content) + } + + /// Decodes the raw input byte content of the CDATA section of the HTML event + /// into a string. + /// + /// When this event produced by the reader, it uses the encoding information + /// associated with that reader to interpret the raw bytes contained within + /// this CDATA event. + /// + /// This will allocate if the value in non-UTF-8 encoding, or EOL normalization + /// is required. + /// + /// Note, that this method should be used only if event represents HTML content, + /// because rules for normalizing EOLs for [XML] and [HTML] differs. + /// + /// To get XML content use [`xml_content()`](Self::xml_content). + /// + /// [XML]: https://www.w3.org/TR/xml11/#sec-line-ends + /// [HTML]: https://html.spec.whatwg.org/#normalize-newlines + pub fn html_content(&self) -> Result, EncodingError> { + self.decoder.html_content(&self.content) } } @@ -1443,6 +1525,46 @@ impl<'a> BytesRef<'a> { self.decoder.decode_cow(&self.content) } + /// Decodes the content of the XML event. + /// + /// When this event produced by the reader, it uses the encoding information + /// associated with that reader to interpret the raw bytes contained within + /// this general reference event. + /// + /// This will allocate if the value in non-UTF-8 encoding, or EOL normalization + /// is required. + /// + /// Note, that this method should be used only if event represents XML content, + /// because rules for normalizing EOLs for [XML] and [HTML] differs. + /// + /// To get HTML content use [`html_content()`](Self::html_content). + /// + /// [XML]: https://www.w3.org/TR/xml11/#sec-line-ends + /// [HTML]: https://html.spec.whatwg.org/#normalize-newlines + pub fn xml_content(&self) -> Result, EncodingError> { + self.decoder.xml_content(&self.content) + } + + /// Decodes the content of the HTML event. + /// + /// When this event produced by the reader, it uses the encoding information + /// associated with that reader to interpret the raw bytes contained within + /// this general reference event. + /// + /// This will allocate if the value in non-UTF-8 encoding, or EOL normalization + /// is required. + /// + /// Note, that this method should be used only if event represents HTML content, + /// because rules for normalizing EOLs for [XML] and [HTML] differs. + /// + /// To get XML content use [`xml_content()`](Self::xml_content). + /// + /// [XML]: https://www.w3.org/TR/xml11/#sec-line-ends + /// [HTML]: https://html.spec.whatwg.org/#normalize-newlines + pub fn html_content(&self) -> Result, EncodingError> { + self.decoder.html_content(&self.content) + } + /// Returns `true` if the specified reference represents the character reference /// (`&#;`). /// diff --git a/tests/encodings.rs b/tests/encodings.rs index 7b64e167..30f132b3 100644 --- a/tests/encodings.rs +++ b/tests/encodings.rs @@ -37,7 +37,7 @@ fn test_koi8_r_encoding() { loop { match r.read_event_into(&mut buf) { Ok(Text(e)) => { - e.decode().unwrap(); + e.xml_content().unwrap(); } Ok(Eof) => break, _ => (), diff --git a/tests/fuzzing.rs b/tests/fuzzing.rs index 25cf6989..a158d570 100644 --- a/tests/fuzzing.rs +++ b/tests/fuzzing.rs @@ -31,14 +31,14 @@ fn fuzz_101() { Ok(Event::Start(e)) | Ok(Event::Empty(e)) => { for a in e.attributes() { if a.ok().map_or(true, |a| { - a.decode_and_unescape_value(reader.decoder()).is_err() + a.decoded_and_normalized_value(reader.decoder()).is_err() }) { break; } } } Ok(Event::Text(e)) => { - if e.decode().is_err() { + if e.xml_content().is_err() { break; } } diff --git a/tests/reader.rs b/tests/reader.rs index fecdeabc..15ce8eaf 100644 --- a/tests/reader.rs +++ b/tests/reader.rs @@ -172,7 +172,7 @@ fn test_escaped_content() { "content unexpected: expecting 'test', got '{:?}'", from_utf8(&e) ); - match e.decode() { + match e.xml_content() { Ok(c) => assert_eq!(c, "test"), Err(e) => panic!( "cannot escape content at position {}: {:?}", diff --git a/tests/roundtrip.rs b/tests/roundtrip.rs index 4fb9ec53..c99e6f64 100644 --- a/tests/roundtrip.rs +++ b/tests/roundtrip.rs @@ -236,7 +236,7 @@ fn reescape_text() { match reader.read_event().unwrap() { Eof => break, Text(e) => { - let t = e.decode().unwrap(); + let t = e.xml_content().unwrap(); assert!(writer.write_event(Text(BytesText::new(&t))).is_ok()); } e => assert!(writer.write_event(e).is_ok()), diff --git a/tests/serde-se.rs b/tests/serde-se.rs index ec73b4ee..051e47c9 100644 --- a/tests/serde-se.rs +++ b/tests/serde-se.rs @@ -1897,9 +1897,11 @@ mod with_root { 3"); serialize_as!(tuple: // Use to_string() to get owned type that is required for deserialization - ("<\"&'>".to_string(), "with\t\r\n spaces", 3usize) + // NOTE: do not use \r, because it normalized to \n during deserialziation + // but writes as is during serialization + ("<\"&'>".to_string(), "with\t\n spaces", 3usize) => "<\"&'>\ - with\t\r\n spaces\ + with\t\n spaces\ 3"); serialize_as!(tuple_struct: Tuple(42.0, "answer")