Skip to content

Commit f7add85

Browse files
committed
Implement an attribute normalization routine
1 parent 166376d commit f7add85

File tree

1 file changed

+298
-0
lines changed

1 file changed

+298
-0
lines changed

src/escape.rs

Lines changed: 298 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use memchr::memchr2_iter;
44
use std::borrow::Cow;
55
use std::num::ParseIntError;
66
use std::ops::Range;
7+
use std::slice::Iter;
78

89
/// Error of parsing character reference (`&#<dec-number>;` or `&#x<hex-number>;`).
910
#[derive(Clone, Debug, PartialEq)]
@@ -50,6 +51,12 @@ pub enum EscapeError {
5051
/// Attempt to parse character reference (`&#<dec-number>;` or `&#x<hex-number>;`)
5152
/// was unsuccessful, not all characters are decimal or hexadecimal numbers.
5253
InvalidCharRef(ParseCharRefError),
54+
/// Expanded more than maximum possible entities during attribute normalization.
55+
///
56+
/// Attribute normalization includes expanding of general entities (`&entity;`)
57+
/// which replacement text also could contain entities, which is also must be expanded.
58+
/// If more than 128 entities would be expanded, this error is returned.
59+
TooManyNestedEntities,
5360
}
5461

5562
impl std::fmt::Display for EscapeError {
@@ -66,6 +73,9 @@ impl std::fmt::Display for EscapeError {
6673
Self::InvalidCharRef(e) => {
6774
write!(f, "invalid character reference: {}", e)
6875
}
76+
Self::TooManyNestedEntities => {
77+
f.write_str("too many nested entities in an attribute value")
78+
}
6979
}
7080
}
7181
}
@@ -302,6 +312,182 @@ where
302312
}
303313
}
304314

315+
const fn is_normalization_char(b: &u8) -> bool {
316+
matches!(*b, b'\t' | b'\r' | b'\n' | b' ' | b'&')
317+
}
318+
319+
/// Returns the attribute value normalized as per [the XML specification],
320+
/// using a custom entity resolver.
321+
///
322+
/// Do not use this method with HTML attributes.
323+
///
324+
/// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
325+
/// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
326+
/// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
327+
/// take precedence.
328+
///
329+
/// This will allocate unless the raw attribute value does not require normalization.
330+
///
331+
/// # Parameters
332+
///
333+
/// - `value`: unnormalized attribute value
334+
/// - `depth`: maximum number of nested entities that can be expanded. If expansion
335+
/// chain will be more that this value, the function will return [`EscapeError::TooManyNestedEntities`]
336+
/// - `resolve_entity`: a function to resolve entity. This function could be called
337+
/// multiple times on the same input and can return different values in each case
338+
/// for the same input, although it is not recommended
339+
///
340+
/// # Lifetimes
341+
///
342+
/// - `'input`: lifetime of the unnormalized attribute. If normalization is not requred,
343+
/// the input returned unchanged with the same lifetime
344+
/// - `'entity`: lifetime of all entities that is returned by the entity resolution routine
345+
///
346+
/// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize
347+
pub(crate) fn normalize_attribute_value<'input, 'entity, F>(
348+
value: &'input str,
349+
depth: usize,
350+
resolve_entity: F,
351+
) -> Result<Cow<'input, str>, EscapeError>
352+
where
353+
// the lifetime of the output comes from a capture or is `'static`
354+
F: Fn(&str) -> Option<&'entity str>,
355+
{
356+
let mut iter = value.as_bytes().iter();
357+
358+
// If we found the charater that requires normalization, create a normalized
359+
// version of the attribute, otherwise return the value unchanged
360+
if let Some(i) = iter.position(is_normalization_char) {
361+
let mut normalized = String::with_capacity(value.len());
362+
let pos = normalize_step(
363+
&mut normalized,
364+
&mut iter,
365+
value,
366+
0,
367+
i,
368+
depth,
369+
&resolve_entity,
370+
)?;
371+
372+
normalize_steps(
373+
&mut normalized,
374+
&mut iter,
375+
value,
376+
pos,
377+
depth,
378+
&resolve_entity,
379+
)?;
380+
return Ok(normalized.into());
381+
}
382+
Ok(Cow::Borrowed(value))
383+
}
384+
385+
fn normalize_steps<'entity, F>(
386+
normalized: &mut String,
387+
iter: &mut Iter<u8>,
388+
input: &str,
389+
mut pos: usize,
390+
depth: usize,
391+
resolve_entity: &F,
392+
) -> Result<(), EscapeError>
393+
where
394+
// the lifetime of the output comes from a capture or is `'static`
395+
F: Fn(&str) -> Option<&'entity str>,
396+
{
397+
while let Some(i) = iter.position(is_normalization_char) {
398+
pos = normalize_step(normalized, iter, input, pos, pos + i, depth, resolve_entity)?;
399+
}
400+
if let Some(rest) = input.get(pos..) {
401+
normalized.push_str(rest);
402+
}
403+
Ok(())
404+
}
405+
406+
/// Performs one step of the [normalization algorithm] (but with recursive part):
407+
///
408+
/// 1. For a character reference, append the referenced character
409+
/// to the normalized value.
410+
/// 2. For an entity reference, recursively apply this algorithm
411+
/// to the replacement text of the entity.
412+
/// 3. For a white space character (#x20, #xD, #xA, #x9), append
413+
/// a space character (#x20) to the normalized value.
414+
/// 4. For another character, append the character to the normalized value.
415+
///
416+
/// # Parameters
417+
///
418+
/// - `normalized`: Output of the algorithm. Normalized value will be placed here
419+
/// - `iter`: Iterator over bytes of `input`
420+
/// - `input`: Original non-normalized value
421+
/// - `last_pos`: Index of the last byte in `input` that was processed
422+
/// - `index`: Index of the byte in `input` that should be processed now
423+
/// - `depth`: Current recursion depth. Too deep recursion will interrupt the algorithm
424+
/// - `resolve_entity`: Resolver of entities. Returns `None` for unknown entities
425+
///
426+
/// [normalization algorithm]: https://www.w3.org/TR/xml11/#AVNormalize
427+
fn normalize_step<'entity, F>(
428+
normalized: &mut String,
429+
iter: &mut Iter<u8>,
430+
input: &str,
431+
last_pos: usize,
432+
index: usize,
433+
depth: usize,
434+
resolve_entity: &F,
435+
) -> Result<usize, EscapeError>
436+
where
437+
// the lifetime of the output comes from a capture or is `'static`
438+
F: Fn(&str) -> Option<&'entity str>,
439+
{
440+
if depth == 0 {
441+
return Err(EscapeError::TooManyNestedEntities);
442+
}
443+
// 4. For another character, append the character to the normalized value.
444+
normalized.push_str(&input[last_pos..index]);
445+
446+
match input.as_bytes()[index] {
447+
b'&' => {
448+
let start = index + 1; // +1 - skip `&`
449+
let end = start
450+
+ match iter.position(|&b| b == b';') {
451+
Some(end) => end,
452+
None => return Err(EscapeError::UnterminatedEntity(index..input.len())),
453+
};
454+
455+
// Content between & and ; - &pat;
456+
let pat = &input[start..end];
457+
// 1. For a character reference, append the referenced character
458+
// to the normalized value.
459+
if pat.starts_with('#') {
460+
let entity = &pat[1..]; // starts after the #
461+
let codepoint = parse_number(entity).map_err(EscapeError::InvalidCharRef)?;
462+
normalized.push_str(codepoint.encode_utf8(&mut [0u8; 4]));
463+
} else
464+
// 2. For an entity reference, recursively apply this algorithm
465+
// to the replacement text of the entity.
466+
if let Some(value) = resolve_entity(pat) {
467+
normalize_steps(
468+
normalized,
469+
&mut value.as_bytes().iter(),
470+
value,
471+
0,
472+
depth.saturating_sub(1),
473+
resolve_entity,
474+
)?;
475+
} else {
476+
return Err(EscapeError::UnrecognizedEntity(start..end, pat.to_string()));
477+
}
478+
Ok(end + 1) // +1 - skip `;`
479+
}
480+
// 3. For a white space character (#x20, #xD, #xA, #x9), append
481+
// a space character (#x20) to the normalized value.
482+
b'\t' | b'\n' | b'\r' | b' ' => {
483+
normalized.push(' ');
484+
Ok(index + 1) // +1 - skip character
485+
}
486+
487+
_ => unreachable!("Only '\\t', '\\n', '\\r', ' ', and '&' are possible here"),
488+
}
489+
}
490+
305491
/// Resolves predefined XML entities or all HTML5 entities depending on the feature
306492
/// [`escape-html`](https://docs.rs/quick-xml/latest/quick_xml/#escape-html).
307493
///
@@ -1844,3 +2030,115 @@ fn from_str_radix(src: &str, radix: u32) -> Result<u32, ParseCharRefError> {
18442030
_ => u32::from_str_radix(src, radix).map_err(ParseCharRefError::InvalidNumber),
18452031
}
18462032
}
2033+
2034+
#[cfg(test)]
2035+
mod normalization {
2036+
use super::*;
2037+
use pretty_assertions::assert_eq;
2038+
2039+
#[test]
2040+
fn empty() {
2041+
assert_eq!(
2042+
normalize_attribute_value("", 5, |_| { None }),
2043+
Ok("".into())
2044+
);
2045+
}
2046+
2047+
#[test]
2048+
fn only_spaces() {
2049+
assert_eq!(
2050+
normalize_attribute_value(" ", 5, |_| { None }),
2051+
Ok(" ".into())
2052+
);
2053+
assert_eq!(
2054+
normalize_attribute_value("\t\t\t", 5, |_| { None }),
2055+
Ok(" ".into())
2056+
);
2057+
assert_eq!(
2058+
normalize_attribute_value("\r\r\r", 5, |_| { None }),
2059+
Ok(" ".into())
2060+
);
2061+
assert_eq!(
2062+
normalize_attribute_value("\n\n\n", 5, |_| { None }),
2063+
Ok(" ".into())
2064+
);
2065+
}
2066+
2067+
#[test]
2068+
fn already_normalized() {
2069+
assert_eq!(
2070+
normalize_attribute_value("already normalized", 5, |_| { None }),
2071+
Ok("already normalized".into())
2072+
);
2073+
}
2074+
2075+
#[test]
2076+
fn characters() {
2077+
assert_eq!(
2078+
normalize_attribute_value("string with &#32; character", 5, |_| { None }),
2079+
Ok("string with character".into())
2080+
);
2081+
assert_eq!(
2082+
normalize_attribute_value("string with &#x20; character", 5, |_| { None }),
2083+
Ok("string with character".into())
2084+
);
2085+
}
2086+
2087+
#[test]
2088+
fn entities() {
2089+
assert_eq!(
2090+
normalize_attribute_value("string with &entity; reference", 5, |_| {
2091+
Some("replacement")
2092+
}),
2093+
Ok("string with replacement reference".into())
2094+
);
2095+
assert_eq!(
2096+
normalize_attribute_value("string with &entity-1; reference", 5, |entity| {
2097+
match entity {
2098+
"entity-1" => Some("recursive &entity-2;"),
2099+
"entity-2" => Some("entity&#32;2"),
2100+
_ => None,
2101+
}
2102+
}),
2103+
Ok("string with recursive entity 2 reference".into())
2104+
);
2105+
}
2106+
2107+
#[test]
2108+
fn unclosed_entity() {
2109+
assert_eq!(
2110+
normalize_attribute_value("string with unclosed &entity reference", 5, |_| {
2111+
// 0 ^ = 21 ^ = 38
2112+
Some("replacement")
2113+
}),
2114+
Err(EscapeError::UnterminatedEntity(21..38))
2115+
);
2116+
assert_eq!(
2117+
normalize_attribute_value("string with unclosed &#32 (character) reference", 5, |_| {
2118+
// 0 ^ = 21 ^ = 47
2119+
None
2120+
}),
2121+
Err(EscapeError::UnterminatedEntity(21..47))
2122+
);
2123+
}
2124+
2125+
#[test]
2126+
fn unknown_entity() {
2127+
assert_eq!(
2128+
normalize_attribute_value("string with unknown &entity; reference", 5, |_| { None }),
2129+
// 0 ^ ^ = 21..27
2130+
Err(EscapeError::UnrecognizedEntity(
2131+
21..27,
2132+
"entity".to_string(),
2133+
))
2134+
);
2135+
}
2136+
2137+
#[test]
2138+
fn recursive_entity() {
2139+
assert_eq!(
2140+
normalize_attribute_value("&entity; reference", 5, |_| Some("recursive &entity;")),
2141+
Err(EscapeError::TooManyNestedEntities),
2142+
);
2143+
}
2144+
}

0 commit comments

Comments
 (0)