Skip to content

Commit 21687c7

Browse files
committed
Properly normalize attribute values
closes #371
1 parent 46b4d1d commit 21687c7

File tree

4 files changed

+133
-6
lines changed

4 files changed

+133
-6
lines changed

src/errors.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ impl From<EscapeError> for Error {
7272
}
7373

7474
impl From<AttrError> for Error {
75+
/// Creates a new `Error::InvalidAttr` from the given error
7576
#[inline]
7677
fn from(error: AttrError) -> Self {
7778
Error::InvalidAttr(error)

src/escapei.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use std::ops::Range;
99
use pretty_assertions::assert_eq;
1010

1111
/// Error for XML escape/unescqpe.
12-
#[derive(Debug)]
12+
#[derive(Debug, PartialEq)]
1313
pub enum EscapeError {
1414
/// Entity with Null character
1515
EntityWithNull(::std::ops::Range<usize>),
@@ -134,7 +134,7 @@ pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
134134
}
135135

136136
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
137-
/// value, using a dictionnary of custom entities.
137+
/// value, using a dictionary of custom entities.
138138
///
139139
/// # Pre-condition
140140
///

src/events/attributes.rs

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
55
use crate::errors::{Error, Result as XmlResult};
66
use crate::escape::{do_unescape, escape};
7+
use crate::escapei::EscapeError;
78
use crate::name::QName;
89
use crate::reader::{is_whitespace, Reader};
910
use crate::utils::{write_byte_string, write_cow_string, Bytes};
@@ -32,6 +33,13 @@ pub struct Attribute<'a> {
3233
}
3334

3435
impl<'a> Attribute<'a> {
36+
///
37+
pub fn normalized_value(&'a self) -> Result<Cow<'a, [u8]>, EscapeError> {
38+
let normalized = normalize_attribute_value(self.value.as_ref());
39+
let escaped = do_unescape(&*normalized, None)?;
40+
Ok(Cow::Owned(escaped.into_owned()))
41+
}
42+
3543
/// Returns the unescaped value.
3644
///
3745
/// This is normally the value you are interested in. Escape sequences such as `&gt;` are
@@ -182,6 +190,90 @@ impl<'a> From<Attr<&'a [u8]>> for Attribute<'a> {
182190
}
183191
}
184192

193+
/// Normalize the attribute value according to xml specification section 3.3.3
194+
///
195+
/// https://www.w3.org/TR/xml/#AVNormalize
196+
///
197+
/// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value
198+
/// * Sequences of whitespace-like characters are replaced with a single whitespace character
199+
/// * Character and entity references are substituted as defined by the spec
200+
fn normalize_attribute_value(attr: &[u8]) -> Cow<[u8]> {
201+
// TODO: character references, entity references, error handling associated with those
202+
203+
#[derive(PartialEq)]
204+
enum ParseState {
205+
Space,
206+
CDATA,
207+
}
208+
209+
// Trim characters from the beginning and end of the attribute value - this can't fail.
210+
fn trim_value(attr: &[u8]) -> &[u8] {
211+
let first_non_space_char = attr.iter().position(|c| !is_whitespace(*c));
212+
213+
if first_non_space_char.is_none() {
214+
// The entire value was whitespace-like characters
215+
return b"";
216+
}
217+
218+
let last_non_space_char = attr.iter().rposition(|c| !is_whitespace(*c));
219+
220+
// Trim all whitespace-like characters away from the beginning and end of the attribute value.
221+
let begin = first_non_space_char.unwrap();
222+
let end = last_non_space_char.unwrap_or(attr.len());
223+
&attr[begin..=end]
224+
}
225+
226+
let trimmed_attr = trim_value(attr);
227+
228+
// A new buffer is only created when we encounter a situation that requires it.
229+
let mut normalized: Option<Vec<u8>> = None;
230+
// We start on character data because all whitespace-like characters are already trimmed away.
231+
let mut current_state = ParseState::CDATA;
232+
233+
// Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
234+
// or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
235+
// buffer and continue using this buffer.
236+
for (idx, ch) in trimmed_attr.iter().enumerate() {
237+
match ch {
238+
b'\n' | b'\r' | b'\t' | b' ' => match current_state {
239+
ParseState::Space => match normalized {
240+
Some(_) => continue,
241+
None => normalized = Some(Vec::from(&trimmed_attr[..idx])),
242+
},
243+
ParseState::CDATA => {
244+
current_state = ParseState::Space;
245+
match normalized.as_mut() {
246+
Some(buf) => buf.push(b' '),
247+
None => {
248+
let mut buf = Vec::from(&trimmed_attr[..idx]);
249+
buf.push(b' ');
250+
normalized = Some(buf);
251+
}
252+
}
253+
}
254+
},
255+
c @ _ => match current_state {
256+
ParseState::Space => {
257+
current_state = ParseState::CDATA;
258+
if let Some(normalized) = normalized.as_mut() {
259+
normalized.push(*c);
260+
}
261+
}
262+
ParseState::CDATA => {
263+
if let Some(normalized) = normalized.as_mut() {
264+
normalized.push(*c);
265+
}
266+
}
267+
},
268+
}
269+
}
270+
271+
match normalized {
272+
Some(normalized) => Cow::Owned(normalized),
273+
None => Cow::Borrowed(trimmed_attr),
274+
}
275+
}
276+
185277
////////////////////////////////////////////////////////////////////////////////////////////////////
186278

187279
/// Iterator over XML attributes.
@@ -798,6 +890,43 @@ mod xml {
798890
use super::*;
799891
use pretty_assertions::assert_eq;
800892

893+
#[test]
894+
fn attribute_value_normalization() {
895+
// empty value
896+
assert_eq!(normalize_attribute_value(b""), Cow::Borrowed(b""));
897+
// return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character
898+
assert_eq!(
899+
normalize_attribute_value(b"\rfoo\rbar\tbaz\ndelta\n"),
900+
Cow::Owned::<[u8]>(b"foo bar baz delta".to_vec())
901+
);
902+
// leading and trailing spaces must be stripped
903+
assert_eq!(normalize_attribute_value(b" foo "), Cow::Borrowed(b"foo"));
904+
// leading space
905+
assert_eq!(normalize_attribute_value(b" bar"), Cow::Borrowed(b"bar"));
906+
// trailing space
907+
assert_eq!(normalize_attribute_value(b"baz "), Cow::Borrowed(b"baz"));
908+
// sequences of spaces must be replaced with a single space
909+
assert_eq!(
910+
normalize_attribute_value(b" foo bar baz "),
911+
Cow::Owned::<[u8]>(b"foo bar baz".to_vec())
912+
);
913+
// sequence replacement mixed with characters treated as whitespace (\t \r \n)
914+
assert_eq!(
915+
normalize_attribute_value(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r"),
916+
Cow::Owned::<[u8]>(b"foo bar baz delta echo foxtrot".to_vec())
917+
);
918+
// character references for whitespace-like characters are not combined after substitution
919+
assert_eq!(
920+
normalize_attribute_value(b"&#x20;&#xD0;&#xA0;&#x90;"),
921+
Cow::Owned::<[u8]>(b" \r\t\n".to_vec())
922+
);
923+
// sequence replacement mixed with characters treated as whitespace (\t \r \n)
924+
assert_eq!(
925+
normalize_attribute_value(b" &#x20;foo\tbar baz &#xA0;delta\n&#x90;\r echo foxtrot&#xD0;"),
926+
Cow::Owned::<[u8]>(b" foo bar baz \ndelta \t echo foxtrot\r".to_vec())
927+
);
928+
}
929+
801930
/// Checked attribute is the single attribute
802931
mod single {
803932
use super::*;

src/reader.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1436,10 +1436,7 @@ impl ReadElementState {
14361436
/// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab)
14371437
#[inline]
14381438
pub(crate) fn is_whitespace(b: u8) -> bool {
1439-
match b {
1440-
b' ' | b'\r' | b'\n' | b'\t' => true,
1441-
_ => false,
1442-
}
1439+
matches!(b, b' ' | b'\r' | b'\n' | b'\t')
14431440
}
14441441

14451442
////////////////////////////////////////////////////////////////////////////////////////////////////

0 commit comments

Comments
 (0)