Skip to content

Commit 1a138d6

Browse files
committed
temp
1 parent 21687c7 commit 1a138d6

File tree

1 file changed

+109
-94
lines changed

1 file changed

+109
-94
lines changed

src/events/attributes.rs

Lines changed: 109 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,88 @@ pub struct Attribute<'a> {
3333
}
3434

3535
impl<'a> Attribute<'a> {
36+
/// Normalize the attribute value according to xml specification section 3.3.3
3637
///
38+
/// https://www.w3.org/TR/xml/#AVNormalize
39+
///
40+
/// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value
41+
/// * Sequences of whitespace-like characters are replaced with a single whitespace character
42+
/// * Character and entity references are substituted as defined by the spec
3743
pub fn normalized_value(&'a self) -> Result<Cow<'a, [u8]>, EscapeError> {
38-
let normalized = normalize_attribute_value(self.value.as_ref());
44+
// TODO: character references, entity references, error handling associated with those
45+
46+
#[derive(PartialEq)]
47+
enum ParseState {
48+
Space,
49+
CDATA,
50+
}
51+
52+
// Trim characters from the beginning and end of the attribute value - this can't fail.
53+
fn trim_value(attr: &[u8]) -> &[u8] {
54+
let first_non_space_char = attr.iter().position(|c| !is_whitespace(*c));
55+
56+
if first_non_space_char.is_none() {
57+
// The entire value was whitespace-like characters
58+
return b"";
59+
}
60+
61+
let last_non_space_char = attr.iter().rposition(|c| !is_whitespace(*c));
62+
63+
// Trim all whitespace-like characters away from the beginning and end of the attribute value.
64+
let begin = first_non_space_char.unwrap();
65+
let end = last_non_space_char.unwrap_or(attr.len());
66+
&attr[begin..=end]
67+
}
68+
69+
let trimmed_attr = trim_value(self.value.as_ref());
70+
71+
// A new buffer is only created when we encounter a situation that requires it.
72+
let mut normalized: Option<Vec<u8>> = None;
73+
// We start on character data because all whitespace-like characters are already trimmed away.
74+
let mut current_state = ParseState::CDATA;
75+
76+
// Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
77+
// or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
78+
// buffer and continue using this buffer.
79+
for (idx, ch) in trimmed_attr.iter().enumerate() {
80+
match ch {
81+
b'\n' | b'\r' | b'\t' | b' ' => match current_state {
82+
ParseState::Space => match normalized {
83+
Some(_) => continue,
84+
None => normalized = Some(Vec::from(&trimmed_attr[..idx])),
85+
},
86+
ParseState::CDATA => {
87+
current_state = ParseState::Space;
88+
match normalized.as_mut() {
89+
Some(buf) => buf.push(b' '),
90+
None => {
91+
let mut buf = Vec::from(&trimmed_attr[..idx]);
92+
buf.push(b' ');
93+
normalized = Some(buf);
94+
}
95+
}
96+
}
97+
},
98+
c @ _ => match current_state {
99+
ParseState::Space => {
100+
current_state = ParseState::CDATA;
101+
if let Some(normalized) = normalized.as_mut() {
102+
normalized.push(*c);
103+
}
104+
}
105+
ParseState::CDATA => {
106+
if let Some(normalized) = normalized.as_mut() {
107+
normalized.push(*c);
108+
}
109+
}
110+
},
111+
}
112+
}
113+
114+
let normalized = match normalized {
115+
Some(normalized) => Cow::Owned(normalized),
116+
None => Cow::Borrowed(trimmed_attr),
117+
};
39118
let escaped = do_unescape(&*normalized, None)?;
40119
Ok(Cow::Owned(escaped.into_owned()))
41120
}
@@ -190,90 +269,6 @@ impl<'a> From<Attr<&'a [u8]>> for Attribute<'a> {
190269
}
191270
}
192271

193-
/// Normalize the attribute value according to xml specification section 3.3.3
194-
///
195-
/// https://www.w3.org/TR/xml/#AVNormalize
196-
///
197-
/// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value
198-
/// * Sequences of whitespace-like characters are replaced with a single whitespace character
199-
/// * Character and entity references are substituted as defined by the spec
200-
fn normalize_attribute_value(attr: &[u8]) -> Cow<[u8]> {
201-
// TODO: character references, entity references, error handling associated with those
202-
203-
#[derive(PartialEq)]
204-
enum ParseState {
205-
Space,
206-
CDATA,
207-
}
208-
209-
// Trim characters from the beginning and end of the attribute value - this can't fail.
210-
fn trim_value(attr: &[u8]) -> &[u8] {
211-
let first_non_space_char = attr.iter().position(|c| !is_whitespace(*c));
212-
213-
if first_non_space_char.is_none() {
214-
// The entire value was whitespace-like characters
215-
return b"";
216-
}
217-
218-
let last_non_space_char = attr.iter().rposition(|c| !is_whitespace(*c));
219-
220-
// Trim all whitespace-like characters away from the beginning and end of the attribute value.
221-
let begin = first_non_space_char.unwrap();
222-
let end = last_non_space_char.unwrap_or(attr.len());
223-
&attr[begin..=end]
224-
}
225-
226-
let trimmed_attr = trim_value(attr);
227-
228-
// A new buffer is only created when we encounter a situation that requires it.
229-
let mut normalized: Option<Vec<u8>> = None;
230-
// We start on character data because all whitespace-like characters are already trimmed away.
231-
let mut current_state = ParseState::CDATA;
232-
233-
// Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
234-
// or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
235-
// buffer and continue using this buffer.
236-
for (idx, ch) in trimmed_attr.iter().enumerate() {
237-
match ch {
238-
b'\n' | b'\r' | b'\t' | b' ' => match current_state {
239-
ParseState::Space => match normalized {
240-
Some(_) => continue,
241-
None => normalized = Some(Vec::from(&trimmed_attr[..idx])),
242-
},
243-
ParseState::CDATA => {
244-
current_state = ParseState::Space;
245-
match normalized.as_mut() {
246-
Some(buf) => buf.push(b' '),
247-
None => {
248-
let mut buf = Vec::from(&trimmed_attr[..idx]);
249-
buf.push(b' ');
250-
normalized = Some(buf);
251-
}
252-
}
253-
}
254-
},
255-
c @ _ => match current_state {
256-
ParseState::Space => {
257-
current_state = ParseState::CDATA;
258-
if let Some(normalized) = normalized.as_mut() {
259-
normalized.push(*c);
260-
}
261-
}
262-
ParseState::CDATA => {
263-
if let Some(normalized) = normalized.as_mut() {
264-
normalized.push(*c);
265-
}
266-
}
267-
},
268-
}
269-
}
270-
271-
match normalized {
272-
Some(normalized) => Cow::Owned(normalized),
273-
None => Cow::Borrowed(trimmed_attr),
274-
}
275-
}
276-
277272
////////////////////////////////////////////////////////////////////////////////////////////////////
278273

279274
/// Iterator over XML attributes.
@@ -893,36 +888,56 @@ mod xml {
893888
#[test]
894889
fn attribute_value_normalization() {
895890
// empty value
896-
assert_eq!(normalize_attribute_value(b""), Cow::Borrowed(b""));
891+
let attr = Attribute::from(("foo", ""));
892+
assert_eq!(attr.normalized_value().unwrap(), Cow::Borrowed(b""));
893+
897894
// return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character
895+
let attr = Attribute::from(("foo", "\rfoo\rbar\tbaz\ndelta\n"));
898896
assert_eq!(
899-
normalize_attribute_value(b"\rfoo\rbar\tbaz\ndelta\n"),
897+
attr.normalized_value().unwrap(),
900898
Cow::Owned::<[u8]>(b"foo bar baz delta".to_vec())
901899
);
900+
902901
// leading and trailing spaces must be stripped
903-
assert_eq!(normalize_attribute_value(b" foo "), Cow::Borrowed(b"foo"));
902+
let attr = Attribute::from(("foo", " foo "));
903+
assert_eq!(attr.normalized_value().unwrap(), Cow::Borrowed(b"foo"));
904+
904905
// leading space
905-
assert_eq!(normalize_attribute_value(b" bar"), Cow::Borrowed(b"bar"));
906+
let attr = Attribute::from(("foo", " bar"));
907+
assert_eq!(attr.normalized_value().unwrap(), Cow::Borrowed(b"bar"));
908+
906909
// trailing space
907-
assert_eq!(normalize_attribute_value(b"baz "), Cow::Borrowed(b"baz"));
910+
let attr = Attribute::from(("foo", "baz "));
911+
assert_eq!(attr.normalized_value().unwrap(), Cow::Borrowed(b"baz"));
912+
908913
// sequences of spaces must be replaced with a single space
914+
let attr = Attribute::from(("foo", " foo bar baz "));
909915
assert_eq!(
910-
normalize_attribute_value(b" foo bar baz "),
916+
attr.normalized_value().unwrap(),
911917
Cow::Owned::<[u8]>(b"foo bar baz".to_vec())
912918
);
919+
913920
// sequence replacement mixed with characters treated as whitespace (\t \r \n)
921+
let attr = Attribute::from(("foo", " \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r"));
914922
assert_eq!(
915-
normalize_attribute_value(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r"),
923+
attr.normalized_value().unwrap(),
916924
Cow::Owned::<[u8]>(b"foo bar baz delta echo foxtrot".to_vec())
917925
);
926+
918927
// character references for whitespace-like characters are not combined after substitution
928+
let attr = Attribute::from(("foo", "&#x20;&#xD0;&#xA0;&#x90;"));
919929
assert_eq!(
920-
normalize_attribute_value(b"&#x20;&#xD0;&#xA0;&#x90;"),
930+
attr.normalized_value().unwrap(),
921931
Cow::Owned::<[u8]>(b" \r\t\n".to_vec())
922932
);
933+
923934
// sequence replacement mixed with characters treated as whitespace (\t \r \n)
935+
let attr = Attribute::from((
936+
"foo",
937+
" &#x20;foo\tbar baz &#xA0;delta\n&#x90;\r echo foxtrot&#xD0;",
938+
));
924939
assert_eq!(
925-
normalize_attribute_value(b" &#x20;foo\tbar baz &#xA0;delta\n&#x90;\r echo foxtrot&#xD0;"),
940+
attr.normalized_value().unwrap(),
926941
Cow::Owned::<[u8]>(b" foo bar baz \ndelta \t echo foxtrot\r".to_vec())
927942
);
928943
}

0 commit comments

Comments
 (0)