Skip to content

Commit bb16e17

Browse files
dralleyMingun
authored andcommitted
More correctly normalize attribute values
1 parent f7add85 commit bb16e17

File tree

5 files changed

+255
-4
lines changed

5 files changed

+255
-4
lines changed

Changelog.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,17 @@
1515

1616
### New Features
1717

18+
- [#379]: Improved compliance with the XML attribute value normalization process by
19+
adding `Attribute::normalized_value()` and `Attribute::normalized_value_with()`,
20+
which ought to be used in place of `Attribute::unescape_value()` and
21+
`Attribute::unescape_value_with()`
22+
1823
### Bug Fixes
1924

2025
### Misc Changes
2126

27+
[#379]: https://github.com/tafia/quick-xml/pull/379
28+
2229

2330
## 0.38.0 -- 2025-06-28
2431

benches/macrobenches.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,14 +44,13 @@ static INPUTS: &[(&str, &str)] = &[
4444
("players.xml", PLAYERS),
4545
];
4646

47-
// TODO: use fully normalized attribute values
4847
fn parse_document_from_str(doc: &str) -> XmlResult<()> {
4948
let mut r = Reader::from_str(doc);
5049
loop {
5150
match black_box(r.read_event()?) {
5251
Event::Start(e) | Event::Empty(e) => {
5352
for attr in e.attributes() {
54-
black_box(attr?.decode_and_unescape_value(r.decoder())?);
53+
black_box(attr?.decode_and_normalize_value(r.decoder(), 128)?);
5554
}
5655
}
5756
Event::Text(e) => {

benches/microbenches.rs

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,50 @@ fn attributes(c: &mut Criterion) {
243243
assert_eq!(count, 150);
244244
})
245245
});
246+
247+
group.finish();
248+
}
249+
250+
/// Benchmarks normalizing attribute values
251+
fn attribute_value_normalization(c: &mut Criterion) {
252+
let mut group = c.benchmark_group("attribute_value_normalization");
253+
254+
group.bench_function("noop_short", |b| {
255+
b.iter(|| {
256+
black_box(unescape("foobar")).unwrap();
257+
})
258+
});
259+
260+
group.bench_function("noop_long", |b| {
261+
b.iter(|| {
262+
black_box(unescape("just a bit of text without any entities")).unwrap();
263+
})
264+
});
265+
266+
group.bench_function("replacement_chars", |b| {
267+
b.iter(|| {
268+
black_box(unescape("just a bit\n of text without\tany entities")).unwrap();
269+
})
270+
});
271+
272+
group.bench_function("char_reference", |b| {
273+
b.iter(|| {
274+
let text = "prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
275+
black_box(unescape(text)).unwrap();
276+
let text = "&#38;&#60;";
277+
black_box(unescape(text)).unwrap();
278+
})
279+
});
280+
281+
group.bench_function("entity_reference", |b| {
282+
b.iter(|| {
283+
let text = "age &gt; 72 &amp;&amp; age &lt; 21";
284+
black_box(unescape(text)).unwrap();
285+
let text = "&quot;what&apos;s that?&quot;";
286+
black_box(unescape(text)).unwrap();
287+
})
288+
});
289+
246290
group.finish();
247291
}
248292

@@ -355,6 +399,7 @@ criterion_group!(
355399
read_resolved_event_into,
356400
one_event,
357401
attributes,
402+
attribute_value_normalization,
358403
escaping,
359404
unescaping,
360405
);

src/errors.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ impl From<EscapeError> for Error {
229229
}
230230

231231
impl From<AttrError> for Error {
232+
/// Creates a new `Error::InvalidAttr` from the given error
232233
#[inline]
233234
fn from(error: AttrError) -> Self {
234235
Self::InvalidAttr(error)

src/events/attributes.rs

Lines changed: 201 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
55
use crate::encoding::Decoder;
66
use crate::errors::Result as XmlResult;
7-
use crate::escape::{escape, resolve_predefined_entity, unescape_with};
7+
#[cfg(any(doc, not(feature = "encoding")))]
8+
use crate::escape::EscapeError;
9+
use crate::escape::{escape, normalize_attribute_value, resolve_predefined_entity, unescape_with};
810
use crate::name::{LocalName, Namespace, QName};
911
use crate::reader::NsReader;
1012
use crate::utils::{is_whitespace, Bytes};
@@ -32,7 +34,119 @@ pub struct Attribute<'a> {
3234
}
3335

3436
impl<'a> Attribute<'a> {
35-
/// Decodes using UTF-8 then unescapes the value.
37+
/// Returns the attribute value normalized as per [the XML specification].
38+
///
39+
/// Do not use this method with HTML attributes.
40+
///
41+
/// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
42+
/// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters.
43+
///
44+
/// This will allocate unless the raw attribute value does not require normalization.
45+
///
46+
/// See also [`normalized_value_with()`](#method.normalized_value_with)
47+
///
48+
/// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize
49+
#[cfg(any(doc, not(feature = "encoding")))]
50+
pub fn normalized_value(&self, depth: usize) -> Result<Cow<'a, str>, EscapeError> {
51+
self.normalized_value_with(depth, |_| None)
52+
}
53+
54+
/// Returns the attribute value normalized as per [the XML specification],
55+
/// using a custom entity resolver.
56+
///
57+
/// Do not use this method with HTML attributes.
58+
///
59+
/// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
60+
/// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
61+
/// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
62+
/// take precedence.
63+
///
64+
/// This will allocate unless the raw attribute value does not require normalization.
65+
///
66+
/// See also [`normalized_value()`](#method.normalized_value)
67+
///
68+
/// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize
69+
#[cfg(any(doc, not(feature = "encoding")))]
70+
pub fn normalized_value_with<'entity>(
71+
&self,
72+
depth: usize,
73+
resolve_entity: impl Fn(&str) -> Option<&'entity str>,
74+
) -> Result<Cow<'a, str>, EscapeError> {
75+
let decoded = match &self.value {
76+
Cow::Borrowed(bytes) => {
77+
Cow::Borrowed(std::str::from_utf8(bytes).expect("unable to decode as utf-8"))
78+
}
79+
// Convert to owned, because otherwise Cow will be bound with wrong lifetime
80+
Cow::Owned(bytes) => Cow::Owned(
81+
std::str::from_utf8(bytes)
82+
.expect("unable to decode as utf-8")
83+
.to_owned(),
84+
),
85+
};
86+
87+
match normalize_attribute_value(&decoded, depth, resolve_entity)? {
88+
// Because result is borrowed, no replacements was done and we can use original string
89+
Cow::Borrowed(_) => Ok(decoded),
90+
Cow::Owned(s) => Ok(s.into()),
91+
}
92+
}
93+
94+
/// Decodes using a provided reader and returns the attribute value normalized
95+
/// as per [the XML specification].
96+
///
97+
/// Do not use this method with HTML attributes.
98+
///
99+
/// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
100+
/// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters.
101+
///
102+
/// This will allocate unless the raw attribute value does not require normalization.
103+
///
104+
/// See also [`decode_and_normalize_value_with()`](#method.decode_and_normalize_value_with)
105+
///
106+
/// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize
107+
pub fn decode_and_normalize_value(
108+
&self,
109+
decoder: Decoder,
110+
depth: usize,
111+
) -> XmlResult<Cow<'a, str>> {
112+
self.decode_and_normalize_value_with(decoder, depth, |_| None)
113+
}
114+
115+
/// Decodes using a provided reader and returns the attribute value normalized
116+
/// as per [the XML specification], using a custom entity resolver.
117+
///
118+
/// Do not use this method with HTML attributes.
119+
///
120+
/// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
121+
/// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
122+
/// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
123+
/// take precedence.
124+
///
125+
/// This will allocate unless the raw attribute value does not require normalization.
126+
///
127+
/// See also [`decode_and_normalize_value()`](#method.decode_and_normalize_value)
128+
///
129+
/// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize
130+
pub fn decode_and_normalize_value_with<'entity>(
131+
&self,
132+
decoder: Decoder,
133+
depth: usize,
134+
resolve_entity: impl Fn(&str) -> Option<&'entity str>,
135+
) -> XmlResult<Cow<'a, str>> {
136+
let decoded = match &self.value {
137+
Cow::Borrowed(bytes) => decoder.decode(bytes)?,
138+
// Convert to owned, because otherwise Cow will be bound with wrong lifetime
139+
Cow::Owned(bytes) => decoder.decode(bytes)?.into_owned().into(),
140+
};
141+
142+
match normalize_attribute_value(&decoded, depth, resolve_entity)? {
143+
// Because result is borrowed, no replacements was done and we can use original string
144+
Cow::Borrowed(_) => Ok(decoded),
145+
Cow::Owned(s) => Ok(s.into()),
146+
}
147+
}
148+
149+
/// Returns the unescaped value.
36150
///
37151
/// This is normally the value you are interested in. Escape sequences such as `&gt;` are
38152
/// replaced with their unescaped equivalents such as `>`.
@@ -1011,6 +1125,91 @@ mod xml {
10111125
use super::*;
10121126
use pretty_assertions::assert_eq;
10131127

1128+
#[cfg(any(doc, not(feature = "encoding")))]
1129+
mod attribute_value_normalization {
1130+
1131+
use super::*;
1132+
use pretty_assertions::assert_eq;
1133+
1134+
/// Empty values returned are unchanged
1135+
#[test]
1136+
fn test_empty() {
1137+
let raw_value = "".as_bytes();
1138+
let attr = Attribute::from(("foo".as_bytes(), raw_value));
1139+
assert_eq!(attr.normalized_value(5), Ok(Cow::Borrowed("")));
1140+
}
1141+
1142+
/// Already normalized values are returned unchanged
1143+
#[test]
1144+
fn test_already_normalized() {
1145+
let raw_value = "foobar123".as_bytes();
1146+
let output = "foobar123";
1147+
let attr = Attribute::from(("foo".as_bytes(), raw_value));
1148+
assert_eq!(attr.normalized_value(5), Ok(Cow::Borrowed(output)));
1149+
}
1150+
1151+
/// Return, tab, and newline characters (0xD, 0x9, 0xA) must be substituted with
1152+
/// a space character
1153+
#[test]
1154+
fn test_space_replacement() {
1155+
let raw_value = "\r\nfoo\rbar\tbaz\n\ndelta\n".as_bytes();
1156+
let output = " foo bar baz delta ".to_owned();
1157+
let attr = Attribute::from(("foo".as_bytes(), raw_value));
1158+
assert_eq!(attr.normalized_value(5), Ok(Cow::Owned(output)));
1159+
}
1160+
1161+
/// Entities must be terminated
1162+
#[test]
1163+
fn test_unterminated_entity() {
1164+
let raw_value = "abc&quotdef".as_bytes();
1165+
let attr = Attribute::from(("foo".as_bytes(), raw_value));
1166+
assert_eq!(
1167+
attr.normalized_value(5),
1168+
Err(EscapeError::UnterminatedEntity(3..11))
1169+
);
1170+
}
1171+
1172+
/// Unknown entities raise error
1173+
#[test]
1174+
fn test_unrecognized_entity() {
1175+
let raw_value = "abc&unkn;def".as_bytes();
1176+
let attr = Attribute::from(("foo".as_bytes(), raw_value));
1177+
assert_eq!(
1178+
attr.normalized_value(5),
1179+
Err(EscapeError::UnrecognizedEntity(4..8, "unkn".to_owned())) // TODO: is this divergence between range behavior of UnterminatedEntity and UnrecognizedEntity appropriate? existing unescape code behaves the same. (see: start index)
1180+
);
1181+
}
1182+
1183+
/// custom entity replacement works, entity replacement text processed recursively
1184+
#[test]
1185+
fn test_entity_replacement() {
1186+
let raw_value = "&d;&d;A&a;&#x20;&a;B&da;".as_bytes();
1187+
let output = "\r\rA\n \nB\r\n".to_owned();
1188+
let attr = Attribute::from(("foo".as_bytes(), raw_value));
1189+
fn custom_resolver(ent: &str) -> Option<&'static str> {
1190+
match ent {
1191+
"d" => Some("&#xD;"),
1192+
"a" => Some("&#xA;"),
1193+
"da" => Some("&#xD;&#xA;"),
1194+
_ => None,
1195+
}
1196+
}
1197+
assert_eq!(
1198+
attr.normalized_value_with(5, &custom_resolver),
1199+
Ok(Cow::Owned(output))
1200+
);
1201+
}
1202+
1203+
#[test]
1204+
fn test_char_references() {
1205+
// character literal references are substituted without being replaced by spaces
1206+
let raw_value = "&#xd;&#xd;A&#xa;&#xa;B&#xd;&#xa;".as_bytes();
1207+
let output = "\r\rA\n\nB\r\n".to_owned();
1208+
let attr = Attribute::from(("foo".as_bytes(), raw_value));
1209+
assert_eq!(attr.normalized_value(5), Ok(Cow::Owned(output)));
1210+
}
1211+
}
1212+
10141213
/// Checked attribute is the single attribute
10151214
mod single {
10161215
use super::*;

0 commit comments

Comments
 (0)