Skip to content

Commit e7fa0ce

Browse files
authored
Merge pull request #895 from Mingun/fix-eol-normalization
Fix EOL normalization in `\r` followed by any character which is encoded as `c2 XX` in UTF-8 except `c2 85`
2 parents f8a8364 + cd6f813 commit e7fa0ce

File tree

3 files changed

+136
-75
lines changed

3 files changed

+136
-75
lines changed

Changelog.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,16 @@
1818

1919
### Bug Fixes
2020

21+
- [#895]: Fix incorrect normalization of `\rX` EOL sequences where `X` is a char which is
22+
UTF-8 encoded as [c2 xx], except [c2 85].
23+
2124
### Misc Changes
2225

26+
- [#895]: Add new `xml10_content()` and `xml11_content()` methods which behaves the same as
27+
`html_content()` and `xml_content()` methods, but express intention more clearly.
28+
29+
[#895]: https://github.com/tafia/quick-xml/pull/895
30+
2331

2432
## 0.38.2 -- 2025-08-19
2533

src/escape.rs

Lines changed: 44 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,7 @@ where
305305
////////////////////////////////////////////////////////////////////////////////////////////////////
306306

307307
// TODO: It would be better to reuse buffer after decoding if possible
308-
pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str> {
308+
pub(crate) fn normalize_xml11_eols<'input>(text: &'input str) -> Cow<'input, str> {
309309
let bytes = text.as_bytes();
310310

311311
// The following sequences of UTF-8 encoded input should be translated into
@@ -326,13 +326,13 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str>
326326
// we are sure that index within string
327327
normalized.push_str(&text[0..i]);
328328

329-
let mut pos = normalize_xml_eol_step(&mut normalized, text, i, '\n');
329+
let mut pos = normalize_xml11_eol_step(&mut normalized, text, i, '\n');
330330
while let Some(i) = memchr3(b'\r', 0xC2, 0xE2, &bytes[pos..]) {
331331
let index = pos + i;
332332
// NOTE: unsafe { text.get_unchecked(pos..index) } could be used because
333333
// we are sure that index within string
334334
normalized.push_str(&text[pos..index]);
335-
pos = normalize_xml_eol_step(&mut normalized, text, index, '\n');
335+
pos = normalize_xml11_eol_step(&mut normalized, text, index, '\n');
336336
}
337337
if let Some(rest) = text.get(pos..) {
338338
normalized.push_str(rest);
@@ -378,7 +378,7 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str>
378378
///
379379
/// [eof]: https://www.w3.org/TR/xml11/#sec-line-ends
380380
/// [only for]: https://html.spec.whatwg.org/#normalize-newlines
381-
fn normalize_xml_eol_step(normalized: &mut String, text: &str, index: usize, ch: char) -> usize {
381+
fn normalize_xml11_eol_step(normalized: &mut String, text: &str, index: usize, ch: char) -> usize {
382382
let input = text.as_bytes();
383383
match input[index] {
384384
b'\r' => {
@@ -388,15 +388,15 @@ fn normalize_xml_eol_step(normalized: &mut String, text: &str, index: usize, ch:
388388
normalized.push(ch);
389389
return index + 2; // skip \r\n
390390
}
391-
// Because input is correct UTF-8 and in UTF-8 every character has
392-
// an unique prefix, byte C2 means only start of #x85 character
393391
if next == 0xC2 {
392+
// UTF-8 encoding of #x85 character is [c2 85]
394393
if index + 2 < input.len() && input[index + 2] == 0x85 {
395394
normalized.push(ch);
396395
} else {
396+
normalized.push(ch);
397397
// NOTE: unsafe { text.get_unchecked(index..index + 3) } could be used because
398398
// we are sure that index within string
399-
normalized.push_str(&text[index..index + 3]);
399+
normalized.push_str(&text[index + 1..index + 3]);
400400
}
401401
return index + 3; // skip \r + UTF-8 encoding of character (c2 xx)
402402
}
@@ -441,7 +441,7 @@ fn normalize_xml_eol_step(normalized: &mut String, text: &str, index: usize, ch:
441441
////////////////////////////////////////////////////////////////////////////////////////////////////
442442

443443
// TODO: It would be better to reuse buffer after decoding if possible
444-
pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str> {
444+
pub(crate) fn normalize_xml10_eols<'input>(text: &'input str) -> Cow<'input, str> {
445445
let bytes = text.as_bytes();
446446

447447
// The following sequences of UTF-8 encoded input should be translated into
@@ -459,13 +459,13 @@ pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str>
459459
// we are sure that index within string
460460
normalized.push_str(&text[0..i]);
461461

462-
let mut pos = normalize_html_eol_step(&mut normalized, bytes, i, '\n');
462+
let mut pos = normalize_xml10_eol_step(&mut normalized, bytes, i, '\n');
463463
while let Some(i) = memchr(b'\r', &bytes[pos..]) {
464464
let index = pos + i;
465465
// NOTE: unsafe { text.get_unchecked(pos..index) } could be used because
466466
// we are sure that index within string
467467
normalized.push_str(&text[pos..index]);
468-
pos = normalize_html_eol_step(&mut normalized, bytes, index, '\n');
468+
pos = normalize_xml10_eol_step(&mut normalized, bytes, index, '\n');
469469
}
470470
if let Some(rest) = text.get(pos..) {
471471
normalized.push_str(rest);
@@ -487,7 +487,12 @@ pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str>
487487
/// - `ch`: a character that should be put to the string instead of newline sequence
488488
///
489489
/// [only for]: https://html.spec.whatwg.org/#normalize-newlines
490-
fn normalize_html_eol_step(normalized: &mut String, input: &[u8], index: usize, ch: char) -> usize {
490+
fn normalize_xml10_eol_step(
491+
normalized: &mut String,
492+
input: &[u8],
493+
index: usize,
494+
ch: char,
495+
) -> usize {
491496
match input[index] {
492497
b'\r' => {
493498
normalized.push(ch);
@@ -2062,56 +2067,59 @@ mod normalization {
20622067
mod eol {
20632068
use super::*;
20642069

2065-
mod xml {
2070+
mod xml11 {
20662071
use super::*;
20672072
use pretty_assertions::assert_eq;
20682073

20692074
#[test]
20702075
fn empty() {
2071-
assert_eq!(normalize_xml_eols(""), "");
2076+
assert_eq!(normalize_xml11_eols(""), "");
20722077
}
20732078

20742079
#[test]
20752080
fn already_normalized() {
20762081
assert_eq!(
2077-
normalize_xml_eols("\nalready \n\n normalized\n"),
2082+
normalize_xml11_eols("\nalready \n\n normalized\n"),
20782083
"\nalready \n\n normalized\n",
20792084
);
20802085
}
20812086

20822087
#[test]
20832088
fn cr_lf() {
2084-
assert_eq!(normalize_xml_eols("\r\nsome\r\n\r\ntext"), "\nsome\n\ntext");
2089+
assert_eq!(
2090+
normalize_xml11_eols("\r\nsome\r\n\r\ntext"),
2091+
"\nsome\n\ntext"
2092+
);
20852093
}
20862094

20872095
#[test]
20882096
fn cr_u0085() {
20892097
assert_eq!(
2090-
normalize_xml_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"),
2098+
normalize_xml11_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"),
20912099
"\nsome\n\ntext",
20922100
);
20932101
}
20942102

20952103
#[test]
20962104
fn u0085() {
20972105
assert_eq!(
2098-
normalize_xml_eols("\u{0085}some\u{0085}\u{0085}text"),
2106+
normalize_xml11_eols("\u{0085}some\u{0085}\u{0085}text"),
20992107
"\nsome\n\ntext",
21002108
);
21012109
}
21022110

21032111
#[test]
21042112
fn u2028() {
21052113
assert_eq!(
2106-
normalize_xml_eols("\u{2028}some\u{2028}\u{2028}text"),
2114+
normalize_xml11_eols("\u{2028}some\u{2028}\u{2028}text"),
21072115
"\nsome\n\ntext",
21082116
);
21092117
}
21102118

21112119
#[test]
21122120
fn mixed() {
21132121
assert_eq!(
2114-
normalize_xml_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"),
2122+
normalize_xml11_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"),
21152123
"\n\n\n\n\n\nsome\n\n\ntext",
21162124
);
21172125
}
@@ -2138,9 +2146,9 @@ mod normalization {
21382146

21392147
dbg!((input, &description));
21402148
if ch == '\u{0085}' {
2141-
assert_eq!(normalize_xml_eols(input), "\n", "{}", description);
2149+
assert_eq!(normalize_xml11_eols(input), "\n", "{}", description);
21422150
} else {
2143-
assert_eq!(normalize_xml_eols(input), input, "{}", description);
2151+
assert_eq!(normalize_xml11_eols(input), input, "{}", description);
21442152
}
21452153
}
21462154
assert_eq!((first..=last).count(), 64);
@@ -2171,9 +2179,12 @@ mod normalization {
21712179

21722180
dbg!((input, &description));
21732181
if ch == '\u{0085}' {
2174-
assert_eq!(normalize_xml_eols(input), "\n", "{}", description);
2182+
assert_eq!(normalize_xml11_eols(input), "\n", "{}", description);
21752183
} else {
2176-
assert_eq!(normalize_xml_eols(input), input, "{}", description);
2184+
let mut expected = utf8.clone();
2185+
expected[0] = b'\n';
2186+
let expected = std::str::from_utf8(&expected).expect(&description);
2187+
assert_eq!(normalize_xml11_eols(input), expected, "{}", description);
21772188
}
21782189
}
21792190
assert_eq!((first..=last).count(), 64);
@@ -2204,68 +2215,68 @@ mod normalization {
22042215

22052216
dbg!((input, &description));
22062217
if ch == '\u{2028}' {
2207-
assert_eq!(normalize_xml_eols(input), "\n", "{}", description);
2218+
assert_eq!(normalize_xml11_eols(input), "\n", "{}", description);
22082219
} else {
2209-
assert_eq!(normalize_xml_eols(input), input, "{}", description);
2220+
assert_eq!(normalize_xml11_eols(input), input, "{}", description);
22102221
}
22112222
}
22122223
assert_eq!((first..=last).count(), 4096);
22132224
}
22142225
}
22152226

2216-
mod html {
2227+
mod xml10 {
22172228
use super::*;
22182229
use pretty_assertions::assert_eq;
22192230

22202231
#[test]
22212232
fn empty() {
2222-
assert_eq!(normalize_html_eols(""), "");
2233+
assert_eq!(normalize_xml10_eols(""), "");
22232234
}
22242235

22252236
#[test]
22262237
fn already_normalized() {
22272238
assert_eq!(
2228-
normalize_html_eols("\nalready \n\n normalized\n"),
2239+
normalize_xml10_eols("\nalready \n\n normalized\n"),
22292240
"\nalready \n\n normalized\n",
22302241
);
22312242
}
22322243

22332244
#[test]
22342245
fn cr_lf() {
22352246
assert_eq!(
2236-
normalize_html_eols("\r\nsome\r\n\r\ntext"),
2247+
normalize_xml10_eols("\r\nsome\r\n\r\ntext"),
22372248
"\nsome\n\ntext"
22382249
);
22392250
}
22402251

22412252
#[test]
22422253
fn cr_u0085() {
22432254
assert_eq!(
2244-
normalize_html_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"),
2255+
normalize_xml10_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"),
22452256
"\n\u{0085}some\n\u{0085}\n\u{0085}text",
22462257
);
22472258
}
22482259

22492260
#[test]
22502261
fn u0085() {
22512262
assert_eq!(
2252-
normalize_html_eols("\u{0085}some\u{0085}\u{0085}text"),
2263+
normalize_xml10_eols("\u{0085}some\u{0085}\u{0085}text"),
22532264
"\u{0085}some\u{0085}\u{0085}text",
22542265
);
22552266
}
22562267

22572268
#[test]
22582269
fn u2028() {
22592270
assert_eq!(
2260-
normalize_html_eols("\u{2028}some\u{2028}\u{2028}text"),
2271+
normalize_xml10_eols("\u{2028}some\u{2028}\u{2028}text"),
22612272
"\u{2028}some\u{2028}\u{2028}text",
22622273
);
22632274
}
22642275

22652276
#[test]
22662277
fn mixed() {
22672278
assert_eq!(
2268-
normalize_html_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"),
2279+
normalize_xml10_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"),
22692280
"\n\n\n\u{2028}\n\nsome\n\u{0085}\n\u{0085}text",
22702281
);
22712282
}

0 commit comments

Comments
 (0)