Skip to content

Commit cd6f813

Browse files
committed
XML 1.0 rules for EOL normalization equals to HTML rules, so rename corresponding methods
HTML: https://html.spec.whatwg.org/#normalize-newlines XML 1.0: https://www.w3.org/TR/xml/#sec-line-ends XML 1.1: https://www.w3.org/TR/xml11/#sec-line-ends XML 1.0 and HTML normalizes only \r and \r\n to \n XML 1.1 additionally normalizes \r\x85, \x85 and \u2028 to \n
1 parent 66d8fa2 commit cd6f813

File tree

3 files changed

+125
-72
lines changed

3 files changed

+125
-72
lines changed

Changelog.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@
2323

2424
### Misc Changes
2525

26+
- [#895]: Add new `xml10_content()` and `xml11_content()` methods which behaves the same as
27+
`html_content()` and `xml_content()` methods, but express intention more clearly.
28+
2629
[#895]: https://github.com/tafia/quick-xml/pull/895
2730

2831

src/escape.rs

Lines changed: 38 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,7 @@ where
305305
////////////////////////////////////////////////////////////////////////////////////////////////////
306306

307307
// TODO: It would be better to reuse buffer after decoding if possible
308-
pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str> {
308+
pub(crate) fn normalize_xml11_eols<'input>(text: &'input str) -> Cow<'input, str> {
309309
let bytes = text.as_bytes();
310310

311311
// The following sequences of UTF-8 encoded input should be translated into
@@ -326,13 +326,13 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str>
326326
// we are sure that index within string
327327
normalized.push_str(&text[0..i]);
328328

329-
let mut pos = normalize_xml_eol_step(&mut normalized, text, i, '\n');
329+
let mut pos = normalize_xml11_eol_step(&mut normalized, text, i, '\n');
330330
while let Some(i) = memchr3(b'\r', 0xC2, 0xE2, &bytes[pos..]) {
331331
let index = pos + i;
332332
// NOTE: unsafe { text.get_unchecked(pos..index) } could be used because
333333
// we are sure that index within string
334334
normalized.push_str(&text[pos..index]);
335-
pos = normalize_xml_eol_step(&mut normalized, text, index, '\n');
335+
pos = normalize_xml11_eol_step(&mut normalized, text, index, '\n');
336336
}
337337
if let Some(rest) = text.get(pos..) {
338338
normalized.push_str(rest);
@@ -378,7 +378,7 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str>
378378
///
379379
/// [eof]: https://www.w3.org/TR/xml11/#sec-line-ends
380380
/// [only for]: https://html.spec.whatwg.org/#normalize-newlines
381-
fn normalize_xml_eol_step(normalized: &mut String, text: &str, index: usize, ch: char) -> usize {
381+
fn normalize_xml11_eol_step(normalized: &mut String, text: &str, index: usize, ch: char) -> usize {
382382
let input = text.as_bytes();
383383
match input[index] {
384384
b'\r' => {
@@ -441,7 +441,7 @@ fn normalize_xml_eol_step(normalized: &mut String, text: &str, index: usize, ch:
441441
////////////////////////////////////////////////////////////////////////////////////////////////////
442442

443443
// TODO: It would be better to reuse buffer after decoding if possible
444-
pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str> {
444+
pub(crate) fn normalize_xml10_eols<'input>(text: &'input str) -> Cow<'input, str> {
445445
let bytes = text.as_bytes();
446446

447447
// The following sequences of UTF-8 encoded input should be translated into
@@ -459,13 +459,13 @@ pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str>
459459
// we are sure that index within string
460460
normalized.push_str(&text[0..i]);
461461

462-
let mut pos = normalize_html_eol_step(&mut normalized, bytes, i, '\n');
462+
let mut pos = normalize_xml10_eol_step(&mut normalized, bytes, i, '\n');
463463
while let Some(i) = memchr(b'\r', &bytes[pos..]) {
464464
let index = pos + i;
465465
// NOTE: unsafe { text.get_unchecked(pos..index) } could be used because
466466
// we are sure that index within string
467467
normalized.push_str(&text[pos..index]);
468-
pos = normalize_html_eol_step(&mut normalized, bytes, index, '\n');
468+
pos = normalize_xml10_eol_step(&mut normalized, bytes, index, '\n');
469469
}
470470
if let Some(rest) = text.get(pos..) {
471471
normalized.push_str(rest);
@@ -487,7 +487,12 @@ pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str>
487487
/// - `ch`: a character that should be put to the string instead of newline sequence
488488
///
489489
/// [only for]: https://html.spec.whatwg.org/#normalize-newlines
490-
fn normalize_html_eol_step(normalized: &mut String, input: &[u8], index: usize, ch: char) -> usize {
490+
fn normalize_xml10_eol_step(
491+
normalized: &mut String,
492+
input: &[u8],
493+
index: usize,
494+
ch: char,
495+
) -> usize {
491496
match input[index] {
492497
b'\r' => {
493498
normalized.push(ch);
@@ -2062,56 +2067,59 @@ mod normalization {
20622067
mod eol {
20632068
use super::*;
20642069

2065-
mod xml {
2070+
mod xml11 {
20662071
use super::*;
20672072
use pretty_assertions::assert_eq;
20682073

20692074
#[test]
20702075
fn empty() {
2071-
assert_eq!(normalize_xml_eols(""), "");
2076+
assert_eq!(normalize_xml11_eols(""), "");
20722077
}
20732078

20742079
#[test]
20752080
fn already_normalized() {
20762081
assert_eq!(
2077-
normalize_xml_eols("\nalready \n\n normalized\n"),
2082+
normalize_xml11_eols("\nalready \n\n normalized\n"),
20782083
"\nalready \n\n normalized\n",
20792084
);
20802085
}
20812086

20822087
#[test]
20832088
fn cr_lf() {
2084-
assert_eq!(normalize_xml_eols("\r\nsome\r\n\r\ntext"), "\nsome\n\ntext");
2089+
assert_eq!(
2090+
normalize_xml11_eols("\r\nsome\r\n\r\ntext"),
2091+
"\nsome\n\ntext"
2092+
);
20852093
}
20862094

20872095
#[test]
20882096
fn cr_u0085() {
20892097
assert_eq!(
2090-
normalize_xml_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"),
2098+
normalize_xml11_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"),
20912099
"\nsome\n\ntext",
20922100
);
20932101
}
20942102

20952103
#[test]
20962104
fn u0085() {
20972105
assert_eq!(
2098-
normalize_xml_eols("\u{0085}some\u{0085}\u{0085}text"),
2106+
normalize_xml11_eols("\u{0085}some\u{0085}\u{0085}text"),
20992107
"\nsome\n\ntext",
21002108
);
21012109
}
21022110

21032111
#[test]
21042112
fn u2028() {
21052113
assert_eq!(
2106-
normalize_xml_eols("\u{2028}some\u{2028}\u{2028}text"),
2114+
normalize_xml11_eols("\u{2028}some\u{2028}\u{2028}text"),
21072115
"\nsome\n\ntext",
21082116
);
21092117
}
21102118

21112119
#[test]
21122120
fn mixed() {
21132121
assert_eq!(
2114-
normalize_xml_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"),
2122+
normalize_xml11_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"),
21152123
"\n\n\n\n\n\nsome\n\n\ntext",
21162124
);
21172125
}
@@ -2138,9 +2146,9 @@ mod normalization {
21382146

21392147
dbg!((input, &description));
21402148
if ch == '\u{0085}' {
2141-
assert_eq!(normalize_xml_eols(input), "\n", "{}", description);
2149+
assert_eq!(normalize_xml11_eols(input), "\n", "{}", description);
21422150
} else {
2143-
assert_eq!(normalize_xml_eols(input), input, "{}", description);
2151+
assert_eq!(normalize_xml11_eols(input), input, "{}", description);
21442152
}
21452153
}
21462154
assert_eq!((first..=last).count(), 64);
@@ -2171,12 +2179,12 @@ mod normalization {
21712179

21722180
dbg!((input, &description));
21732181
if ch == '\u{0085}' {
2174-
assert_eq!(normalize_xml_eols(input), "\n", "{}", description);
2182+
assert_eq!(normalize_xml11_eols(input), "\n", "{}", description);
21752183
} else {
21762184
let mut expected = utf8.clone();
21772185
expected[0] = b'\n';
21782186
let expected = std::str::from_utf8(&expected).expect(&description);
2179-
assert_eq!(normalize_xml_eols(input), expected, "{}", description);
2187+
assert_eq!(normalize_xml11_eols(input), expected, "{}", description);
21802188
}
21812189
}
21822190
assert_eq!((first..=last).count(), 64);
@@ -2207,68 +2215,68 @@ mod normalization {
22072215

22082216
dbg!((input, &description));
22092217
if ch == '\u{2028}' {
2210-
assert_eq!(normalize_xml_eols(input), "\n", "{}", description);
2218+
assert_eq!(normalize_xml11_eols(input), "\n", "{}", description);
22112219
} else {
2212-
assert_eq!(normalize_xml_eols(input), input, "{}", description);
2220+
assert_eq!(normalize_xml11_eols(input), input, "{}", description);
22132221
}
22142222
}
22152223
assert_eq!((first..=last).count(), 4096);
22162224
}
22172225
}
22182226

2219-
mod html {
2227+
mod xml10 {
22202228
use super::*;
22212229
use pretty_assertions::assert_eq;
22222230

22232231
#[test]
22242232
fn empty() {
2225-
assert_eq!(normalize_html_eols(""), "");
2233+
assert_eq!(normalize_xml10_eols(""), "");
22262234
}
22272235

22282236
#[test]
22292237
fn already_normalized() {
22302238
assert_eq!(
2231-
normalize_html_eols("\nalready \n\n normalized\n"),
2239+
normalize_xml10_eols("\nalready \n\n normalized\n"),
22322240
"\nalready \n\n normalized\n",
22332241
);
22342242
}
22352243

22362244
#[test]
22372245
fn cr_lf() {
22382246
assert_eq!(
2239-
normalize_html_eols("\r\nsome\r\n\r\ntext"),
2247+
normalize_xml10_eols("\r\nsome\r\n\r\ntext"),
22402248
"\nsome\n\ntext"
22412249
);
22422250
}
22432251

22442252
#[test]
22452253
fn cr_u0085() {
22462254
assert_eq!(
2247-
normalize_html_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"),
2255+
normalize_xml10_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"),
22482256
"\n\u{0085}some\n\u{0085}\n\u{0085}text",
22492257
);
22502258
}
22512259

22522260
#[test]
22532261
fn u0085() {
22542262
assert_eq!(
2255-
normalize_html_eols("\u{0085}some\u{0085}\u{0085}text"),
2263+
normalize_xml10_eols("\u{0085}some\u{0085}\u{0085}text"),
22562264
"\u{0085}some\u{0085}\u{0085}text",
22572265
);
22582266
}
22592267

22602268
#[test]
22612269
fn u2028() {
22622270
assert_eq!(
2263-
normalize_html_eols("\u{2028}some\u{2028}\u{2028}text"),
2271+
normalize_xml10_eols("\u{2028}some\u{2028}\u{2028}text"),
22642272
"\u{2028}some\u{2028}\u{2028}text",
22652273
);
22662274
}
22672275

22682276
#[test]
22692277
fn mixed() {
22702278
assert_eq!(
2271-
normalize_html_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"),
2279+
normalize_xml10_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"),
22722280
"\n\n\n\u{2028}\n\nsome\n\u{0085}\n\u{0085}text",
22732281
);
22742282
}

0 commit comments

Comments
 (0)