@@ -305,7 +305,7 @@ where
305
305
////////////////////////////////////////////////////////////////////////////////////////////////////
306
306
307
307
// TODO: It would be better to reuse buffer after decoding if possible
308
- pub ( crate ) fn normalize_xml_eols < ' input > ( text : & ' input str ) -> Cow < ' input , str > {
308
+ pub ( crate ) fn normalize_xml11_eols < ' input > ( text : & ' input str ) -> Cow < ' input , str > {
309
309
let bytes = text. as_bytes ( ) ;
310
310
311
311
// The following sequences of UTF-8 encoded input should be translated into
@@ -326,13 +326,13 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str>
326
326
// we are sure that index within string
327
327
normalized. push_str ( & text[ 0 ..i] ) ;
328
328
329
- let mut pos = normalize_xml_eol_step ( & mut normalized, text, i, '\n' ) ;
329
+ let mut pos = normalize_xml11_eol_step ( & mut normalized, text, i, '\n' ) ;
330
330
while let Some ( i) = memchr3 ( b'\r' , 0xC2 , 0xE2 , & bytes[ pos..] ) {
331
331
let index = pos + i;
332
332
// NOTE: unsafe { text.get_unchecked(pos..index) } could be used because
333
333
// we are sure that index within string
334
334
normalized. push_str ( & text[ pos..index] ) ;
335
- pos = normalize_xml_eol_step ( & mut normalized, text, index, '\n' ) ;
335
+ pos = normalize_xml11_eol_step ( & mut normalized, text, index, '\n' ) ;
336
336
}
337
337
if let Some ( rest) = text. get ( pos..) {
338
338
normalized. push_str ( rest) ;
@@ -378,7 +378,7 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str>
378
378
///
379
379
/// [eof]: https://www.w3.org/TR/xml11/#sec-line-ends
380
380
/// [only for]: https://html.spec.whatwg.org/#normalize-newlines
381
- fn normalize_xml_eol_step ( normalized : & mut String , text : & str , index : usize , ch : char ) -> usize {
381
+ fn normalize_xml11_eol_step ( normalized : & mut String , text : & str , index : usize , ch : char ) -> usize {
382
382
let input = text. as_bytes ( ) ;
383
383
match input[ index] {
384
384
b'\r' => {
@@ -441,7 +441,7 @@ fn normalize_xml_eol_step(normalized: &mut String, text: &str, index: usize, ch:
441
441
////////////////////////////////////////////////////////////////////////////////////////////////////
442
442
443
443
// TODO: It would be better to reuse buffer after decoding if possible
444
- pub ( crate ) fn normalize_html_eols < ' input > ( text : & ' input str ) -> Cow < ' input , str > {
444
+ pub ( crate ) fn normalize_xml10_eols < ' input > ( text : & ' input str ) -> Cow < ' input , str > {
445
445
let bytes = text. as_bytes ( ) ;
446
446
447
447
// The following sequences of UTF-8 encoded input should be translated into
@@ -459,13 +459,13 @@ pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str>
459
459
// we are sure that index within string
460
460
normalized. push_str ( & text[ 0 ..i] ) ;
461
461
462
- let mut pos = normalize_html_eol_step ( & mut normalized, bytes, i, '\n' ) ;
462
+ let mut pos = normalize_xml10_eol_step ( & mut normalized, bytes, i, '\n' ) ;
463
463
while let Some ( i) = memchr ( b'\r' , & bytes[ pos..] ) {
464
464
let index = pos + i;
465
465
// NOTE: unsafe { text.get_unchecked(pos..index) } could be used because
466
466
// we are sure that index within string
467
467
normalized. push_str ( & text[ pos..index] ) ;
468
- pos = normalize_html_eol_step ( & mut normalized, bytes, index, '\n' ) ;
468
+ pos = normalize_xml10_eol_step ( & mut normalized, bytes, index, '\n' ) ;
469
469
}
470
470
if let Some ( rest) = text. get ( pos..) {
471
471
normalized. push_str ( rest) ;
@@ -487,7 +487,12 @@ pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str>
487
487
/// - `ch`: a character that should be put to the string instead of newline sequence
488
488
///
489
489
/// [only for]: https://html.spec.whatwg.org/#normalize-newlines
490
- fn normalize_html_eol_step ( normalized : & mut String , input : & [ u8 ] , index : usize , ch : char ) -> usize {
490
+ fn normalize_xml10_eol_step (
491
+ normalized : & mut String ,
492
+ input : & [ u8 ] ,
493
+ index : usize ,
494
+ ch : char ,
495
+ ) -> usize {
491
496
match input[ index] {
492
497
b'\r' => {
493
498
normalized. push ( ch) ;
@@ -2062,56 +2067,59 @@ mod normalization {
2062
2067
mod eol {
2063
2068
use super :: * ;
2064
2069
2065
- mod xml {
2070
+ mod xml11 {
2066
2071
use super :: * ;
2067
2072
use pretty_assertions:: assert_eq;
2068
2073
2069
2074
#[ test]
2070
2075
fn empty ( ) {
2071
- assert_eq ! ( normalize_xml_eols ( "" ) , "" ) ;
2076
+ assert_eq ! ( normalize_xml11_eols ( "" ) , "" ) ;
2072
2077
}
2073
2078
2074
2079
#[ test]
2075
2080
fn already_normalized ( ) {
2076
2081
assert_eq ! (
2077
- normalize_xml_eols ( "\n already \n \n normalized\n " ) ,
2082
+ normalize_xml11_eols ( "\n already \n \n normalized\n " ) ,
2078
2083
"\n already \n \n normalized\n " ,
2079
2084
) ;
2080
2085
}
2081
2086
2082
2087
#[ test]
2083
2088
fn cr_lf ( ) {
2084
- assert_eq ! ( normalize_xml_eols( "\r \n some\r \n \r \n text" ) , "\n some\n \n text" ) ;
2089
+ assert_eq ! (
2090
+ normalize_xml11_eols( "\r \n some\r \n \r \n text" ) ,
2091
+ "\n some\n \n text"
2092
+ ) ;
2085
2093
}
2086
2094
2087
2095
#[ test]
2088
2096
fn cr_u0085 ( ) {
2089
2097
assert_eq ! (
2090
- normalize_xml_eols ( "\r \u{0085} some\r \u{0085} \r \u{0085} text" ) ,
2098
+ normalize_xml11_eols ( "\r \u{0085} some\r \u{0085} \r \u{0085} text" ) ,
2091
2099
"\n some\n \n text" ,
2092
2100
) ;
2093
2101
}
2094
2102
2095
2103
#[ test]
2096
2104
fn u0085 ( ) {
2097
2105
assert_eq ! (
2098
- normalize_xml_eols ( "\u{0085} some\u{0085} \u{0085} text" ) ,
2106
+ normalize_xml11_eols ( "\u{0085} some\u{0085} \u{0085} text" ) ,
2099
2107
"\n some\n \n text" ,
2100
2108
) ;
2101
2109
}
2102
2110
2103
2111
#[ test]
2104
2112
fn u2028 ( ) {
2105
2113
assert_eq ! (
2106
- normalize_xml_eols ( "\u{2028} some\u{2028} \u{2028} text" ) ,
2114
+ normalize_xml11_eols ( "\u{2028} some\u{2028} \u{2028} text" ) ,
2107
2115
"\n some\n \n text" ,
2108
2116
) ;
2109
2117
}
2110
2118
2111
2119
#[ test]
2112
2120
fn mixed ( ) {
2113
2121
assert_eq ! (
2114
- normalize_xml_eols ( "\r \r \r \u{2028} \n \r \n some\n \u{0085} \r \u{0085} text" ) ,
2122
+ normalize_xml11_eols ( "\r \r \r \u{2028} \n \r \n some\n \u{0085} \r \u{0085} text" ) ,
2115
2123
"\n \n \n \n \n \n some\n \n \n text" ,
2116
2124
) ;
2117
2125
}
@@ -2138,9 +2146,9 @@ mod normalization {
2138
2146
2139
2147
dbg ! ( ( input, & description) ) ;
2140
2148
if ch == '\u{0085}' {
2141
- assert_eq ! ( normalize_xml_eols ( input) , "\n " , "{}" , description) ;
2149
+ assert_eq ! ( normalize_xml11_eols ( input) , "\n " , "{}" , description) ;
2142
2150
} else {
2143
- assert_eq ! ( normalize_xml_eols ( input) , input, "{}" , description) ;
2151
+ assert_eq ! ( normalize_xml11_eols ( input) , input, "{}" , description) ;
2144
2152
}
2145
2153
}
2146
2154
assert_eq ! ( ( first..=last) . count( ) , 64 ) ;
@@ -2171,12 +2179,12 @@ mod normalization {
2171
2179
2172
2180
dbg ! ( ( input, & description) ) ;
2173
2181
if ch == '\u{0085}' {
2174
- assert_eq ! ( normalize_xml_eols ( input) , "\n " , "{}" , description) ;
2182
+ assert_eq ! ( normalize_xml11_eols ( input) , "\n " , "{}" , description) ;
2175
2183
} else {
2176
2184
let mut expected = utf8. clone ( ) ;
2177
2185
expected[ 0 ] = b'\n' ;
2178
2186
let expected = std:: str:: from_utf8 ( & expected) . expect ( & description) ;
2179
- assert_eq ! ( normalize_xml_eols ( input) , expected, "{}" , description) ;
2187
+ assert_eq ! ( normalize_xml11_eols ( input) , expected, "{}" , description) ;
2180
2188
}
2181
2189
}
2182
2190
assert_eq ! ( ( first..=last) . count( ) , 64 ) ;
@@ -2207,68 +2215,68 @@ mod normalization {
2207
2215
2208
2216
dbg ! ( ( input, & description) ) ;
2209
2217
if ch == '\u{2028}' {
2210
- assert_eq ! ( normalize_xml_eols ( input) , "\n " , "{}" , description) ;
2218
+ assert_eq ! ( normalize_xml11_eols ( input) , "\n " , "{}" , description) ;
2211
2219
} else {
2212
- assert_eq ! ( normalize_xml_eols ( input) , input, "{}" , description) ;
2220
+ assert_eq ! ( normalize_xml11_eols ( input) , input, "{}" , description) ;
2213
2221
}
2214
2222
}
2215
2223
assert_eq ! ( ( first..=last) . count( ) , 4096 ) ;
2216
2224
}
2217
2225
}
2218
2226
2219
- mod html {
2227
+ mod xml10 {
2220
2228
use super :: * ;
2221
2229
use pretty_assertions:: assert_eq;
2222
2230
2223
2231
#[ test]
2224
2232
fn empty ( ) {
2225
- assert_eq ! ( normalize_html_eols ( "" ) , "" ) ;
2233
+ assert_eq ! ( normalize_xml10_eols ( "" ) , "" ) ;
2226
2234
}
2227
2235
2228
2236
#[ test]
2229
2237
fn already_normalized ( ) {
2230
2238
assert_eq ! (
2231
- normalize_html_eols ( "\n already \n \n normalized\n " ) ,
2239
+ normalize_xml10_eols ( "\n already \n \n normalized\n " ) ,
2232
2240
"\n already \n \n normalized\n " ,
2233
2241
) ;
2234
2242
}
2235
2243
2236
2244
#[ test]
2237
2245
fn cr_lf ( ) {
2238
2246
assert_eq ! (
2239
- normalize_html_eols ( "\r \n some\r \n \r \n text" ) ,
2247
+ normalize_xml10_eols ( "\r \n some\r \n \r \n text" ) ,
2240
2248
"\n some\n \n text"
2241
2249
) ;
2242
2250
}
2243
2251
2244
2252
#[ test]
2245
2253
fn cr_u0085 ( ) {
2246
2254
assert_eq ! (
2247
- normalize_html_eols ( "\r \u{0085} some\r \u{0085} \r \u{0085} text" ) ,
2255
+ normalize_xml10_eols ( "\r \u{0085} some\r \u{0085} \r \u{0085} text" ) ,
2248
2256
"\n \u{0085} some\n \u{0085} \n \u{0085} text" ,
2249
2257
) ;
2250
2258
}
2251
2259
2252
2260
#[ test]
2253
2261
fn u0085 ( ) {
2254
2262
assert_eq ! (
2255
- normalize_html_eols ( "\u{0085} some\u{0085} \u{0085} text" ) ,
2263
+ normalize_xml10_eols ( "\u{0085} some\u{0085} \u{0085} text" ) ,
2256
2264
"\u{0085} some\u{0085} \u{0085} text" ,
2257
2265
) ;
2258
2266
}
2259
2267
2260
2268
#[ test]
2261
2269
fn u2028 ( ) {
2262
2270
assert_eq ! (
2263
- normalize_html_eols ( "\u{2028} some\u{2028} \u{2028} text" ) ,
2271
+ normalize_xml10_eols ( "\u{2028} some\u{2028} \u{2028} text" ) ,
2264
2272
"\u{2028} some\u{2028} \u{2028} text" ,
2265
2273
) ;
2266
2274
}
2267
2275
2268
2276
#[ test]
2269
2277
fn mixed ( ) {
2270
2278
assert_eq ! (
2271
- normalize_html_eols ( "\r \r \r \u{2028} \n \r \n some\n \u{0085} \r \u{0085} text" ) ,
2279
+ normalize_xml10_eols ( "\r \r \r \u{2028} \n \r \n some\n \u{0085} \r \u{0085} text" ) ,
2272
2280
"\n \n \n \u{2028} \n \n some\n \u{0085} \n \u{0085} text" ,
2273
2281
) ;
2274
2282
}
0 commit comments