@@ -305,7 +305,7 @@ where
305
305
////////////////////////////////////////////////////////////////////////////////////////////////////
306
306
307
307
// TODO: It would be better to reuse buffer after decoding if possible
308
- pub ( crate ) fn normalize_xml_eols < ' input > ( text : & ' input str ) -> Cow < ' input , str > {
308
+ pub ( crate ) fn normalize_xml11_eols < ' input > ( text : & ' input str ) -> Cow < ' input , str > {
309
309
let bytes = text. as_bytes ( ) ;
310
310
311
311
// The following sequences of UTF-8 encoded input should be translated into
@@ -326,13 +326,13 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str>
326
326
// we are sure that index within string
327
327
normalized. push_str ( & text[ 0 ..i] ) ;
328
328
329
- let mut pos = normalize_xml_eol_step ( & mut normalized, text, i, '\n' ) ;
329
+ let mut pos = normalize_xml11_eol_step ( & mut normalized, text, i, '\n' ) ;
330
330
while let Some ( i) = memchr3 ( b'\r' , 0xC2 , 0xE2 , & bytes[ pos..] ) {
331
331
let index = pos + i;
332
332
// NOTE: unsafe { text.get_unchecked(pos..index) } could be used because
333
333
// we are sure that index within string
334
334
normalized. push_str ( & text[ pos..index] ) ;
335
- pos = normalize_xml_eol_step ( & mut normalized, text, index, '\n' ) ;
335
+ pos = normalize_xml11_eol_step ( & mut normalized, text, index, '\n' ) ;
336
336
}
337
337
if let Some ( rest) = text. get ( pos..) {
338
338
normalized. push_str ( rest) ;
@@ -378,7 +378,7 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str>
378
378
///
379
379
/// [eof]: https://www.w3.org/TR/xml11/#sec-line-ends
380
380
/// [only for]: https://html.spec.whatwg.org/#normalize-newlines
381
- fn normalize_xml_eol_step ( normalized : & mut String , text : & str , index : usize , ch : char ) -> usize {
381
+ fn normalize_xml11_eol_step ( normalized : & mut String , text : & str , index : usize , ch : char ) -> usize {
382
382
let input = text. as_bytes ( ) ;
383
383
match input[ index] {
384
384
b'\r' => {
@@ -388,15 +388,15 @@ fn normalize_xml_eol_step(normalized: &mut String, text: &str, index: usize, ch:
388
388
normalized. push ( ch) ;
389
389
return index + 2 ; // skip \r\n
390
390
}
391
- // Because input is correct UTF-8 and in UTF-8 every character has
392
- // an unique prefix, byte C2 means only start of #x85 character
393
391
if next == 0xC2 {
392
+ // UTF-8 encoding of #x85 character is [c2 85]
394
393
if index + 2 < input. len ( ) && input[ index + 2 ] == 0x85 {
395
394
normalized. push ( ch) ;
396
395
} else {
396
+ normalized. push ( ch) ;
397
397
// NOTE: unsafe { text.get_unchecked(index..index + 3) } could be used because
398
398
// we are sure that index within string
399
- normalized. push_str ( & text[ index..index + 3 ] ) ;
399
+ normalized. push_str ( & text[ index + 1 ..index + 3 ] ) ;
400
400
}
401
401
return index + 3 ; // skip \r + UTF-8 encoding of character (c2 xx)
402
402
}
@@ -441,7 +441,7 @@ fn normalize_xml_eol_step(normalized: &mut String, text: &str, index: usize, ch:
441
441
////////////////////////////////////////////////////////////////////////////////////////////////////
442
442
443
443
// TODO: It would be better to reuse buffer after decoding if possible
444
- pub ( crate ) fn normalize_html_eols < ' input > ( text : & ' input str ) -> Cow < ' input , str > {
444
+ pub ( crate ) fn normalize_xml10_eols < ' input > ( text : & ' input str ) -> Cow < ' input , str > {
445
445
let bytes = text. as_bytes ( ) ;
446
446
447
447
// The following sequences of UTF-8 encoded input should be translated into
@@ -459,13 +459,13 @@ pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str>
459
459
// we are sure that index within string
460
460
normalized. push_str ( & text[ 0 ..i] ) ;
461
461
462
- let mut pos = normalize_html_eol_step ( & mut normalized, bytes, i, '\n' ) ;
462
+ let mut pos = normalize_xml10_eol_step ( & mut normalized, bytes, i, '\n' ) ;
463
463
while let Some ( i) = memchr ( b'\r' , & bytes[ pos..] ) {
464
464
let index = pos + i;
465
465
// NOTE: unsafe { text.get_unchecked(pos..index) } could be used because
466
466
// we are sure that index within string
467
467
normalized. push_str ( & text[ pos..index] ) ;
468
- pos = normalize_html_eol_step ( & mut normalized, bytes, index, '\n' ) ;
468
+ pos = normalize_xml10_eol_step ( & mut normalized, bytes, index, '\n' ) ;
469
469
}
470
470
if let Some ( rest) = text. get ( pos..) {
471
471
normalized. push_str ( rest) ;
@@ -487,7 +487,12 @@ pub(crate) fn normalize_html_eols<'input>(text: &'input str) -> Cow<'input, str>
487
487
/// - `ch`: a character that should be put to the string instead of newline sequence
488
488
///
489
489
/// [only for]: https://html.spec.whatwg.org/#normalize-newlines
490
- fn normalize_html_eol_step ( normalized : & mut String , input : & [ u8 ] , index : usize , ch : char ) -> usize {
490
+ fn normalize_xml10_eol_step (
491
+ normalized : & mut String ,
492
+ input : & [ u8 ] ,
493
+ index : usize ,
494
+ ch : char ,
495
+ ) -> usize {
491
496
match input[ index] {
492
497
b'\r' => {
493
498
normalized. push ( ch) ;
@@ -2062,56 +2067,59 @@ mod normalization {
2062
2067
mod eol {
2063
2068
use super :: * ;
2064
2069
2065
- mod xml {
2070
+ mod xml11 {
2066
2071
use super :: * ;
2067
2072
use pretty_assertions:: assert_eq;
2068
2073
2069
2074
#[ test]
2070
2075
fn empty ( ) {
2071
- assert_eq ! ( normalize_xml_eols ( "" ) , "" ) ;
2076
+ assert_eq ! ( normalize_xml11_eols ( "" ) , "" ) ;
2072
2077
}
2073
2078
2074
2079
#[ test]
2075
2080
fn already_normalized ( ) {
2076
2081
assert_eq ! (
2077
- normalize_xml_eols ( "\n already \n \n normalized\n " ) ,
2082
+ normalize_xml11_eols ( "\n already \n \n normalized\n " ) ,
2078
2083
"\n already \n \n normalized\n " ,
2079
2084
) ;
2080
2085
}
2081
2086
2082
2087
#[ test]
2083
2088
fn cr_lf ( ) {
2084
- assert_eq ! ( normalize_xml_eols( "\r \n some\r \n \r \n text" ) , "\n some\n \n text" ) ;
2089
+ assert_eq ! (
2090
+ normalize_xml11_eols( "\r \n some\r \n \r \n text" ) ,
2091
+ "\n some\n \n text"
2092
+ ) ;
2085
2093
}
2086
2094
2087
2095
#[ test]
2088
2096
fn cr_u0085 ( ) {
2089
2097
assert_eq ! (
2090
- normalize_xml_eols ( "\r \u{0085} some\r \u{0085} \r \u{0085} text" ) ,
2098
+ normalize_xml11_eols ( "\r \u{0085} some\r \u{0085} \r \u{0085} text" ) ,
2091
2099
"\n some\n \n text" ,
2092
2100
) ;
2093
2101
}
2094
2102
2095
2103
#[ test]
2096
2104
fn u0085 ( ) {
2097
2105
assert_eq ! (
2098
- normalize_xml_eols ( "\u{0085} some\u{0085} \u{0085} text" ) ,
2106
+ normalize_xml11_eols ( "\u{0085} some\u{0085} \u{0085} text" ) ,
2099
2107
"\n some\n \n text" ,
2100
2108
) ;
2101
2109
}
2102
2110
2103
2111
#[ test]
2104
2112
fn u2028 ( ) {
2105
2113
assert_eq ! (
2106
- normalize_xml_eols ( "\u{2028} some\u{2028} \u{2028} text" ) ,
2114
+ normalize_xml11_eols ( "\u{2028} some\u{2028} \u{2028} text" ) ,
2107
2115
"\n some\n \n text" ,
2108
2116
) ;
2109
2117
}
2110
2118
2111
2119
#[ test]
2112
2120
fn mixed ( ) {
2113
2121
assert_eq ! (
2114
- normalize_xml_eols ( "\r \r \r \u{2028} \n \r \n some\n \u{0085} \r \u{0085} text" ) ,
2122
+ normalize_xml11_eols ( "\r \r \r \u{2028} \n \r \n some\n \u{0085} \r \u{0085} text" ) ,
2115
2123
"\n \n \n \n \n \n some\n \n \n text" ,
2116
2124
) ;
2117
2125
}
@@ -2138,9 +2146,9 @@ mod normalization {
2138
2146
2139
2147
dbg ! ( ( input, & description) ) ;
2140
2148
if ch == '\u{0085}' {
2141
- assert_eq ! ( normalize_xml_eols ( input) , "\n " , "{}" , description) ;
2149
+ assert_eq ! ( normalize_xml11_eols ( input) , "\n " , "{}" , description) ;
2142
2150
} else {
2143
- assert_eq ! ( normalize_xml_eols ( input) , input, "{}" , description) ;
2151
+ assert_eq ! ( normalize_xml11_eols ( input) , input, "{}" , description) ;
2144
2152
}
2145
2153
}
2146
2154
assert_eq ! ( ( first..=last) . count( ) , 64 ) ;
@@ -2171,9 +2179,12 @@ mod normalization {
2171
2179
2172
2180
dbg ! ( ( input, & description) ) ;
2173
2181
if ch == '\u{0085}' {
2174
- assert_eq ! ( normalize_xml_eols ( input) , "\n " , "{}" , description) ;
2182
+ assert_eq ! ( normalize_xml11_eols ( input) , "\n " , "{}" , description) ;
2175
2183
} else {
2176
- assert_eq ! ( normalize_xml_eols( input) , input, "{}" , description) ;
2184
+ let mut expected = utf8. clone ( ) ;
2185
+ expected[ 0 ] = b'\n' ;
2186
+ let expected = std:: str:: from_utf8 ( & expected) . expect ( & description) ;
2187
+ assert_eq ! ( normalize_xml11_eols( input) , expected, "{}" , description) ;
2177
2188
}
2178
2189
}
2179
2190
assert_eq ! ( ( first..=last) . count( ) , 64 ) ;
@@ -2204,68 +2215,68 @@ mod normalization {
2204
2215
2205
2216
dbg ! ( ( input, & description) ) ;
2206
2217
if ch == '\u{2028}' {
2207
- assert_eq ! ( normalize_xml_eols ( input) , "\n " , "{}" , description) ;
2218
+ assert_eq ! ( normalize_xml11_eols ( input) , "\n " , "{}" , description) ;
2208
2219
} else {
2209
- assert_eq ! ( normalize_xml_eols ( input) , input, "{}" , description) ;
2220
+ assert_eq ! ( normalize_xml11_eols ( input) , input, "{}" , description) ;
2210
2221
}
2211
2222
}
2212
2223
assert_eq ! ( ( first..=last) . count( ) , 4096 ) ;
2213
2224
}
2214
2225
}
2215
2226
2216
- mod html {
2227
+ mod xml10 {
2217
2228
use super :: * ;
2218
2229
use pretty_assertions:: assert_eq;
2219
2230
2220
2231
#[ test]
2221
2232
fn empty ( ) {
2222
- assert_eq ! ( normalize_html_eols ( "" ) , "" ) ;
2233
+ assert_eq ! ( normalize_xml10_eols ( "" ) , "" ) ;
2223
2234
}
2224
2235
2225
2236
#[ test]
2226
2237
fn already_normalized ( ) {
2227
2238
assert_eq ! (
2228
- normalize_html_eols ( "\n already \n \n normalized\n " ) ,
2239
+ normalize_xml10_eols ( "\n already \n \n normalized\n " ) ,
2229
2240
"\n already \n \n normalized\n " ,
2230
2241
) ;
2231
2242
}
2232
2243
2233
2244
#[ test]
2234
2245
fn cr_lf ( ) {
2235
2246
assert_eq ! (
2236
- normalize_html_eols ( "\r \n some\r \n \r \n text" ) ,
2247
+ normalize_xml10_eols ( "\r \n some\r \n \r \n text" ) ,
2237
2248
"\n some\n \n text"
2238
2249
) ;
2239
2250
}
2240
2251
2241
2252
#[ test]
2242
2253
fn cr_u0085 ( ) {
2243
2254
assert_eq ! (
2244
- normalize_html_eols ( "\r \u{0085} some\r \u{0085} \r \u{0085} text" ) ,
2255
+ normalize_xml10_eols ( "\r \u{0085} some\r \u{0085} \r \u{0085} text" ) ,
2245
2256
"\n \u{0085} some\n \u{0085} \n \u{0085} text" ,
2246
2257
) ;
2247
2258
}
2248
2259
2249
2260
#[ test]
2250
2261
fn u0085 ( ) {
2251
2262
assert_eq ! (
2252
- normalize_html_eols ( "\u{0085} some\u{0085} \u{0085} text" ) ,
2263
+ normalize_xml10_eols ( "\u{0085} some\u{0085} \u{0085} text" ) ,
2253
2264
"\u{0085} some\u{0085} \u{0085} text" ,
2254
2265
) ;
2255
2266
}
2256
2267
2257
2268
#[ test]
2258
2269
fn u2028 ( ) {
2259
2270
assert_eq ! (
2260
- normalize_html_eols ( "\u{2028} some\u{2028} \u{2028} text" ) ,
2271
+ normalize_xml10_eols ( "\u{2028} some\u{2028} \u{2028} text" ) ,
2261
2272
"\u{2028} some\u{2028} \u{2028} text" ,
2262
2273
) ;
2263
2274
}
2264
2275
2265
2276
#[ test]
2266
2277
fn mixed ( ) {
2267
2278
assert_eq ! (
2268
- normalize_html_eols ( "\r \r \r \u{2028} \n \r \n some\n \u{0085} \r \u{0085} text" ) ,
2279
+ normalize_xml10_eols ( "\r \r \r \u{2028} \n \r \n some\n \u{0085} \r \u{0085} text" ) ,
2269
2280
"\n \n \n \u{2028} \n \n some\n \u{0085} \n \u{0085} text" ,
2270
2281
) ;
2271
2282
}
0 commit comments