@@ -4,6 +4,7 @@ use memchr::memchr2_iter;
4
4
use std:: borrow:: Cow ;
5
5
use std:: num:: ParseIntError ;
6
6
use std:: ops:: Range ;
7
+ use std:: slice:: Iter ;
7
8
8
9
/// Error of parsing character reference (`&#<dec-number>;` or `&#x<hex-number>;`).
9
10
#[ derive( Clone , Debug , PartialEq ) ]
@@ -50,6 +51,12 @@ pub enum EscapeError {
50
51
/// Attempt to parse character reference (`&#<dec-number>;` or `&#x<hex-number>;`)
51
52
/// was unsuccessful, not all characters are decimal or hexadecimal numbers.
52
53
InvalidCharRef ( ParseCharRefError ) ,
54
+ /// Expanded more than maximum possible entities during attribute normalization.
55
+ ///
56
+ /// Attribute normalization includes expanding of general entities (`&entity;`)
57
+ /// which replacement text also could contain entities, which is also must be expanded.
58
+ /// If more than 128 entities would be expanded, this error is returned.
59
+ TooManyNestedEntities ,
53
60
}
54
61
55
62
impl std:: fmt:: Display for EscapeError {
@@ -66,6 +73,9 @@ impl std::fmt::Display for EscapeError {
66
73
Self :: InvalidCharRef ( e) => {
67
74
write ! ( f, "invalid character reference: {}" , e)
68
75
}
76
+ Self :: TooManyNestedEntities => {
77
+ f. write_str ( "too many nested entities in an attribute value" )
78
+ }
69
79
}
70
80
}
71
81
}
@@ -302,6 +312,182 @@ where
302
312
}
303
313
}
304
314
315
+ const fn is_normalization_char ( b : & u8 ) -> bool {
316
+ matches ! ( * b, b'\t' | b'\r' | b'\n' | b' ' | b'&' )
317
+ }
318
+
319
+ /// Returns the attribute value normalized as per [the XML specification],
320
+ /// using a custom entity resolver.
321
+ ///
322
+ /// Do not use this method with HTML attributes.
323
+ ///
324
+ /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>`
325
+ /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
326
+ /// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
327
+ /// take precedence.
328
+ ///
329
+ /// This will allocate unless the raw attribute value does not require normalization.
330
+ ///
331
+ /// # Parameters
332
+ ///
333
+ /// - `value`: unnormalized attribute value
334
+ /// - `depth`: maximum number of nested entities that can be expanded. If expansion
335
+ /// chain will be more that this value, the function will return [`EscapeError::TooManyNestedEntities`]
336
+ /// - `resolve_entity`: a function to resolve entity. This function could be called
337
+ /// multiple times on the same input and can return different values in each case
338
+ /// for the same input, although it is not recommended
339
+ ///
340
+ /// # Lifetimes
341
+ ///
342
+ /// - `'input`: lifetime of the unnormalized attribute. If normalization is not requred,
343
+ /// the input returned unchanged with the same lifetime
344
+ /// - `'entity`: lifetime of all entities that is returned by the entity resolution routine
345
+ ///
346
+ /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize
347
+ pub ( crate ) fn normalize_attribute_value < ' input , ' entity , F > (
348
+ value : & ' input str ,
349
+ depth : usize ,
350
+ resolve_entity : F ,
351
+ ) -> Result < Cow < ' input , str > , EscapeError >
352
+ where
353
+ // the lifetime of the output comes from a capture or is `'static`
354
+ F : Fn ( & str ) -> Option < & ' entity str > ,
355
+ {
356
+ let mut iter = value. as_bytes ( ) . iter ( ) ;
357
+
358
+ // If we found the charater that requires normalization, create a normalized
359
+ // version of the attribute, otherwise return the value unchanged
360
+ if let Some ( i) = iter. position ( is_normalization_char) {
361
+ let mut normalized = String :: with_capacity ( value. len ( ) ) ;
362
+ let pos = normalize_step (
363
+ & mut normalized,
364
+ & mut iter,
365
+ value,
366
+ 0 ,
367
+ i,
368
+ depth,
369
+ & resolve_entity,
370
+ ) ?;
371
+
372
+ normalize_steps (
373
+ & mut normalized,
374
+ & mut iter,
375
+ value,
376
+ pos,
377
+ depth,
378
+ & resolve_entity,
379
+ ) ?;
380
+ return Ok ( normalized. into ( ) ) ;
381
+ }
382
+ Ok ( Cow :: Borrowed ( value) )
383
+ }
384
+
385
+ fn normalize_steps < ' entity , F > (
386
+ normalized : & mut String ,
387
+ iter : & mut Iter < u8 > ,
388
+ input : & str ,
389
+ mut pos : usize ,
390
+ depth : usize ,
391
+ resolve_entity : & F ,
392
+ ) -> Result < ( ) , EscapeError >
393
+ where
394
+ // the lifetime of the output comes from a capture or is `'static`
395
+ F : Fn ( & str ) -> Option < & ' entity str > ,
396
+ {
397
+ while let Some ( i) = iter. position ( is_normalization_char) {
398
+ pos = normalize_step ( normalized, iter, input, pos, pos + i, depth, resolve_entity) ?;
399
+ }
400
+ if let Some ( rest) = input. get ( pos..) {
401
+ normalized. push_str ( rest) ;
402
+ }
403
+ Ok ( ( ) )
404
+ }
405
+
406
+ /// Performs one step of the [normalization algorithm] (but with recursive part):
407
+ ///
408
+ /// 1. For a character reference, append the referenced character
409
+ /// to the normalized value.
410
+ /// 2. For an entity reference, recursively apply this algorithm
411
+ /// to the replacement text of the entity.
412
+ /// 3. For a white space character (#x20, #xD, #xA, #x9), append
413
+ /// a space character (#x20) to the normalized value.
414
+ /// 4. For another character, append the character to the normalized value.
415
+ ///
416
+ /// # Parameters
417
+ ///
418
+ /// - `normalized`: Output of the algorithm. Normalized value will be placed here
419
+ /// - `iter`: Iterator over bytes of `input`
420
+ /// - `input`: Original non-normalized value
421
+ /// - `last_pos`: Index of the last byte in `input` that was processed
422
+ /// - `index`: Index of the byte in `input` that should be processed now
423
+ /// - `depth`: Current recursion depth. Too deep recursion will interrupt the algorithm
424
+ /// - `resolve_entity`: Resolver of entities. Returns `None` for unknown entities
425
+ ///
426
+ /// [normalization algorithm]: https://www.w3.org/TR/xml11/#AVNormalize
427
+ fn normalize_step < ' entity , F > (
428
+ normalized : & mut String ,
429
+ iter : & mut Iter < u8 > ,
430
+ input : & str ,
431
+ last_pos : usize ,
432
+ index : usize ,
433
+ depth : usize ,
434
+ resolve_entity : & F ,
435
+ ) -> Result < usize , EscapeError >
436
+ where
437
+ // the lifetime of the output comes from a capture or is `'static`
438
+ F : Fn ( & str ) -> Option < & ' entity str > ,
439
+ {
440
+ if depth == 0 {
441
+ return Err ( EscapeError :: TooManyNestedEntities ) ;
442
+ }
443
+ // 4. For another character, append the character to the normalized value.
444
+ normalized. push_str ( & input[ last_pos..index] ) ;
445
+
446
+ match input. as_bytes ( ) [ index] {
447
+ b'&' => {
448
+ let start = index + 1 ; // +1 - skip `&`
449
+ let end = start
450
+ + match iter. position ( |& b| b == b';' ) {
451
+ Some ( end) => end,
452
+ None => return Err ( EscapeError :: UnterminatedEntity ( index..input. len ( ) ) ) ,
453
+ } ;
454
+
455
+ // Content between & and ; - &pat;
456
+ let pat = & input[ start..end] ;
457
+ // 1. For a character reference, append the referenced character
458
+ // to the normalized value.
459
+ if pat. starts_with ( '#' ) {
460
+ let entity = & pat[ 1 ..] ; // starts after the #
461
+ let codepoint = parse_number ( entity) . map_err ( EscapeError :: InvalidCharRef ) ?;
462
+ normalized. push_str ( codepoint. encode_utf8 ( & mut [ 0u8 ; 4 ] ) ) ;
463
+ } else
464
+ // 2. For an entity reference, recursively apply this algorithm
465
+ // to the replacement text of the entity.
466
+ if let Some ( value) = resolve_entity ( pat) {
467
+ normalize_steps (
468
+ normalized,
469
+ & mut value. as_bytes ( ) . iter ( ) ,
470
+ value,
471
+ 0 ,
472
+ depth. saturating_sub ( 1 ) ,
473
+ resolve_entity,
474
+ ) ?;
475
+ } else {
476
+ return Err ( EscapeError :: UnrecognizedEntity ( start..end, pat. to_string ( ) ) ) ;
477
+ }
478
+ Ok ( end + 1 ) // +1 - skip `;`
479
+ }
480
+ // 3. For a white space character (#x20, #xD, #xA, #x9), append
481
+ // a space character (#x20) to the normalized value.
482
+ b'\t' | b'\n' | b'\r' | b' ' => {
483
+ normalized. push ( ' ' ) ;
484
+ Ok ( index + 1 ) // +1 - skip character
485
+ }
486
+
487
+ _ => unreachable ! ( "Only '\\ t', '\\ n', '\\ r', ' ', and '&' are possible here" ) ,
488
+ }
489
+ }
490
+
305
491
/// Resolves predefined XML entities or all HTML5 entities depending on the feature
306
492
/// [`escape-html`](https://docs.rs/quick-xml/latest/quick_xml/#escape-html).
307
493
///
@@ -1844,3 +2030,115 @@ fn from_str_radix(src: &str, radix: u32) -> Result<u32, ParseCharRefError> {
1844
2030
_ => u32:: from_str_radix ( src, radix) . map_err ( ParseCharRefError :: InvalidNumber ) ,
1845
2031
}
1846
2032
}
2033
+
2034
+ #[ cfg( test) ]
2035
+ mod normalization {
2036
+ use super :: * ;
2037
+ use pretty_assertions:: assert_eq;
2038
+
2039
+ #[ test]
2040
+ fn empty ( ) {
2041
+ assert_eq ! (
2042
+ normalize_attribute_value( "" , 5 , |_| { None } ) ,
2043
+ Ok ( "" . into( ) )
2044
+ ) ;
2045
+ }
2046
+
2047
+ #[ test]
2048
+ fn only_spaces ( ) {
2049
+ assert_eq ! (
2050
+ normalize_attribute_value( " " , 5 , |_| { None } ) ,
2051
+ Ok ( " " . into( ) )
2052
+ ) ;
2053
+ assert_eq ! (
2054
+ normalize_attribute_value( "\t \t \t " , 5 , |_| { None } ) ,
2055
+ Ok ( " " . into( ) )
2056
+ ) ;
2057
+ assert_eq ! (
2058
+ normalize_attribute_value( "\r \r \r " , 5 , |_| { None } ) ,
2059
+ Ok ( " " . into( ) )
2060
+ ) ;
2061
+ assert_eq ! (
2062
+ normalize_attribute_value( "\n \n \n " , 5 , |_| { None } ) ,
2063
+ Ok ( " " . into( ) )
2064
+ ) ;
2065
+ }
2066
+
2067
+ #[ test]
2068
+ fn already_normalized ( ) {
2069
+ assert_eq ! (
2070
+ normalize_attribute_value( "already normalized" , 5 , |_| { None } ) ,
2071
+ Ok ( "already normalized" . into( ) )
2072
+ ) ;
2073
+ }
2074
+
2075
+ #[ test]
2076
+ fn characters ( ) {
2077
+ assert_eq ! (
2078
+ normalize_attribute_value( "string with   character" , 5 , |_| { None } ) ,
2079
+ Ok ( "string with character" . into( ) )
2080
+ ) ;
2081
+ assert_eq ! (
2082
+ normalize_attribute_value( "string with   character" , 5 , |_| { None } ) ,
2083
+ Ok ( "string with character" . into( ) )
2084
+ ) ;
2085
+ }
2086
+
2087
+ #[ test]
2088
+ fn entities ( ) {
2089
+ assert_eq ! (
2090
+ normalize_attribute_value( "string with &entity; reference" , 5 , |_| {
2091
+ Some ( "replacement" )
2092
+ } ) ,
2093
+ Ok ( "string with replacement reference" . into( ) )
2094
+ ) ;
2095
+ assert_eq ! (
2096
+ normalize_attribute_value( "string with &entity-1; reference" , 5 , |entity| {
2097
+ match entity {
2098
+ "entity-1" => Some ( "recursive &entity-2;" ) ,
2099
+ "entity-2" => Some ( "entity 2" ) ,
2100
+ _ => None ,
2101
+ }
2102
+ } ) ,
2103
+ Ok ( "string with recursive entity 2 reference" . into( ) )
2104
+ ) ;
2105
+ }
2106
+
2107
+ #[ test]
2108
+ fn unclosed_entity ( ) {
2109
+ assert_eq ! (
2110
+ normalize_attribute_value( "string with unclosed &entity reference" , 5 , |_| {
2111
+ // 0 ^ = 21 ^ = 38
2112
+ Some ( "replacement" )
2113
+ } ) ,
2114
+ Err ( EscapeError :: UnterminatedEntity ( 21 ..38 ) )
2115
+ ) ;
2116
+ assert_eq ! (
2117
+ normalize_attribute_value( "string with unclosed   (character) reference" , 5 , |_| {
2118
+ // 0 ^ = 21 ^ = 47
2119
+ None
2120
+ } ) ,
2121
+ Err ( EscapeError :: UnterminatedEntity ( 21 ..47 ) )
2122
+ ) ;
2123
+ }
2124
+
2125
+ #[ test]
2126
+ fn unknown_entity ( ) {
2127
+ assert_eq ! (
2128
+ normalize_attribute_value( "string with unknown &entity; reference" , 5 , |_| { None } ) ,
2129
+ // 0 ^ ^ = 21..27
2130
+ Err ( EscapeError :: UnrecognizedEntity (
2131
+ 21 ..27 ,
2132
+ "entity" . to_string( ) ,
2133
+ ) )
2134
+ ) ;
2135
+ }
2136
+
2137
+ #[ test]
2138
+ fn recursive_entity ( ) {
2139
+ assert_eq ! (
2140
+ normalize_attribute_value( "&entity; reference" , 5 , |_| Some ( "recursive &entity;" ) ) ,
2141
+ Err ( EscapeError :: TooManyNestedEntities ) ,
2142
+ ) ;
2143
+ }
2144
+ }
0 commit comments