@@ -33,9 +33,88 @@ pub struct Attribute<'a> {
33
33
}
34
34
35
35
impl < ' a > Attribute < ' a > {
36
+ /// Normalize the attribute value according to xml specification section 3.3.3
36
37
///
38
+ /// https://www.w3.org/TR/xml/#AVNormalize
39
+ ///
40
+ /// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value
41
+ /// * Sequences of whitespace-like characters are replaced with a single whitespace character
42
+ /// * Character and entity references are substituted as defined by the spec
37
43
pub fn normalized_value ( & ' a self ) -> Result < Cow < ' a , [ u8 ] > , EscapeError > {
38
- let normalized = normalize_attribute_value ( self . value . as_ref ( ) ) ;
44
+ // TODO: character references, entity references, error handling associated with those
45
+
46
+ #[ derive( PartialEq ) ]
47
+ enum ParseState {
48
+ Space ,
49
+ CDATA ,
50
+ }
51
+
52
+ // Trim characters from the beginning and end of the attribute value - this can't fail.
53
+ fn trim_value ( attr : & [ u8 ] ) -> & [ u8 ] {
54
+ let first_non_space_char = attr. iter ( ) . position ( |c| !is_whitespace ( * c) ) ;
55
+
56
+ if first_non_space_char. is_none ( ) {
57
+ // The entire value was whitespace-like characters
58
+ return b"" ;
59
+ }
60
+
61
+ let last_non_space_char = attr. iter ( ) . rposition ( |c| !is_whitespace ( * c) ) ;
62
+
63
+ // Trim all whitespace-like characters away from the beginning and end of the attribute value.
64
+ let begin = first_non_space_char. unwrap ( ) ;
65
+ let end = last_non_space_char. unwrap_or ( attr. len ( ) ) ;
66
+ & attr[ begin..=end]
67
+ }
68
+
69
+ let trimmed_attr = trim_value ( self . value . as_ref ( ) ) ;
70
+
71
+ // A new buffer is only created when we encounter a situation that requires it.
72
+ let mut normalized: Option < Vec < u8 > > = None ;
73
+ // We start on character data because all whitespace-like characters are already trimmed away.
74
+ let mut current_state = ParseState :: CDATA ;
75
+
76
+ // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
77
+ // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
78
+ // buffer and continue using this buffer.
79
+ for ( idx, ch) in trimmed_attr. iter ( ) . enumerate ( ) {
80
+ match ch {
81
+ b'\n' | b'\r' | b'\t' | b' ' => match current_state {
82
+ ParseState :: Space => match normalized {
83
+ Some ( _) => continue ,
84
+ None => normalized = Some ( Vec :: from ( & trimmed_attr[ ..idx] ) ) ,
85
+ } ,
86
+ ParseState :: CDATA => {
87
+ current_state = ParseState :: Space ;
88
+ match normalized. as_mut ( ) {
89
+ Some ( buf) => buf. push ( b' ' ) ,
90
+ None => {
91
+ let mut buf = Vec :: from ( & trimmed_attr[ ..idx] ) ;
92
+ buf. push ( b' ' ) ;
93
+ normalized = Some ( buf) ;
94
+ }
95
+ }
96
+ }
97
+ } ,
98
+ c @ _ => match current_state {
99
+ ParseState :: Space => {
100
+ current_state = ParseState :: CDATA ;
101
+ if let Some ( normalized) = normalized. as_mut ( ) {
102
+ normalized. push ( * c) ;
103
+ }
104
+ }
105
+ ParseState :: CDATA => {
106
+ if let Some ( normalized) = normalized. as_mut ( ) {
107
+ normalized. push ( * c) ;
108
+ }
109
+ }
110
+ } ,
111
+ }
112
+ }
113
+
114
+ let normalized = match normalized {
115
+ Some ( normalized) => Cow :: Owned ( normalized) ,
116
+ None => Cow :: Borrowed ( trimmed_attr) ,
117
+ } ;
39
118
let escaped = do_unescape ( & * normalized, None ) ?;
40
119
Ok ( Cow :: Owned ( escaped. into_owned ( ) ) )
41
120
}
@@ -190,90 +269,6 @@ impl<'a> From<Attr<&'a [u8]>> for Attribute<'a> {
190
269
}
191
270
}
192
271
193
- /// Normalize the attribute value according to xml specification section 3.3.3
194
- ///
195
- /// https://www.w3.org/TR/xml/#AVNormalize
196
- ///
197
- /// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value
198
- /// * Sequences of whitespace-like characters are replaced with a single whitespace character
199
- /// * Character and entity references are substituted as defined by the spec
200
- fn normalize_attribute_value ( attr : & [ u8 ] ) -> Cow < [ u8 ] > {
201
- // TODO: character references, entity references, error handling associated with those
202
-
203
- #[ derive( PartialEq ) ]
204
- enum ParseState {
205
- Space ,
206
- CDATA ,
207
- }
208
-
209
- // Trim characters from the beginning and end of the attribute value - this can't fail.
210
- fn trim_value ( attr : & [ u8 ] ) -> & [ u8 ] {
211
- let first_non_space_char = attr. iter ( ) . position ( |c| !is_whitespace ( * c) ) ;
212
-
213
- if first_non_space_char. is_none ( ) {
214
- // The entire value was whitespace-like characters
215
- return b"" ;
216
- }
217
-
218
- let last_non_space_char = attr. iter ( ) . rposition ( |c| !is_whitespace ( * c) ) ;
219
-
220
- // Trim all whitespace-like characters away from the beginning and end of the attribute value.
221
- let begin = first_non_space_char. unwrap ( ) ;
222
- let end = last_non_space_char. unwrap_or ( attr. len ( ) ) ;
223
- & attr[ begin..=end]
224
- }
225
-
226
- let trimmed_attr = trim_value ( attr) ;
227
-
228
- // A new buffer is only created when we encounter a situation that requires it.
229
- let mut normalized: Option < Vec < u8 > > = None ;
230
- // We start on character data because all whitespace-like characters are already trimmed away.
231
- let mut current_state = ParseState :: CDATA ;
232
-
233
- // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
234
- // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
235
- // buffer and continue using this buffer.
236
- for ( idx, ch) in trimmed_attr. iter ( ) . enumerate ( ) {
237
- match ch {
238
- b'\n' | b'\r' | b'\t' | b' ' => match current_state {
239
- ParseState :: Space => match normalized {
240
- Some ( _) => continue ,
241
- None => normalized = Some ( Vec :: from ( & trimmed_attr[ ..idx] ) ) ,
242
- } ,
243
- ParseState :: CDATA => {
244
- current_state = ParseState :: Space ;
245
- match normalized. as_mut ( ) {
246
- Some ( buf) => buf. push ( b' ' ) ,
247
- None => {
248
- let mut buf = Vec :: from ( & trimmed_attr[ ..idx] ) ;
249
- buf. push ( b' ' ) ;
250
- normalized = Some ( buf) ;
251
- }
252
- }
253
- }
254
- } ,
255
- c @ _ => match current_state {
256
- ParseState :: Space => {
257
- current_state = ParseState :: CDATA ;
258
- if let Some ( normalized) = normalized. as_mut ( ) {
259
- normalized. push ( * c) ;
260
- }
261
- }
262
- ParseState :: CDATA => {
263
- if let Some ( normalized) = normalized. as_mut ( ) {
264
- normalized. push ( * c) ;
265
- }
266
- }
267
- } ,
268
- }
269
- }
270
-
271
- match normalized {
272
- Some ( normalized) => Cow :: Owned ( normalized) ,
273
- None => Cow :: Borrowed ( trimmed_attr) ,
274
- }
275
- }
276
-
277
272
////////////////////////////////////////////////////////////////////////////////////////////////////
278
273
279
274
/// Iterator over XML attributes.
0 commit comments