@@ -88,7 +88,7 @@ impl EscapeError {
8888///
8989/// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
9090/// the callback will be called exactly once.
91- pub fn unescape_non_mixed < F > ( src : & str , mode : Mode , callback : & mut F )
91+ pub fn unescape_non_mixed < F > ( src : & str , mode : Mode , callback : & mut F ) -> Rfc3349
9292where
9393 F : FnMut ( Range < usize > , Result < char , EscapeError > ) ,
9494{
9797 let mut chars = src. chars ( ) ;
9898 let res = unescape_char_or_byte ( & mut chars, mode) ;
9999 callback ( 0 ..( src. len ( ) - chars. as_str ( ) . len ( ) ) , res) ;
100+ Rfc3349 :: Unused // rfc3349 never triggered by char or byte literals
100101 }
101102 Str => unescape_non_raw_common ( src, mode, callback) ,
102103 RawStr => check_raw_common ( src, mode, callback) ,
@@ -107,7 +108,7 @@ where
107108 result = Err ( EscapeError :: NulInCStr ) ;
108109 }
109110 callback ( r, result)
110- } ) ;
111+ } )
111112 }
112113 ByteStr { .. } | CStr => unreachable ! ( ) ,
113114 }
@@ -148,7 +149,7 @@ impl From<u8> for MixedUnit {
148149/// a sequence of escaped characters or errors.
149150///
150151/// Values are returned by invoking `callback`.
151- pub fn unescape_mixed < F > ( src : & str , mode : Mode , callback : & mut F )
152+ pub fn unescape_mixed < F > ( src : & str , mode : Mode , callback : & mut F ) -> Rfc3349
152153where
153154 F : FnMut ( Range < usize > , Result < MixedUnit , EscapeError > ) ,
154155{
@@ -160,7 +161,7 @@ where
160161 result = Err ( EscapeError :: NulInCStr ) ;
161162 }
162163 callback ( r, result)
163- } ) ;
164+ } )
164165 }
165166 Char | Byte | Str | RawStr | RawByteStr { .. } | RawCStr => unreachable ! ( ) ,
166167 }
@@ -178,6 +179,15 @@ pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
178179 unescape_char_or_byte ( & mut src. chars ( ) , Byte ) . map ( byte_from_char)
179180}
180181
182+ /// Used to indicate if rfc3349 (mixed-utf8-literals) was required for the
183+ /// literal to be valid.
184+ #[ derive( Debug , PartialEq ) ]
185+ #[ must_use]
186+ pub enum Rfc3349 {
187+ Used ,
188+ Unused ,
189+ }
190+
181191/// What kind of literal do we parse.
182192#[ derive( Debug , Clone , Copy , PartialEq ) ]
183193pub enum Mode {
@@ -214,24 +224,24 @@ impl Mode {
214224
215225 /// Are unicode (non-ASCII) chars allowed?
216226 #[ inline]
217- fn allow_unicode_chars ( self ) -> bool {
227+ fn allow_unicode_chars ( self , rfc3349 : & mut Rfc3349 ) -> bool {
218228 match self {
219- Byte | ByteStr { rfc3349 : false } | RawByteStr { rfc3349 : false } => false ,
229+ Byte => false ,
230+ ByteStr { .. } | RawByteStr { .. } => { * rfc3349 = Rfc3349 :: Used ; true }
220231 Char
221232 | Str
222233 | RawStr
223- | ByteStr { rfc3349 : true }
224- | RawByteStr { rfc3349 : true }
225234 | CStr
226235 | RawCStr => true ,
227236 }
228237 }
229238
230239 /// Are unicode escapes (`\u`) allowed?
231- fn allow_unicode_escapes ( self ) -> bool {
240+ fn allow_unicode_escapes ( self , rfc3349 : & mut Rfc3349 ) -> bool {
232241 match self {
233- Byte | ByteStr { rfc3349 : false } => false ,
234- Char | Str | ByteStr { rfc3349 : true } | CStr => true ,
242+ Byte => false ,
243+ ByteStr { .. } => { * rfc3349 = Rfc3349 :: Used ; true }
244+ Char | Str | CStr => true ,
235245 RawByteStr { .. } | RawStr | RawCStr => unreachable ! ( ) ,
236246 }
237247 }
@@ -245,9 +255,12 @@ impl Mode {
245255 }
246256}
247257
258+ // The bool in the return value indicates if rfc3349 must be enabled for the
259+ // escape to be accepted.
248260fn scan_escape < T : From < char > + From < u8 > > (
249261 chars : & mut Chars < ' _ > ,
250262 mode : Mode ,
263+ rfc3349 : & mut Rfc3349 ,
251264) -> Result < T , EscapeError > {
252265 // Previous character was '\\', unescape what follows.
253266 let res: char = match chars. next ( ) . ok_or ( EscapeError :: LoneSlash ) ? {
@@ -277,15 +290,17 @@ fn scan_escape<T: From<char> + From<u8>>(
277290 Ok ( T :: from ( value as u8 ) )
278291 } ;
279292 }
280- // njn: gate: is it a ByteStr?
281- 'u' => return scan_unicode ( chars, mode. allow_unicode_escapes ( ) ) . map ( T :: from) ,
293+ 'u' => {
294+ // njn: convert all mode matches back to equality checks
295+ return scan_unicode ( chars, mode, rfc3349) . map ( T :: from) ;
296+ }
282297 _ => return Err ( EscapeError :: InvalidEscape ) ,
283298 } ;
284299 Ok ( T :: from ( res) )
285300}
286301
287302// njn: change arg to mode in precursor?
288- fn scan_unicode ( chars : & mut Chars < ' _ > , allow_unicode_escapes : bool ) -> Result < char , EscapeError > {
303+ fn scan_unicode ( chars : & mut Chars < ' _ > , mode : Mode , rfc3349 : & mut Rfc3349 ) -> Result < char , EscapeError > {
289304 // We've parsed '\u', now we have to parse '{..}'.
290305
291306 if chars. next ( ) != Some ( '{' ) {
@@ -313,7 +328,7 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
313328
314329 // Incorrect syntax has higher priority for error reporting
315330 // than unallowed value for a literal.
316- if !allow_unicode_escapes {
331+ if !mode . allow_unicode_escapes ( rfc3349 ) {
317332 return Err ( EscapeError :: UnicodeEscapeInByte ) ;
318333 }
319334
@@ -339,19 +354,28 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
339354}
340355
341356#[ inline]
342- fn ascii_check ( c : char , allow_unicode_chars : bool ) -> Result < char , EscapeError > {
343- if allow_unicode_chars || c. is_ascii ( ) { Ok ( c) } else { Err ( EscapeError :: NonAsciiCharInByte ) }
357+ fn ascii_check ( c : char , mode : Mode , rfc3349 : & mut Rfc3349 ) -> Result < char , EscapeError > {
358+ // Note: we must check `is_ascii` first, to avoid setting `rfc3349` unnecessarily.
359+ if c. is_ascii ( ) || mode. allow_unicode_chars ( rfc3349) {
360+ Ok ( c)
361+ } else {
362+ Err ( EscapeError :: NonAsciiCharInByte )
363+ }
344364}
345365
346366fn unescape_char_or_byte ( chars : & mut Chars < ' _ > , mode : Mode ) -> Result < char , EscapeError > {
347367 let c = chars. next ( ) . ok_or ( EscapeError :: ZeroChars ) ?;
368+ let mut rfc3349 = Rfc3349 :: Unused ;
348369 let res = match c {
349- '\\' => scan_escape ( chars, mode) ,
370+ '\\' => scan_escape ( chars, mode, & mut rfc3349 ) ,
350371 '\n' | '\t' | '\'' => Err ( EscapeError :: EscapeOnlyChar ) ,
351372 '\r' => Err ( EscapeError :: BareCarriageReturn ) ,
352- // njn: this is the only ascii_check that will remain
353- _ => ascii_check ( c, mode. allow_unicode_chars ( ) ) ,
373+ _ => ascii_check ( c, mode, & mut rfc3349) ,
354374 } ?;
375+
376+ // rfc3349 cannot be triggered for char or byte literals.
377+ assert_eq ! ( rfc3349, Rfc3349 :: Unused ) ;
378+
355379 if chars. next ( ) . is_some ( ) {
356380 return Err ( EscapeError :: MoreThanOneChar ) ;
357381 }
@@ -360,12 +384,12 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
360384
361385/// Takes a contents of a string literal (without quotes) and produces a
362386/// sequence of escaped characters or errors.
363- fn unescape_non_raw_common < F , T : From < char > + From < u8 > > ( src : & str , mode : Mode , callback : & mut F )
387+ fn unescape_non_raw_common < F , T : From < char > + From < u8 > > ( src : & str , mode : Mode , callback : & mut F ) -> Rfc3349
364388where
365389 F : FnMut ( Range < usize > , Result < T , EscapeError > ) ,
366390{
367391 let mut chars = src. chars ( ) ;
368- let allow_unicode_chars = mode . allow_unicode_chars ( ) ; // get this outside the loop
392+ let mut rfc3349 = Rfc3349 :: Unused ;
369393
370394 // The `start` and `end` computation here is complicated because
371395 // `skip_ascii_whitespace` makes us to skip over chars without counting
@@ -385,20 +409,17 @@ where
385409 } ) ;
386410 continue ;
387411 }
388- _ => scan_escape :: < T > ( & mut chars, mode) ,
412+ _ => scan_escape :: < T > ( & mut chars, mode, & mut rfc3349 ) ,
389413 }
390414 }
391415 '"' => Err ( EscapeError :: EscapeOnlyChar ) ,
392416 '\r' => Err ( EscapeError :: BareCarriageReturn ) ,
393-
394- // njn: gate, similar to check_raw_common, check:
395- // - is it a ByteStr AND does it contain a unicode char
396-
397- _ => ascii_check ( c, allow_unicode_chars) . map ( T :: from) ,
417+ _ => ascii_check ( c, mode, & mut rfc3349) . map ( T :: from) ,
398418 } ;
399419 let end = src. len ( ) - chars. as_str ( ) . len ( ) ;
400420 callback ( start..end, res) ;
401421 }
422+ rfc3349
402423}
403424
404425fn skip_ascii_whitespace < F > ( chars : & mut Chars < ' _ > , start : usize , callback : & mut F )
@@ -431,12 +452,12 @@ where
431452/// sequence of characters or errors.
432453/// NOTE: Raw strings do not perform any explicit character escaping, here we
433454/// only produce errors on bare CR.
434- fn check_raw_common < F > ( src : & str , mode : Mode , callback : & mut F )
455+ fn check_raw_common < F > ( src : & str , mode : Mode , callback : & mut F ) -> Rfc3349
435456where
436457 F : FnMut ( Range < usize > , Result < char , EscapeError > ) ,
437458{
438459 let mut chars = src. chars ( ) ;
439- let allow_unicode_chars = mode . allow_unicode_chars ( ) ; // get this outside the loop
460+ let mut rfc3349 = Rfc3349 :: Unused ;
440461
441462 // The `start` and `end` computation here matches the one in
442463 // `unescape_non_raw_common` for consistency, even though this function
@@ -445,20 +466,12 @@ where
445466 let start = src. len ( ) - chars. as_str ( ) . len ( ) - c. len_utf8 ( ) ;
446467 let res = match c {
447468 '\r' => Err ( EscapeError :: BareCarriageReturnInRawString ) ,
448-
449- // njn: gate: need to somehow return an indication of whether
450- // rfc3349 unicode char allowance was required for this literal,
451- // i.e. check
452- // - is it a RawByteStr AND does it contain a unicode char
453- //
454- // njn: but the ascii_check itself isn't necessary
455- // - or make it return three values? ok, ok-with-3349, bad?
456-
457- _ => ascii_check ( c, allow_unicode_chars) ,
469+ _ => ascii_check ( c, mode, & mut rfc3349) ,
458470 } ;
459471 let end = src. len ( ) - chars. as_str ( ) . len ( ) ;
460472 callback ( start..end, res) ;
461473 }
474+ rfc3349
462475}
463476
464477#[ inline]
0 commit comments