@@ -9,6 +9,9 @@ use Mode::*;
9
9
#[ cfg( test) ]
10
10
mod tests;
11
11
12
+ // njn: need to add tests in tests/ui/mixed-utf8-literals/; see
13
+ // tests/ui/try-block/ for an example to follow
14
+
12
15
/// Errors and warnings that can occur during string unescaping. They mostly
13
16
/// relate to malformed escape sequences, but there are a few that are about
14
17
/// other problems.
@@ -80,12 +83,12 @@ impl EscapeError {
80
83
}
81
84
}
82
85
83
- /// Takes a contents of a literal (without quotes) and produces a sequence of
84
- /// escaped characters or errors.
86
+ /// Takes a contents of a non-mixed-utf8 literal (without quotes) and produces
87
+ /// a sequence of escaped characters or errors.
85
88
///
86
89
/// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
87
90
/// the callback will be called exactly once.
88
- pub fn unescape_literal < F > ( src : & str , mode : Mode , callback : & mut F )
91
+ pub fn unescape_non_mixed < F > ( src : & str , mode : Mode , callback : & mut F )
89
92
where
90
93
F : FnMut ( Range < usize > , Result < char , EscapeError > ) ,
91
94
{
95
98
let res = unescape_char_or_byte ( & mut chars, mode) ;
96
99
callback ( 0 ..( src. len ( ) - chars. as_str ( ) . len ( ) ) , res) ;
97
100
}
98
- Str | ByteStr => unescape_non_raw_common ( src, mode, callback) ,
99
- RawStr | RawByteStr => check_raw_common ( src, mode, callback) ,
100
- CStr | RawCStr => unreachable ! ( ) ,
101
+ Str => unescape_non_raw_common ( src, mode, callback) ,
102
+ RawStr => check_raw_common ( src, mode, callback) ,
103
+ RawByteStr { .. } => check_raw_common ( src, mode, & mut |r, result| callback ( r, result) ) ,
104
+ RawCStr => {
105
+ check_raw_common ( src, mode, & mut |r, mut result| {
106
+ if let Ok ( '\0' ) = result {
107
+ result = Err ( EscapeError :: NulInCStr ) ;
108
+ }
109
+ callback ( r, result)
110
+ } ) ;
111
+ }
112
+ ByteStr { .. } | CStr => unreachable ! ( ) ,
101
113
}
102
114
}
103
115
@@ -132,11 +144,16 @@ impl From<u8> for MixedUnit {
132
144
}
133
145
}
134
146
135
- pub fn unescape_c_string < F > ( src : & str , mode : Mode , callback : & mut F )
147
+ /// Takes a contents of a mixed-utf8 literal (without quotes) and produces
148
+ /// a sequence of escaped characters or errors.
149
+ ///
150
+ /// Values are returned by invoking `callback`.
151
+ pub fn unescape_mixed < F > ( src : & str , mode : Mode , callback : & mut F )
136
152
where
137
153
F : FnMut ( Range < usize > , Result < MixedUnit , EscapeError > ) ,
138
154
{
139
155
match mode {
156
+ ByteStr { .. } => unescape_non_raw_common ( src, mode, & mut |r, result| callback ( r, result) ) ,
140
157
CStr => {
141
158
unescape_non_raw_common ( src, mode, & mut |r, mut result| {
142
159
if let Ok ( MixedUnit :: Char ( '\0' ) ) = result {
@@ -145,16 +162,7 @@ where
145
162
callback ( r, result)
146
163
} ) ;
147
164
}
148
- RawCStr => {
149
- check_raw_common ( src, mode, & mut |r, mut result| {
150
- if let Ok ( '\0' ) = result {
151
- result = Err ( EscapeError :: NulInCStr ) ;
152
- }
153
- // High bytes aren't possible in raw strings.
154
- callback ( r, result. map ( MixedUnit :: Char ) )
155
- } ) ;
156
- }
157
- Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable ! ( ) ,
165
+ Char | Byte | Str | RawStr | RawByteStr { .. } | RawCStr => unreachable ! ( ) ,
158
166
}
159
167
}
160
168
@@ -180,8 +188,8 @@ pub enum Mode {
180
188
Str ,
181
189
RawStr ,
182
190
183
- ByteStr ,
184
- RawByteStr ,
191
+ ByteStr { rfc3349 : bool } ,
192
+ RawByteStr { rfc3349 : bool } ,
185
193
186
194
CStr ,
187
195
RawCStr ,
@@ -190,7 +198,7 @@ pub enum Mode {
190
198
impl Mode {
191
199
pub fn in_double_quotes ( self ) -> bool {
192
200
match self {
193
- Str | RawStr | ByteStr | RawByteStr | CStr | RawCStr => true ,
201
+ Str | RawStr | ByteStr { .. } | RawByteStr { .. } | CStr | RawCStr => true ,
194
202
Char | Byte => false ,
195
203
}
196
204
}
@@ -199,33 +207,39 @@ impl Mode {
199
207
fn allow_high_bytes ( self ) -> bool {
200
208
match self {
201
209
Char | Str => false ,
202
- Byte | ByteStr | CStr => true ,
203
- RawStr | RawByteStr | RawCStr => unreachable ! ( ) ,
210
+ Byte | ByteStr { .. } | CStr => true ,
211
+ RawStr | RawByteStr { .. } | RawCStr => unreachable ! ( ) ,
204
212
}
205
213
}
206
214
207
215
/// Are unicode (non-ASCII) chars allowed?
208
216
#[ inline]
209
217
fn allow_unicode_chars ( self ) -> bool {
210
218
match self {
211
- Byte | ByteStr | RawByteStr => false ,
212
- Char | Str | RawStr | CStr | RawCStr => true ,
219
+ Byte | ByteStr { rfc3349 : false } | RawByteStr { rfc3349 : false } => false ,
220
+ Char
221
+ | Str
222
+ | RawStr
223
+ | ByteStr { rfc3349 : true }
224
+ | RawByteStr { rfc3349 : true }
225
+ | CStr
226
+ | RawCStr => true ,
213
227
}
214
228
}
215
229
216
230
/// Are unicode escapes (`\u`) allowed?
217
231
fn allow_unicode_escapes ( self ) -> bool {
218
232
match self {
219
- Byte | ByteStr => false ,
220
- Char | Str | CStr => true ,
221
- RawByteStr | RawStr | RawCStr => unreachable ! ( ) ,
233
+ Byte | ByteStr { rfc3349 : false } => false ,
234
+ Char | Str | ByteStr { rfc3349 : true } | CStr => true ,
235
+ RawByteStr { .. } | RawStr | RawCStr => unreachable ! ( ) ,
222
236
}
223
237
}
224
238
225
239
pub fn prefix_noraw ( self ) -> & ' static str {
226
240
match self {
227
241
Char | Str | RawStr => "" ,
228
- Byte | ByteStr | RawByteStr => "b" ,
242
+ Byte | ByteStr { .. } | RawByteStr { .. } => "b" ,
229
243
CStr | RawCStr => "c" ,
230
244
}
231
245
}
@@ -263,12 +277,14 @@ fn scan_escape<T: From<char> + From<u8>>(
263
277
Ok ( T :: from ( value as u8 ) )
264
278
} ;
265
279
}
280
+ // njn: gate: is it a ByteStr?
266
281
'u' => return scan_unicode ( chars, mode. allow_unicode_escapes ( ) ) . map ( T :: from) ,
267
282
_ => return Err ( EscapeError :: InvalidEscape ) ,
268
283
} ;
269
284
Ok ( T :: from ( res) )
270
285
}
271
286
287
+ // njn: change arg to mode in precursor?
272
288
fn scan_unicode ( chars : & mut Chars < ' _ > , allow_unicode_escapes : bool ) -> Result < char , EscapeError > {
273
289
// We've parsed '\u', now we have to parse '{..}'.
274
290
@@ -333,6 +349,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
333
349
'\\' => scan_escape ( chars, mode) ,
334
350
'\n' | '\t' | '\'' => Err ( EscapeError :: EscapeOnlyChar ) ,
335
351
'\r' => Err ( EscapeError :: BareCarriageReturn ) ,
352
+ // njn: this is the only ascii_check that will remain
336
353
_ => ascii_check ( c, mode. allow_unicode_chars ( ) ) ,
337
354
} ?;
338
355
if chars. next ( ) . is_some ( ) {
@@ -373,6 +390,10 @@ where
373
390
}
374
391
'"' => Err ( EscapeError :: EscapeOnlyChar ) ,
375
392
'\r' => Err ( EscapeError :: BareCarriageReturn ) ,
393
+
394
+ // njn: gate, similar to check_raw_common, check:
395
+ // - is it a ByteStr AND does it contain a unicode char
396
+
376
397
_ => ascii_check ( c, allow_unicode_chars) . map ( T :: from) ,
377
398
} ;
378
399
let end = src. len ( ) - chars. as_str ( ) . len ( ) ;
@@ -424,6 +445,15 @@ where
424
445
let start = src. len ( ) - chars. as_str ( ) . len ( ) - c. len_utf8 ( ) ;
425
446
let res = match c {
426
447
'\r' => Err ( EscapeError :: BareCarriageReturnInRawString ) ,
448
+
449
+ // njn: gate: need to somehow return an indication of whether
450
+ // rfc3349 unicode char allowance was required for this literal,
451
+ // i.e. check
452
+ // - is it a RawByteStr AND does it contain a unicode char
453
+ //
454
+ // njn: but the ascii_check itself isn't necessary
455
+ // - or make it return three values? ok, ok-with-3349, bad?
456
+
427
457
_ => ascii_check ( c, allow_unicode_chars) ,
428
458
} ;
429
459
let end = src. len ( ) - chars. as_str ( ) . len ( ) ;
@@ -432,8 +462,8 @@ where
432
462
}
433
463
434
464
#[ inline]
435
- pub fn byte_from_char ( c : char ) -> u8 {
465
+ pub ( crate ) fn byte_from_char ( c : char ) -> u8 {
436
466
let res = c as u32 ;
437
- debug_assert ! ( res <= u8 :: MAX as u32 , "guaranteed because of ByteStr " ) ;
467
+ debug_assert ! ( res <= u8 :: MAX as u32 , "guaranteed because of Byte " ) ;
438
468
res as u8
439
469
}
0 commit comments