Skip to content

Commit 73a7193

Browse files
committed
[xtk-ui, with let rfc3349 = false;] rfc3349
1 parent 071d9ce commit 73a7193

File tree

15 files changed

+195
-89
lines changed

15 files changed

+195
-89
lines changed

compiler/rustc_ast/src/util/literal.rs

+12-6
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@
33
use crate::ast::{self, LitKind, MetaItemLit, StrStyle};
44
use crate::token::{self, Token};
55
use rustc_lexer::unescape::{
6-
byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, MixedUnit,
7-
Mode,
6+
unescape_byte, unescape_char, unescape_mixed, unescape_non_mixed, MixedUnit, Mode,
87
};
98
use rustc_span::symbol::{kw, sym, Symbol};
109
use rustc_span::Span;
@@ -85,7 +84,7 @@ impl LitKind {
8584
// Force-inlining here is aggressive but the closure is
8685
// called on every char in the string, so it can be hot in
8786
// programs with many long strings containing escapes.
88-
unescape_literal(
87+
unescape_non_mixed(
8988
s,
9089
Mode::Str,
9190
&mut #[inline(always)]
@@ -109,8 +108,15 @@ impl LitKind {
109108
token::ByteStr => {
110109
let s = symbol.as_str();
111110
let mut buf = Vec::with_capacity(s.len());
112-
unescape_literal(s, Mode::ByteStr, &mut |_, c| match c {
113-
Ok(c) => buf.push(byte_from_char(c)),
111+
// We can just use `rfc3349 = true` here, which is more
112+
// permissive than `rfc3349 = false`, because escapes and
113+
// chars were checked by the lexer.
114+
let rfc3349 = true;
115+
unescape_mixed(s, Mode::ByteStr { rfc3349 }, &mut |_, c| match c {
116+
Ok(MixedUnit::Char(c)) => {
117+
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
118+
}
119+
Ok(MixedUnit::HighByte(b)) => buf.push(b),
114120
Err(err) => {
115121
assert!(!err.is_fatal(), "failed to unescape string literal")
116122
}
@@ -126,7 +132,7 @@ impl LitKind {
126132
token::CStr => {
127133
let s = symbol.as_str();
128134
let mut buf = Vec::with_capacity(s.len());
129-
unescape_c_string(s, Mode::CStr, &mut |_span, c| match c {
135+
unescape_mixed(s, Mode::CStr, &mut |_span, c| match c {
130136
Ok(MixedUnit::Char(c)) => {
131137
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
132138
}

compiler/rustc_ast_passes/src/feature_gate.rs

+2
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,8 @@ pub fn check_crate(krate: &ast::Crate, sess: &Session, features: &Features) {
508508
}
509509
};
510510
}
511+
// njn: right wording?
512+
gate_all!(mixed_utf8_literals, r#"mixed utf8 b"..." and br"..." literals are experimental"#);
511513
gate_all!(
512514
if_let_guard,
513515
"`if let` guards are experimental",

compiler/rustc_feature/src/unstable.rs

+2
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,8 @@ declare_features! (
520520
/// standard library until the soundness issues with specialization
521521
/// are fixed.
522522
(unstable, min_specialization, "1.7.0", Some(31844)),
523+
/// Allows mixed utf8 b"..." and br"..." literals.
524+
(unstable, mixed_utf8_literals, "CURRENT_RUSTC_VERSION", Some(116907)),
523525
/// Allows qualified paths in struct expressions, struct patterns and tuple struct patterns.
524526
(unstable, more_qualified_paths, "1.54.0", Some(86935)),
525527
/// Allows the `#[must_not_suspend]` attribute.

compiler/rustc_lexer/src/unescape.rs

+60-30
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ use Mode::*;
99
#[cfg(test)]
1010
mod tests;
1111

12+
// njn: need to add tests in tests/ui/mixed-utf8-literals/; see
13+
// tests/ui/try-block/ for an example to follow
14+
1215
/// Errors and warnings that can occur during string unescaping. They mostly
1316
/// relate to malformed escape sequences, but there are a few that are about
1417
/// other problems.
@@ -80,12 +83,12 @@ impl EscapeError {
8083
}
8184
}
8285

83-
/// Takes a contents of a literal (without quotes) and produces a sequence of
84-
/// escaped characters or errors.
86+
/// Takes a contents of a non-mixed-utf8 literal (without quotes) and produces
87+
/// a sequence of escaped characters or errors.
8588
///
8689
/// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
8790
/// the callback will be called exactly once.
88-
pub fn unescape_literal<F>(src: &str, mode: Mode, callback: &mut F)
91+
pub fn unescape_non_mixed<F>(src: &str, mode: Mode, callback: &mut F)
8992
where
9093
F: FnMut(Range<usize>, Result<char, EscapeError>),
9194
{
@@ -95,9 +98,18 @@ where
9598
let res = unescape_char_or_byte(&mut chars, mode);
9699
callback(0..(src.len() - chars.as_str().len()), res);
97100
}
98-
Str | ByteStr => unescape_non_raw_common(src, mode, callback),
99-
RawStr | RawByteStr => check_raw_common(src, mode, callback),
100-
CStr | RawCStr => unreachable!(),
101+
Str => unescape_non_raw_common(src, mode, callback),
102+
RawStr => check_raw_common(src, mode, callback),
103+
RawByteStr { .. } => check_raw_common(src, mode, &mut |r, result| callback(r, result)),
104+
RawCStr => {
105+
check_raw_common(src, mode, &mut |r, mut result| {
106+
if let Ok('\0') = result {
107+
result = Err(EscapeError::NulInCStr);
108+
}
109+
callback(r, result)
110+
});
111+
}
112+
ByteStr { .. } | CStr => unreachable!(),
101113
}
102114
}
103115

@@ -132,11 +144,16 @@ impl From<u8> for MixedUnit {
132144
}
133145
}
134146

135-
pub fn unescape_c_string<F>(src: &str, mode: Mode, callback: &mut F)
147+
/// Takes a contents of a mixed-utf8 literal (without quotes) and produces
148+
/// a sequence of escaped characters or errors.
149+
///
150+
/// Values are returned by invoking `callback`.
151+
pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F)
136152
where
137153
F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
138154
{
139155
match mode {
156+
ByteStr { .. } => unescape_non_raw_common(src, mode, &mut |r, result| callback(r, result)),
140157
CStr => {
141158
unescape_non_raw_common(src, mode, &mut |r, mut result| {
142159
if let Ok(MixedUnit::Char('\0')) = result {
@@ -145,16 +162,7 @@ where
145162
callback(r, result)
146163
});
147164
}
148-
RawCStr => {
149-
check_raw_common(src, mode, &mut |r, mut result| {
150-
if let Ok('\0') = result {
151-
result = Err(EscapeError::NulInCStr);
152-
}
153-
// High bytes aren't possible in raw strings.
154-
callback(r, result.map(MixedUnit::Char))
155-
});
156-
}
157-
Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(),
165+
Char | Byte | Str | RawStr | RawByteStr { .. } | RawCStr => unreachable!(),
158166
}
159167
}
160168

@@ -180,8 +188,8 @@ pub enum Mode {
180188
Str,
181189
RawStr,
182190

183-
ByteStr,
184-
RawByteStr,
191+
ByteStr { rfc3349: bool },
192+
RawByteStr { rfc3349: bool },
185193

186194
CStr,
187195
RawCStr,
@@ -190,7 +198,7 @@ pub enum Mode {
190198
impl Mode {
191199
pub fn in_double_quotes(self) -> bool {
192200
match self {
193-
Str | RawStr | ByteStr | RawByteStr | CStr | RawCStr => true,
201+
Str | RawStr | ByteStr { .. } | RawByteStr { .. } | CStr | RawCStr => true,
194202
Char | Byte => false,
195203
}
196204
}
@@ -199,33 +207,39 @@ impl Mode {
199207
fn allow_high_bytes(self) -> bool {
200208
match self {
201209
Char | Str => false,
202-
Byte | ByteStr | CStr => true,
203-
RawStr | RawByteStr | RawCStr => unreachable!(),
210+
Byte | ByteStr { .. } | CStr => true,
211+
RawStr | RawByteStr { .. } | RawCStr => unreachable!(),
204212
}
205213
}
206214

207215
/// Are unicode (non-ASCII) chars allowed?
208216
#[inline]
209217
fn allow_unicode_chars(self) -> bool {
210218
match self {
211-
Byte | ByteStr | RawByteStr => false,
212-
Char | Str | RawStr | CStr | RawCStr => true,
219+
Byte | ByteStr { rfc3349: false } | RawByteStr { rfc3349: false } => false,
220+
Char
221+
| Str
222+
| RawStr
223+
| ByteStr { rfc3349: true }
224+
| RawByteStr { rfc3349: true }
225+
| CStr
226+
| RawCStr => true,
213227
}
214228
}
215229

216230
/// Are unicode escapes (`\u`) allowed?
217231
fn allow_unicode_escapes(self) -> bool {
218232
match self {
219-
Byte | ByteStr => false,
220-
Char | Str | CStr => true,
221-
RawByteStr | RawStr | RawCStr => unreachable!(),
233+
Byte | ByteStr { rfc3349: false } => false,
234+
Char | Str | ByteStr { rfc3349: true } | CStr => true,
235+
RawByteStr { .. } | RawStr | RawCStr => unreachable!(),
222236
}
223237
}
224238

225239
pub fn prefix_noraw(self) -> &'static str {
226240
match self {
227241
Char | Str | RawStr => "",
228-
Byte | ByteStr | RawByteStr => "b",
242+
Byte | ByteStr { .. } | RawByteStr { .. } => "b",
229243
CStr | RawCStr => "c",
230244
}
231245
}
@@ -263,12 +277,14 @@ fn scan_escape<T: From<char> + From<u8>>(
263277
Ok(T::from(value as u8))
264278
};
265279
}
280+
// njn: gate: is it a ByteStr?
266281
'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from),
267282
_ => return Err(EscapeError::InvalidEscape),
268283
};
269284
Ok(T::from(res))
270285
}
271286

287+
// njn: change arg to mode in precursor?
272288
fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<char, EscapeError> {
273289
// We've parsed '\u', now we have to parse '{..}'.
274290

@@ -333,6 +349,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
333349
'\\' => scan_escape(chars, mode),
334350
'\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
335351
'\r' => Err(EscapeError::BareCarriageReturn),
352+
// njn: this is the only ascii_check that will remain
336353
_ => ascii_check(c, mode.allow_unicode_chars()),
337354
}?;
338355
if chars.next().is_some() {
@@ -373,6 +390,10 @@ where
373390
}
374391
'"' => Err(EscapeError::EscapeOnlyChar),
375392
'\r' => Err(EscapeError::BareCarriageReturn),
393+
394+
// njn: gate, similar to check_raw_common, check:
395+
// - is it a ByteStr AND does it contain a unicode char
396+
376397
_ => ascii_check(c, allow_unicode_chars).map(T::from),
377398
};
378399
let end = src.len() - chars.as_str().len();
@@ -424,6 +445,15 @@ where
424445
let start = src.len() - chars.as_str().len() - c.len_utf8();
425446
let res = match c {
426447
'\r' => Err(EscapeError::BareCarriageReturnInRawString),
448+
449+
// njn: gate: need to somehow return an indication of whether
450+
// rfc3349 unicode char allowance was required for this literal,
451+
// i.e. check
452+
// - is it a RawByteStr AND does it contain a unicode char
453+
//
454+
// njn: but the ascii_check itself isn't necessary
455+
// - or make it return three values? ok, ok-with-3349, bad?
456+
427457
_ => ascii_check(c, allow_unicode_chars),
428458
};
429459
let end = src.len() - chars.as_str().len();
@@ -432,8 +462,8 @@ where
432462
}
433463

434464
#[inline]
435-
pub fn byte_from_char(c: char) -> u8 {
465+
pub(crate) fn byte_from_char(c: char) -> u8 {
436466
let res = c as u32;
437-
debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr");
467+
debug_assert!(res <= u8::MAX as u32, "guaranteed because of Byte");
438468
res as u8
439469
}

compiler/rustc_lexer/src/unescape/tests.rs

+15-10
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ fn test_unescape_char_good() {
100100
fn test_unescape_str_warn() {
101101
fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
102102
let mut unescaped = Vec::with_capacity(literal.len());
103-
unescape_literal(literal, Mode::Str, &mut |range, res| unescaped.push((range, res)));
103+
unescape_non_mixed(literal, Mode::Str, &mut |range, res| unescaped.push((range, res)));
104104
assert_eq!(unescaped, expected);
105105
}
106106

@@ -124,7 +124,7 @@ fn test_unescape_str_warn() {
124124
fn test_unescape_str_good() {
125125
fn check(literal_text: &str, expected: &str) {
126126
let mut buf = Ok(String::with_capacity(literal_text.len()));
127-
unescape_literal(literal_text, Mode::Str, &mut |range, c| {
127+
unescape_non_mixed(literal_text, Mode::Str, &mut |range, c| {
128128
if let Ok(b) = &mut buf {
129129
match c {
130130
Ok(c) => b.push(c),
@@ -240,16 +240,19 @@ fn test_unescape_byte_good() {
240240
#[test]
241241
fn test_unescape_byte_str_good() {
242242
fn check(literal_text: &str, expected: &[u8]) {
243-
let mut buf = Ok(Vec::with_capacity(literal_text.len()));
244-
unescape_literal(literal_text, Mode::ByteStr, &mut |range, c| {
245-
if let Ok(b) = &mut buf {
243+
let mut buf_res = Ok(Vec::with_capacity(literal_text.len()));
244+
unescape_mixed(literal_text, Mode::ByteStr { rfc3349: false }, &mut |range, c| {
245+
if let Ok(buf) = &mut buf_res {
246246
match c {
247-
Ok(c) => b.push(byte_from_char(c)),
248-
Err(e) => buf = Err((range, e)),
247+
Ok(MixedUnit::Char(c)) => {
248+
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
249+
}
250+
Ok(MixedUnit::HighByte(b)) => buf.push(b),
251+
Err(e) => buf_res = Err((range, e)),
249252
}
250253
}
251254
});
252-
assert_eq!(buf.as_deref(), Ok(expected))
255+
assert_eq!(buf_res.as_deref(), Ok(expected))
253256
}
254257

255258
check("foo", b"foo");
@@ -264,7 +267,7 @@ fn test_unescape_byte_str_good() {
264267
fn test_unescape_raw_str() {
265268
fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
266269
let mut unescaped = Vec::with_capacity(literal.len());
267-
unescape_literal(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res)));
270+
unescape_non_mixed(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res)));
268271
assert_eq!(unescaped, expected);
269272
}
270273

@@ -276,7 +279,9 @@ fn test_unescape_raw_str() {
276279
fn test_unescape_raw_byte_str() {
277280
fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
278281
let mut unescaped = Vec::with_capacity(literal.len());
279-
unescape_literal(literal, Mode::RawByteStr, &mut |range, res| unescaped.push((range, res)));
282+
unescape_non_mixed(literal, Mode::RawByteStr { rfc3349: false }, &mut |range, res| {
283+
unescaped.push((range, res))
284+
});
280285
assert_eq!(unescaped, expected);
281286
}
282287

compiler/rustc_parse/messages.ftl

+4
Original file line numberDiff line numberDiff line change
@@ -814,6 +814,10 @@ parse_unexpected_vert_vert_before_function_parameter = unexpected `||` before fu
814814
parse_unexpected_vert_vert_in_pattern = unexpected token `||` in pattern
815815
.suggestion = use a single `|` to separate multiple alternative patterns
816816
817+
# njn:
818+
# - b'\u{1234}' error says "unicode escape in byte string", should be "byte literal"
819+
# - after rfc3349 stabilizes, byte literal wil be the only error case here
820+
# - could add a `.desc` field in a precursor
817821
parse_unicode_escape_in_byte = unicode escape in byte string
818822
.label = {parse_unicode_escape_in_byte}
819823
.help = unicode escape sequences cannot be used as a byte or in a byte string

0 commit comments

Comments
 (0)