Skip to content

Commit aa4f805

Browse files
committed
Make unicode text flow control chars visible as �
We already point these out quite aggressively, telling people not to use them, but would normally be rendered as nothing. Having them visible will make it easier for people to actually deal with them. ``` error: unicode codepoint changing visible direction of text present in literal --> $DIR/unicode-control-codepoints.rs:26:22 | LL | println!("{:?}", '�'); | ^-^ | || | |'\u{202e}' | this literal contains an invisible unicode text flow control codepoint | = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen = help: if their presence wasn't intentional, you can remove them help: if you want to keep them but make them visible in your source code, you can escape them | LL | println!("{:?}", '\u{202e}'); | ~~~~~~~~ ``` vs the previous ``` error: unicode codepoint changing visible direction of text present in literal --> $DIR/unicode-control-codepoints.rs:26:22 | LL | println!("{:?}", ''); | ^- | || | |'\u{202e}' | this literal contains an invisible unicode text flow control codepoint | = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen = help: if their presence wasn't intentional, you can remove them help: if you want to keep them but make them visible in your source code, you can escape them | LL | println!("{:?}", '\u{202e}'); | ~~~~~~~~ ```
1 parent ac6eb65 commit aa4f805

File tree

3 files changed

+62
-60
lines changed

3 files changed

+62
-60
lines changed

Diff for: compiler/rustc_errors/src/emitter.rs

+11-10
Original file line numberDiff line numberDiff line change
@@ -2558,18 +2558,19 @@ fn num_decimal_digits(num: usize) -> usize {
25582558
}
25592559

25602560
// We replace some characters so the CLI output is always consistent and underlines aligned.
2561+
// Keep the following list in sync with `rustc_span::char_width`.
25612562
const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
2562-
('\t', " "), // We do our own tab replacement
2563+
('\t', " "), // We do our own tab replacement
25632564
('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
2564-
('\u{202A}', ""), // The following unicode text flow control characters are inconsistently
2565-
('\u{202B}', ""), // supported across CLIs and can cause confusion due to the bytes on disk
2566-
('\u{202D}', ""), // not corresponding to the visible source code, so we replace them always.
2567-
('\u{202E}', ""),
2568-
('\u{2066}', ""),
2569-
('\u{2067}', ""),
2570-
('\u{2068}', ""),
2571-
('\u{202C}', ""),
2572-
('\u{2069}', ""),
2565+
('\u{202A}', ""), // The following unicode text flow control characters are inconsistently
2566+
('\u{202B}', ""), // supported across CLIs and can cause confusion due to the bytes on disk
2567+
('\u{202D}', ""), // not corresponding to the visible source code, so we replace them always.
2568+
('\u{202E}', ""),
2569+
('\u{2066}', ""),
2570+
('\u{2067}', ""),
2571+
('\u{2068}', ""),
2572+
('\u{202C}', ""),
2573+
('\u{2069}', ""),
25732574
// In terminals without Unicode support the following will be garbled, but in *all* terminals
25742575
// the underlying codepoint will be as well. We could gate this replacement behind a "unicode
25752576
// support" gate.

Diff for: compiler/rustc_span/src/lib.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -2094,7 +2094,8 @@ pub fn char_width(ch: char) -> usize {
20942094
| '\u{000E}' | '\u{000F}' | '\u{0010}' | '\u{0011}' | '\u{0012}' | '\u{0013}'
20952095
| '\u{0014}' | '\u{0015}' | '\u{0016}' | '\u{0017}' | '\u{0018}' | '\u{0019}'
20962096
| '\u{001A}' | '\u{001B}' | '\u{001C}' | '\u{001D}' | '\u{001E}' | '\u{001F}'
2097-
| '\u{007F}' => 1,
2097+
| '\u{007F}' | '\u{202A}' | '\u{202B}' | '\u{202D}' | '\u{202E}' | '\u{2066}'
2098+
| '\u{2067}' | '\u{2068}' | '\u{202C}' | '\u{2069}' => 1,
20982099
_ => unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1),
20992100
}
21002101
}

Diff for: tests/ui/parser/unicode-control-codepoints.stderr

+49-49
Original file line numberDiff line numberDiff line change
@@ -17,78 +17,78 @@ LL | println!("{:?}", b"us\u{202B}e\u{202A}r");
1717
error: non-ASCII character in byte string literal
1818
--> $DIR/unicode-control-codepoints.rs:16:26
1919
|
20-
LL | println!("{:?}", b"/* } if isAdmin begin admins only ");
20+
LL | println!("{:?}", b"/* } if isAdmin� � begin admins only ");
2121
| ^ must be ASCII but is '\u{202e}'
2222
|
2323
help: if you meant to use the UTF-8 encoding of '\u{202e}', use \xHH escapes
2424
|
25-
LL | println!("{:?}", b"/*\xE2\x80\xAE } if isAdmin begin admins only ");
25+
LL | println!("{:?}", b"/*\xE2\x80\xAE } if isAdmin� � begin admins only ");
2626
| ~~~~~~~~~~~~
2727

2828
error: non-ASCII character in byte string literal
2929
--> $DIR/unicode-control-codepoints.rs:16:30
3030
|
31-
LL | println!("{:?}", b"/* } if isAdmin begin admins only ");
32-
| ^ must be ASCII but is '\u{2066}'
31+
LL | println!("{:?}", b"/* } if isAdmin� � begin admins only ");
32+
| ^ must be ASCII but is '\u{2066}'
3333
|
3434
help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes
3535
|
36-
LL | println!("{:?}", b"/* } \xE2\x81\xA6if isAdmin begin admins only ");
37-
| ~~~~~~~~~~~~
36+
LL | println!("{:?}", b"/* } \xE2\x81\xA6if isAdmin� � begin admins only ");
37+
| ~~~~~~~~~~~~
3838

3939
error: non-ASCII character in byte string literal
4040
--> $DIR/unicode-control-codepoints.rs:16:41
4141
|
42-
LL | println!("{:?}", b"/* } if isAdmin begin admins only ");
43-
| ^ must be ASCII but is '\u{2069}'
42+
LL | println!("{:?}", b"/* } if isAdmin� � begin admins only ");
43+
| ^ must be ASCII but is '\u{2069}'
4444
|
4545
help: if you meant to use the UTF-8 encoding of '\u{2069}', use \xHH escapes
4646
|
47-
LL | println!("{:?}", b"/* } if isAdmin\xE2\x81\xA9 begin admins only ");
48-
| ~~~~~~~~~~~~
47+
LL | println!("{:?}", b"/* } if isAdmin\xE2\x81\xA9 begin admins only ");
48+
| ~~~~~~~~~~~~
4949

5050
error: non-ASCII character in byte string literal
5151
--> $DIR/unicode-control-codepoints.rs:16:43
5252
|
53-
LL | println!("{:?}", b"/* } if isAdmin begin admins only ");
54-
| ^ must be ASCII but is '\u{2066}'
53+
LL | println!("{:?}", b"/* } if isAdmin� � begin admins only ");
54+
| ^ must be ASCII but is '\u{2066}'
5555
|
5656
help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes
5757
|
58-
LL | println!("{:?}", b"/* } if isAdmin \xE2\x81\xA6 begin admins only ");
59-
| ~~~~~~~~~~~~
58+
LL | println!("{:?}", b"/* } if isAdmin \xE2\x81\xA6 begin admins only ");
59+
| ~~~~~~~~~~~~
6060

6161
error: non-ASCII character in raw byte string literal
6262
--> $DIR/unicode-control-codepoints.rs:21:29
6363
|
64-
LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##);
64+
LL | println!("{:?}", br##"/* } if isAdmin� � begin admins only "##);
6565
| ^ must be ASCII but is '\u{202e}'
6666

6767
error: non-ASCII character in raw byte string literal
6868
--> $DIR/unicode-control-codepoints.rs:21:33
6969
|
70-
LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##);
71-
| ^ must be ASCII but is '\u{2066}'
70+
LL | println!("{:?}", br##"/* } if isAdmin� � begin admins only "##);
71+
| ^ must be ASCII but is '\u{2066}'
7272

7373
error: non-ASCII character in raw byte string literal
7474
--> $DIR/unicode-control-codepoints.rs:21:44
7575
|
76-
LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##);
77-
| ^ must be ASCII but is '\u{2069}'
76+
LL | println!("{:?}", br##"/* } if isAdmin� � begin admins only "##);
77+
| ^ must be ASCII but is '\u{2069}'
7878

7979
error: non-ASCII character in raw byte string literal
8080
--> $DIR/unicode-control-codepoints.rs:21:46
8181
|
82-
LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##);
83-
| ^ must be ASCII but is '\u{2066}'
82+
LL | println!("{:?}", br##"/* } if isAdmin� � begin admins only "##);
83+
| ^ must be ASCII but is '\u{2066}'
8484

8585
error: unicode codepoint changing visible direction of text present in comment
8686
--> $DIR/unicode-control-codepoints.rs:2:5
8787
|
88-
LL | // if access_level != "user" { // Check if admin
89-
| ^^^^^^^^^^^^^^^^^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^
90-
| | ||
91-
| | |'\u{202a}'
88+
LL | // if access_level != "us�e�r" { // Check if admin
89+
| ^^^^^^^^^^^^^^^^^^^^^^^^^-^-^^^^^^^^^^^^^^^^^^^^^^
90+
| | | |
91+
| | | '\u{202a}'
9292
| | '\u{202b}'
9393
| this comment contains invisible unicode text flow control codepoints
9494
|
@@ -99,12 +99,12 @@ LL | // if access_level != "user" { // Check if admin
9999
error: unicode codepoint changing visible direction of text present in comment
100100
--> $DIR/unicode-control-codepoints.rs:30:1
101101
|
102-
LL | //"/* } if isAdmin begin admins only */"
103-
| ^^^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^
104-
| | | | ||
105-
| | | | |'\u{2066}'
106-
| | | | '\u{2069}'
107-
| | | '\u{2066}'
102+
LL | //"/* } if isAdmin� � begin admins only */"
103+
| ^^^^^-^^^-^^^^^^^^^^-^-^^^^^^^^^^^^^^^^^^^^^^
104+
| | | | | |
105+
| | | | | '\u{2066}'
106+
| | | | '\u{2069}'
107+
| | | '\u{2066}'
108108
| | '\u{202e}'
109109
| this comment contains invisible unicode text flow control codepoints
110110
|
@@ -114,12 +114,12 @@ LL | //"/* } if isAdmin begin admins only */"
114114
error: unicode codepoint changing visible direction of text present in literal
115115
--> $DIR/unicode-control-codepoints.rs:11:22
116116
|
117-
LL | println!("{:?}", "/* } if isAdmin begin admins only ");
118-
| ^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^
119-
| | | | ||
120-
| | | | |'\u{2066}'
121-
| | | | '\u{2069}'
122-
| | | '\u{2066}'
117+
LL | println!("{:?}", "/* } if isAdmin� � begin admins only ");
118+
| ^^^-^^^-^^^^^^^^^^-^-^^^^^^^^^^^^^^^^^^^^
119+
| | | | | |
120+
| | | | | '\u{2066}'
121+
| | | | '\u{2069}'
122+
| | | '\u{2066}'
123123
| | '\u{202e}'
124124
| this literal contains invisible unicode text flow control codepoints
125125
|
@@ -134,12 +134,12 @@ LL | println!("{:?}", "/*\u{202e} } \u{2066}if isAdmin\u{2069} \u{2066} begi
134134
error: unicode codepoint changing visible direction of text present in literal
135135
--> $DIR/unicode-control-codepoints.rs:14:22
136136
|
137-
LL | println!("{:?}", r##"/* } if isAdmin begin admins only "##);
138-
| ^^^^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^
139-
| | | | ||
140-
| | | | |'\u{2066}'
141-
| | | | '\u{2069}'
142-
| | | '\u{2066}'
137+
LL | println!("{:?}", r##"/* } if isAdmin� � begin admins only "##);
138+
| ^^^^^^-^^^-^^^^^^^^^^-^-^^^^^^^^^^^^^^^^^^^^^^
139+
| | | | | |
140+
| | | | | '\u{2066}'
141+
| | | | '\u{2069}'
142+
| | | '\u{2066}'
143143
| | '\u{202e}'
144144
| this literal contains invisible unicode text flow control codepoints
145145
|
@@ -153,8 +153,8 @@ LL | println!("{:?}", r##"/*\u{202e} } \u{2066}if isAdmin\u{2069} \u{2066} b
153153
error: unicode codepoint changing visible direction of text present in literal
154154
--> $DIR/unicode-control-codepoints.rs:26:22
155155
|
156-
LL | println!("{:?}", '');
157-
| ^-
156+
LL | println!("{:?}", '');
157+
| ^-^
158158
| ||
159159
| |'\u{202e}'
160160
| this literal contains an invisible unicode text flow control codepoint
@@ -169,8 +169,8 @@ LL | println!("{:?}", '\u{202e}');
169169
error: unicode codepoint changing visible direction of text present in doc comment
170170
--> $DIR/unicode-control-codepoints.rs:33:1
171171
|
172-
LL | /** ''); */fn foo() {}
173-
| ^^^^^^^^^^^^ this doc comment contains an invisible unicode text flow control codepoint
172+
LL | /** ''); */fn foo() {}
173+
| ^^^^^^^^^^^^^ this doc comment contains an invisible unicode text flow control codepoint
174174
|
175175
= note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen
176176
= note: if their presence wasn't intentional, you can remove them
@@ -181,8 +181,8 @@ error: unicode codepoint changing visible direction of text present in doc comme
181181
|
182182
LL | / /**
183183
LL | | *
184-
LL | | * ''); */fn bar() {}
185-
| |___________^ this doc comment contains an invisible unicode text flow control codepoint
184+
LL | | * ''); */fn bar() {}
185+
| |____________^ this doc comment contains an invisible unicode text flow control codepoint
186186
|
187187
= note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen
188188
= note: if their presence wasn't intentional, you can remove them

0 commit comments

Comments
 (0)