@@ -424,10 +424,7 @@ pub const Tokenizer = struct {
424
424
};
425
425
state = .invalid ;
426
426
},
427
- '\r ' = > {
428
- state = .expect_newline ;
429
- },
430
- ' ' , '\n ' , '\t ' = > {
427
+ ' ' , '\n ' , '\t ' , '\r ' = > {
431
428
result .loc .start = self .index + 1 ;
432
429
},
433
430
'"' = > {
@@ -553,6 +550,13 @@ pub const Tokenizer = struct {
553
550
},
554
551
555
552
.expect_newline = > switch (c ) {
553
+ 0 = > {
554
+ if (self .index == self .buffer .len ) {
555
+ result .tag = .invalid ;
556
+ break ;
557
+ }
558
+ state = .invalid ;
559
+ },
556
560
'\n ' = > {
557
561
result .loc .start = self .index + 1 ;
558
562
state = .start ;
@@ -846,7 +850,15 @@ pub const Tokenizer = struct {
846
850
self .index += 1 ;
847
851
break ;
848
852
},
849
- 0x01... 0x08, 0x0b... 0x1f, 0x7f = > {
853
+ '\r ' = > {
854
+ if (self .buffer [self .index + 1 ] == '\n ' ) {
855
+ self .index += 2 ;
856
+ break ;
857
+ } else {
858
+ state = .invalid ;
859
+ }
860
+ },
861
+ 0x01... 0x09, 0x0b... 0x0c, 0x0e... 0x1f, 0x7f = > {
850
862
state = .invalid ;
851
863
},
852
864
else = > continue ,
@@ -1091,22 +1103,31 @@ pub const Tokenizer = struct {
1091
1103
state = .start ;
1092
1104
result .loc .start = self .index + 1 ;
1093
1105
},
1094
- 0x01... 0x08 , 0x0b... 0x0c, 0x0e... 0x1f, 0x7f = > {
1106
+ 0x01... 0x09 , 0x0b... 0x0c, 0x0e... 0x1f, 0x7f = > {
1095
1107
state = .invalid ;
1096
1108
},
1097
1109
else = > {
1098
1110
state = .line_comment ;
1099
1111
},
1100
1112
},
1101
1113
.doc_comment_start = > switch (c ) {
1102
- 0 , '\n ' , ' \r ' = > {
1114
+ 0 , '\n ' = > {
1103
1115
result .tag = .doc_comment ;
1104
1116
break ;
1105
1117
},
1118
+ '\r ' = > {
1119
+ if (self .buffer [self .index + 1 ] == '\n ' ) {
1120
+ self .index += 1 ;
1121
+ result .tag = .doc_comment ;
1122
+ break ;
1123
+ } else {
1124
+ state = .invalid ;
1125
+ }
1126
+ },
1106
1127
'/' = > {
1107
1128
state = .line_comment ;
1108
1129
},
1109
- 0x01... 0x08 , 0x0b... 0x0c, 0x0e... 0x1f, 0x7f = > {
1130
+ 0x01... 0x09 , 0x0b... 0x0c, 0x0e... 0x1f, 0x7f = > {
1110
1131
state = .invalid ;
1111
1132
},
1112
1133
else = > {
@@ -1135,16 +1156,24 @@ pub const Tokenizer = struct {
1135
1156
state = .start ;
1136
1157
result .loc .start = self .index + 1 ;
1137
1158
},
1138
- 0x01... 0x08 , 0x0b... 0x0c, 0x0e... 0x1f, 0x7f = > {
1159
+ 0x01... 0x09 , 0x0b... 0x0c, 0x0e... 0x1f, 0x7f = > {
1139
1160
state = .invalid ;
1140
1161
},
1141
1162
else = > continue ,
1142
1163
},
1143
1164
.doc_comment = > switch (c ) {
1144
- 0 , '\n ' , ' \r ' = > {
1165
+ 0 , '\n ' = > {
1145
1166
break ;
1146
1167
},
1147
- 0x01... 0x08, 0x0b... 0x0c, 0x0e... 0x1f, 0x7f = > {
1168
+ '\r ' = > {
1169
+ if (self .buffer [self .index + 1 ] == '\n ' ) {
1170
+ self .index += 1 ;
1171
+ break ;
1172
+ } else {
1173
+ state = .invalid ;
1174
+ }
1175
+ },
1176
+ 0x01... 0x09, 0x0b... 0x0c, 0x0e... 0x1f, 0x7f = > {
1148
1177
state = .invalid ;
1149
1178
},
1150
1179
else = > continue ,
@@ -1386,30 +1415,6 @@ test "string identifier and builtin fns" {
1386
1415
});
1387
1416
}
1388
1417
1389
- test "multiline string literal with literal tab" {
1390
- try testTokenize (
1391
- \\\\foo bar
1392
- , &.{
1393
- .multiline_string_literal_line ,
1394
- });
1395
- }
1396
-
1397
- test "comments with literal tab" {
1398
- try testTokenize (
1399
- \\//foo bar
1400
- \\//!foo bar
1401
- \\///foo bar
1402
- \\// foo
1403
- \\/// foo
1404
- \\/// /foo
1405
- , &.{
1406
- .container_doc_comment ,
1407
- .doc_comment ,
1408
- .doc_comment ,
1409
- .doc_comment ,
1410
- });
1411
- }
1412
-
1413
1418
test "pipe and then invalid" {
1414
1419
try testTokenize ("||=" , &.{
1415
1420
.pipe_pipe ,
@@ -1767,6 +1772,60 @@ test "null byte before eof" {
1767
1772
try testTokenize ("/// NUL\x00 \n " , &.{ .doc_comment , .invalid });
1768
1773
}
1769
1774
1775
+ test "invalid tabs and carriage returns" {
1776
+ // "Inside Line Comments and Documentation Comments, Any TAB is rejected by
1777
+ // the grammar since it is ambiguous how it should be rendered."
1778
+ // https://github.com/ziglang/zig-spec/issues/38
1779
+ try testTokenize ("//\t " , &.{.invalid });
1780
+ try testTokenize ("// \t " , &.{.invalid });
1781
+ try testTokenize ("///\t " , &.{.invalid });
1782
+ try testTokenize ("/// \t " , &.{.invalid });
1783
+ try testTokenize ("//!\t " , &.{.invalid });
1784
+ try testTokenize ("//! \t " , &.{.invalid });
1785
+
1786
+ // "Inside Line Comments and Documentation Comments, CR directly preceding
1787
+ // NL is unambiguously part of the newline sequence. It is accepted by the
1788
+ // grammar and removed by zig fmt, leaving only NL. CR anywhere else is
1789
+ // rejected by the grammar."
1790
+ // https://github.com/ziglang/zig-spec/issues/38
1791
+ try testTokenize ("//\r " , &.{.invalid });
1792
+ try testTokenize ("// \r " , &.{.invalid });
1793
+ try testTokenize ("///\r " , &.{.invalid });
1794
+ try testTokenize ("/// \r " , &.{.invalid });
1795
+ try testTokenize ("//\r " , &.{.invalid });
1796
+ try testTokenize ("// \r " , &.{.invalid });
1797
+ try testTokenize ("///\r " , &.{.invalid });
1798
+ try testTokenize ("/// \r " , &.{.invalid });
1799
+ try testTokenize ("//\r \n " , &.{});
1800
+ try testTokenize ("// \r \n " , &.{});
1801
+ try testTokenize ("///\r \n " , &.{.doc_comment });
1802
+ try testTokenize ("/// \r \n " , &.{.doc_comment });
1803
+ try testTokenize ("//!\r " , &.{.invalid });
1804
+ try testTokenize ("//! \r " , &.{.invalid });
1805
+ try testTokenize ("//!\r " , &.{.invalid });
1806
+ try testTokenize ("//! \r " , &.{.invalid });
1807
+ try testTokenize ("//!\r \n " , &.{.container_doc_comment });
1808
+ try testTokenize ("//! \r \n " , &.{.container_doc_comment });
1809
+
1810
+ // The control characters TAB and CR are rejected by the grammar inside multi-line string literals,
1811
+ // except if CR is directly before NL.
1812
+ // https://github.com/ziglang/zig-spec/issues/38
1813
+ try testTokenize ("\\\\ \r " , &.{.invalid });
1814
+ try testTokenize ("\\\\ \r " , &.{.invalid });
1815
+ try testTokenize ("\\\\ \r " , &.{.invalid });
1816
+ try testTokenize ("\\\\ \t " , &.{.invalid });
1817
+ try testTokenize ("\\\\ \t " , &.{.invalid });
1818
+ try testTokenize ("\\\\ \t " , &.{.invalid });
1819
+ try testTokenize ("\\\\ \r \n " , &.{.multiline_string_literal_line });
1820
+
1821
+ // "TAB used as whitespace is...accepted by the grammar. CR used as
1822
+ // whitespace, whether directly preceding NL or stray, is...accepted by the
1823
+ // grammar."
1824
+ // https://github.com/ziglang/zig-spec/issues/38
1825
+ try testTokenize ("\t pub\t switch\t " , &.{ .keyword_pub , .keyword_switch });
1826
+ try testTokenize ("\r pub\r switch\r " , &.{ .keyword_pub , .keyword_switch });
1827
+ }
1828
+
1770
1829
fn testTokenize (source : [:0 ]const u8 , expected_token_tags : []const Token.Tag ) ! void {
1771
1830
var tokenizer = Tokenizer .init (source );
1772
1831
for (expected_token_tags ) | expected_token_tag | {
0 commit comments