Skip to content

Commit b61d9db

Browse files
committed
tokenizer: tabs and carriage returns spec conformance
1 parent ee7a094 commit b61d9db

File tree

2 files changed

+94
-43
lines changed

2 files changed

+94
-43
lines changed

lib/std/zig/tokenizer.zig

+94-35
Original file line numberDiff line numberDiff line change
@@ -424,10 +424,7 @@ pub const Tokenizer = struct {
424424
};
425425
state = .invalid;
426426
},
427-
'\r' => {
428-
state = .expect_newline;
429-
},
430-
' ', '\n', '\t' => {
427+
' ', '\n', '\t', '\r' => {
431428
result.loc.start = self.index + 1;
432429
},
433430
'"' => {
@@ -553,6 +550,13 @@ pub const Tokenizer = struct {
553550
},
554551

555552
.expect_newline => switch (c) {
553+
0 => {
554+
if (self.index == self.buffer.len) {
555+
result.tag = .invalid;
556+
break;
557+
}
558+
state = .invalid;
559+
},
556560
'\n' => {
557561
result.loc.start = self.index + 1;
558562
state = .start;
@@ -846,7 +850,15 @@ pub const Tokenizer = struct {
846850
self.index += 1;
847851
break;
848852
},
849-
0x01...0x08, 0x0b...0x1f, 0x7f => {
853+
'\r' => {
854+
if (self.buffer[self.index + 1] == '\n') {
855+
self.index += 2;
856+
break;
857+
} else {
858+
state = .invalid;
859+
}
860+
},
861+
0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
850862
state = .invalid;
851863
},
852864
else => continue,
@@ -1091,22 +1103,31 @@ pub const Tokenizer = struct {
10911103
state = .start;
10921104
result.loc.start = self.index + 1;
10931105
},
1094-
0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
1106+
0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
10951107
state = .invalid;
10961108
},
10971109
else => {
10981110
state = .line_comment;
10991111
},
11001112
},
11011113
.doc_comment_start => switch (c) {
1102-
0, '\n', '\r' => {
1114+
0, '\n' => {
11031115
result.tag = .doc_comment;
11041116
break;
11051117
},
1118+
'\r' => {
1119+
if (self.buffer[self.index + 1] == '\n') {
1120+
self.index += 1;
1121+
result.tag = .doc_comment;
1122+
break;
1123+
} else {
1124+
state = .invalid;
1125+
}
1126+
},
11061127
'/' => {
11071128
state = .line_comment;
11081129
},
1109-
0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
1130+
0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
11101131
state = .invalid;
11111132
},
11121133
else => {
@@ -1135,16 +1156,24 @@ pub const Tokenizer = struct {
11351156
state = .start;
11361157
result.loc.start = self.index + 1;
11371158
},
1138-
0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
1159+
0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
11391160
state = .invalid;
11401161
},
11411162
else => continue,
11421163
},
11431164
.doc_comment => switch (c) {
1144-
0, '\n', '\r' => {
1165+
0, '\n' => {
11451166
break;
11461167
},
1147-
0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
1168+
'\r' => {
1169+
if (self.buffer[self.index + 1] == '\n') {
1170+
self.index += 1;
1171+
break;
1172+
} else {
1173+
state = .invalid;
1174+
}
1175+
},
1176+
0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
11481177
state = .invalid;
11491178
},
11501179
else => continue,
@@ -1386,30 +1415,6 @@ test "string identifier and builtin fns" {
13861415
});
13871416
}
13881417

1389-
test "multiline string literal with literal tab" {
1390-
try testTokenize(
1391-
\\\\foo bar
1392-
, &.{
1393-
.multiline_string_literal_line,
1394-
});
1395-
}
1396-
1397-
test "comments with literal tab" {
1398-
try testTokenize(
1399-
\\//foo bar
1400-
\\//!foo bar
1401-
\\///foo bar
1402-
\\// foo
1403-
\\/// foo
1404-
\\/// /foo
1405-
, &.{
1406-
.container_doc_comment,
1407-
.doc_comment,
1408-
.doc_comment,
1409-
.doc_comment,
1410-
});
1411-
}
1412-
14131418
test "pipe and then invalid" {
14141419
try testTokenize("||=", &.{
14151420
.pipe_pipe,
@@ -1767,6 +1772,60 @@ test "null byte before eof" {
17671772
try testTokenize("/// NUL\x00\n", &.{ .doc_comment, .invalid });
17681773
}
17691774

1775+
test "invalid tabs and carriage returns" {
1776+
// "Inside Line Comments and Documentation Comments, Any TAB is rejected by
1777+
// the grammar since it is ambiguous how it should be rendered."
1778+
// https://github.com/ziglang/zig-spec/issues/38
1779+
try testTokenize("//\t", &.{.invalid});
1780+
try testTokenize("// \t", &.{.invalid});
1781+
try testTokenize("///\t", &.{.invalid});
1782+
try testTokenize("/// \t", &.{.invalid});
1783+
try testTokenize("//!\t", &.{.invalid});
1784+
try testTokenize("//! \t", &.{.invalid});
1785+
1786+
// "Inside Line Comments and Documentation Comments, CR directly preceding
1787+
// NL is unambiguously part of the newline sequence. It is accepted by the
1788+
// grammar and removed by zig fmt, leaving only NL. CR anywhere else is
1789+
// rejected by the grammar."
1790+
// https://github.com/ziglang/zig-spec/issues/38
1791+
try testTokenize("//\r", &.{.invalid});
1792+
try testTokenize("// \r", &.{.invalid});
1793+
try testTokenize("///\r", &.{.invalid});
1794+
try testTokenize("/// \r", &.{.invalid});
1795+
try testTokenize("//\r ", &.{.invalid});
1796+
try testTokenize("// \r ", &.{.invalid});
1797+
try testTokenize("///\r ", &.{.invalid});
1798+
try testTokenize("/// \r ", &.{.invalid});
1799+
try testTokenize("//\r\n", &.{});
1800+
try testTokenize("// \r\n", &.{});
1801+
try testTokenize("///\r\n", &.{.doc_comment});
1802+
try testTokenize("/// \r\n", &.{.doc_comment});
1803+
try testTokenize("//!\r", &.{.invalid});
1804+
try testTokenize("//! \r", &.{.invalid});
1805+
try testTokenize("//!\r ", &.{.invalid});
1806+
try testTokenize("//! \r ", &.{.invalid});
1807+
try testTokenize("//!\r\n", &.{.container_doc_comment});
1808+
try testTokenize("//! \r\n", &.{.container_doc_comment});
1809+
1810+
// The control characters TAB and CR are rejected by the grammar inside multi-line string literals,
1811+
// except if CR is directly before NL.
1812+
// https://github.com/ziglang/zig-spec/issues/38
1813+
try testTokenize("\\\\\r", &.{.invalid});
1814+
try testTokenize("\\\\\r ", &.{.invalid});
1815+
try testTokenize("\\\\ \r", &.{.invalid});
1816+
try testTokenize("\\\\\t", &.{.invalid});
1817+
try testTokenize("\\\\\t ", &.{.invalid});
1818+
try testTokenize("\\\\ \t", &.{.invalid});
1819+
try testTokenize("\\\\\r\n", &.{.multiline_string_literal_line});
1820+
1821+
// "TAB used as whitespace is...accepted by the grammar. CR used as
1822+
// whitespace, whether directly preceding NL or stray, is...accepted by the
1823+
// grammar."
1824+
// https://github.com/ziglang/zig-spec/issues/38
1825+
try testTokenize("\tpub\tswitch\t", &.{ .keyword_pub, .keyword_switch });
1826+
try testTokenize("\rpub\rswitch\r", &.{ .keyword_pub, .keyword_switch });
1827+
}
1828+
17701829
fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void {
17711830
var tokenizer = Tokenizer.init(source);
17721831
for (expected_token_tags) |expected_token_tag| {

test/compile_errors.zig

-8
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,6 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
3838
});
3939
}
4040

41-
{
42-
const case = ctx.obj("isolated carriage return in multiline string literal", b.graph.host);
43-
44-
case.addError("const foo = \\\\\test\r\r rogue carriage return\n;", &[_][]const u8{
45-
":1:13: error: expected expression, found 'invalid token'",
46-
});
47-
}
48-
4941
{
5042
const case = ctx.obj("missing semicolon at EOF", b.graph.host);
5143
case.addError(

0 commit comments

Comments
 (0)