tokenizer: tabs and carriage returns spec conformance

andrewrk · andrewrk · commit b61d9db4138e · 2024-07-31T14:44:36.000-07:00
diff --git a/lib/std/zig/tokenizer.zig b/lib/std/zig/tokenizer.zig
@@ -424,10 +424,7 @@ pub const Tokenizer = struct {
                         };
                         state = .invalid;
                     },
-                    '\r' => {
-                        state = .expect_newline;
-                    },
-                    ' ', '\n', '\t' => {
+                    ' ', '\n', '\t', '\r' => {
                         result.loc.start = self.index + 1;
                     },
                     '"' => {
@@ -553,6 +550,13 @@ pub const Tokenizer = struct {
                 },
 
                 .expect_newline => switch (c) {
+                    0 => {
+                        if (self.index == self.buffer.len) {
+                            result.tag = .invalid;
+                            break;
+                        }
+                        state = .invalid;
+                    },
                     '\n' => {
                         result.loc.start = self.index + 1;
                         state = .start;
@@ -846,7 +850,15 @@ pub const Tokenizer = struct {
                         self.index += 1;
                         break;
                     },
-                    0x01...0x08, 0x0b...0x1f, 0x7f => {
+                    '\r' => {
+                        if (self.buffer[self.index + 1] == '\n') {
+                            self.index += 2;
+                            break;
+                        } else {
+                            state = .invalid;
+                        }
+                    },
+                    0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
                         state = .invalid;
                     },
                     else => continue,
@@ -1091,22 +1103,31 @@ pub const Tokenizer = struct {
                         state = .start;
                         result.loc.start = self.index + 1;
                     },
-                    0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
+                    0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
                         state = .invalid;
                     },
                     else => {
                         state = .line_comment;
                     },
                 },
                 .doc_comment_start => switch (c) {
-                    0, '\n', '\r' => {
+                    0, '\n' => {
                         result.tag = .doc_comment;
                         break;
                     },
+                    '\r' => {
+                        if (self.buffer[self.index + 1] == '\n') {
+                            self.index += 1;
+                            result.tag = .doc_comment;
+                            break;
+                        } else {
+                            state = .invalid;
+                        }
+                    },
                     '/' => {
                         state = .line_comment;
                     },
-                    0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
+                    0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
                         state = .invalid;
                     },
                     else => {
@@ -1135,16 +1156,24 @@ pub const Tokenizer = struct {
                         state = .start;
                         result.loc.start = self.index + 1;
                     },
-                    0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
+                    0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
                         state = .invalid;
                     },
                     else => continue,
                 },
                 .doc_comment => switch (c) {
-                    0, '\n', '\r' => {
+                    0, '\n' => {
                         break;
                     },
-                    0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
+                    '\r' => {
+                        if (self.buffer[self.index + 1] == '\n') {
+                            self.index += 1;
+                            break;
+                        } else {
+                            state = .invalid;
+                        }
+                    },
+                    0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
                         state = .invalid;
                     },
                     else => continue,
@@ -1386,30 +1415,6 @@ test "string identifier and builtin fns" {
     });
 }
 
-test "multiline string literal with literal tab" {
-    try testTokenize(
-        \\\\foo	bar
-    , &.{
-        .multiline_string_literal_line,
-    });
-}
-
-test "comments with literal tab" {
-    try testTokenize(
-        \\//foo	bar
-        \\//!foo	bar
-        \\///foo	bar
-        \\//	foo
-        \\///	foo
-        \\///	/foo
-    , &.{
-        .container_doc_comment,
-        .doc_comment,
-        .doc_comment,
-        .doc_comment,
-    });
-}
-
 test "pipe and then invalid" {
     try testTokenize("||=", &.{
         .pipe_pipe,
@@ -1767,6 +1772,60 @@ test "null byte before eof" {
     try testTokenize("/// NUL\x00\n", &.{ .doc_comment, .invalid });
 }
 
+test "invalid tabs and carriage returns" {
+    // "Inside Line Comments and Documentation Comments, Any TAB is rejected by
+    // the grammar since it is ambiguous how it should be rendered."
+    // https://github.com/ziglang/zig-spec/issues/38
+    try testTokenize("//\t", &.{.invalid});
+    try testTokenize("// \t", &.{.invalid});
+    try testTokenize("///\t", &.{.invalid});
+    try testTokenize("/// \t", &.{.invalid});
+    try testTokenize("//!\t", &.{.invalid});
+    try testTokenize("//! \t", &.{.invalid});
+
+    // "Inside Line Comments and Documentation Comments, CR directly preceding
+    // NL is unambiguously part of the newline sequence. It is accepted by the
+    // grammar and removed by zig fmt, leaving only NL. CR anywhere else is
+    // rejected by the grammar."
+    // https://github.com/ziglang/zig-spec/issues/38
+    try testTokenize("//\r", &.{.invalid});
+    try testTokenize("// \r", &.{.invalid});
+    try testTokenize("///\r", &.{.invalid});
+    try testTokenize("/// \r", &.{.invalid});
+    try testTokenize("//\r ", &.{.invalid});
+    try testTokenize("// \r ", &.{.invalid});
+    try testTokenize("///\r ", &.{.invalid});
+    try testTokenize("/// \r ", &.{.invalid});
+    try testTokenize("//\r\n", &.{});
+    try testTokenize("// \r\n", &.{});
+    try testTokenize("///\r\n", &.{.doc_comment});
+    try testTokenize("/// \r\n", &.{.doc_comment});
+    try testTokenize("//!\r", &.{.invalid});
+    try testTokenize("//! \r", &.{.invalid});
+    try testTokenize("//!\r ", &.{.invalid});
+    try testTokenize("//! \r ", &.{.invalid});
+    try testTokenize("//!\r\n", &.{.container_doc_comment});
+    try testTokenize("//! \r\n", &.{.container_doc_comment});
+
+    // The control characters TAB and CR are rejected by the grammar inside multi-line string literals,
+    // except if CR is directly before NL.
+    // https://github.com/ziglang/zig-spec/issues/38
+    try testTokenize("\\\\\r", &.{.invalid});
+    try testTokenize("\\\\\r ", &.{.invalid});
+    try testTokenize("\\\\ \r", &.{.invalid});
+    try testTokenize("\\\\\t", &.{.invalid});
+    try testTokenize("\\\\\t ", &.{.invalid});
+    try testTokenize("\\\\ \t", &.{.invalid});
+    try testTokenize("\\\\\r\n", &.{.multiline_string_literal_line});
+
+    // "TAB used as whitespace is...accepted by the grammar. CR used as
+    // whitespace, whether directly preceding NL or stray, is...accepted by the
+    // grammar."
+    // https://github.com/ziglang/zig-spec/issues/38
+    try testTokenize("\tpub\tswitch\t", &.{ .keyword_pub, .keyword_switch });
+    try testTokenize("\rpub\rswitch\r", &.{ .keyword_pub, .keyword_switch });
+}
+
 fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void {
     var tokenizer = Tokenizer.init(source);
     for (expected_token_tags) |expected_token_tag| {
diff --git a/test/compile_errors.zig b/test/compile_errors.zig
@@ -38,14 +38,6 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
         });
     }
 
-    {
-        const case = ctx.obj("isolated carriage return in multiline string literal", b.graph.host);
-
-        case.addError("const foo = \\\\\test\r\r rogue carriage return\n;", &[_][]const u8{
-            ":1:13: error: expected expression, found 'invalid token'",
-        });
-    }
-
     {
         const case = ctx.obj("missing semicolon at EOF", b.graph.host);
         case.addError(

Original file line number	Diff line number	Diff line change
`@@ -38,14 +38,6 @@ pub fn addCases(ctx: Cases, b: std.Build) !void {`
`38`	`38`	`});`
`39`	`39`	`}`
`40`	`40`
`41`		`- {`
`42`		`- const case = ctx.obj("isolated carriage return in multiline string literal", b.graph.host);`
`43`		`-`
`44`		`- case.addError("const foo = \\\\\test\r\r rogue carriage return\n;", &[_][]const u8{`
`45`		`- ":1:13: error: expected expression, found 'invalid token'",`
`46`		`- });`
`47`		`- }`
`48`		`-`
`49`	`41`	`{`
`50`	`42`	`const case = ctx.obj("missing semicolon at EOF", b.graph.host);`
`51`	`43`	`case.addError(`