Skip to content

Commit d6a74ed

Browse files
committed
[self-hosted] source must be valid utf8. see #663
1 parent fb96c3e commit d6a74ed

File tree

1 file changed

+149
-28
lines changed

1 file changed

+149
-28
lines changed

src-self-hosted/tokenizer.zig

+149-28
Original file line numberDiff line numberDiff line change
@@ -532,60 +532,181 @@ pub const Tokenizer = struct {
532532

533533
fn checkLiteralCharacter(self: &Tokenizer) {
534534
if (self.pending_invalid_token != null) return;
535+
const invalid_length = self.getInvalidCharacterLength();
536+
if (invalid_length == 0) return;
537+
self.pending_invalid_token = Token {
538+
.id = Token.Id.Invalid,
539+
.start = self.index,
540+
.end = self.index + invalid_length,
541+
};
542+
}
543+
544+
fn getInvalidCharacterLength(self: &Tokenizer) -> u3 {
535545
const c0 = self.buffer[self.index];
536-
if (c0 < 0x20 or c0 == 0x7f) {
537-
// ascii control codes are never allowed
538-
// (note that \n was checked before we got here)
539-
self.pending_invalid_token = Token {
540-
.id = Token.Id.Invalid,
541-
.start = self.index,
542-
.end = self.index + 1,
543-
};
544-
return;
546+
if (c0 < 0x80) {
547+
if (c0 < 0x20 or c0 == 0x7f) {
548+
// ascii control codes are never allowed
549+
// (note that \n was checked before we got here)
550+
return 1;
551+
}
552+
// looks fine to me.
553+
return 0;
554+
} else {
555+
// check utf8-encoded character.
556+
// remember that the last byte in the buffer is guaranteed to be '\n',
557+
// which means we really don't need to do bounds checks here,
558+
// as long as we check one byte at a time for being a continuation byte.
559+
var value: u32 = undefined;
560+
var length: u3 = undefined;
561+
if (c0 & 0b11100000 == 0b11000000) {value = c0 & 0b00011111; length = 2;}
562+
else if (c0 & 0b11110000 == 0b11100000) {value = c0 & 0b00001111; length = 3;}
563+
else if (c0 & 0b11111000 == 0b11110000) {value = c0 & 0b00000111; length = 4;}
564+
else return 1; // unexpected continuation or too many leading 1's
565+
566+
const c1 = self.buffer[self.index + 1];
567+
if (c1 & 0b11000000 != 0b10000000) return 1; // expected continuation
568+
value <<= 6;
569+
value |= c1 & 0b00111111;
570+
if (length == 2) {
571+
if (value < 0x80) return length; // overlong
572+
if (value == 0x85) return length; // U+0085 (NEL)
573+
self.index += length - 1;
574+
return 0;
575+
}
576+
const c2 = self.buffer[self.index + 2];
577+
if (c2 & 0b11000000 != 0b10000000) return 2; // expected continuation
578+
value <<= 6;
579+
value |= c2 & 0b00111111;
580+
if (length == 3) {
581+
if (value < 0x800) return length; // overlong
582+
if (value == 0x2028) return length; // U+2028 (LS)
583+
if (value == 0x2029) return length; // U+2029 (PS)
584+
if (0xd800 <= value and value <= 0xdfff) return length; // surrogate halves not allowed in utf8
585+
self.index += length - 1;
586+
return 0;
587+
}
588+
const c3 = self.buffer[self.index + 3];
589+
if (c3 & 0b11000000 != 0b10000000) return 3; // expected continuation
590+
value <<= 6;
591+
value |= c3 & 0b00111111;
592+
if (length == 4) {
593+
if (value < 0x10000) return length; // overlong
594+
if (value > 0x10FFFF) return length; // out of bounds
595+
self.index += length - 1;
596+
return 0;
597+
}
598+
unreachable;
545599
}
546600
}
547601
};
548602

549603

550604

551-
test "tokenizer" {
552-
// source must end with eol
553-
testTokenize("", []Token.Id {
605+
test "tokenizer - source must end with eol" {
606+
testTokenizeWithEol("", []Token.Id {
554607
}, true);
555-
testTokenize("no newline", []Token.Id {
608+
testTokenizeWithEol("no newline", []Token.Id {
556609
}, false);
557-
testTokenize("test\n", []Token.Id {
610+
testTokenizeWithEol("test\n", []Token.Id {
558611
Token.Id.Keyword_test,
559612
}, true);
560-
testTokenize("test\nno newline", []Token.Id {
613+
testTokenizeWithEol("test\nno newline", []Token.Id {
561614
Token.Id.Keyword_test,
562615
}, false);
616+
}
563617

564-
// invalid token characters
565-
testTokenize("#\n", []Token.Id {
566-
Token.Id.Invalid,
567-
}, true);
568-
testTokenize("`\n", []Token.Id {
569-
Token.Id.Invalid,
570-
}, true);
618+
test "tokenizer - invalid token characters" {
619+
testTokenize("#\n", []Token.Id{Token.Id.Invalid});
620+
testTokenize("`\n", []Token.Id{Token.Id.Invalid});
621+
}
571622

572-
// invalid literal/comment characters
623+
test "tokenizer - invalid literal/comment characters" {
573624
testTokenize("\"\x00\"\n", []Token.Id {
574625
Token.Id { .StringLiteral = Token.StrLitKind.Normal },
575626
Token.Id.Invalid,
576-
}, true);
627+
});
577628
testTokenize("//\x00\n", []Token.Id {
578629
Token.Id.Invalid,
579-
}, true);
630+
});
580631
testTokenize("//\x1f\n", []Token.Id {
581632
Token.Id.Invalid,
582-
}, true);
633+
});
583634
testTokenize("//\x7f\n", []Token.Id {
584635
Token.Id.Invalid,
585-
}, true);
636+
});
637+
}
638+
639+
test "tokenizer - valid unicode" {
640+
testTokenize("//\xc2\x80\n", []Token.Id{});
641+
testTokenize("//\xdf\xbf\n", []Token.Id{});
642+
testTokenize("//\xe0\xa0\x80\n", []Token.Id{});
643+
testTokenize("//\xe1\x80\x80\n", []Token.Id{});
644+
testTokenize("//\xef\xbf\xbf\n", []Token.Id{});
645+
testTokenize("//\xf0\x90\x80\x80\n", []Token.Id{});
646+
testTokenize("//\xf1\x80\x80\x80\n", []Token.Id{});
647+
testTokenize("//\xf3\xbf\xbf\xbf\n", []Token.Id{});
648+
testTokenize("//\xf4\x8f\xbf\xbf\n", []Token.Id{});
586649
}
587650

588-
fn testTokenize(source: []const u8, expected_tokens: []const Token.Id, expected_eol_at_eof: bool) {
651+
test "tokenizer - invalid unicode continuation bytes" {
652+
// unexpected continuation
653+
testTokenize("//\x80\n", []Token.Id{Token.Id.Invalid});
654+
testTokenize("//\xbf\n", []Token.Id{Token.Id.Invalid});
655+
// too many leading 1's
656+
testTokenize("//\xf8\n", []Token.Id{Token.Id.Invalid});
657+
testTokenize("//\xff\n", []Token.Id{Token.Id.Invalid});
658+
// expected continuation for 2 byte sequences
659+
testTokenize("//\xc2\x00\n", []Token.Id{Token.Id.Invalid});
660+
testTokenize("//\xc2\xc0\n", []Token.Id{Token.Id.Invalid});
661+
// expected continuation for 3 byte sequences
662+
testTokenize("//\xe0\x00\n", []Token.Id{Token.Id.Invalid});
663+
testTokenize("//\xe0\xc0\n", []Token.Id{Token.Id.Invalid});
664+
testTokenize("//\xe0\xa0\n", []Token.Id{Token.Id.Invalid});
665+
testTokenize("//\xe0\xa0\x00\n", []Token.Id{Token.Id.Invalid});
666+
testTokenize("//\xe0\xa0\xc0\n", []Token.Id{Token.Id.Invalid});
667+
// expected continuation for 4 byte sequences
668+
testTokenize("//\xf0\x00\n", []Token.Id{Token.Id.Invalid});
669+
testTokenize("//\xf0\xc0\n", []Token.Id{Token.Id.Invalid});
670+
testTokenize("//\xf0\x90\x00\n", []Token.Id{Token.Id.Invalid});
671+
testTokenize("//\xf0\x90\xc0\n", []Token.Id{Token.Id.Invalid});
672+
testTokenize("//\xf0\x90\x80\x00\n", []Token.Id{Token.Id.Invalid});
673+
testTokenize("//\xf0\x90\x80\xc0\n", []Token.Id{Token.Id.Invalid});
674+
}
675+
676+
test "tokenizer - overlong utf8 codepoint" {
677+
testTokenize("//\xc0\x80\n", []Token.Id{Token.Id.Invalid});
678+
testTokenize("//\xc1\xbf\n", []Token.Id{Token.Id.Invalid});
679+
testTokenize("//\xe0\x80\x80\n", []Token.Id{Token.Id.Invalid});
680+
testTokenize("//\xe0\x9f\xbf\n", []Token.Id{Token.Id.Invalid});
681+
testTokenize("//\xf0\x80\x80\x80\n", []Token.Id{Token.Id.Invalid});
682+
testTokenize("//\xf0\x8f\xbf\xbf\n", []Token.Id{Token.Id.Invalid});
683+
}
684+
685+
test "tokenizer - misc invalid utf8" {
686+
// codepoint out of bounds
687+
testTokenize("//\xf4\x90\x80\x80\n", []Token.Id{Token.Id.Invalid});
688+
testTokenize("//\xf7\xbf\xbf\xbf\n", []Token.Id{Token.Id.Invalid});
689+
// unicode newline characters.U+0085, U+2028, U+2029
690+
testTokenize("//\xc2\x84\n", []Token.Id{});
691+
testTokenize("//\xc2\x85\n", []Token.Id{Token.Id.Invalid});
692+
testTokenize("//\xc2\x86\n", []Token.Id{});
693+
testTokenize("//\xe2\x80\xa7\n", []Token.Id{});
694+
testTokenize("//\xe2\x80\xa8\n", []Token.Id{Token.Id.Invalid});
695+
testTokenize("//\xe2\x80\xa9\n", []Token.Id{Token.Id.Invalid});
696+
testTokenize("//\xe2\x80\xaa\n", []Token.Id{});
697+
// surrogate halves
698+
testTokenize("//\xed\x9f\x80\n", []Token.Id{});
699+
testTokenize("//\xed\xa0\x80\n", []Token.Id{Token.Id.Invalid});
700+
testTokenize("//\xed\xbf\xbf\n", []Token.Id{Token.Id.Invalid});
701+
testTokenize("//\xee\x80\x80\n", []Token.Id{});
702+
// surrogate halves are invalid, even in surrogate pairs
703+
testTokenize("//\xed\xa0\xad\xed\xb2\xa9\n", []Token.Id{Token.Id.Invalid});
704+
}
705+
706+
fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) {
707+
testTokenizeWithEol(source, expected_tokens, true);
708+
}
709+
fn testTokenizeWithEol(source: []const u8, expected_tokens: []const Token.Id, expected_eol_at_eof: bool) {
589710
var tokenizer = Tokenizer.init(source);
590711
for (expected_tokens) |expected_token_id| {
591712
const token = tokenizer.next();

0 commit comments

Comments
 (0)