@@ -532,60 +532,181 @@ pub const Tokenizer = struct {
532
532
533
533
fn checkLiteralCharacter (self : & Tokenizer ) {
534
534
if (self .pending_invalid_token != null ) return ;
535
+ const invalid_length = self .getInvalidCharacterLength ();
536
+ if (invalid_length == 0 ) return ;
537
+ self .pending_invalid_token = Token {
538
+ .id = Token .Id .Invalid ,
539
+ .start = self .index ,
540
+ .end = self .index + invalid_length ,
541
+ };
542
+ }
543
+
544
+ fn getInvalidCharacterLength (self : & Tokenizer ) - > u3 {
535
545
const c0 = self .buffer [self .index ];
536
- if (c0 < 0x20 or c0 == 0x7f ) {
537
- // ascii control codes are never allowed
538
- // (note that \n was checked before we got here)
539
- self .pending_invalid_token = Token {
540
- .id = Token .Id .Invalid ,
541
- .start = self .index ,
542
- .end = self .index + 1 ,
543
- };
544
- return ;
546
+ if (c0 < 0x80 ) {
547
+ if (c0 < 0x20 or c0 == 0x7f ) {
548
+ // ascii control codes are never allowed
549
+ // (note that \n was checked before we got here)
550
+ return 1 ;
551
+ }
552
+ // looks fine to me.
553
+ return 0 ;
554
+ } else {
555
+ // check utf8-encoded character.
556
+ // remember that the last byte in the buffer is guaranteed to be '\n',
557
+ // which means we really don't need to do bounds checks here,
558
+ // as long as we check one byte at a time for being a continuation byte.
559
+ var value : u32 = undefined ;
560
+ var length : u3 = undefined ;
561
+ if (c0 & 0b11100000 == 0b11000000 ) {value = c0 & 0b00011111 ; length = 2 ;}
562
+ else if (c0 & 0b11110000 == 0b11100000 ) {value = c0 & 0b00001111 ; length = 3 ;}
563
+ else if (c0 & 0b11111000 == 0b11110000 ) {value = c0 & 0b00000111 ; length = 4 ;}
564
+ else return 1 ; // unexpected continuation or too many leading 1's
565
+
566
+ const c1 = self .buffer [self .index + 1 ];
567
+ if (c1 & 0b11000000 != 0b10000000 ) return 1 ; // expected continuation
568
+ value <<= 6 ;
569
+ value |= c1 & 0b00111111 ;
570
+ if (length == 2 ) {
571
+ if (value < 0x80 ) return length ; // overlong
572
+ if (value == 0x85 ) return length ; // U+0085 (NEL)
573
+ self .index += length - 1 ;
574
+ return 0 ;
575
+ }
576
+ const c2 = self .buffer [self .index + 2 ];
577
+ if (c2 & 0b11000000 != 0b10000000 ) return 2 ; // expected continuation
578
+ value <<= 6 ;
579
+ value |= c2 & 0b00111111 ;
580
+ if (length == 3 ) {
581
+ if (value < 0x800 ) return length ; // overlong
582
+ if (value == 0x2028 ) return length ; // U+2028 (LS)
583
+ if (value == 0x2029 ) return length ; // U+2029 (PS)
584
+ if (0xd800 <= value and value <= 0xdfff ) return length ; // surrogate halves not allowed in utf8
585
+ self .index += length - 1 ;
586
+ return 0 ;
587
+ }
588
+ const c3 = self .buffer [self .index + 3 ];
589
+ if (c3 & 0b11000000 != 0b10000000 ) return 3 ; // expected continuation
590
+ value <<= 6 ;
591
+ value |= c3 & 0b00111111 ;
592
+ if (length == 4 ) {
593
+ if (value < 0x10000 ) return length ; // overlong
594
+ if (value > 0x10FFFF ) return length ; // out of bounds
595
+ self .index += length - 1 ;
596
+ return 0 ;
597
+ }
598
+ unreachable ;
545
599
}
546
600
}
547
601
};
548
602
549
603
550
604
551
- test "tokenizer" {
552
- // source must end with eol
553
- testTokenize ("" , []Token.Id {
605
+ test "tokenizer - source must end with eol" {
606
+ testTokenizeWithEol ("" , []Token.Id {
554
607
}, true );
555
- testTokenize ("no newline" , []Token.Id {
608
+ testTokenizeWithEol ("no newline" , []Token.Id {
556
609
}, false );
557
- testTokenize ("test\n " , []Token.Id {
610
+ testTokenizeWithEol ("test\n " , []Token.Id {
558
611
Token .Id .Keyword_test ,
559
612
}, true );
560
- testTokenize ("test\n no newline" , []Token.Id {
613
+ testTokenizeWithEol ("test\n no newline" , []Token.Id {
561
614
Token .Id .Keyword_test ,
562
615
}, false );
616
+ }
563
617
564
- // invalid token characters
565
- testTokenize ("#\n " , []Token.Id {
566
- Token .Id .Invalid ,
567
- }, true );
568
- testTokenize ("`\n " , []Token.Id {
569
- Token .Id .Invalid ,
570
- }, true );
618
+ test "tokenizer - invalid token characters" {
619
+ testTokenize ("#\n " , []Token.Id {Token .Id .Invalid });
620
+ testTokenize ("`\n " , []Token.Id {Token .Id .Invalid });
621
+ }
571
622
572
- // invalid literal/comment characters
623
+ test "tokenizer - invalid literal/comment characters" {
573
624
testTokenize ("\" \x00 \" \n " , []Token.Id {
574
625
Token.Id { .StringLiteral = Token .StrLitKind .Normal },
575
626
Token .Id .Invalid ,
576
- }, true );
627
+ });
577
628
testTokenize ("//\x00 \n " , []Token.Id {
578
629
Token .Id .Invalid ,
579
- }, true );
630
+ });
580
631
testTokenize ("//\x1f \n " , []Token.Id {
581
632
Token .Id .Invalid ,
582
- }, true );
633
+ });
583
634
testTokenize ("//\x7f \n " , []Token.Id {
584
635
Token .Id .Invalid ,
585
- }, true );
636
+ });
637
+ }
638
+
639
+ test "tokenizer - valid unicode" {
640
+ testTokenize ("//\xc2\x80 \n " , []Token.Id {});
641
+ testTokenize ("//\xdf\xbf \n " , []Token.Id {});
642
+ testTokenize ("//\xe0\xa0\x80 \n " , []Token.Id {});
643
+ testTokenize ("//\xe1\x80\x80 \n " , []Token.Id {});
644
+ testTokenize ("//\xef\xbf\xbf \n " , []Token.Id {});
645
+ testTokenize ("//\xf0\x90\x80\x80 \n " , []Token.Id {});
646
+ testTokenize ("//\xf1\x80\x80\x80 \n " , []Token.Id {});
647
+ testTokenize ("//\xf3\xbf\xbf\xbf \n " , []Token.Id {});
648
+ testTokenize ("//\xf4\x8f\xbf\xbf \n " , []Token.Id {});
586
649
}
587
650
588
- fn testTokenize (source : []const u8 , expected_tokens : []const Token.Id , expected_eol_at_eof : bool ) {
651
+ test "tokenizer - invalid unicode continuation bytes" {
652
+ // unexpected continuation
653
+ testTokenize ("//\x80 \n " , []Token.Id {Token .Id .Invalid });
654
+ testTokenize ("//\xbf \n " , []Token.Id {Token .Id .Invalid });
655
+ // too many leading 1's
656
+ testTokenize ("//\xf8 \n " , []Token.Id {Token .Id .Invalid });
657
+ testTokenize ("//\xff \n " , []Token.Id {Token .Id .Invalid });
658
+ // expected continuation for 2 byte sequences
659
+ testTokenize ("//\xc2\x00 \n " , []Token.Id {Token .Id .Invalid });
660
+ testTokenize ("//\xc2\xc0 \n " , []Token.Id {Token .Id .Invalid });
661
+ // expected continuation for 3 byte sequences
662
+ testTokenize ("//\xe0\x00 \n " , []Token.Id {Token .Id .Invalid });
663
+ testTokenize ("//\xe0\xc0 \n " , []Token.Id {Token .Id .Invalid });
664
+ testTokenize ("//\xe0\xa0 \n " , []Token.Id {Token .Id .Invalid });
665
+ testTokenize ("//\xe0\xa0\x00 \n " , []Token.Id {Token .Id .Invalid });
666
+ testTokenize ("//\xe0\xa0\xc0 \n " , []Token.Id {Token .Id .Invalid });
667
+ // expected continuation for 4 byte sequences
668
+ testTokenize ("//\xf0\x00 \n " , []Token.Id {Token .Id .Invalid });
669
+ testTokenize ("//\xf0\xc0 \n " , []Token.Id {Token .Id .Invalid });
670
+ testTokenize ("//\xf0\x90\x00 \n " , []Token.Id {Token .Id .Invalid });
671
+ testTokenize ("//\xf0\x90\xc0 \n " , []Token.Id {Token .Id .Invalid });
672
+ testTokenize ("//\xf0\x90\x80\x00 \n " , []Token.Id {Token .Id .Invalid });
673
+ testTokenize ("//\xf0\x90\x80\xc0 \n " , []Token.Id {Token .Id .Invalid });
674
+ }
675
+
676
+ test "tokenizer - overlong utf8 codepoint" {
677
+ testTokenize ("//\xc0\x80 \n " , []Token.Id {Token .Id .Invalid });
678
+ testTokenize ("//\xc1\xbf \n " , []Token.Id {Token .Id .Invalid });
679
+ testTokenize ("//\xe0\x80\x80 \n " , []Token.Id {Token .Id .Invalid });
680
+ testTokenize ("//\xe0\x9f\xbf \n " , []Token.Id {Token .Id .Invalid });
681
+ testTokenize ("//\xf0\x80\x80\x80 \n " , []Token.Id {Token .Id .Invalid });
682
+ testTokenize ("//\xf0\x8f\xbf\xbf \n " , []Token.Id {Token .Id .Invalid });
683
+ }
684
+
685
+ test "tokenizer - misc invalid utf8" {
686
+ // codepoint out of bounds
687
+ testTokenize ("//\xf4\x90\x80\x80 \n " , []Token.Id {Token .Id .Invalid });
688
+ testTokenize ("//\xf7\xbf\xbf\xbf \n " , []Token.Id {Token .Id .Invalid });
689
+ // unicode newline characters.U+0085, U+2028, U+2029
690
+ testTokenize ("//\xc2\x84 \n " , []Token.Id {});
691
+ testTokenize ("//\xc2\x85 \n " , []Token.Id {Token .Id .Invalid });
692
+ testTokenize ("//\xc2\x86 \n " , []Token.Id {});
693
+ testTokenize ("//\xe2\x80\xa7 \n " , []Token.Id {});
694
+ testTokenize ("//\xe2\x80\xa8 \n " , []Token.Id {Token .Id .Invalid });
695
+ testTokenize ("//\xe2\x80\xa9 \n " , []Token.Id {Token .Id .Invalid });
696
+ testTokenize ("//\xe2\x80\xaa \n " , []Token.Id {});
697
+ // surrogate halves
698
+ testTokenize ("//\xed\x9f\x80 \n " , []Token.Id {});
699
+ testTokenize ("//\xed\xa0\x80 \n " , []Token.Id {Token .Id .Invalid });
700
+ testTokenize ("//\xed\xbf\xbf \n " , []Token.Id {Token .Id .Invalid });
701
+ testTokenize ("//\xee\x80\x80 \n " , []Token.Id {});
702
+ // surrogate halves are invalid, even in surrogate pairs
703
+ testTokenize ("//\xed\xa0\xad\xed\xb2\xa9 \n " , []Token.Id {Token .Id .Invalid });
704
+ }
705
+
706
+ fn testTokenize (source : []const u8 , expected_tokens : []const Token.Id ) {
707
+ testTokenizeWithEol (source , expected_tokens , true );
708
+ }
709
+ fn testTokenizeWithEol (source : []const u8 , expected_tokens : []const Token.Id , expected_eol_at_eof : bool ) {
589
710
var tokenizer = Tokenizer .init (source );
590
711
for (expected_tokens ) | expected_token_id | {
591
712
const token = tokenizer .next ();
0 commit comments