@@ -399,7 +399,7 @@ private RegExpTree parseCharset() {
399
399
CharRanges ieExplicits = CharRanges .EMPTY ;
400
400
while (pos < limit && pattern .charAt (pos ) != ']' ) {
401
401
char ch = pattern .charAt (pos );
402
- char start ;
402
+ int start ;
403
403
if (ch == '\\' ) {
404
404
++pos ;
405
405
char possibleGroupName = pattern .charAt (pos );
@@ -414,7 +414,7 @@ private RegExpTree parseCharset() {
414
414
start = ch ;
415
415
++pos ;
416
416
}
417
- char end = start ;
417
+ int end = start ;
418
418
if (pos + 1 < limit && pattern .charAt (pos ) == '-'
419
419
&& pattern .charAt (pos + 1 ) != ']' ) {
420
420
++pos ;
@@ -464,15 +464,20 @@ private RegExpTree parseCharset() {
464
464
* contexts, so contexts must filter those instead.
465
465
* E.g. '\b' means a different thing inside a charset than without.
466
466
*/
467
- private char parseEscapeChar () {
467
+ private int parseEscapeChar () {
468
468
char ch = pattern .charAt (pos ++);
469
469
switch (ch ) {
470
470
case 'b' : return '\b' ;
471
471
case 'f' : return '\f' ;
472
472
case 'n' : return '\n' ;
473
473
case 'r' : return '\r' ;
474
474
case 't' : return '\t' ;
475
- case 'u' : return parseHex (4 );
475
+ case 'u' :
476
+ if (flags .contains ("u" ) && pos < limit && pattern .charAt (pos ) == '{' ) {
477
+ return parseUnicodeEscape ();
478
+ } else {
479
+ return parseHex (4 );
480
+ }
476
481
case 'v' : return '\u000b' ;
477
482
case 'x' : return parseHex (2 );
478
483
default :
@@ -599,7 +604,7 @@ private RegExpTree parseEscape() {
599
604
++pos ;
600
605
return new Charset (charGroup , CharRanges .EMPTY );
601
606
}
602
- return new Text ("" + parseEscapeChar ());
607
+ return new Text (new String ( Character . toChars ( parseEscapeChar ()) ));
603
608
}
604
609
}
605
610
@@ -630,6 +635,42 @@ private char parseHex(int n) {
630
635
return (char ) result ;
631
636
}
632
637
638
+ private int parseUnicodeEscape () {
639
+ checkState (pattern .charAt (pos ) == '{' );
640
+ int start = pos ++;
641
+ int result = 0 ;
642
+ char ch = pattern .charAt (pos );
643
+ if (ch == '}' ) {
644
+ throw new IllegalArgumentException ("Invalid unicode escape: "
645
+ + pattern .substring (start , ++pos ));
646
+ }
647
+ while (pos < limit ) {
648
+ int digit ;
649
+ ch = pattern .charAt (pos ++);
650
+ if ('0' <= ch && ch <= '9' ) {
651
+ digit = ch - '0' ;
652
+ } else if ('a' <= ch && ch <= 'f' ) {
653
+ digit = ch + (10 - 'a' );
654
+ } else if ('A' <= ch && ch <= 'F' ) {
655
+ digit = ch + (10 - 'A' );
656
+ } else if (ch == '}' ) {
657
+ break ;
658
+ } else {
659
+ throw new IllegalArgumentException ("Invalid character in unicode escape: " + ch );
660
+ }
661
+ result = (result << 4 ) | digit ;
662
+ }
663
+ if (ch != '}' ) {
664
+ throw new IllegalArgumentException ("Malformed unicode escape: expected '}' after "
665
+ + pattern .substring (start , pos ));
666
+ }
667
+ if (result > 0x10FFFF ) {
668
+ throw new IllegalArgumentException ("Unicode must not be greater than 0x10FFFF: "
669
+ + pattern .substring (start , pos ));
670
+ }
671
+ return result ;
672
+ }
673
+
633
674
private boolean isRepetitionStart (char ch ) {
634
675
switch (ch ) {
635
676
case '?' :
0 commit comments