@@ -669,7 +669,7 @@ char *string_html_encode(const char *input, int &len,
669
669
p--;
670
670
*q++ = ' &' ; *q++ = ' a' ; *q++ = ' m' ; *q++ = ' p' ; *q++ = ' ;' ;
671
671
}
672
- } else {
672
+ } else {
673
673
*q++ = ' &' ; *q++ = ' a' ; *q++ = ' m' ; *q++ = ' p' ; *q++ = ' ;' ;
674
674
}
675
675
break ;
@@ -686,78 +686,82 @@ char *string_html_encode(const char *input, int &len,
686
686
*q++ = c;
687
687
break ;
688
688
}
689
- if (qsBitmask & static_cast <int64_t >(EntBitmask::ENT_BM_IGNORE)) {
689
+
690
+ bool should_skip =
691
+ qsBitmask & static_cast <int64_t >(EntBitmask::ENT_BM_IGNORE);
692
+ bool should_replace =
693
+ qsBitmask & static_cast <int64_t >(EntBitmask::ENT_BM_SUBSTITUTE);
694
+
695
+ if (!utf8 && should_skip) {
690
696
break ;
691
697
}
692
698
693
699
auto avail = end - p;
694
700
auto utf8_trail = [](unsigned char c) { return c >= 0x80 && c <= 0xbf ; };
695
701
702
+ // This has to be a macro since it needs to be able to break away from
703
+ // the for loop we're in.
704
+ // ENT_IGNORE has higher precedence than ENT_SUBSTITUTE
705
+ // \uFFFD is Unicode Replacement Character (U+FFFD)
706
+ #define UTF8_ERROR_IF (cond ) \
707
+ if (cond) { \
708
+ if (should_skip) { break ; } \
709
+ else if (should_replace) { strcpy (q, " \uFFFD " ); q += 3 ; break ; } \
710
+ else { goto exit_error; } \
711
+ }
712
+
696
713
if (utf8) {
697
714
if (c < 0xc2 ) {
698
- goto exit_error ;
715
+ UTF8_ERROR_IF ( true ) ;
699
716
} else if (c < 0xe0 ) {
700
- if (avail < 2 || !utf8_trail (*(p + 1 ))) {
701
- goto exit_error;
702
- }
717
+ UTF8_ERROR_IF (avail < 2 || !utf8_trail (*(p + 1 )));
703
718
704
719
uint16_t tc = ((c & 0x1f ) << 6 ) | (p[1 ] & 0x3f );
705
- if (tc < 0x80 ) { // non-shortest form
706
- goto exit_error;
707
- }
720
+ UTF8_ERROR_IF (tc < 0x80 ); // non-shortest form
721
+
708
722
codeLength = 2 ;
709
723
entity[0 ] = *p;
710
724
entity[1 ] = *(p + 1 );
711
725
entity[2 ] = ' \0 ' ;
712
726
} else if (c < 0xf0 ) {
713
- if (avail < 3 ) {
714
- goto exit_error;
715
- }
727
+ UTF8_ERROR_IF (avail < 3 );
716
728
for (int i = 1 ; i < 3 ; ++i) {
717
- if (!utf8_trail (*(p + i))) {
718
- goto exit_error;
719
- }
729
+ UTF8_ERROR_IF (!utf8_trail (*(p + i)));
720
730
}
721
731
722
732
uint32_t tc = ((c & 0x0f ) << 12 ) |
723
733
((*(p+1 ) & 0x3f ) << 6 ) |
724
734
(*(p+2 ) & 0x3f );
725
- if (tc < 0x800 ) { // non-shortest form
726
- goto exit_error;
727
- } else if (tc >= 0xd800 && tc <= 0xdfff ) { // surrogate
728
- goto exit_error;
729
- }
735
+ UTF8_ERROR_IF (tc < 0x800 ); // non-shortest form
736
+ UTF8_ERROR_IF (tc >= 0xd800 && tc <= 0xdfff ); // surrogate
737
+
730
738
codeLength = 3 ;
731
739
entity[0 ] = *p;
732
740
entity[1 ] = *(p + 1 );
733
741
entity[2 ] = *(p + 2 );
734
742
entity[3 ] = ' \0 ' ;
735
743
} else if (c < 0xf5 ) {
736
- if (avail < 4 ) {
737
- goto exit_error;
738
- }
744
+ UTF8_ERROR_IF (avail < 4 );
739
745
for (int i = 1 ; i < 4 ; ++i) {
740
- if (!utf8_trail (*(p + i))) {
741
- goto exit_error;
742
- }
746
+ UTF8_ERROR_IF (!utf8_trail (*(p + i)));
743
747
}
744
748
745
749
uint32_t tc = ((c & 0x07 ) << 18 ) |
746
750
((*(p+1 ) & 0x3f ) << 12 ) |
747
751
((*(p+2 ) & 0x3f ) << 6 ) |
748
752
(*(p+3 ) & 0x3f );
749
- if (tc < 0x10000 || tc > 0x10ffff ) {
750
- // non-shortest form or outside range
751
- goto exit_error ;
752
- }
753
+
754
+ // non-shortest form or outside range
755
+ UTF8_ERROR_IF (tc < 0x10000 || tc > 0x10ffff ) ;
756
+
753
757
codeLength = 4 ;
754
758
entity[0 ] = *p;
755
759
entity[1 ] = *(p + 1 );
756
760
entity[2 ] = *(p + 2 );
757
761
entity[3 ] = *(p + 3 );
758
762
entity[4 ] = ' \0 ' ;
759
763
} else {
760
- goto exit_error ;
764
+ UTF8_ERROR_IF ( true ) ;
761
765
}
762
766
} else {
763
767
codeLength = 1 ;
@@ -795,6 +799,8 @@ char *string_html_encode(const char *input, int &len,
795
799
796
800
}
797
801
802
+ #undef UTF8_ERROR_IF
803
+
798
804
if (q - ret > INT_MAX) {
799
805
goto exit_error;
800
806
}
0 commit comments