Skip to content

Commit 26c5071

Browse files
Liran Nunasgolemon
authored andcommitted
Corrected ENT_IGNORE and ENT_SUBSTITUTE behaviors for UTF8 strings
`htmlspecialchars` was incorrectly aborting escaping of UTF8 characters whenever `ENT_IGNORE` was passed in because of bad checks. This diff corrects that behavior. This diff also implements support for `ENT_SUBSTITUTE` since the functionality is very similar. Fixes facebook#2186 and facebook#2262 Reviewed By: @ptarjan Differential Revision: D1250810
1 parent fa6c733 commit 26c5071

File tree

5 files changed

+63
-35
lines changed

5 files changed

+63
-35
lines changed

hphp/runtime/base/string-util.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class StringUtil {
5050
QS(FBUtf8, 32768) \
5151
/* Order of the fields matters here if we're
5252
* matching on what flags are set */ \
53+
QS(Substitute, 8) /* k_ENT_SUBSTITUTE: replace invalid chars with FFFD */ \
5354
QS(Ignore, 4) /* k_ENT_IGNORE: silently discard invalid chars */ \
5455
QS(Both, 3) /* k_ENT_QUOTES: escape both double and single quotes */ \
5556
QS(Double, 2) /* k_ENT_COMPAT: escape double quotes only */ \
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
<?php
2+
3+
$s = "\xc3\xa8\xc3\xa9\xc3\xa6\xc3\x83\xc2\xb3";
4+
var_dump($s);
5+
var_dump(htmlspecialchars($s, ENT_COMPAT, 'UTF-8'));
6+
var_dump(htmlspecialchars($s, ENT_COMPAT | ENT_IGNORE, 'UTF-8'));
7+
var_dump(htmlspecialchars($s, ENT_COMPAT | ENT_SUBSTITUTE, 'UTF-8'));
8+
9+
var_dump(htmlspecialchars("a\x80b"));
10+
var_dump(htmlspecialchars("a\x80b", ENT_IGNORE));
11+
var_dump(htmlspecialchars("a\x80b", ENT_SUBSTITUTE));
12+
var_dump(htmlspecialchars("a\x80b", ENT_IGNORE | ENT_SUBSTITUTE));
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
string(10) "èéæÃ³"
2+
string(10) "èéæÃ³"
3+
string(10) "èéæÃ³"
4+
string(10) "èéæÃ³"
5+
string(0) ""
6+
string(2) "ab"
7+
string(5) "a�b"
8+
string(2) "ab"

hphp/zend/zend-html.cpp

Lines changed: 37 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -669,7 +669,7 @@ char *string_html_encode(const char *input, int &len,
669669
p--;
670670
*q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
671671
}
672-
} else {
672+
} else {
673673
*q++ = '&'; *q++ = 'a'; *q++ = 'm'; *q++ = 'p'; *q++ = ';';
674674
}
675675
break;
@@ -686,78 +686,82 @@ char *string_html_encode(const char *input, int &len,
686686
*q++ = c;
687687
break;
688688
}
689-
if (qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_IGNORE)) {
689+
690+
bool should_skip =
691+
qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_IGNORE);
692+
bool should_replace =
693+
qsBitmask & static_cast<int64_t>(EntBitmask::ENT_BM_SUBSTITUTE);
694+
695+
if (!utf8 && should_skip) {
690696
break;
691697
}
692698

693699
auto avail = end - p;
694700
auto utf8_trail = [](unsigned char c) { return c >= 0x80 && c <= 0xbf; };
695701

702+
// This has to be a macro since it needs to be able to break away from
703+
// the for loop we're in.
704+
// ENT_IGNORE has higher precedence than ENT_SUBSTITUTE
705+
// \uFFFD is Unicode Replacement Character (U+FFFD)
706+
#define UTF8_ERROR_IF(cond) \
707+
if (cond) { \
708+
if (should_skip) { break; } \
709+
else if (should_replace) { strcpy(q, "\uFFFD"); q += 3; break; } \
710+
else { goto exit_error; } \
711+
}
712+
696713
if (utf8) {
697714
if (c < 0xc2) {
698-
goto exit_error;
715+
UTF8_ERROR_IF(true);
699716
} else if (c < 0xe0) {
700-
if (avail < 2 || !utf8_trail(*(p + 1))) {
701-
goto exit_error;
702-
}
717+
UTF8_ERROR_IF(avail < 2 || !utf8_trail(*(p + 1)));
703718

704719
uint16_t tc = ((c & 0x1f) << 6) | (p[1] & 0x3f);
705-
if (tc < 0x80) { // non-shortest form
706-
goto exit_error;
707-
}
720+
UTF8_ERROR_IF(tc < 0x80); // non-shortest form
721+
708722
codeLength = 2;
709723
entity[0] = *p;
710724
entity[1] = *(p + 1);
711725
entity[2] = '\0';
712726
} else if (c < 0xf0) {
713-
if (avail < 3) {
714-
goto exit_error;
715-
}
727+
UTF8_ERROR_IF(avail < 3);
716728
for (int i = 1; i < 3; ++i) {
717-
if (!utf8_trail(*(p + i))) {
718-
goto exit_error;
719-
}
729+
UTF8_ERROR_IF(!utf8_trail(*(p + i)));
720730
}
721731

722732
uint32_t tc = ((c & 0x0f) << 12) |
723733
((*(p+1) & 0x3f) << 6) |
724734
(*(p+2) & 0x3f);
725-
if (tc < 0x800) { // non-shortest form
726-
goto exit_error;
727-
} else if (tc >= 0xd800 && tc <= 0xdfff) { // surrogate
728-
goto exit_error;
729-
}
735+
UTF8_ERROR_IF(tc < 0x800); // non-shortest form
736+
UTF8_ERROR_IF(tc >= 0xd800 && tc <= 0xdfff); // surrogate
737+
730738
codeLength = 3;
731739
entity[0] = *p;
732740
entity[1] = *(p + 1);
733741
entity[2] = *(p + 2);
734742
entity[3] = '\0';
735743
} else if (c < 0xf5) {
736-
if (avail < 4) {
737-
goto exit_error;
738-
}
744+
UTF8_ERROR_IF(avail < 4);
739745
for (int i = 1; i < 4; ++i) {
740-
if (!utf8_trail(*(p + i))) {
741-
goto exit_error;
742-
}
746+
UTF8_ERROR_IF(!utf8_trail(*(p + i)));
743747
}
744748

745749
uint32_t tc = ((c & 0x07) << 18) |
746750
((*(p+1) & 0x3f) << 12) |
747751
((*(p+2) & 0x3f) << 6) |
748752
(*(p+3) & 0x3f);
749-
if (tc < 0x10000 || tc > 0x10ffff) {
750-
// non-shortest form or outside range
751-
goto exit_error;
752-
}
753+
754+
// non-shortest form or outside range
755+
UTF8_ERROR_IF(tc < 0x10000 || tc > 0x10ffff);
756+
753757
codeLength = 4;
754758
entity[0] = *p;
755759
entity[1] = *(p + 1);
756760
entity[2] = *(p + 2);
757761
entity[3] = *(p + 3);
758762
entity[4] = '\0';
759763
} else {
760-
goto exit_error;
764+
UTF8_ERROR_IF(true);
761765
}
762766
} else {
763767
codeLength = 1;
@@ -795,6 +799,8 @@ char *string_html_encode(const char *input, int &len,
795799

796800
}
797801

802+
#undef UTF8_ERROR_IF
803+
798804
if (q - ret > INT_MAX) {
799805
goto exit_error;
800806
}

hphp/zend/zend-html.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,11 @@ enum StringHtmlEncoding {
6262
};
6363

6464
enum class EntBitmask {
65-
ENT_BM_NOQUOTES = 0, /* leave all quotes alone */
66-
ENT_BM_SINGLE = 1, /* escape single quotes only */
67-
ENT_BM_DOUBLE = 2, /* escape double quotes only */
68-
ENT_BM_IGNORE = 4 /* silently discard invalid chars */
65+
ENT_BM_NOQUOTES = 0, /* leave all quotes alone */
66+
ENT_BM_SINGLE = 1, /* escape single quotes only */
67+
ENT_BM_DOUBLE = 2, /* escape double quotes only */
68+
ENT_BM_IGNORE = 4, /* silently discard invalid chars */
69+
ENT_BM_SUBSTITUTE = 8, /* replace invalid chars with U+FFFD */
6970
};
7071

7172
namespace entity_charset_enum {

0 commit comments

Comments
 (0)