Skip to content

Commit b95d51c

Browse files
committed
Fix test-case failures due to character reference parsing bugs.
More completely, change the entire strucutre of the NCR array to be a multi-dimensional array with each dimension having as a key one character and a value possible further steps, and sometimes 'codepoint', which that point represents. Hence, to get & now, you would need $ncrs['a']['m']['p'][';']['codepoint']. 'codepoint' is required so we can cope with both &amp and & (which means it can't just be a value of &amp). This also removes Data::getNamedCharacterReferenceMaxLength(), as it is now useless.
1 parent ef63fc9 commit b95d51c

File tree

4 files changed

+63
-36
lines changed

4 files changed

+63
-36
lines changed

library/HTML5/Data.php

-10
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,6 @@ public static function getNamedCharacterReferences() {
6565
return self::$namedCharacterReferences;
6666
}
6767

68-
public static function getNamedCharacterReferenceMaxLength() {
69-
if (!self::$namedCharacterReferenceMaxLength) {
70-
$namedCharacterReferences = self::getNamedCharacterReferences();
71-
$lengths = array_map('strlen', array_keys($namedCharacterReferences));
72-
self::$namedCharacterReferenceMaxLength = max($lengths);
73-
}
74-
return self::$namedCharacterReferenceMaxLength;
75-
}
76-
77-
7868
/**
7969
* Converts a Unicode codepoint to sequence of UTF-8 bytes.
8070
* @note Shamelessly stolen from HTML Purifier, which is also

library/HTML5/Tokenizer.php

+38-18
Original file line numberDiff line numberDiff line change
@@ -2197,21 +2197,32 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
21972197
with the consumed characters matching one of the
21982198
identifiers in the first column of the named character
21992199
references table (in a case-sensitive manner). */
2200-
2201-
// we will implement this by matching the longest
2202-
// alphanumeric + semicolon string, and then working
2203-
// our way backwards
2204-
$chars .= $this->stream->charsWhile(self::DIGIT . self::ALPHA . ';', HTML5_Data::getNamedCharacterReferenceMaxLength() - 1);
2205-
$len = strlen($chars);
2200+
// What we actually do here is consume as much as we can while it
2201+
// matches the start of one of the identifiers in the first column.
22062202

22072203
$refs = HTML5_Data::getNamedCharacterReferences();
2204+
2205+
// Get the longest string which is the start of an identifier
2206+
// ($chars) as well as the longest identifier which matches ($id)
2207+
// and its codepoint ($codepoint).
22082208
$codepoint = false;
2209-
for($c = $len; $c > 0; $c--) {
2210-
$id = substr($chars, 0, $c);
2211-
if(isset($refs[$id])) {
2212-
$codepoint = $refs[$id];
2213-
break;
2209+
$char = $chars;
2210+
while ($char !== false && isset($refs[$char])) {
2211+
$refs = $refs[$char];
2212+
if (isset($refs['codepoint'])) {
2213+
$id = $chars;
2214+
$codepoint = $refs['codepoint'];
22142215
}
2216+
$chars .= $char = $this->stream->char();
2217+
}
2218+
2219+
// Unconsume the one character we just took which caused the while
2220+
// statement to fail. This could be anything and could cause state
2221+
// changes (as if it matches the while loop it must be
2222+
// alphanumeric so we can just concat it to whatever we get later).
2223+
$this->stream->unget();
2224+
if ($char !== false) {
2225+
$chars = substr($chars, 0, -1);
22152226
}
22162227

22172228
/* If no match can be made, then this is a parse error.
@@ -2235,7 +2246,6 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
22352246
$semicolon = false;
22362247
}
22372248

2238-
22392249
/* If the character reference is being consumed as part of
22402250
an attribute, and the last character matched is not a
22412251
U+003B SEMICOLON (;), and the next character is in the
@@ -2245,17 +2255,27 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
22452255
then, for historical reasons, all the characters that were
22462256
matched after the U+0026 AMPERSAND (&) must be unconsumed,
22472257
and nothing is returned. */
2248-
if (
2249-
$inattr && !$semicolon &&
2250-
strspn(substr($chars, $c, 1), self::ALPHA . self::DIGIT)
2251-
) {
2252-
return '&' . $chars;
2258+
if ($inattr && !$semicolon) {
2259+
// The next character is either the next character in $chars or in the stream.
2260+
if (strlen($chars) > strlen($id)) {
2261+
$next = substr($chars, strlen($id), 1);
2262+
} else {
2263+
$next = $this->stream->char();
2264+
$this->stream->unget();
2265+
}
2266+
if (
2267+
'0' <= $next && $next <= '9' ||
2268+
'A' <= $next && $next <= 'Z' ||
2269+
'a' <= $next && $next <= 'z'
2270+
) {
2271+
return '&' . $chars;
2272+
}
22532273
}
22542274

22552275
/* Otherwise, return a character token for the character
22562276
corresponding to the character reference name (as given
22572277
by the second column of the named character references table). */
2258-
return HTML5_Data::utf8chr($codepoint) . substr($chars, $c);
2278+
return HTML5_Data::utf8chr($codepoint) . substr($chars, strlen($id));
22592279
}
22602280
}
22612281

library/HTML5/named-character-references.ser

+1-1
Large diffs are not rendered by default.

maintenance/scrape-ncr.php

+24-7
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,36 @@
1212
}
1313

1414
$url = 'http://www.whatwg.org/specs/web-apps/current-work/multipage/named-character-references.html';
15-
$request = new HttpRequest($url);
16-
$request->send();
17-
$html = $request->getResponseBody();
15+
if (extension_loaded('pecl_http')) {
16+
$request = new HttpRequest($url);
17+
$request->send();
18+
$html = $request->getResponseBody();
19+
} else {
20+
$html = file_get_contents($url);
21+
}
1822

1923
preg_match_all(
20-
'#<code title="">\s*([^<]+?)\s*</code>\s*</td>\s*<td>\s*U+([^<]+?)\s*<#',
24+
'#<code title="">\s*([^<]+?)\s*</code>\s*</td>\s*<td>\s*U\+([^<]+?)\s*<#',
2125
$html, $matches, PREG_SET_ORDER);
2226

2327
$table = array();
2428
foreach ($matches as $match) {
25-
$ncr = $match[1];
26-
$codepoint = hexdec($match[2]);
27-
$table[$ncr] = $codepoint;
29+
list(, $name, $codepoint) = $match;
30+
31+
// Set the subtable we're working with initially to the whole table.
32+
$subtable =& $table;
33+
34+
// Loop over each character to the name creating an array key for it, if it
35+
// doesn't already exist
36+
for ($i = 0, $len = strlen($name); $i < $len; $i++) {
37+
if (!isset($subtable[$name[$i]])) {
38+
$subtable[$name[$i]] = null;
39+
}
40+
$subtable =& $subtable[$name[$i]];
41+
}
42+
43+
// Set the key codepoint to the codepoint.
44+
$subtable['codepoint'] = hexdec($codepoint);
2845
}
2946

3047
file_put_contents($output, serialize($table));

0 commit comments

Comments
 (0)