Fix test-case failures due to character reference parsing bugs.

gsnedders · gsnedders · commit b95d51ca0655 · 2009-06-02T17:29:14.000+01:00
More completely, change the entire strucutre of the NCR array to be a multi-dimensional array with each dimension having as a key one character and a value possible further steps, and sometimes 'codepoint', which that point represents. Hence, to get &amp;amp; now, you would need $ncrs['a']['m']['p'][';']['codepoint']. 'codepoint' is required so we can cope with both &amp;amp and &amp;amp; (which means it can't just be a value of &amp;amp).

This also removes Data::getNamedCharacterReferenceMaxLength(), as it is now useless.
diff --git a/library/HTML5/Data.php b/library/HTML5/Data.php
@@ -65,16 +65,6 @@ public static function getNamedCharacterReferences() {
         return self::$namedCharacterReferences;
     }
 
-    public static function getNamedCharacterReferenceMaxLength() {
-        if (!self::$namedCharacterReferenceMaxLength) {
-            $namedCharacterReferences = self::getNamedCharacterReferences();
-            $lengths = array_map('strlen', array_keys($namedCharacterReferences));
-            self::$namedCharacterReferenceMaxLength = max($lengths);
-        }
-        return self::$namedCharacterReferenceMaxLength;
-    }
-
-
     /**
      * Converts a Unicode codepoint to sequence of UTF-8 bytes.
      * @note Shamelessly stolen from HTML Purifier, which is also
diff --git a/library/HTML5/Tokenizer.php b/library/HTML5/Tokenizer.php
@@ -2197,21 +2197,32 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
             with the consumed characters matching one of the
             identifiers in the first column of the named character
             references table (in a case-sensitive manner). */
-
-            // we will implement this by matching the longest
-            // alphanumeric + semicolon string, and then working
-            // our way backwards
-            $chars .= $this->stream->charsWhile(self::DIGIT . self::ALPHA . ';', HTML5_Data::getNamedCharacterReferenceMaxLength() - 1);
-            $len = strlen($chars);
+            // What we actually do here is consume as much as we can while it
+            // matches the start of one of the identifiers in the first column.
 
             $refs = HTML5_Data::getNamedCharacterReferences();
+            
+            // Get the longest string which is the start of an identifier
+            // ($chars) as well as the longest identifier which matches ($id)
+            // and its codepoint ($codepoint).
             $codepoint = false;
-            for($c = $len; $c > 0; $c--) {
-                $id = substr($chars, 0, $c);
-                if(isset($refs[$id])) {
-                    $codepoint = $refs[$id];
-                    break;
+            $char = $chars;
+            while ($char !== false && isset($refs[$char])) {
+                $refs = $refs[$char];
+                if (isset($refs['codepoint'])) {
+                    $id = $chars;
+                    $codepoint = $refs['codepoint'];
                 }
+                $chars .= $char = $this->stream->char();
+            }
+            
+            // Unconsume the one character we just took which caused the while
+            // statement to fail. This could be anything and could cause state
+            // changes (as if it matches the while loop it must be
+            // alphanumeric so we can just concat it to whatever we get later).
+            $this->stream->unget();
+            if ($char !== false) {
+                $chars = substr($chars, 0, -1);
             }
 
             /* If no match can be made, then this is a parse error.
@@ -2235,7 +2246,6 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
                 $semicolon = false;
             }
 
-
             /* If the character reference is being consumed as part of
             an attribute, and the last character matched is not a
             U+003B SEMICOLON (;), and the next character is in the
@@ -2245,17 +2255,27 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
             then, for historical reasons, all the characters that were
             matched after the U+0026 AMPERSAND (&) must be unconsumed,
             and nothing is returned. */
-            if (
-                $inattr && !$semicolon &&
-                strspn(substr($chars, $c, 1), self::ALPHA . self::DIGIT)
-            ) {
-                return '&' . $chars;
+            if ($inattr && !$semicolon) {
+                // The next character is either the next character in $chars or in the stream.
+                if (strlen($chars) > strlen($id)) {
+                    $next = substr($chars, strlen($id), 1);
+                } else {
+                    $next = $this->stream->char();
+                    $this->stream->unget();
+                }
+                if (
+                    '0' <= $next && $next <= '9' ||
+                    'A' <= $next && $next <= 'Z' ||
+                    'a' <= $next && $next <= 'z'
+                ) {
+                    return '&' . $chars;
+                }
             }
 
             /* Otherwise, return a character token for the character
             corresponding to the character reference name (as given
             by the second column of the named character references table). */
-            return HTML5_Data::utf8chr($codepoint) . substr($chars, $c);
+            return HTML5_Data::utf8chr($codepoint) . substr($chars, strlen($id));
         }
     }
 
diff --git a/library/HTML5/named-character-references.ser b/library/HTML5/named-character-references.ser
diff --git a/maintenance/scrape-ncr.php b/maintenance/scrape-ncr.php
@@ -12,19 +12,36 @@
 }
 
 $url = 'http://www.whatwg.org/specs/web-apps/current-work/multipage/named-character-references.html';
-$request = new HttpRequest($url);
-$request->send();
-$html = $request->getResponseBody();
+if (extension_loaded('pecl_http')) {
+    $request = new HttpRequest($url);
+    $request->send();
+    $html = $request->getResponseBody();
+} else {
+    $html = file_get_contents($url);
+}
 
 preg_match_all(
-    '#<code title="">\s*([^<]+?)\s*</code>\s*</td>\s*<td>\s*U+([^<]+?)\s*<#',
+    '#<code title="">\s*([^<]+?)\s*</code>\s*</td>\s*<td>\s*U\+([^<]+?)\s*<#',
     $html, $matches, PREG_SET_ORDER);
 
 $table = array();
 foreach ($matches as $match) {
-    $ncr = $match[1];
-    $codepoint = hexdec($match[2]);
-    $table[$ncr] = $codepoint;
+    list(, $name, $codepoint) = $match;
+    
+    // Set the subtable we're working with initially to the whole table.
+    $subtable =& $table;
+    
+    // Loop over each character to the name creating an array key for it, if it 
+    // doesn't already exist
+    for ($i = 0, $len = strlen($name); $i < $len; $i++) {
+        if (!isset($subtable[$name[$i]])) {
+            $subtable[$name[$i]] = null;
+        }
+        $subtable =& $subtable[$name[$i]];
+    }
+    
+    // Set the key codepoint to the codepoint.
+    $subtable['codepoint'] = hexdec($codepoint);
 }
 
 file_put_contents($output, serialize($table));