@@ -2197,21 +2197,32 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
2197
2197
with the consumed characters matching one of the
2198
2198
identifiers in the first column of the named character
2199
2199
references table (in a case-sensitive manner). */
2200
-
2201
- // we will implement this by matching the longest
2202
- // alphanumeric + semicolon string, and then working
2203
- // our way backwards
2204
- $ chars .= $ this ->stream ->charsWhile (self ::DIGIT . self ::ALPHA . '; ' , HTML5_Data::getNamedCharacterReferenceMaxLength () - 1 );
2205
- $ len = strlen ($ chars );
2200
+ // What we actually do here is consume as much as we can while it
2201
+ // matches the start of one of the identifiers in the first column.
2206
2202
2207
2203
$ refs = HTML5_Data::getNamedCharacterReferences ();
2204
+
2205
+ // Get the longest string which is the start of an identifier
2206
+ // ($chars) as well as the longest identifier which matches ($id)
2207
+ // and its codepoint ($codepoint).
2208
2208
$ codepoint = false ;
2209
- for ($ c = $ len ; $ c > 0 ; $ c --) {
2210
- $ id = substr ($ chars , 0 , $ c );
2211
- if (isset ($ refs [$ id ])) {
2212
- $ codepoint = $ refs [$ id ];
2213
- break ;
2209
+ $ char = $ chars ;
2210
+ while ($ char !== false && isset ($ refs [$ char ])) {
2211
+ $ refs = $ refs [$ char ];
2212
+ if (isset ($ refs ['codepoint ' ])) {
2213
+ $ id = $ chars ;
2214
+ $ codepoint = $ refs ['codepoint ' ];
2214
2215
}
2216
+ $ chars .= $ char = $ this ->stream ->char ();
2217
+ }
2218
+
2219
+ // Unconsume the one character we just took which caused the while
2220
+ // statement to fail. This could be anything and could cause state
2221
+ // changes (as if it matches the while loop it must be
2222
+ // alphanumeric so we can just concat it to whatever we get later).
2223
+ $ this ->stream ->unget ();
2224
+ if ($ char !== false ) {
2225
+ $ chars = substr ($ chars , 0 , -1 );
2215
2226
}
2216
2227
2217
2228
/* If no match can be made, then this is a parse error.
@@ -2235,7 +2246,6 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
2235
2246
$ semicolon = false ;
2236
2247
}
2237
2248
2238
-
2239
2249
/* If the character reference is being consumed as part of
2240
2250
an attribute, and the last character matched is not a
2241
2251
U+003B SEMICOLON (;), and the next character is in the
@@ -2245,17 +2255,27 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
2245
2255
then, for historical reasons, all the characters that were
2246
2256
matched after the U+0026 AMPERSAND (&) must be unconsumed,
2247
2257
and nothing is returned. */
2248
- if (
2249
- $ inattr && !$ semicolon &&
2250
- strspn (substr ($ chars , $ c , 1 ), self ::ALPHA . self ::DIGIT )
2251
- ) {
2252
- return '& ' . $ chars ;
2258
+ if ($ inattr && !$ semicolon ) {
2259
+ // The next character is either the next character in $chars or in the stream.
2260
+ if (strlen ($ chars ) > strlen ($ id )) {
2261
+ $ next = substr ($ chars , strlen ($ id ), 1 );
2262
+ } else {
2263
+ $ next = $ this ->stream ->char ();
2264
+ $ this ->stream ->unget ();
2265
+ }
2266
+ if (
2267
+ '0 ' <= $ next && $ next <= '9 ' ||
2268
+ 'A ' <= $ next && $ next <= 'Z ' ||
2269
+ 'a ' <= $ next && $ next <= 'z '
2270
+ ) {
2271
+ return '& ' . $ chars ;
2272
+ }
2253
2273
}
2254
2274
2255
2275
/* Otherwise, return a character token for the character
2256
2276
corresponding to the character reference name (as given
2257
2277
by the second column of the named character references table). */
2258
- return HTML5_Data::utf8chr ($ codepoint ) . substr ($ chars , $ c );
2278
+ return HTML5_Data::utf8chr ($ codepoint ) . substr ($ chars , strlen ( $ id ) );
2259
2279
}
2260
2280
}
2261
2281
0 commit comments