@@ -276,7 +276,7 @@ public static function read_character_reference( $context, $text, $at, &$skip_by
276
276
$ numeric_base = 16 ;
277
277
$ numeric_digits = '0123456789abcdefABCDEF ' ;
278
278
$ max_digits = 6 ; // 
279
- $ digits_at += 1 ;
279
+ ++ $ digits_at ;
280
280
} else {
281
281
$ numeric_base = 10 ;
282
282
$ numeric_digits = '0123456789 ' ;
@@ -308,49 +308,25 @@ public static function read_character_reference( $context, $text, $at, &$skip_by
308
308
$ digits = substr ( $ text , $ digits_at + $ zero_count , $ digit_count );
309
309
$ code_point = intval ( $ digits , $ numeric_base );
310
310
311
- if (
312
- // Null character.
313
- 0 === $ code_point ||
314
-
315
- // Outside Unicode range.
316
- $ code_point > 0x10FFFF ||
317
-
318
- // Surrogate.
319
- ( $ code_point >= 0xD800 && $ code_point <= 0xDFFF )
320
- ) {
321
- $ skip_bytes = $ end_of_span - $ at ;
322
- return '� ' ;
323
- }
324
-
325
- if (
326
- /*
327
- * Noncharacters.
328
- *
329
- * > A noncharacter is a code point that is in the range U+FDD0 to U+FDEF,
330
- * > inclusive, or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF,
331
- * > U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE,
332
- * > U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
333
- * > U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,
334
- * > U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF.
335
- *
336
- * @see https://infra.spec.whatwg.org/#noncharacter
337
- */
338
- ( $ code_point >= 0xFDD0 && $ code_point <= 0xFDEF ) ||
339
- ( 0xFFFE === ( $ code_point & 0xFFFE ) ) ||
340
-
341
- // 0x0D or non-ASCII-whitespace control
342
- 0x0D === $ code_point ||
343
- (
344
- $ code_point >= 0 &&
345
- $ code_point <= 0x1F &&
346
- 0x9 !== $ code_point &&
347
- 0xA !== $ code_point &&
348
- 0xC !== $ code_point &&
349
- 0xD !== $ code_point
350
- )
351
- ) {
352
- // @todo This is an error but the code point passes through.
353
- }
311
+ /*
312
+ * Noncharacters, 0x0D, and non-ASCII-whitespace control characters.
313
+ *
314
+ * > A noncharacter is a code point that is in the range U+FDD0 to U+FDEF,
315
+ * > inclusive, or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF,
316
+ * > U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE,
317
+ * > U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
318
+ * > U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,
319
+ * > U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF.
320
+ *
321
+ * A C0 control is a code point that is in the range of U+00 to U+1F,
322
+ * but ASCII whitespace includes U+09, U+0A, U+0C, and U+0D.
323
+ *
324
+ * These characters are invalid but still decode as any valid character.
325
+ * This comment is here to note and explain why there's no check to
326
+ * remove these characters or replace them.
327
+ *
328
+ * @see https://infra.spec.whatwg.org/#noncharacter
329
+ */
354
330
355
331
/*
356
332
* > If the number is one of the numbers in the first column of
@@ -449,7 +425,36 @@ public static function read_character_reference( $context, $text, $at, &$skip_by
449
425
return $ name ;
450
426
}
451
427
428
+ /**
429
+ * Encode a code point number into the UTF-8 encoding.
430
+ *
431
+ * This encoder implements the encoding algorithm for converting a number
432
+ * into a byte sequence, but if it receives an invalid code point it will
433
+ * return the Unicode Replacement Character U+FFFD `�`.
434
+ *
435
+ * Example:
436
+ *
437
+ * '🅰' === WP_HTML_Decoder::code_point_to_utf8_bytes( 0x1f170 );
438
+ *
439
+ * // Half of a surrogate pair is an invalid code point.
440
+ * '�' === WP_HTML_Decoder::code_point_to_utf8_bytes( 0xd83c );
441
+ *
442
+ * @since 6.6.0
443
+ *
444
+ * @see https://www.rfc-editor.org/rfc/rfc3629 UTF-8
445
+ *
446
+ * @param int $code_point Which code point to convert.
447
+ * @return string Converted code point, or `�` if invalid.
448
+ */
452
449
public static function code_point_to_utf8_bytes ( $ code_point ) {
450
+ if (
451
+ $ code_point <= 0 ||
452
+ ( $ code_point >= 0xD800 && $ code_point <= 0xDFFF ) ||
453
+ $ code_point > 0x10FFFF
454
+ ) {
455
+ return '� ' ;
456
+ }
457
+
453
458
if ( $ code_point < 0x80 ) {
454
459
return chr ( $ code_point );
455
460
}
0 commit comments