Skip to content

Commit e0ef94a

Browse files
committed
More documentation, removed a benign-error-only if
1 parent 79103bc commit e0ef94a

File tree

1 file changed

+49
-44
lines changed

1 file changed

+49
-44
lines changed

src/wp-includes/html-api/class-wp-html-decoder.php

Lines changed: 49 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ public static function read_character_reference( $context, $text, $at, &$skip_by
276276
$numeric_base = 16;
277277
$numeric_digits = '0123456789abcdefABCDEF';
278278
$max_digits = 6; // 
279-
$digits_at += 1;
279+
++$digits_at;
280280
} else {
281281
$numeric_base = 10;
282282
$numeric_digits = '0123456789';
@@ -308,49 +308,25 @@ public static function read_character_reference( $context, $text, $at, &$skip_by
308308
$digits = substr( $text, $digits_at + $zero_count, $digit_count );
309309
$code_point = intval( $digits, $numeric_base );
310310

311-
if (
312-
// Null character.
313-
0 === $code_point ||
314-
315-
// Outside Unicode range.
316-
$code_point > 0x10FFFF ||
317-
318-
// Surrogate.
319-
( $code_point >= 0xD800 && $code_point <= 0xDFFF )
320-
) {
321-
$skip_bytes = $end_of_span - $at;
322-
return '';
323-
}
324-
325-
if (
326-
/*
327-
* Noncharacters.
328-
*
329-
* > A noncharacter is a code point that is in the range U+FDD0 to U+FDEF,
330-
* > inclusive, or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF,
331-
* > U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE,
332-
* > U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
333-
* > U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,
334-
* > U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF.
335-
*
336-
* @see https://infra.spec.whatwg.org/#noncharacter
337-
*/
338-
( $code_point >= 0xFDD0 && $code_point <= 0xFDEF ) ||
339-
( 0xFFFE === ( $code_point & 0xFFFE ) ) ||
340-
341-
// 0x0D or non-ASCII-whitespace control
342-
0x0D === $code_point ||
343-
(
344-
$code_point >= 0 &&
345-
$code_point <= 0x1F &&
346-
0x9 !== $code_point &&
347-
0xA !== $code_point &&
348-
0xC !== $code_point &&
349-
0xD !== $code_point
350-
)
351-
) {
352-
// @todo This is an error but the code point passes through.
353-
}
311+
/*
312+
* Noncharacters, 0x0D, and non-ASCII-whitespace control characters.
313+
*
314+
* > A noncharacter is a code point that is in the range U+FDD0 to U+FDEF,
315+
* > inclusive, or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF,
316+
* > U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE,
317+
* > U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
318+
* > U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,
319+
* > U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF.
320+
*
321+
* A C0 control is a code point that is in the range of U+00 to U+1F,
322+
* but ASCII whitespace includes U+09, U+0A, U+0C, and U+0D.
323+
*
324+
* These characters are invalid but still decode as any valid character.
325+
* This comment is here to note and explain why there's no check to
326+
* remove these characters or replace them.
327+
*
328+
* @see https://infra.spec.whatwg.org/#noncharacter
329+
*/
354330

355331
/*
356332
* > If the number is one of the numbers in the first column of
@@ -449,7 +425,36 @@ public static function read_character_reference( $context, $text, $at, &$skip_by
449425
return $name;
450426
}
451427

428+
/**
429+
* Encode a code point number into the UTF-8 encoding.
430+
*
431+
* This encoder implements the encoding algorithm for converting a number
432+
* into a byte sequence, but if it receives an invalid code point it will
433+
* return the Unicode Replacement Character U+FFFD `�`.
434+
*
435+
* Example:
436+
*
437+
* '🅰' === WP_HTML_Decoder::code_point_to_utf8_bytes( 0x1f170 );
438+
*
439+
* // Half of a surrogate pair is an invalid code point.
440+
* '�' === WP_HTML_Decoder::code_point_to_utf8_bytes( 0xd83c );
441+
*
442+
* @since 6.6.0
443+
*
444+
* @see https://www.rfc-editor.org/rfc/rfc3629 UTF-8
445+
*
446+
* @param int $code_point Which code point to convert.
447+
* @return string Converted code point, or `�` if invalid.
448+
*/
452449
public static function code_point_to_utf8_bytes( $code_point ) {
450+
if (
451+
$code_point <= 0 ||
452+
( $code_point >= 0xD800 && $code_point <= 0xDFFF ) ||
453+
$code_point > 0x10FFFF
454+
) {
455+
return '';
456+
}
457+
453458
if ( $code_point < 0x80 ) {
454459
return chr( $code_point );
455460
}

0 commit comments

Comments
 (0)