Skip to content

Commit aa5a99b

Browse files
committed
HTML API: Add custom text decoder.
Provides a custom decoder for strings coming from HTML attributes and markup. This custom decoder is necessary because of deficiencies in PHP's `html_entity_decode()` function: - It isn't aware of 720 of the possible named character references in HTML, leaving many out that should be translated. - It isn't aware of the ambiguous ampersand rule, which allows conversion of character references in certain contexts when they are missing their closing `;`. - It doesn't draw a distinction for the ambiguous ampersand rule when decoding attribute values instead of markup values. - Use of `html_entity_decode()` requires manually passing non-default paramter values to ensure it decodes properly. This decoder also provides some conveniences, such as making a single-pass and interruptable decode operation possible. This will provide a number of opportunities to optimize detection and decoding of things like value prefixes, and whether a value contains a given substring. Developed in WordPress/wordpress-develop#6387 Discussed in https://core.trac.wordpress.org/ticket/61072 Props dmsnell, gziolo, jonsurrell, jorbin, westonruter, zieladam. Fixes #61072. Built from https://develop.svn.wordpress.org/trunk@58281 git-svn-id: https://core.svn.wordpress.org/trunk@57741 1a063a9b-81f0-0310-95a4-ce76da25c4cd
1 parent 7b88768 commit aa5a99b

File tree

5 files changed

+481
-23
lines changed

5 files changed

+481
-23
lines changed

wp-includes/class-wp-token-map.php

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -435,8 +435,8 @@ public static function from_precomputed_table( $state ) {
435435
*
436436
* @since 6.6.0
437437
*
438-
* @param string $word Determine if this word is a lookup key in the map.
439-
* @param ?string $case_sensitivity 'ascii-case-insensitive' to ignore ASCII case or default of 'case-sensitive'.
438+
* @param string $word Determine if this word is a lookup key in the map.
439+
* @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'.
440440
* @return bool Whether there's an entry for the given word in the map.
441441
*/
442442
public function contains( $word, $case_sensitivity = 'case-sensitive' ) {
@@ -521,10 +521,10 @@ public function contains( $word, $case_sensitivity = 'case-sensitive' ) {
521521
* @since 6.6.0
522522
*
523523
* @param string $text String in which to search for a lookup key.
524-
* @param ?int $offset How many bytes into the string where the lookup key ought to start.
525-
* @param ?int &$matched_token_byte_length Holds byte-length of found token matched, otherwise not set.
526-
* @param ?string $case_sensitivity 'ascii-case-insensitive' to ignore ASCII case or default of 'case-sensitive'.
527-
* @return string|false Mapped value of lookup key if found, otherwise `false`.
524+
* @param int $offset Optional. How many bytes into the string where the lookup key ought to start. Default 0.
525+
* @param ?int &$matched_token_byte_length Optional. Holds byte-length of found token matched, otherwise not set. Default null.
526+
* @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'.
527+
* @return string|null Mapped value of lookup key if found, otherwise `null`.
528528
*/
529529
public function read_token( $text, $offset = 0, &$matched_token_byte_length = null, $case_sensitivity = 'case-sensitive' ) {
530530
$ignore_case = 'ascii-case-insensitive' === $case_sensitivity;
@@ -539,7 +539,7 @@ public function read_token( $text, $offset = 0, &$matched_token_byte_length = nu
539539
// Perhaps a short word then.
540540
return strlen( $this->small_words ) > 0
541541
? $this->read_small_token( $text, $offset, $matched_token_byte_length, $case_sensitivity )
542-
: false;
542+
: null;
543543
}
544544

545545
$group = $this->large_words[ $group_at / ( $this->key_length + 1 ) ];
@@ -564,19 +564,19 @@ public function read_token( $text, $offset = 0, &$matched_token_byte_length = nu
564564
// Perhaps a short word then.
565565
return strlen( $this->small_words ) > 0
566566
? $this->read_small_token( $text, $offset, $matched_token_byte_length, $case_sensitivity )
567-
: false;
567+
: null;
568568
}
569569

570570
/**
571571
* Finds a match for a short word at the index.
572572
*
573573
* @since 6.6.0.
574574
*
575-
* @param string $text String in which to search for a lookup key.
576-
* @param ?int $offset How many bytes into the string where the lookup key ought to start.
577-
* @param ?int &$matched_token_byte_length Holds byte-length of found lookup key if matched, otherwise not set.
578-
* @param ?string $case_sensitivity 'ascii-case-insensitive' to ignore ASCII case or default of 'case-sensitive'.
579-
* @return string|false Mapped value of lookup key if found, otherwise `false`.
575+
* @param string $text String in which to search for a lookup key.
576+
* @param int $offset Optional. How many bytes into the string where the lookup key ought to start. Default 0.
577+
* @param ?int &$matched_token_byte_length Optional. Holds byte-length of found lookup key if matched, otherwise not set. Default null.
578+
* @param string $case_sensitivity Optional. Pass 'ascii-case-insensitive' to ignore ASCII case when matching. Default 'case-sensitive'.
579+
* @return string|null Mapped value of lookup key if found, otherwise `null`.
580580
*/
581581
private function read_small_token( $text, $offset, &$matched_token_byte_length, $case_sensitivity = 'case-sensitive' ) {
582582
$ignore_case = 'ascii-case-insensitive' === $case_sensitivity;
@@ -616,7 +616,7 @@ private function read_small_token( $text, $offset, &$matched_token_byte_length,
616616
return $this->small_mappings[ $at / ( $this->key_length + 1 ) ];
617617
}
618618

619-
return false;
619+
return null;
620620
}
621621

622622
/**
@@ -692,7 +692,7 @@ public function to_array() {
692692
*
693693
* @since 6.6.0
694694
*
695-
* @param ?string $indent Use this string for indentation, or rely on the default horizontal tab character.
695+
* @param string $indent Optional. Use this string for indentation, or rely on the default horizontal tab character. Default "\t".
696696
* @return string Value which can be pasted into a PHP source file for quick loading of table.
697697
*/
698698
public function precomputed_php_source_table( $indent = "\t" ) {

0 commit comments

Comments
 (0)