Skip to content

Commit 6146ecd

Browse files
committed
HTML API: Refactor wp_kses_hair() (#9248)
Trac ticket: Core-63694 `wp_kses_hair()` is built around an impressive state machine for parsing the `$attr` of an HTML tag, that is, the span of text after the tag name and before the closing `>`. Unfortunately, that parsing code doesn’t fully-implement the HTML specification and may be prone to mis-parsing. This patch replaces the existing state machine with a straight-forward use of the HTML API to parse the attributes for us, constructing a shell take for the `$attr` string and reading the attributes structurally. This shell is necessary because a previous stage of the pipeline has already separated what it thinks is the so-called “attribute list” from a tag. Props: dmsnell
1 parent 3045051 commit 6146ecd

File tree

1 file changed

+27
-127
lines changed

1 file changed

+27
-127
lines changed

src/wp-includes/kses.php

Lines changed: 27 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -1385,149 +1385,49 @@ function wp_kses_attr_check( &$name, &$value, &$whole, $vless, $element, $allowe
13851385
* attribute defined first (`foo='bar' foo='baz'` will result in `foo='bar'`).
13861386
*
13871387
* @since 1.0.0
1388+
* @since 6.9.0 Rebuilt on HTML API
13881389
*
13891390
* @param string $attr Attribute list from HTML element to closing HTML element tag.
13901391
* @param string[] $allowed_protocols Array of allowed URL protocols.
13911392
* @return array[] Array of attribute information after parsing.
13921393
*/
13931394
function wp_kses_hair( $attr, $allowed_protocols ) {
1394-
$attrarr = array();
1395-
$mode = 0;
1396-
$attrname = '';
1397-
$uris = wp_kses_uri_attributes();
1395+
$attributes = array();
1396+
$uris = wp_kses_uri_attributes();
13981397

13991398
// Loop through the whole attribute list.
14001399

1401-
while ( strlen( $attr ) !== 0 ) {
1402-
$working = 0; // Was the last operation successful?
1400+
$processor = new WP_HTML_Tag_Processor( "<wp {$attr}>" );
1401+
$processor->next_token();
14031402

1404-
switch ( $mode ) {
1405-
case 0:
1406-
if ( preg_match( '/^([_a-zA-Z][-_a-zA-Z0-9:.]*)/', $attr, $match ) ) {
1407-
$attrname = $match[1];
1408-
$working = 1;
1409-
$mode = 1;
1410-
$attr = preg_replace( '/^[_a-zA-Z][-_a-zA-Z0-9:.]*/', '', $attr );
1411-
}
1412-
1413-
break;
1414-
1415-
case 1:
1416-
if ( preg_match( '/^\s*=\s*/', $attr ) ) { // Equals sign.
1417-
$working = 1;
1418-
$mode = 2;
1419-
$attr = preg_replace( '/^\s*=\s*/', '', $attr );
1420-
break;
1421-
}
1422-
1423-
if ( preg_match( '/^\s+/', $attr ) ) { // Valueless.
1424-
$working = 1;
1425-
$mode = 0;
1426-
1427-
if ( false === array_key_exists( $attrname, $attrarr ) ) {
1428-
$attrarr[ $attrname ] = array(
1429-
'name' => $attrname,
1430-
'value' => '',
1431-
'whole' => $attrname,
1432-
'vless' => 'y',
1433-
);
1434-
}
1435-
1436-
$attr = preg_replace( '/^\s+/', '', $attr );
1437-
}
1438-
1439-
break;
1440-
1441-
case 2:
1442-
if ( preg_match( '%^"([^"]*)"(\s+|/?$)%', $attr, $match ) ) {
1443-
// "value"
1444-
$thisval = $match[1];
1445-
if ( in_array( strtolower( $attrname ), $uris, true ) ) {
1446-
$thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
1447-
}
1448-
1449-
if ( false === array_key_exists( $attrname, $attrarr ) ) {
1450-
$attrarr[ $attrname ] = array(
1451-
'name' => $attrname,
1452-
'value' => $thisval,
1453-
'whole' => "$attrname=\"$thisval\"",
1454-
'vless' => 'n',
1455-
);
1456-
}
1457-
1458-
$working = 1;
1459-
$mode = 0;
1460-
$attr = preg_replace( '/^"[^"]*"(\s+|$)/', '', $attr );
1461-
break;
1462-
}
1463-
1464-
if ( preg_match( "%^'([^']*)'(\s+|/?$)%", $attr, $match ) ) {
1465-
// 'value'
1466-
$thisval = $match[1];
1467-
if ( in_array( strtolower( $attrname ), $uris, true ) ) {
1468-
$thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
1469-
}
1470-
1471-
if ( false === array_key_exists( $attrname, $attrarr ) ) {
1472-
$attrarr[ $attrname ] = array(
1473-
'name' => $attrname,
1474-
'value' => $thisval,
1475-
'whole' => "$attrname='$thisval'",
1476-
'vless' => 'n',
1477-
);
1478-
}
1479-
1480-
$working = 1;
1481-
$mode = 0;
1482-
$attr = preg_replace( "/^'[^']*'(\s+|$)/", '', $attr );
1483-
break;
1484-
}
1485-
1486-
if ( preg_match( "%^([^\s\"']+)(\s+|/?$)%", $attr, $match ) ) {
1487-
// value
1488-
$thisval = $match[1];
1489-
if ( in_array( strtolower( $attrname ), $uris, true ) ) {
1490-
$thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
1491-
}
1492-
1493-
if ( false === array_key_exists( $attrname, $attrarr ) ) {
1494-
$attrarr[ $attrname ] = array(
1495-
'name' => $attrname,
1496-
'value' => $thisval,
1497-
'whole' => "$attrname=\"$thisval\"",
1498-
'vless' => 'n',
1499-
);
1500-
}
1501-
1502-
// We add quotes to conform to W3C's HTML spec.
1503-
$working = 1;
1504-
$mode = 0;
1505-
$attr = preg_replace( "%^[^\s\"']+(\s+|$)%", '', $attr );
1506-
}
1403+
foreach ( $processor->get_attribute_names_with_prefix( '' ) as $name ) {
1404+
$value = $processor->get_attribute( $name );
1405+
$is_bool = true === $value;
1406+
if ( is_string( $value ) && in_array( $name, $uris, true ) ) {
1407+
$value = wp_kses_bad_protocol( $value, $allowed_protocols );
1408+
}
15071409

1508-
break;
1509-
} // End switch.
1410+
// Reconstruct and normalize the attribute value.
1411+
$syntax_characters = array(
1412+
'&' => '&amp;',
1413+
'<' => '&lt;',
1414+
'>' => '&gt;',
1415+
"'" => '&apos;',
1416+
'"' => '&quot;',
1417+
);
15101418

1511-
if ( 0 === $working ) { // Not well-formed, remove and try again.
1512-
$attr = wp_kses_html_error( $attr );
1513-
$mode = 0;
1514-
}
1515-
} // End while.
1419+
$recoded = $is_bool ? '' : strtr( $value, $syntax_characters );
1420+
$whole = $is_bool ? $name : "{$name}=\"{$recoded}\"";
15161421

1517-
if ( 1 === $mode && false === array_key_exists( $attrname, $attrarr ) ) {
1518-
/*
1519-
* Special case, for when the attribute list ends with a valueless
1520-
* attribute like "selected".
1521-
*/
1522-
$attrarr[ $attrname ] = array(
1523-
'name' => $attrname,
1524-
'value' => '',
1525-
'whole' => $attrname,
1526-
'vless' => 'y',
1422+
$attributes[] = array(
1423+
'name' => $name,
1424+
'value' => $recoded,
1425+
'whole' => $whole,
1426+
'vless' => $is_bool,
15271427
);
15281428
}
15291429

1530-
return $attrarr;
1430+
return $attributes;
15311431
}
15321432

15331433
/**

0 commit comments

Comments
 (0)