Skip to content

Commit d4e028d

Browse files
gh-141756: Fix handling non-ASCII element and attribute names in HTMLParser
Only ASCII letters are now converted to lower case.
1 parent b3b63e8 commit d4e028d

3 files changed

Lines changed: 56 additions & 7 deletions

File tree

Lib/html/parser.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
# and CDATA (character data -- only end tags are special).
99

1010

11+
import string
1112
import re
1213
import _markupbase
1314

@@ -105,6 +106,10 @@ def _replace_attr_charref(match):
105106
def _unescape_attrvalue(s):
106107
return attr_charref.sub(_replace_attr_charref, s)
107108

109+
def _ascii_lower(s, *, table=str.maketrans(string.ascii_uppercase,
110+
string.ascii_lowercase)):
111+
return s.translate(table)
112+
108113

109114
class HTMLParser(_markupbase.ParserBase):
110115
"""Find tags and other markup and call handler functions.
@@ -179,7 +184,7 @@ def get_starttag_text(self):
179184
return self.__starttag_text
180185

181186
def set_cdata_mode(self, elem, *, escapable=False):
182-
self.cdata_elem = elem.lower()
187+
self.cdata_elem = _ascii_lower(elem)
183188
self._escapable = escapable
184189
if self.cdata_elem == 'plaintext':
185190
self.interesting = re.compile(r'\z')
@@ -284,7 +289,7 @@ def goahead(self, end):
284289
self.handle_comment(rawdata[i+4:j])
285290
elif startswith("<![CDATA[", i) and self._support_cdata:
286291
self.unknown_decl(rawdata[i+3:])
287-
elif rawdata[i:i+9].lower() == '<!doctype':
292+
elif _ascii_lower(rawdata[i:i+9]) == '<!doctype':
288293
self.handle_decl(rawdata[i+2:])
289294
elif startswith("<!", i):
290295
# bogus comment
@@ -372,7 +377,7 @@ def parse_html_declaration(self, i):
372377
return -1
373378
self.unknown_decl(rawdata[i+3: j])
374379
return j + 3
375-
elif rawdata[i:i+9].lower() == '<!doctype':
380+
elif _ascii_lower(rawdata[i:i+9]) == '<!doctype':
376381
# find the closing >
377382
gtpos = rawdata.find('>', i+9)
378383
if gtpos == -1:
@@ -438,7 +443,7 @@ def parse_starttag(self, i):
438443
match = tagfind_tolerant.match(rawdata, i+1)
439444
assert match, 'unexpected call to parse_starttag()'
440445
k = match.end()
441-
self.lasttag = tag = match.group(1).lower()
446+
self.lasttag = tag = _ascii_lower(match.group(1))
442447
while k < endpos:
443448
m = attrfind_tolerant.match(rawdata, k)
444449
if not m:
@@ -451,7 +456,7 @@ def parse_starttag(self, i):
451456
attrvalue = attrvalue[1:-1]
452457
if attrvalue:
453458
attrvalue = _unescape_attrvalue(attrvalue)
454-
attrs.append((attrname.lower(), attrvalue))
459+
attrs.append((_ascii_lower(attrname), attrvalue))
455460
k = m.end()
456461

457462
end = rawdata[k:endpos].strip()
@@ -507,7 +512,7 @@ def parse_endtag(self, i):
507512
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
508513
match = tagfind_tolerant.match(rawdata, i+2)
509514
assert match
510-
tag = match.group(1).lower()
515+
tag = _ascii_lower(match.group(1))
511516
self.handle_endtag(tag)
512517
self.clear_cdata_mode()
513518
return j

Lib/test/test_htmlparser.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,37 @@ def test_invalid_nonascii_closing_tag(self, tag, endtag):
533533
("endtag", tag),
534534
], collector=EventCollector(convert_charrefs=False, scripting=True))
535535

536+
@support.subTests('tag,converted', [
537+
('TıTLE', 'tıtle'),
538+
('NOFRAMEſ', 'noframeſ'),
539+
('NOſCRIPT', 'noſcript'),
540+
('NOSCRıPT', 'noscrıpt'),
541+
('SCRıPT', 'scrıpt'),
542+
('ADDREß', 'addreß'),
543+
('DATALIst', 'datalist'),
544+
('Lı', 'lı'),
545+
('LINK', 'linK'),
546+
])
547+
def test_nonascii_tag(self, tag, converted):
548+
# Starts with ASCII letter
549+
source = f"<{tag}><br></{tag}>"
550+
self._run_check(source, [
551+
("starttag", converted, []),
552+
("starttag", "br", []),
553+
("endtag", converted),
554+
], collector=EventCollector(convert_charrefs=False, scripting=True))
555+
556+
@support.subTests('tag', ['ſtyle', 'ſtyle', 'style', 'ıframe', 'ſcript',
557+
'ı', 'KBD', 'ſMALL', 'stRONG'])
558+
def test_invalid_nonascii_tag(self, tag):
559+
# Starts with non-ASCII letter
560+
source = f"<{tag}><br></{tag}>"
561+
self._run_check(source, [
562+
("data", f"<{tag}>"),
563+
("starttag", "br", []),
564+
("comment", f"{tag}"),
565+
], collector=EventCollector(convert_charrefs=False, scripting=True))
566+
536567
@support.subTests('tail,end', [
537568
('', False),
538569
('<', False),
@@ -1068,7 +1099,7 @@ def test_attr_values(self):
10681099
"<a href=mailto:xyz@example.com>",
10691100
[("starttag", "a", [("href", "mailto:xyz@example.com")])])
10701101

1071-
def test_attr_nonascii(self):
1102+
def test_attr_value_nonascii(self):
10721103
# see issue 7311
10731104
self._run_check(
10741105
"<img src=/foo/bar.png alt=\u4e2d\u6587>",
@@ -1083,6 +1114,16 @@ def test_attr_nonascii(self):
10831114
[("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
10841115
("href", "\u30c6\u30b9\u30c8.html")])])
10851116

1117+
def test_attr_name_nonascii(self):
1118+
self._run_check(
1119+
'<BUTTON ACCEßKEY="s">',
1120+
[('starttag', 'button', [('acceßKey', 's')])])
1121+
self._run_check(
1122+
'<TRACK KIND="chapters" ſRC="sampleChapters.vtt" SRCLANG="en" />',
1123+
[('startendtag', 'track', [('Kind', 'chapters'),
1124+
('ſrc', 'sampleChapters.vtt'),
1125+
('srclang', 'en')])])
1126+
10861127
def test_attr_entity_replacement(self):
10871128
self._run_check(
10881129
"<a b='&amp;&gt;&lt;&quot;&apos;'>",
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix handling non-ASCII element and attribute names in
2+
:class:`html.parser.HTMLParser`. Only ASCII letters are now converted to
3+
lower case.

0 commit comments

Comments
 (0)