Skip to content

Commit 9b8d8eb

Browse files
committed
Fix #11, #12: quote attributes that need escaping in legacy browsers
These are mostly out of the market now, so this isn't massively needed any more; nevertheless, avoiding XSS as much as possible is inevitably desirable. This alters the API so that quote_attr_values is now a ternary setting, choosing between legacy-safe behaviour, spec behaviour, and always quoting.
1 parent 4768c64 commit 9b8d8eb

File tree

5 files changed

+132
-11
lines changed

5 files changed

+132
-11
lines changed

CHANGES.rst

+7
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,13 @@ Released on XXX
3333
* **Use scripting disabled by default (as we don't implement
3434
scripting).**
3535

36+
* **Fix #11, avoiding the XSS bug potentially caused by serializer
37+
allowing attribute values to be escaped out of in old browser versions,
38+
changing the quote_attr_values option on serializer to take one of
39+
three values, "always" (the old True value), "legacy" (the new option,
40+
and the new default), and "spec" (the old False value, and the old
41+
default).**
42+
3643

3744
0.9999999/1.0b8
3845
~~~~~~~~~~~~~~~

html5lib/serializer/htmlserializer.py

+20-6
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,16 @@
1010

1111
spaceCharacters = "".join(spaceCharacters)
1212

13-
quoteAttributeSpec = re.compile("[" + spaceCharacters + "\"'=<>`]")
13+
quoteAttributeSpecChars = spaceCharacters + "\"'=<>`"
14+
quoteAttributeSpec = re.compile("[" + quoteAttributeSpecChars + "]")
15+
quoteAttributeLegacy = re.compile("[" + quoteAttributeSpecChars +
16+
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
17+
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
18+
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
19+
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
20+
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
21+
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
22+
"\u3000]")
1423

1524
try:
1625
from codecs import register_error, xmlcharrefreplace_errors
@@ -72,7 +81,7 @@ def htmlentityreplace_errors(exc):
7281
class HTMLSerializer(object):
7382

7483
# attribute quoting options
75-
quote_attr_values = False
84+
quote_attr_values = "legacy" # be secure by default
7685
quote_char = '"'
7786
use_best_quote_char = True
7887

@@ -108,9 +117,9 @@ def __init__(self, **kwargs):
108117
inject_meta_charset=True|False
109118
Whether it insert a meta element to define the character set of the
110119
document.
111-
quote_attr_values=True|False
120+
quote_attr_values="legacy"|"spec"|"always"
112121
Whether to quote attribute values that don't require quoting
113-
per HTML5 parsing rules.
122+
per legacy browser behaviour, when required by the standard, or always.
114123
quote_char=u'"'|u"'"
115124
Use given quote character for attribute quoting. Default is to
116125
use double quote unless attribute value contains a double quote,
@@ -239,10 +248,15 @@ def serialize(self, treewalker, encoding=None):
239248
(k not in booleanAttributes.get(name, tuple()) and
240249
k not in booleanAttributes.get("", tuple())):
241250
yield self.encodeStrict("=")
242-
if self.quote_attr_values:
251+
if self.quote_attr_values == "always" or len(v) == 0:
243252
quote_attr = True
253+
elif self.quote_attr_values == "spec":
254+
quote_attr = quoteAttributeSpec.search(v) is not None
255+
elif self.quote_attr_values == "legacy":
256+
quote_attr = quoteAttributeLegacy.search(v) is not None
244257
else:
245-
quote_attr = len(v) == 0 or quoteAttributeSpec.search(v)
258+
raise ValueError("quote_attr_values must be one of: "
259+
"'always', 'spec', or 'legacy'")
246260
v = v.replace("&", "&amp;")
247261
if self.escape_lt_in_attrs:
248262
v = v.replace("<", "&lt;")

html5lib/tests/serializer-testdata/core.test

+1-1
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@
242242
},
243243
{
244244
"expected": [
245-
"<span title=foo\u000bbar>"
245+
"<span title=\"foo\u000bbar\">"
246246
],
247247
"input": [
248248
[

html5lib/tests/serializer-testdata/options.test

+73-4
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,9 @@
4141
]
4242
]
4343
],
44-
"description": "quote_attr_values=true",
44+
"description": "quote_attr_values='always'",
4545
"options": {
46-
"quote_attr_values": true
46+
"quote_attr_values": "always"
4747
}
4848
},
4949
{
@@ -64,9 +64,78 @@
6464
]
6565
]
6666
],
67-
"description": "quote_attr_values=true with irrelevant",
67+
"description": "quote_attr_values='always' with irrelevant",
6868
"options": {
69-
"quote_attr_values": true
69+
"quote_attr_values": "always"
70+
}
71+
},
72+
{
73+
"expected": [
74+
"<div class=\"foo\">"
75+
],
76+
"input": [
77+
[
78+
"StartTag",
79+
"http://www.w3.org/1999/xhtml",
80+
"div",
81+
[
82+
{
83+
"namespace": null,
84+
"name": "class",
85+
"value": "foo"
86+
}
87+
]
88+
]
89+
],
90+
"description": "non-minimized quote_attr_values='always'",
91+
"options": {
92+
"quote_attr_values": "always"
93+
}
94+
},
95+
{
96+
"expected": [
97+
"<div class=foo>"
98+
],
99+
"input": [
100+
[
101+
"StartTag",
102+
"http://www.w3.org/1999/xhtml",
103+
"div",
104+
[
105+
{
106+
"namespace": null,
107+
"name": "class",
108+
"value": "foo"
109+
}
110+
]
111+
]
112+
],
113+
"description": "non-minimized quote_attr_values='legacy'",
114+
"options": {
115+
"quote_attr_values": "legacy"
116+
}
117+
},
118+
{
119+
"expected": [
120+
"<div class=foo>"
121+
],
122+
"input": [
123+
[
124+
"StartTag",
125+
"http://www.w3.org/1999/xhtml",
126+
"div",
127+
[
128+
{
129+
"namespace": null,
130+
"name": "class",
131+
"value": "foo"
132+
}
133+
]
134+
]
135+
],
136+
"description": "non-minimized quote_attr_values='spec'",
137+
"options": {
138+
"quote_attr_values": "spec"
70139
}
71140
},
72141
{

html5lib/tests/test_serializer.py

+31
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,37 @@ def testComment():
146146
throwsWithLatin1([["Comment", "\u0101"]])
147147

148148

149+
@pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`"))
150+
def testSpecQuoteAttribute(c):
151+
input_ = [["StartTag", "http://www.w3.org/1999/xhtml", "span",
152+
[{"namespace": None, "name": "foo", "value": c}]]]
153+
if c == '"':
154+
output_ = ["<span foo='%s'>" % c]
155+
else:
156+
output_ = ['<span foo="%s">' % c]
157+
options_ = {"quote_attr_values": "spec"}
158+
runSerializerTest(input_, output_, options_)
159+
160+
161+
@pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`"
162+
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
163+
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
164+
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
165+
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
166+
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
167+
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
168+
"\u3000"))
169+
def testLegacyQuoteAttribute(c):
170+
input_ = [["StartTag", "http://www.w3.org/1999/xhtml", "span",
171+
[{"namespace": None, "name": "foo", "value": c}]]]
172+
if c == '"':
173+
output_ = ["<span foo='%s'>" % c]
174+
else:
175+
output_ = ['<span foo="%s">' % c]
176+
options_ = {"quote_attr_values": "legacy"}
177+
runSerializerTest(input_, output_, options_)
178+
179+
149180
@pytest.fixture
150181
def lxml_parser():
151182
return etree.XMLParser(resolve_entities=False)

0 commit comments

Comments
 (0)