Skip to content

Commit 35a2f57

Browse files
committed
Fix case-sensitivity issues
* (Functional) pseudo-classes are always case-insensitive * Add the 'xhtml' flag * Element names and attribute names are case sensitive for HTML, but not XHTML or XML.
1 parent a08663d commit 35a2f57

File tree

6 files changed

+117
-62
lines changed

6 files changed

+117
-62
lines changed

CHANGES

+9
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,15 @@
11
Changelog
22
=========
33

4+
Version 0.5
5+
-----------
6+
7+
Not released yet.
8+
9+
* Fix case sensitivity issues.
10+
* Add the ``xhtml`` parameter for :class:`HTMLTranslator`.
11+
12+
413
Version 0.4
514
-----------
615

cssselect/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,5 @@
1717
from cssselect.xpath import GenericTranslator, HTMLTranslator, ExpressionError
1818

1919

20-
VERSION = '0.4'
20+
VERSION = '0.5'
2121
__version__ = VERSION

cssselect/parser.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,8 @@ def parse_simple_selector(stream, inside_negation=False):
376376
if stream.peek() == '(':
377377
stream.next()
378378
stream.skip_whitespace()
379-
if ident == 'not':
379+
is_negation = ident.lower() == 'not'
380+
if is_negation:
380381
if inside_negation:
381382
raise SelectorSyntaxError('Got nested :not()')
382383
argument, argument_pseudo_element = parse_simple_selector(
@@ -396,7 +397,7 @@ def parse_simple_selector(stream, inside_negation=False):
396397
if not next == ')':
397398
raise SelectorSyntaxError(
398399
"Expected ')', got '%s'" % next)
399-
if ident == 'not':
400+
if is_negation:
400401
result = Negation(result, argument)
401402
else:
402403
result = Function(result, ident, argument)

cssselect/tests.py

+44-36
Original file line numberDiff line numberDiff line change
@@ -284,72 +284,72 @@ def xpath(css):
284284
return str(GenericTranslator().css_to_xpath(css, prefix=''))
285285

286286
assert xpath('*') == "*"
287-
assert xpath('E') == "e"
288-
assert xpath('E[foo]') == "e[@foo]"
289-
assert xpath('E[foo="bar"]') == "e[@foo = 'bar']"
290-
assert xpath('E[foo~="bar"]') == (
287+
assert xpath('e') == "e"
288+
assert xpath('e[foo]') == "e[@foo]"
289+
assert xpath('e[foo="bar"]') == "e[@foo = 'bar']"
290+
assert xpath('e[foo~="bar"]') == (
291291
"e[@foo and contains("
292292
"concat(' ', normalize-space(@foo), ' '), ' bar ')]")
293-
assert xpath('E[foo^="bar"]') == (
293+
assert xpath('e[foo^="bar"]') == (
294294
"e[@foo and starts-with(@foo, 'bar')]")
295-
assert xpath('E[foo$="bar"]') == (
295+
assert xpath('e[foo$="bar"]') == (
296296
"e[@foo and substring(@foo, string-length(@foo)-2) = 'bar']")
297-
assert xpath('E[foo*="bar"]') == (
297+
assert xpath('e[foo*="bar"]') == (
298298
"e[@foo and contains(@foo, 'bar')]")
299-
assert xpath('E[hreflang|="en"]') == (
299+
assert xpath('e[hreflang|="en"]') == (
300300
"e[@hreflang and ("
301301
"@hreflang = 'en' or starts-with(@hreflang, 'en-'))]")
302-
assert xpath('E:nth-child(1)') == (
302+
assert xpath('e:nth-child(1)') == (
303303
"*/*[name() = 'e' and (position() = 1)]")
304-
assert xpath('E:nth-last-child(1)') == (
304+
assert xpath('e:nth-last-child(1)') == (
305305
"*/*[name() = 'e' and (position() = last() - 1)]")
306-
assert xpath('E:nth-last-child(2n+2)') == (
306+
assert xpath('e:nth-last-child(2n+2)') == (
307307
"*/*[name() = 'e' and ("
308308
"(position() +2) mod -2 = 0 and position() < (last() -2))]")
309-
assert xpath('E:nth-of-type(1)') == (
309+
assert xpath('e:nth-of-type(1)') == (
310310
"*/e[position() = 1]")
311-
assert xpath('E:nth-last-of-type(1)') == (
311+
assert xpath('e:nth-last-of-type(1)') == (
312312
"*/e[position() = last() - 1]")
313-
assert xpath('E:nth-last-of-type(1)') == (
313+
assert xpath('e:nth-last-of-type(1)') == (
314314
"*/e[position() = last() - 1]")
315-
assert xpath('div E:nth-last-of-type(1) .aclass') == (
315+
assert xpath('div e:nth-last-of-type(1) .aclass') == (
316316
"div/descendant-or-self::*/e[position() = last() - 1]"
317317
"/descendant-or-self::*/*[@class and contains("
318318
"concat(' ', normalize-space(@class), ' '), ' aclass ')]")
319-
assert xpath('E:first-child') == (
319+
assert xpath('e:first-child') == (
320320
"*/*[name() = 'e' and (position() = 1)]")
321-
assert xpath('E:last-child') == (
321+
assert xpath('e:last-child') == (
322322
"*/*[name() = 'e' and (position() = last())]")
323-
assert xpath('E:first-of-type') == (
323+
assert xpath('e:first-of-type') == (
324324
"*/e[position() = 1]")
325-
assert xpath('E:last-of-type') == (
325+
assert xpath('e:last-of-type') == (
326326
"*/e[position() = last()]")
327-
assert xpath('E:only-child') == (
327+
assert xpath('e:only-child') == (
328328
"*/*[name() = 'e' and (last() = 1)]")
329-
assert xpath('E:only-of-type') == (
329+
assert xpath('e:only-of-type') == (
330330
"e[last() = 1]")
331-
assert xpath('E:empty') == (
331+
assert xpath('e:empty') == (
332332
"e[not(*) and not(normalize-space())]")
333-
assert xpath('E:root') == (
333+
assert xpath('e:root') == (
334334
"e[not(parent::*)]")
335-
assert xpath('E:contains("foo")') == (
335+
assert xpath('e:contains("foo")') == (
336336
"e[contains(string(.), 'foo')]")
337-
assert xpath('E:contains(foo)') == (
337+
assert xpath('e:contains(foo)') == (
338338
"e[contains(string(.), 'foo')]")
339-
assert xpath('E.warning') == (
339+
assert xpath('e.warning') == (
340340
"e[@class and contains("
341341
"concat(' ', normalize-space(@class), ' '), ' warning ')]")
342-
assert xpath('E#myid') == (
342+
assert xpath('e#myid') == (
343343
"e[@id = 'myid']")
344-
assert xpath('E:not(:nth-child(odd))') == (
344+
assert xpath('e:not(:nth-child(odd))') == (
345345
"e[not((position() -1) mod 2 = 0 and position() >= 1)]")
346-
assert xpath('E F') == (
346+
assert xpath('e f') == (
347347
"e/descendant-or-self::*/f")
348-
assert xpath('E > F') == (
348+
assert xpath('e > f') == (
349349
"e/f")
350-
assert xpath('E + F') == (
350+
assert xpath('e + f') == (
351351
"e/following-sibling::*[name() = 'f' and (position() = 1)]")
352-
assert xpath('E ~ F') == (
352+
assert xpath('e ~ f') == (
353353
"e/following-sibling::f")
354354
assert xpath('div#container p') == (
355355
"div[@id = 'container']/descendant-or-self::*/p")
@@ -426,12 +426,17 @@ def pcss(main, *selectors, **kwargs):
426426
return result
427427

428428
all_ids = pcss('*')
429+
assert len(all_ids) == 27
429430
assert all_ids[:4] == ['html', 'nil', 'nil', 'outer-div']
430431
assert all_ids[-1:] == ['foobar-span']
431432
assert pcss('div') == ['outer-div', 'li-div', 'foobar-div']
433+
assert pcss('DIV', html_only=True) == [
434+
'outer-div', 'li-div', 'foobar-div'] # case-insensitive in HTML
432435
assert pcss('div div') == ['li-div']
433436
assert pcss('div, div div') == ['outer-div', 'li-div', 'foobar-div']
434437
assert pcss('a[name]') == ['name-anchor']
438+
assert pcss('a[NAme]', html_only=True) == [
439+
'name-anchor'] # case-insensitive in HTML:
435440
assert pcss('a[rel]') == ['tag-anchor', 'nofollow-anchor']
436441
assert pcss('a[rel="tag"]') == ['tag-anchor']
437442
assert pcss('a[href*="localhost"]') == ['tag-anchor']
@@ -441,7 +446,7 @@ def pcss(main, *selectors, **kwargs):
441446
assert pcss('div[foobar~="bc"]', 'div[foobar~="cde"]') == [
442447
'foobar-div']
443448
assert pcss('div[foobar~="cd"]') == []
444-
assert pcss('*[lang|="en"]', '*[lang|="en-US"]') == ['second-li']
449+
assert pcss('*[lang|="en"]', '[lang|="en-US"]') == ['second-li']
445450
assert pcss('*[lang|="e"]') == []
446451
assert pcss('li:nth-child(3)') == ['third-li']
447452
assert pcss('li:nth-child(10)') == []
@@ -471,12 +476,12 @@ def pcss(main, *selectors, **kwargs):
471476
self.assertRaises(ExpressionError, pcss, 'p *:only-of-type')
472477
self.assertRaises(ExpressionError, pcss, 'p:lang(fr)')
473478
assert pcss('p:only-of-type') == ['paragraph']
474-
assert pcss('a:empty') == ['name-anchor']
479+
assert pcss('a:empty', 'a:EMpty') == ['name-anchor']
475480
assert pcss('li:empty') == [
476481
'third-li', 'fourth-li', 'fifth-li', 'sixth-li', 'seventh-li']
477482
assert pcss(':root', 'html:root') == ['html']
478483
assert pcss('li:root', '* :root') == []
479-
assert pcss('*:contains("link")') == [
484+
assert pcss('*:contains("link")', ':CONtains("link")') == [
480485
'html', 'nil', 'outer-div', 'tag-anchor', 'nofollow-anchor']
481486
assert pcss('*:contains("LInk")') == [] # case sensitive
482487
assert pcss('*:contains("e")') == [
@@ -488,7 +493,6 @@ def pcss(main, *selectors, **kwargs):
488493
assert pcss('ol *.c', 'ol li.c', 'li ~ li.c', 'ol > li.c') == [
489494
'third-li', 'fourth-li']
490495
assert pcss('#first-li', 'li#first-li', '*#first-li') == ['first-li']
491-
# Need some tests of :not()']
492496
assert pcss('li div', 'li > div', 'div div') == ['li-div']
493497
assert pcss('div > div') == []
494498
assert pcss('div>.c', 'div > .c') == ['first-ol']
@@ -507,6 +511,10 @@ def pcss(main, *selectors, **kwargs):
507511
'fieldset', 'checkbox-disabled']
508512
assert pcss(':enabled', html_only=True) == [
509513
'checkbox-unchecked', 'checkbox-checked']
514+
assert pcss('a:not([href])') == ['name-anchor']
515+
assert pcss('ol :Not(li[class])') == [
516+
'first-li', 'second-li', 'li-div',
517+
'fifth-li', 'sixth-li', 'seventh-li']
510518

511519
def test_select_shakespeare(self):
512520
document = html.document_fromstring(HTML_SHAKESPEARE)

cssselect/xpath.py

+60-20
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,10 @@ def join(self, combiner, other):
9393
class GenericTranslator(object):
9494
"""
9595
Translator for "generic" XML documents.
96+
97+
Everything is case-sensitive, no assumption is made on the meaning
98+
of element names and attribute names.
99+
96100
"""
97101
combinator_mapping = {
98102
' ': 'descendant',
@@ -116,6 +120,24 @@ class GenericTranslator(object):
116120
#: http://www.w3.org/TR/selectors/#id-selectors
117121
id_attribute = 'id'
118122

123+
#: The case sensitivity of document language element names,
124+
#: attribute names, and attribute values in selectors depends
125+
#: on the document language.
126+
#: http://www.w3.org/TR/selectors/#casesens
127+
#:
128+
#: When a document language defines one of these as case-insensitive,
129+
#: cssselect assumes that the document parser makes the parsed values
130+
#: lower-case. Making the selector lower-case too makes the comparaison
131+
#: case-insensitive.
132+
#:
133+
#: In HTML, element names and attributes names (but not attribute values)
134+
#: are case-insensitive. All of lxml.html, html5lib, BeautifulSoup4
135+
#: and HTMLParser make them lower-case in their parse result, so
136+
#: the assumption holds.
137+
lower_case_element_names = False
138+
lower_case_attribute_names = False
139+
lower_case_attribute_values = False
140+
119141
def css_to_xpath(self, css, prefix='descendant-or-self::'):
120142
"""Translate a *group of selectors* to XPath.
121143
@@ -201,7 +223,7 @@ def xpath_negation(self, negation):
201223

202224
def xpath_function(self, function):
203225
"""Translate a functional pseudo-class."""
204-
method = 'xpath_%s_function' % function.name.replace('-', '_')
226+
method = 'xpath_%s_function' % function.name.replace('-', '_').lower()
205227
method = getattr(self, method, None)
206228
if not method:
207229
raise ExpressionError(
@@ -210,7 +232,7 @@ def xpath_function(self, function):
210232

211233
def xpath_pseudo(self, pseudo):
212234
"""Translate a pseudo-class."""
213-
method = 'xpath_%s_pseudo' % pseudo.ident.replace('-', '_')
235+
method = 'xpath_%s_pseudo' % pseudo.ident.replace('-', '_').lower()
214236
method = getattr(self, method, None)
215237
if not method:
216238
# TODO: better error message for pseudo-elements?
@@ -226,12 +248,19 @@ def xpath_attrib(self, selector):
226248
raise ExpressionError(
227249
"Unknown attribute operator: %r" % selector.operator)
228250
method = getattr(self, 'xpath_attrib_%s' % operator)
229-
# FIXME: what if attrib is *?
251+
if self.lower_case_attribute_names:
252+
name = selector.attrib.lower()
253+
else:
254+
name = selector.attrib
230255
if selector.namespace == '*':
231-
name = '@' + selector.attrib
256+
name = '@' + name
257+
else:
258+
name = '@%s:%s' % (selector.namespace, name)
259+
if self.lower_case_attribute_values:
260+
value = selector.value.lower()
232261
else:
233-
name = '@%s:%s' % (selector.namespace, selector.attrib)
234-
return method(self.xpath(selector.selector), name, selector.value)
262+
value = selector.value
263+
return method(self.xpath(selector.selector), name, value)
235264

236265
def xpath_class(self, class_selector):
237266
"""Translate a class selector."""
@@ -243,23 +272,18 @@ def xpath_class(self, class_selector):
243272
def xpath_hash(self, id_selector):
244273
"""Translate an ID selector."""
245274
xpath = self.xpath(id_selector.selector)
246-
return xpath.add_condition('@%s = %s' % (
247-
self.id_attribute, self.xpath_literal(id_selector.id)))
275+
return self.xpath_attrib_equals(xpath, '@id', id_selector.id)
248276

249277
def xpath_element(self, selector):
250278
"""Translate a type or universal selector."""
251-
if selector.namespace == '*':
252-
# Fixed case sensitive matching on lxml 2.3.4 patched for external cssselect with Python 2.7 64-bit on Windows.
253-
# Case insensitive matching is not working unless source elements are lower case.
254-
# For HTMLTranslator, I kept the existing behavior of setting the element to lower case.
255-
# "...in HTML, element names are case-insensitive, but in XML they are case-sensitive."
256-
# http://www.w3.org/TR/CSS2/selector.html#pattern-matching
257-
element = selector.element
258-
if isinstance(self, HTMLTranslator):
259-
element = element.lower()
279+
if self.lower_case_element_names:
280+
element = selector.element.lower()
260281
else:
261-
# FIXME: Should we lowercase here?
262-
element = '%s:%s' % (selector.namespace, selector.element)
282+
element = selector.element
283+
if selector.namespace != '*':
284+
# Namespace prefixes are case-sensitive.
285+
# http://www.w3.org/TR/css3-namespace/#prefixes
286+
element = '%s:%s' % (selector.namespace, element)
263287
return XPathExpr(element=element)
264288

265289

@@ -465,8 +489,24 @@ def xpath_attrib_substringmatch(self, xpath, name, value):
465489

466490
class HTMLTranslator(GenericTranslator):
467491
"""
468-
Translator for HTML documents.
492+
Translator for (X)HTML documents.
493+
494+
Has a more useful implementation of some pseudo-classes, based on
495+
HTML-specific element names and attribute names.
496+
The API is the same as :class:`GenericTranslator`.
497+
498+
:param xhtml:
499+
If false (the default), element names and attribute names
500+
are case-insensitive.
501+
469502
"""
503+
def __init__(self, xhtml=False):
504+
self.xhtml = xhtml # Might be useful for sub-classes?
505+
if not xhtml:
506+
# See their definition in GenericTranslator.
507+
self.lower_case_element_names = True
508+
self.lower_case_attribute_names = True
509+
470510
def xpath_checked_pseudo(self, xpath):
471511
# FIXME: is this really all the elements?
472512
return xpath.add_condition(

docs/index.rst

-3
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,6 @@ selectors. For example, ``div, h1.title + p`` is a group of 2 selectors.
5757
:members: css_to_xpath, selector_to_xpath
5858

5959
.. autoclass:: HTMLTranslator
60-
61-
The API is the same as :class:`GenericTranslator`.
62-
6360
.. autoexception:: SelectorError
6461
.. autoexception:: SelectorSyntaxError
6562
.. autoexception:: ExpressionError

0 commit comments

Comments
 (0)