Skip to content

Commit e5454f4

Browse files
committed
extend css selectors with ":text" and :attribute(<name>) scrapy#176
1 parent 0ea8c38 commit e5454f4

File tree

2 files changed

+189
-21
lines changed

2 files changed

+189
-21
lines changed

scrapy/selector/csssel.py

+72-21
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,88 @@
11
from cssselect import GenericTranslator, HTMLTranslator
2-
from scrapy.utils.python import flatten
3-
from scrapy.selector import HtmlXPathSelector, XmlXPathSelector
4-
from .list import SelectorList
2+
from cssselect.xpath import XPathExpr, ExpressionError
3+
from scrapy.selector import XPathSelector, HtmlXPathSelector, XmlXPathSelector
54

65

7-
class CSSSelectorList(SelectorList):
8-
def xpath(self, xpath):
9-
return self.__class__(flatten([x.xpath(xpath) for x in self]))
6+
class ScrapyXPathExpr(XPathExpr):
107

11-
def get(self, attr):
12-
return self.__class__(flatten([x.get(attr) for x in self]))
8+
textnode = False
9+
attribute = None
1310

14-
def text(self, all=False):
15-
return self.__class__(flatten([x.text(all) for x in self]))
11+
@classmethod
12+
def from_xpath(cls, xpath, textnode=False, attribute=None):
13+
x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
14+
x.textnode = textnode
15+
x.attribute = attribute
16+
return x
17+
18+
def __str__(self):
19+
path = super(ScrapyXPathExpr, self).__str__()
20+
if self.textnode:
21+
if path == '*':
22+
path = 'text()'
23+
elif path.endswith('::*/*'):
24+
path = path[:-3] + 'text()'
25+
else:
26+
path += '/text()'
27+
28+
if self.attribute is not None:
29+
if path.endswith('::*/*'):
30+
path = path[:-2]
31+
path += '/@%s' % self.attribute
32+
33+
return path
34+
35+
def join(self, combiner, other):
36+
super(ScrapyXPathExpr, self).join(combiner, other)
37+
self.textnode = other.textnode
38+
self.attribute = other.attribute
39+
return self
40+
41+
42+
class TranslatorMixin(object):
43+
44+
def xpath_element(self, selector):
45+
xpath = super(TranslatorMixin, self).xpath_element(selector)
46+
return ScrapyXPathExpr.from_xpath(xpath)
47+
48+
def xpath_text_pseudo(self, xpath):
49+
"""Support selecting text nodes using :text pseudo-element"""
50+
return ScrapyXPathExpr.from_xpath(xpath, textnode=True)
51+
52+
def xpath_attribute_function(self, xpath, function):
53+
if function.argument_types() not in (['STRING'], ['IDENT']):
54+
raise ExpressionError(
55+
"Expected a single string or ident for :contains(), got %r"
56+
% function.arguments)
57+
value = function.arguments[0].value
58+
return ScrapyXPathExpr.from_xpath(xpath, attribute=value)
59+
60+
61+
class ScrapyGenericTranslator(TranslatorMixin, GenericTranslator):
62+
pass
63+
64+
65+
class ScrapyHTMLTranslator(TranslatorMixin, HTMLTranslator):
66+
pass
1667

1768

1869
class CSSSelectorMixin(object):
70+
1971
def select(self, css):
20-
return CSSSelectorList(super(CSSSelectorMixin, self).select(self.translator.css_to_xpath(css)))
72+
xpath = self._css2xpath(css)
73+
return super(CSSSelectorMixin, self).select(xpath)
2174

22-
def xpath(self, xpath):
23-
return CSSSelectorList(super(CSSSelectorMixin, self).select(xpath))
75+
def _css2xpath(self, css):
76+
return self.translator.css_to_xpath(css)
2477

25-
def text(self, all=False):
26-
return self.xpath('string()') if all else self.xpath('text()')
2778

28-
def get(self, attr):
29-
return self.xpath('@' + attr)
79+
class CSSSelector(CSSSelectorMixin, XPathSelector):
80+
translator = ScrapyHTMLTranslator()
3081

3182

32-
class XmlCSSSelector(CSSSelectorMixin, XmlXPathSelector):
33-
translator = GenericTranslator()
83+
class HtmlCSSSelector(CSSSelectorMixin, HtmlXPathSelector):
84+
translator = ScrapyHTMLTranslator()
3485

3586

36-
class HtmlCSSSelector(CSSSelectorMixin, HtmlXPathSelector):
37-
translator = HTMLTranslator()
87+
class XmlCSSSelector(CSSSelectorMixin, XmlXPathSelector):
88+
translator = ScrapyGenericTranslator()
+117
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
"""
2+
Selector tests for cssselect backend
3+
"""
4+
from twisted.trial import unittest
5+
from scrapy.http import TextResponse, HtmlResponse, XmlResponse
6+
from scrapy.selector import CSSSelector, XmlCSSSelector, HtmlCSSSelector
7+
from scrapy.selector.csssel import ScrapyHTMLTranslator
8+
9+
HTMLBODY = '''
10+
<html>
11+
<body>
12+
<div>
13+
<a id="name-anchor" name="foo"></a>
14+
<a id="tag-anchor" rel="tag" href="http://localhost/foo">link</a>
15+
<a id="nofollow-anchor" rel="nofollow" href="https://example.org"> link</a>
16+
<p id="paragraph">
17+
lorem ipsum text
18+
<b id="p-b">hi</b> <em id="p-em">there</em>
19+
<b id="p-b2">guy</b>
20+
<input type="checkbox" id="checkbox-unchecked" />
21+
<input type="checkbox" id="checkbox-disabled" disabled="" />
22+
<input type="text" id="text-checked" checked="checked" />
23+
<input type="hidden" />
24+
<input type="hidden" disabled="disabled" />
25+
<input type="checkbox" id="checkbox-checked" checked="checked" />
26+
<input type="checkbox" id="checkbox-disabled-checked"
27+
disabled="disabled" checked="checked" />
28+
<fieldset id="fieldset" disabled="disabled">
29+
<input type="checkbox" id="checkbox-fieldset-disabled" />
30+
<input type="hidden" />
31+
</fieldset>
32+
</p>
33+
<map name="dummymap">
34+
<area shape="circle" coords="200,250,25" href="foo.html" id="area-href" />
35+
<area shape="default" id="area-nohref" />
36+
</map>
37+
</div>
38+
<div class="cool-footer" id="foobar-div" foobar="ab bc cde">
39+
<span id="foobar-span">foo ter</span>
40+
</div>
41+
</body></html>
42+
'''
43+
44+
45+
class TranslatorMixinTest(unittest.TestCase):
46+
47+
tr_cls = ScrapyHTMLTranslator
48+
49+
def setUp(self):
50+
self.tr = self.tr_cls()
51+
self.c2x = self.tr.css_to_xpath
52+
53+
def test_attribute_function(self):
54+
cases = [
55+
(':attribute(name)', u'descendant-or-self::*/@name'),
56+
('a:attribute(name)', u'descendant-or-self::a/@name'),
57+
('a :attribute(name)', u'descendant-or-self::a/descendant-or-self::*/@name'),
58+
('a > :attribute(name)', u'descendant-or-self::a/*/@name'),
59+
]
60+
for css, xpath in cases:
61+
self.assertEqual(self.c2x(css), xpath, css)
62+
63+
def test_text_pseudo_element(self):
64+
cases = [
65+
(':text', u'descendant-or-self::text()'),
66+
('p:text', u'descendant-or-self::p/text()'),
67+
('p :text', u'descendant-or-self::p/descendant-or-self::text()'),
68+
('#id:text', u"descendant-or-self::*[@id = 'id']/text()"),
69+
('p#id:text', u"descendant-or-self::p[@id = 'id']/text()"),
70+
('p#id :text', u"descendant-or-self::p[@id = 'id']/descendant-or-self::text()"),
71+
('p#id > :text', u"descendant-or-self::p[@id = 'id']/*/text()"),
72+
('p#id ~ :text', u"descendant-or-self::p[@id = 'id']/following-sibling::*/text()"),
73+
('a[href]:text', u'descendant-or-self::a[@href]/text()'),
74+
('a[href] :text', u'descendant-or-self::a[@href]/descendant-or-self::text()'),
75+
('p:text, a:text', u"descendant-or-self::p/text() | descendant-or-self::a/text()"),
76+
]
77+
for css, xpath in cases:
78+
self.assertEqual(self.c2x(css), xpath, css)
79+
80+
81+
class HTMLCSSSelectorTest(unittest.TestCase):
82+
83+
hcs_cls = HtmlCSSSelector
84+
85+
def setUp(self):
86+
self.htmlresponse = HtmlResponse('http://example.com', body=HTMLBODY)
87+
self.hcs = self.hcs_cls(self.htmlresponse)
88+
89+
def x(self, *a, **kw):
90+
return [v.strip() for v in self.hcs.select(*a, **kw).extract() if v.strip()]
91+
92+
def test_selector_simple(self):
93+
for x in self.hcs.select('input'):
94+
self.assertTrue(isinstance(x, self.hcs.__class__), x)
95+
self.assertEqual(self.hcs.select('input').extract(),
96+
[x.extract() for x in self.hcs.select('input')])
97+
98+
def test_text_pseudo_element(self):
99+
self.assertEqual(self.x('#p-b2'), [u'<b id="p-b2">guy</b>'])
100+
self.assertEqual(self.x('#p-b2:text'), [u'guy'])
101+
self.assertEqual(self.x('#p-b2 :text'), [u'guy'])
102+
self.assertEqual(self.x('#paragraph:text'), [u'lorem ipsum text'])
103+
self.assertEqual(self.x('#paragraph :text'), [u'lorem ipsum text', u'hi', u'there', u'guy'])
104+
self.assertEqual(self.x('p:text'), [u'lorem ipsum text'])
105+
self.assertEqual(self.x('p :text'), [u'lorem ipsum text', u'hi', u'there', u'guy'])
106+
107+
def test_attribute_function(self):
108+
self.assertEqual(self.x('#p-b2:attribute(id)'), [u'p-b2'])
109+
self.assertEqual(self.x('.cool-footer:attribute(class)'), [u'cool-footer'])
110+
self.assertEqual(self.x('.cool-footer :attribute(id)'), [u'foobar-div', u'foobar-span'])
111+
self.assertEqual(self.x('map[name="dummymap"] :attribute(shape)'), [u'circle', u'default'])
112+
113+
def test_nested_selector(self):
114+
self.assertEqual(self.hcs.select('p').select('b:text').extract(),
115+
[u'hi', u'guy'])
116+
self.assertEqual(self.hcs.select('div').select('area:last-child').extract(),
117+
[u'<area shape="default" id="area-nohref">'])

0 commit comments

Comments
 (0)