|
| 1 | +""" |
| 2 | +Selector tests for cssselect backend |
| 3 | +""" |
| 4 | +from twisted.trial import unittest |
| 5 | +from scrapy.http import TextResponse, HtmlResponse, XmlResponse |
| 6 | +from scrapy.selector import CSSSelector, XmlCSSSelector, HtmlCSSSelector |
| 7 | +from scrapy.selector.csssel import ScrapyHTMLTranslator |
| 8 | + |
| 9 | +HTMLBODY = ''' |
| 10 | +<html> |
| 11 | +<body> |
| 12 | +<div> |
| 13 | + <a id="name-anchor" name="foo"></a> |
| 14 | + <a id="tag-anchor" rel="tag" href="http://localhost/foo">link</a> |
| 15 | + <a id="nofollow-anchor" rel="nofollow" href="https://example.org"> link</a> |
| 16 | + <p id="paragraph"> |
| 17 | + lorem ipsum text |
| 18 | + <b id="p-b">hi</b> <em id="p-em">there</em> |
| 19 | + <b id="p-b2">guy</b> |
| 20 | + <input type="checkbox" id="checkbox-unchecked" /> |
| 21 | + <input type="checkbox" id="checkbox-disabled" disabled="" /> |
| 22 | + <input type="text" id="text-checked" checked="checked" /> |
| 23 | + <input type="hidden" /> |
| 24 | + <input type="hidden" disabled="disabled" /> |
| 25 | + <input type="checkbox" id="checkbox-checked" checked="checked" /> |
| 26 | + <input type="checkbox" id="checkbox-disabled-checked" |
| 27 | + disabled="disabled" checked="checked" /> |
| 28 | + <fieldset id="fieldset" disabled="disabled"> |
| 29 | + <input type="checkbox" id="checkbox-fieldset-disabled" /> |
| 30 | + <input type="hidden" /> |
| 31 | + </fieldset> |
| 32 | + </p> |
| 33 | + <map name="dummymap"> |
| 34 | + <area shape="circle" coords="200,250,25" href="foo.html" id="area-href" /> |
| 35 | + <area shape="default" id="area-nohref" /> |
| 36 | + </map> |
| 37 | +</div> |
| 38 | +<div class="cool-footer" id="foobar-div" foobar="ab bc cde"> |
| 39 | + <span id="foobar-span">foo ter</span> |
| 40 | +</div> |
| 41 | +</body></html> |
| 42 | +''' |
| 43 | + |
| 44 | + |
| 45 | +class TranslatorMixinTest(unittest.TestCase): |
| 46 | + |
| 47 | + tr_cls = ScrapyHTMLTranslator |
| 48 | + |
| 49 | + def setUp(self): |
| 50 | + self.tr = self.tr_cls() |
| 51 | + self.c2x = self.tr.css_to_xpath |
| 52 | + |
| 53 | + def test_attribute_function(self): |
| 54 | + cases = [ |
| 55 | + (':attribute(name)', u'descendant-or-self::*/@name'), |
| 56 | + ('a:attribute(name)', u'descendant-or-self::a/@name'), |
| 57 | + ('a :attribute(name)', u'descendant-or-self::a/descendant-or-self::*/@name'), |
| 58 | + ('a > :attribute(name)', u'descendant-or-self::a/*/@name'), |
| 59 | + ] |
| 60 | + for css, xpath in cases: |
| 61 | + self.assertEqual(self.c2x(css), xpath, css) |
| 62 | + |
| 63 | + def test_text_pseudo_element(self): |
| 64 | + cases = [ |
| 65 | + (':text', u'descendant-or-self::text()'), |
| 66 | + ('p:text', u'descendant-or-self::p/text()'), |
| 67 | + ('p :text', u'descendant-or-self::p/descendant-or-self::text()'), |
| 68 | + ('#id:text', u"descendant-or-self::*[@id = 'id']/text()"), |
| 69 | + ('p#id:text', u"descendant-or-self::p[@id = 'id']/text()"), |
| 70 | + ('p#id :text', u"descendant-or-self::p[@id = 'id']/descendant-or-self::text()"), |
| 71 | + ('p#id > :text', u"descendant-or-self::p[@id = 'id']/*/text()"), |
| 72 | + ('p#id ~ :text', u"descendant-or-self::p[@id = 'id']/following-sibling::*/text()"), |
| 73 | + ('a[href]:text', u'descendant-or-self::a[@href]/text()'), |
| 74 | + ('a[href] :text', u'descendant-or-self::a[@href]/descendant-or-self::text()'), |
| 75 | + ('p:text, a:text', u"descendant-or-self::p/text() | descendant-or-self::a/text()"), |
| 76 | + ] |
| 77 | + for css, xpath in cases: |
| 78 | + self.assertEqual(self.c2x(css), xpath, css) |
| 79 | + |
| 80 | + |
| 81 | +class HTMLCSSSelectorTest(unittest.TestCase): |
| 82 | + |
| 83 | + hcs_cls = HtmlCSSSelector |
| 84 | + |
| 85 | + def setUp(self): |
| 86 | + self.htmlresponse = HtmlResponse('http://example.com', body=HTMLBODY) |
| 87 | + self.hcs = self.hcs_cls(self.htmlresponse) |
| 88 | + |
| 89 | + def x(self, *a, **kw): |
| 90 | + return [v.strip() for v in self.hcs.select(*a, **kw).extract() if v.strip()] |
| 91 | + |
| 92 | + def test_selector_simple(self): |
| 93 | + for x in self.hcs.select('input'): |
| 94 | + self.assertTrue(isinstance(x, self.hcs.__class__), x) |
| 95 | + self.assertEqual(self.hcs.select('input').extract(), |
| 96 | + [x.extract() for x in self.hcs.select('input')]) |
| 97 | + |
| 98 | + def test_text_pseudo_element(self): |
| 99 | + self.assertEqual(self.x('#p-b2'), [u'<b id="p-b2">guy</b>']) |
| 100 | + self.assertEqual(self.x('#p-b2:text'), [u'guy']) |
| 101 | + self.assertEqual(self.x('#p-b2 :text'), [u'guy']) |
| 102 | + self.assertEqual(self.x('#paragraph:text'), [u'lorem ipsum text']) |
| 103 | + self.assertEqual(self.x('#paragraph :text'), [u'lorem ipsum text', u'hi', u'there', u'guy']) |
| 104 | + self.assertEqual(self.x('p:text'), [u'lorem ipsum text']) |
| 105 | + self.assertEqual(self.x('p :text'), [u'lorem ipsum text', u'hi', u'there', u'guy']) |
| 106 | + |
| 107 | + def test_attribute_function(self): |
| 108 | + self.assertEqual(self.x('#p-b2:attribute(id)'), [u'p-b2']) |
| 109 | + self.assertEqual(self.x('.cool-footer:attribute(class)'), [u'cool-footer']) |
| 110 | + self.assertEqual(self.x('.cool-footer :attribute(id)'), [u'foobar-div', u'foobar-span']) |
| 111 | + self.assertEqual(self.x('map[name="dummymap"] :attribute(shape)'), [u'circle', u'default']) |
| 112 | + |
| 113 | + def test_nested_selector(self): |
| 114 | + self.assertEqual(self.hcs.select('p').select('b:text').extract(), |
| 115 | + [u'hi', u'guy']) |
| 116 | + self.assertEqual(self.hcs.select('div').select('area:last-child').extract(), |
| 117 | + [u'<area shape="default" id="area-nohref">']) |
0 commit comments