-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtmp_spider.py
30 lines (22 loc) · 966 Bytes
/
tmp_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import scrapy
import logging
from scrapy.contrib.spiders import Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class StackOverflowSpider(scrapy.Spider):
name = 'stackoverflow'
start_urls = ['http://www.mirrorkart.com/Buy-Designers-Mirrors-online']
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//ul[@class="pagination"]/li',)), callback="parse", follow=True),
)
def parse(self, response):
for href in response.css('.product-thumb .image a::attr(href)'):
full_url = response.urljoin(href.extract())
yield scrapy.Request(full_url, callback=self.parse_product)
def parse_product(self, response):
yield {
'title': response.css('h1::text').extract_first(),
'image': response.css('.thumbnails a::attr(href)').extract_first(),
'desc' : response.css('div[id="tab-description"] p').extract(),
'link': response.url,
}