-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlekiaan_tables.py
64 lines (43 loc) · 2.16 KB
/
lekiaan_tables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#paginated crawl
import scrapy
import logging
from scrapy.contrib.spiders import Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy import Request, Spider
from scrapy.exceptions import CloseSpider
from scrapy.selector import Selector
from scrapy.item import Item, Field
URL = 'http://www.lekiaan.com/Handler/ProductShowcaseHandler.ashx?ProductShowcaseInput=%7B%22PgControlId%22:2544135,%22IsConfigured%22:true,%22ConfigurationType%22:%22%22,%22CombiIds%22:%22%22,%22PageNo%22:{page},%22DivClientId%22:%222544135_CU00204262%22,%22SortingValues%22:%22LIFO%22,%22ShowViewType%22:%22H%22,%22PropertyBag%22:null,%22IsRefineExsists%22:false,%22CID%22:%22CU00204262%22,%22CT%22:0,%22TabId%22:%220%22,%22LocationIds%22:%220%22,%22CurrencyCode%22:%22INR%22,%22ContentType%22:%22B%22%7D&_=1471488752780'
class ScrapySampleItem(Item):
title = Field()
link = Field()
desc = Field()
image = Field()
price = Field()
class StackOverflowSpider(scrapy.Spider):
handle_httpstatus_list = [404]
name = "lekiaan tables"
def start_requests(self):
index = 1
while (index < 6):
yield Request(URL.format(page=index))
index +=1
def parse(self, response):
for href in response.css('.bucket_left a::attr(href)'):
full_url = response.urljoin(href.extract())
yield scrapy.Request(full_url, callback=self.parse_product)
def parse_product(self, response):
items = []
item = ScrapySampleItem()
item['title'] = response.css('h1::text').extract_first()
item['image'] = response.css('.product-largimg::attr(src)').extract_first()
item['desc'] = response.css('.product_desc p').extract()
item['price'] = response.css('.sp_amt::text').extract_first()
if not item['desc']:
logging.info("EMPTY RECIEVED")
item['desc'] = response.css('h1::text').extract_first()
item['link'] = response.url
items.append(item)
for item in items:
yield item