Skip to content

Commit d9b86e7

Browse files
author
Ubuntu
committed
pushing scrapers for first lot
0 parents  commit d9b86e7

34 files changed

+1204
-0
lines changed

furnstyl/furnstyl.py

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
2+
import scrapy
3+
import logging
4+
from scrapy.contrib.spiders import Rule
5+
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
6+
from scrapy.selector import HtmlXPathSelector
7+
8+
from scrapy import Request, Spider
9+
from scrapy.exceptions import CloseSpider
10+
from scrapy.selector import Selector
11+
12+
13+
from scrapy.item import Item, Field
14+
15+
class ScrapySampleItem(Item):
16+
title = Field()
17+
link = Field()
18+
desc = Field()
19+
price = Field()
20+
image = Field()
21+
22+
23+
class StackOverflowSpider(scrapy.Spider):
24+
25+
name = 'furnstyl'
26+
start_urls = ['http://www.furnstyl.com/furniture']
27+
28+
29+
def parse(self, response):
30+
for href in response.css('.product-image::attr(href)'):
31+
full_url = response.urljoin(href.extract())
32+
yield scrapy.Request(full_url, callback=self.parse_product)
33+
34+
def parse_product(self, response):
35+
items = []
36+
item = ScrapySampleItem()
37+
38+
item['title'] = response.css('h1::text').extract_first()
39+
item['image'] = response.css('.cloud-zoom img::attr(src)').extract_first()
40+
item['desc'] = response.css('div[id="product_tabs_description_contents"] .std').extract()
41+
item['price'] = response.css('.price').extract_first()
42+
43+
if not item['desc']:
44+
logging.info("EMPTY RECIEVED")
45+
item['desc'] = response.css('h1::text').extract_first()
46+
item['link'] = response.url
47+
items.append(item)
48+
49+
for item in items:
50+
yield item

furnstyl/furnstyl.pyc

2.14 KB
Binary file not shown.

kwaldecal.py

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
2+
import scrapy
3+
import logging
4+
from scrapy.contrib.spiders import Rule
5+
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
6+
from scrapy.selector import HtmlXPathSelector
7+
8+
from scrapy import Request, Spider
9+
from scrapy.exceptions import CloseSpider
10+
from scrapy.selector import Selector
11+
12+
13+
from scrapy.item import Item, Field
14+
15+
16+
URL = 'http://kcwalldecals.com/21-ethnic-indian?p={page}'
17+
18+
19+
20+
class ScrapySampleItem(Item):
21+
title = Field()
22+
link = Field()
23+
desc = Field()
24+
image = Field()
25+
26+
27+
28+
class StackOverflowSpider(scrapy.Spider):
29+
handle_httpstatus_list = [404]
30+
name = "kwaldecal"
31+
32+
33+
def start_requests(self):
34+
index = 1
35+
while (index < 3):
36+
yield Request(URL.format(page=index))
37+
index +=1
38+
39+
def parse(self, response):
40+
41+
for href in response.css('.product_img_link::attr(href)'):
42+
full_url = response.urljoin(href.extract())
43+
yield scrapy.Request(full_url, callback=self.parse_product)
44+
45+
def parse_product(self, response):
46+
47+
items = []
48+
item = ScrapySampleItem()
49+
50+
item['title'] = response.css('h1::text').extract_first()
51+
item['image'] = response.css('img[id="bigpic"]::attr(src)').extract_first()
52+
item['desc'] = response.css('div[id="short_description_content"] p').extract()
53+
54+
if not item['desc']:
55+
logging.info("EMPTY RECIEVED")
56+
57+
item['link'] = response.url
58+
59+
items.append(item)
60+
61+
for item in items:
62+
yield item
63+

kwaldecal.pyc

2.3 KB
Binary file not shown.

kwaldecal/13_kwd.py

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
2+
import scrapy
3+
import logging
4+
from scrapy.contrib.spiders import Rule
5+
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
6+
from scrapy.selector import HtmlXPathSelector
7+
8+
from scrapy import Request, Spider
9+
from scrapy.exceptions import CloseSpider
10+
from scrapy.selector import Selector
11+
12+
13+
from scrapy.item import Item, Field
14+
15+
16+
URL = 'http://kcwalldecals.com/13-birds-animals?p={page}'
17+
18+
19+
20+
class ScrapySampleItem(Item):
21+
title = Field()
22+
link = Field()
23+
desc = Field()
24+
image = Field()
25+
26+
27+
28+
class StackOverflowSpider(scrapy.Spider):
29+
handle_httpstatus_list = [404]
30+
name = "kwaldecal"
31+
32+
33+
def start_requests(self):
34+
index = 1
35+
while (index < 9):
36+
yield Request(URL.format(page=index))
37+
index +=1
38+
39+
def parse(self, response):
40+
41+
for href in response.css('.product_img_link::attr(href)'):
42+
full_url = response.urljoin(href.extract())
43+
yield scrapy.Request(full_url, callback=self.parse_product)
44+
45+
def parse_product(self, response):
46+
47+
items = []
48+
item = ScrapySampleItem()
49+
50+
item['title'] = response.css('h1::text').extract_first()
51+
item['image'] = response.css('img[id="bigpic"]::attr(src)').extract_first()
52+
item['desc'] = response.css('div[id="short_description_content"] p').extract()
53+
54+
if not item['desc']:
55+
logging.info("EMPTY RECIEVED")
56+
57+
item['link'] = response.url
58+
59+
items.append(item)
60+
61+
for item in items:
62+
yield item
63+

kwaldecal/13_kwd.pyc

2.34 KB
Binary file not shown.

kwaldecal/14_kwd.py

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
2+
import scrapy
3+
import logging
4+
from scrapy.contrib.spiders import Rule
5+
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
6+
from scrapy.selector import HtmlXPathSelector
7+
8+
from scrapy import Request, Spider
9+
from scrapy.exceptions import CloseSpider
10+
from scrapy.selector import Selector
11+
12+
13+
from scrapy.item import Item, Field
14+
15+
16+
URL = 'http://kcwalldecals.com/14-modern-graphics?p={page}'
17+
18+
19+
20+
class ScrapySampleItem(Item):
21+
title = Field()
22+
link = Field()
23+
desc = Field()
24+
image = Field()
25+
26+
27+
28+
class StackOverflowSpider(scrapy.Spider):
29+
handle_httpstatus_list = [404]
30+
name = "kwaldecal"
31+
32+
33+
def start_requests(self):
34+
index = 1
35+
while (index < 13):
36+
yield Request(URL.format(page=index))
37+
index +=1
38+
39+
def parse(self, response):
40+
41+
for href in response.css('.product_img_link::attr(href)'):
42+
full_url = response.urljoin(href.extract())
43+
yield scrapy.Request(full_url, callback=self.parse_product)
44+
45+
def parse_product(self, response):
46+
47+
items = []
48+
item = ScrapySampleItem()
49+
50+
item['title'] = response.css('h1::text').extract_first()
51+
item['image'] = response.css('img[id="bigpic"]::attr(src)').extract_first()
52+
item['desc'] = response.css('div[id="short_description_content"] p').extract()
53+
54+
if not item['desc']:
55+
logging.info("EMPTY RECIEVED")
56+
57+
item['link'] = response.url
58+
59+
items.append(item)
60+
61+
for item in items:
62+
yield item
63+

kwaldecal/14_kwd.pyc

2.34 KB
Binary file not shown.

kwaldecal/15_kwd.py

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
2+
import scrapy
3+
import logging
4+
from scrapy.contrib.spiders import Rule
5+
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
6+
from scrapy.selector import HtmlXPathSelector
7+
8+
from scrapy import Request, Spider
9+
from scrapy.exceptions import CloseSpider
10+
from scrapy.selector import Selector
11+
12+
13+
from scrapy.item import Item, Field
14+
15+
16+
URL = 'http://kcwalldecals.com/15-nature-wall-decals?p={page}'
17+
18+
19+
20+
class ScrapySampleItem(Item):
21+
title = Field()
22+
link = Field()
23+
desc = Field()
24+
image = Field()
25+
26+
27+
28+
class StackOverflowSpider(scrapy.Spider):
29+
handle_httpstatus_list = [404]
30+
name = "kwaldecal"
31+
32+
33+
def start_requests(self):
34+
index = 1
35+
while (index < 7):
36+
yield Request(URL.format(page=index))
37+
index +=1
38+
39+
def parse(self, response):
40+
41+
for href in response.css('.product_img_link::attr(href)'):
42+
full_url = response.urljoin(href.extract())
43+
yield scrapy.Request(full_url, callback=self.parse_product)
44+
45+
def parse_product(self, response):
46+
47+
items = []
48+
item = ScrapySampleItem()
49+
50+
item['title'] = response.css('h1::text').extract_first()
51+
item['image'] = response.css('img[id="bigpic"]::attr(src)').extract_first()
52+
item['desc'] = response.css('div[id="short_description_content"] p').extract()
53+
54+
if not item['desc']:
55+
logging.info("EMPTY RECIEVED")
56+
57+
item['link'] = response.url
58+
59+
items.append(item)
60+
61+
for item in items:
62+
yield item
63+

kwaldecal/15_kwd.pyc

2.35 KB
Binary file not shown.

kwaldecal/16_kwd.py

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
2+
import scrapy
3+
import logging
4+
from scrapy.contrib.spiders import Rule
5+
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
6+
from scrapy.selector import HtmlXPathSelector
7+
8+
from scrapy import Request, Spider
9+
from scrapy.exceptions import CloseSpider
10+
from scrapy.selector import Selector
11+
12+
13+
from scrapy.item import Item, Field
14+
15+
16+
URL = 'http://kcwalldecals.com/16-kids-decals?p={page}'
17+
18+
19+
20+
class ScrapySampleItem(Item):
21+
title = Field()
22+
link = Field()
23+
desc = Field()
24+
image = Field()
25+
26+
27+
28+
class StackOverflowSpider(scrapy.Spider):
29+
handle_httpstatus_list = [404]
30+
name = "kwaldecal"
31+
32+
33+
def start_requests(self):
34+
index = 1
35+
while (index < 7):
36+
yield Request(URL.format(page=index))
37+
index +=1
38+
39+
def parse(self, response):
40+
41+
for href in response.css('.product_img_link::attr(href)'):
42+
full_url = response.urljoin(href.extract())
43+
yield scrapy.Request(full_url, callback=self.parse_product)
44+
45+
def parse_product(self, response):
46+
47+
items = []
48+
item = ScrapySampleItem()
49+
50+
item['title'] = response.css('h1::text').extract_first()
51+
item['image'] = response.css('img[id="bigpic"]::attr(src)').extract_first()
52+
item['desc'] = response.css('div[id="short_description_content"] p').extract()
53+
54+
if not item['desc']:
55+
logging.info("EMPTY RECIEVED")
56+
57+
item['link'] = response.url
58+
59+
items.append(item)
60+
61+
for item in items:
62+
yield item
63+

kwaldecal/16_kwd.pyc

2.34 KB
Binary file not shown.

0 commit comments

Comments
 (0)