Skip to content

Commit b86d004

Browse files
author
Ubuntu
committed
added new scrapers and made structure changes to folder
1 parent 4b68692 commit b86d004

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+327
-76
lines changed

amazonin/amazonin.py amazonin.py

File renamed without changes.

amazonin/amazonin.pyc amazonin.pyc

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

anhad.py

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#single page crawl
2+
import scrapy
3+
import logging
4+
from scrapy.contrib.spiders import Rule
5+
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
6+
from scrapy.selector import HtmlXPathSelector
7+
8+
from scrapy import Request, Spider
9+
from scrapy.exceptions import CloseSpider
10+
from scrapy.selector import Selector
11+
12+
13+
from scrapy.item import Item, Field
14+
15+
class ScrapySampleItem(Item):
16+
title = Field()
17+
link = Field()
18+
desc = Field()
19+
price = Field()
20+
image = Field()
21+
22+
23+
class StackOverflowSpider(scrapy.Spider):
24+
25+
name = 'anhad'
26+
start_urls = ["https://www.theanhadshop.com/collections/home-textile","https://www.theanhadshop.com/collections/home-textile?page=2","https://www.theanhadshop.com/collections/home-textile?page=3","https://www.theanhadshop.com/collections/tableware"]
27+
28+
29+
def parse(self, response):
30+
for href in response.css('.ci a::attr(href)'):
31+
full_url = response.urljoin(href.extract())
32+
logging.info(full_url)
33+
yield scrapy.Request(full_url, callback=self.parse_product,dont_filter = True)
34+
35+
def parse_product(self, response):
36+
items = []
37+
item = ScrapySampleItem()
38+
39+
item['title'] = response.css('h1::text').extract_first()
40+
item['image'] = response.css('.thumbnail::attr(src)').extract_first()
41+
item['desc'] = response.css('.rte span::text').extract()
42+
item['price'] = response.css('p[id="product-price"] .product-price .money::text').extract_first()
43+
44+
if not item['desc']:
45+
logging.info("EMPTY RECIEVED")
46+
item['desc'] = response.css('h1::text').extract_first()
47+
item['link'] = response.url
48+
items.append(item)
49+
50+
for item in items:
51+
yield item

anhad.pyc

2.34 KB
Binary file not shown.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

furnstyl/furnstyl.pyc

-2.14 KB
Binary file not shown.

greenessencestore/greenessencestore.py greenessencestore.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ class ScrapySampleItem(Item):
2323
class StackOverflowSpider(scrapy.Spider):
2424

2525
name = 'green essence'
26-
start_urls = ["http://www.greenessencestore.com/Planters-depid-423-page-1.html"]
26+
start_urls = ["http://www.greenessencestore.com/Planters-depid-423-page-1.html","http://www.greenessencestore.com/Garden-Accessories-depid-431-page-1.html","http://www.greenessencestore.com/Planter-Stand---Shelves-depid-264466-page-1.html","http://www.greenessencestore.com/Wall-Pot-Holders---Brackets-depid-273823-page-1.html","http://www.greenessencestore.com/Potted-Plants-depid-16325-page-1.html"]
2727

2828

2929
def parse(self, response):
@@ -36,14 +36,14 @@ def parse_product(self, response):
3636
items = []
3737
item = ScrapySampleItem()
3838

39-
item['title'] = map(unicode.strip, response.css('.title::text').extract())
39+
item['title'] = map(unicode.strip, response.css('.lht19::text').extract())
4040
item['image'] = response.css('img[id="largeImage"]::attr(src)').extract_first()
4141
item['desc'] = response.css('.desc_shorttext').extract()
4242
item['price'] = response.css('.amt::text').extract_first()
4343

4444
if not item['desc']:
4545
logging.info("EMPTY RECIEVED")
46-
item['desc'] = response.css('.title::text').extract()
46+
item['desc'] = response.css('.lht19::text').extract()
4747
item['link'] = response.url
4848
items.append(item)
4949

Binary file not shown.

idamstore.py

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#single page crawl
2+
import scrapy
3+
import logging
4+
from scrapy.contrib.spiders import Rule
5+
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
6+
from scrapy.selector import HtmlXPathSelector
7+
8+
from scrapy import Request, Spider
9+
from scrapy.exceptions import CloseSpider
10+
from scrapy.selector import Selector
11+
12+
13+
from scrapy.item import Item, Field
14+
15+
class ScrapySampleItem(Item):
16+
title = Field()
17+
link = Field()
18+
desc = Field()
19+
price = Field()
20+
image = Field()
21+
22+
23+
class StackOverflowSpider(scrapy.Spider):
24+
25+
name = 'idamstore'
26+
start_urls = ["https://www.idamstore.com/collections/all?page=1","https://www.idamstore.com/collections/all?page=2","https://www.idamstore.com/collections/all?page=3"]
27+
28+
29+
def parse(self, response):
30+
for href in response.css('.ci a::attr(href)'):
31+
full_url = response.urljoin(href.extract())
32+
logging.info(full_url)
33+
yield scrapy.Request(full_url, callback=self.parse_product,dont_filter = True)
34+
35+
def parse_product(self, response):
36+
items = []
37+
item = ScrapySampleItem()
38+
39+
item['title'] = response.css('.section-title h1::text').extract_first()
40+
item['image'] = response.css('.thumbnail::attr(src)').extract_first()
41+
item['desc'] = response.css('.rte span::text').extract()
42+
item['price'] = response.css('p[id="product-price"] .product-price::text').extract_first()
43+
44+
if not item['desc']:
45+
logging.info("EMPTY RECIEVED")
46+
item['desc'] = response.css('.section-title h1::text').extract_first()
47+
item['link'] = response.url
48+
items.append(item)
49+
50+
for item in items:
51+
yield item

idamstore.pyc

2.28 KB
Binary file not shown.

kilishop/kilishop.py kilishop.py

File renamed without changes.

kilishop/kilishop.pyc kilishop.pyc

File renamed without changes.

lekiaan/lekiaan.py lekiaan.py

File renamed without changes.

lekiaan/lekiaan.pyc lekiaan.pyc

File renamed without changes.
File renamed without changes.
File renamed without changes.

lotushouse.py

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
2+
import scrapy
3+
import logging
4+
from scrapy.contrib.spiders import Rule
5+
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
6+
from scrapy.selector import HtmlXPathSelector
7+
8+
from scrapy import Request, Spider
9+
from scrapy.exceptions import CloseSpider
10+
from scrapy.selector import Selector
11+
12+
13+
from scrapy.item import Item, Field
14+
15+
class ScrapySampleItem(Item):
16+
title = Field()
17+
link = Field()
18+
desc = Field()
19+
price = Field()
20+
image = Field()
21+
22+
23+
class StackOverflowSpider(scrapy.Spider):
24+
25+
name = 'lotushouse'
26+
start_urls = ["http://thelotushouse.com/paintings-wall-hangings.html","http://thelotushouse.com/showpieces-figurines.html","http://thelotushouse.com/fashion-accessories.html","http://thelotushouse.com/home-furnishings.html"]
27+
28+
29+
def parse(self, response):
30+
for href in response.css('.product-image::attr(href)'):
31+
full_url = response.urljoin(href.extract())
32+
yield scrapy.Request(full_url, callback=self.parse_product)
33+
34+
next_page = response.css(".next::attr('href')")
35+
if next_page:
36+
url = response.urljoin(next_page[0].extract())
37+
yield scrapy.Request(url, self.parse)
38+
39+
def parse_product(self, response):
40+
items = []
41+
item = ScrapySampleItem()
42+
43+
item['title'] = response.css('.product-name h1::text').extract_first()
44+
item['image'] = response.css('img[id="image"]::attr(src)').extract_first()
45+
item['desc'] = response.css('.product-collateral').extract()
46+
item['price'] = response.css('.add_to_cart .add-to-box .price_box .price-box .regular-price .price::text').extract()
47+
48+
if not item['desc']:
49+
logging.info("EMPTY RECIEVED")
50+
item['desc'] = response.css('h1::text').extract_first()
51+
item['link'] = response.url
52+
items.append(item)
53+
54+
for item in items:
55+
yield item

lotushouse/lotushouse.pyc

2.54 KB
Binary file not shown.

mudfingers.py

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#single page crawl
2+
import scrapy
3+
import logging
4+
from scrapy.contrib.spiders import Rule
5+
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
6+
from scrapy.selector import HtmlXPathSelector
7+
8+
from scrapy import Request, Spider
9+
from scrapy.exceptions import CloseSpider
10+
from scrapy.selector import Selector
11+
12+
13+
from scrapy.item import Item, Field
14+
15+
class ScrapySampleItem(Item):
16+
title = Field()
17+
link = Field()
18+
desc = Field()
19+
price = Field()
20+
image = Field()
21+
22+
23+
class StackOverflowSpider(scrapy.Spider):
24+
25+
name = 'mudfingers'
26+
start_urls = ["http://www.mudfingers.com/AIR-PLANTS-depid-956101-page-1.html","http://www.mudfingers.com/COLORED-CLAYS-depid-932279-page-1.html","http://www.mudfingers.com/SUN-LOVERS-depid-421491-page-1.html","http://www.mudfingers.com/TERRARIUMS-depid-421269-page-1.html","http://www.mudfingers.com/AIR-PLANTS-depid-956101-page-1.html",]
27+
28+
29+
def parse(self, response):
30+
for href in response.css('.standard a::attr(href)'):
31+
full_url = response.urljoin(href.extract())
32+
logging.info(full_url)
33+
yield scrapy.Request(full_url, callback=self.parse_product,dont_filter = True)
34+
35+
def parse_product(self, response):
36+
items = []
37+
item = ScrapySampleItem()
38+
39+
item['title'] = response.css('h1::text').extract_first()
40+
item['image'] = response.css('.z-product-thumbs img::attr(src)').extract_first()
41+
item['desc'] = response.css('div[id="description"]').extract()
42+
item['price'] = response.css('.prices .price_original span[id="price-standard"]::text').extract_first()
43+
44+
if not item['desc']:
45+
logging.info("EMPTY RECIEVED")
46+
item['desc'] = response.css('h1::text').extract_first()
47+
item['link'] = response.url
48+
items.append(item)
49+
50+
for item in items:
51+
yield item

mudfingers.pyc

2.42 KB
Binary file not shown.

masalaworks/mw.py mw.py

File renamed without changes.

masalaworks/mw.pyc mw.pyc

File renamed without changes.

olieco.py

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
2+
import scrapy
3+
import logging
4+
from scrapy.contrib.spiders import Rule
5+
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
6+
from scrapy.selector import HtmlXPathSelector
7+
8+
from scrapy import Request, Spider
9+
from scrapy.exceptions import CloseSpider
10+
from scrapy.selector import Selector
11+
12+
13+
from scrapy.item import Item, Field
14+
15+
class ScrapySampleItem(Item):
16+
title = Field()
17+
link = Field()
18+
desc = Field()
19+
price = Field()
20+
image = Field()
21+
22+
23+
class StackOverflowSpider(scrapy.Spider):
24+
25+
name = 'olie'
26+
start_urls = ["http://www.olie.co.in/collections/lighting","http://www.olie.co.in/collections/cushion-covers","http://www.olie.co.in/collections/storage"]
27+
28+
29+
def parse(self, response):
30+
for href in response.css('.image a::attr(href)'):
31+
full_url = response.urljoin(href.extract())
32+
yield scrapy.Request(full_url, callback=self.parse_product)
33+
34+
next_page = response.css(".next::attr('href')")
35+
if next_page:
36+
url = response.urljoin(next_page[0].extract())
37+
yield scrapy.Request(url, self.parse)
38+
39+
def parse_product(self, response):
40+
items = []
41+
item = ScrapySampleItem()
42+
43+
item['title'] = response.css('.breadcrumb strong::text').extract_first()
44+
item['image'] = response.css('.zoomImg::attr(src)').extract()
45+
item['desc'] = response.css('.description').extract()
46+
item['price'] = response.css('h2[id="price-preview"] span::text').extract_first()
47+
48+
if not item['desc']:
49+
logging.info("EMPTY RECIEVED")
50+
item['desc'] = response.css('.product-name h1::text').extract_first()
51+
item['link'] = response.url
52+
items.append(item)
53+
54+
for item in items:
55+
yield item

olieco.pyc

2.34 KB
Binary file not shown.
File renamed without changes.
File renamed without changes.

posboxin.py

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#single page crawl
2+
import scrapy
3+
import logging
4+
from scrapy.contrib.spiders import Rule
5+
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
6+
from scrapy.selector import HtmlXPathSelector
7+
8+
from scrapy import Request, Spider
9+
from scrapy.exceptions import CloseSpider
10+
from scrapy.selector import Selector
11+
12+
13+
from scrapy.item import Item, Field
14+
15+
class ScrapySampleItem(Item):
16+
title = Field()
17+
link = Field()
18+
desc = Field()
19+
price = Field()
20+
image = Field()
21+
22+
23+
class StackOverflowSpider(scrapy.Spider):
24+
25+
name = 'postbox'
26+
start_urls = ["https://www.thepostbox.in/collections/vibrant-funky-india-cotton-cushion-covers","https://www.thepostbox.in/collections/wall-art","https://www.thepostbox.in/collections/trays-by-kalakaari-haath","https://www.thepostbox.in/collections/terracotta-mugs","https://www.thepostbox.in/collections/handpainted-blue-pottery-the-postbox","https://www.thepostbox.in/collections/coasters-cork-board-city-themes-graphic-art","https://www.thepostbox.in/collections/vibrant-printed-ceramic-mugs-the-postbox"]
27+
28+
29+
def parse(self, response):
30+
for href in response.css('.ci a::attr(href)'):
31+
full_url = response.urljoin(href.extract())
32+
logging.info(full_url)
33+
yield scrapy.Request(full_url, callback=self.parse_product,dont_filter = True)
34+
35+
def parse_product(self, response):
36+
items = []
37+
item = ScrapySampleItem()
38+
39+
item['title'] = response.css('.section-title h1::text').extract_first()
40+
item['image'] = response.css('.main-product-image img::attr(src)').extract_first()
41+
item['desc'] = response.css('.rte').extract()
42+
item['price'] = response.css('.product-price .money::text').extract_first()
43+
44+
if not item['desc']:
45+
logging.info("EMPTY RECIEVED")
46+
item['desc'] = response.css('.section-title h1::text').extract_first()
47+
item['link'] = response.url
48+
items.append(item)
49+
50+
for item in items:
51+
yield item

posboxin.pyc

2.62 KB
Binary file not shown.

0 commit comments

Comments
 (0)