Skip to content

Commit c86c4f1

Browse files
committed
initial commit
0 parents  commit c86c4f1

File tree

8 files changed

+264
-0
lines changed

8 files changed

+264
-0
lines changed

LICENSE

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
Copyright (c) Rolando Espinoza La fuente
2+
All rights reserved.
3+
4+
Redistribution and use in source and binary forms, with or without modification,
5+
are permitted provided that the following conditions are met:
6+
7+
1. Redistributions of source code must retain the above copyright notice,
8+
this list of conditions and the following disclaimer.
9+
10+
2. Redistributions in binary form must reproduce the above copyright
11+
notice, this list of conditions and the following disclaimer in the
12+
documentation and/or other materials provided with the distribution.
13+
14+
3. Neither the name of scrapy-redis nor the names of its contributors may be used
15+
to endorse or promote products derived from this software without
16+
specific prior written permission.
17+
18+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
22+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25+
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

README

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
Redis-based components for Scrapy
2+
=================================
3+
4+
This is a initial work on Scrapy-Redis integration, not production-tested.
5+
Use it at your own risk!
6+
7+
Features:
8+
* Distributed crawling/scraping
9+
* Distributed post-processing
10+
11+
Requirements:
12+
* Scrapy >= 0.13 (development version)
13+
* redis-py (tested on 2.4.9)
14+
* redis server (tested on 2.2-2.4)
15+
16+
Available Scrapy components:
17+
* Scheduler
18+
* Duplication Filter
19+
* Item Pipeline
20+
21+
22+
Running the example project
23+
---------------------------
24+
25+
You can test the funcionality following the next steps:
26+
27+
1. Setup scrapy_redis package in your PYTHONPATH
28+
29+
2. Run the crawler for first time then stop it
30+
31+
$ cd examples
32+
$ scrapy crawl dmoz
33+
^C
34+
35+
3. Run the crawler again to resume stopped crawling
36+
37+
$ scrapy crawl dmoz
38+
... [dmoz] DEBUG: Resuming crawl (9019 requests scheduled)
39+
40+
4. Start one or more additional scrapy crawlers
41+
42+
$ scrapy crawl dmoz
43+
... [dmoz] DEBUG: Resuming crawl (8712 requests scheduled)
44+
45+
5. Start one or more post-processing workers
46+
47+
$ python process_items.py
48+
Processing: Kilani Giftware (http://www.dmoz.org/Computers/Shopping/Gifts/)
49+
Processing: NinjaGizmos.com (http://www.dmoz.org/Computers/Shopping/Gifts/)
50+
...
51+
52+
That's it.

scrapy_redis/__init__.py

Whitespace-only changes.

scrapy_redis/dupefilter.py

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import redis
2+
import time
3+
from scrapy.dupefilter import BaseDupeFilter
4+
from scrapy.utils.request import request_fingerprint
5+
6+
7+
class RFPDupeFilter(BaseDupeFilter):
8+
"""Redis-based request duplication filter"""
9+
10+
def __init__(self, server, key):
11+
"""Initialize duplication filter
12+
13+
Parameters:
14+
server -- Redis connection
15+
key -- redis key to store fingerprints
16+
17+
"""
18+
self.server = server
19+
self.key = key
20+
21+
@classmethod
22+
def from_settings(cls, settings):
23+
host = settings.get('REDIS_HOST', 'localhost')
24+
port = settings.get('REDIS_PORT', 6379)
25+
server = redis.Redis(host, port)
26+
# create one-time key. needed to support to use this
27+
# class as standalone dupefilter with scrapy's default scheduler
28+
# if scrapy passes spider on open() method this wouldn't be needed
29+
key = "dupefilter:%s" % int(time.time())
30+
return cls(server, key)
31+
32+
def request_seen(self, request):
33+
fp = request_fingerprint(request)
34+
added = self.server.sadd(self.key, fp)
35+
return not added
36+
37+
def close(self, reason):
38+
"""Delete data on close. Called by scrapy's scheduler"""
39+
self.clear()
40+
41+
def clear(self):
42+
"""Clears fingerprints data"""
43+
self.server.delete(self.key)
44+

scrapy_redis/pipelines.py

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import redis
2+
3+
from twisted.internet.threads import deferToThread
4+
from scrapy.utils.serialize import ScrapyJSONEncoder
5+
6+
7+
class RedisPipeline(object):
8+
"""Pushes serialized item into a redis list/queue"""
9+
10+
def __init__(self, host, port):
11+
self.server = redis.Redis(host, port)
12+
self.encoder = ScrapyJSONEncoder()
13+
14+
@classmethod
15+
def from_settings(cls, settings):
16+
host = settings.get('REDIS_HOST', 'localhost')
17+
port = settings.get('REDIS_PORT', 6379)
18+
return cls(host, port)
19+
20+
def process_item(self, item, spider):
21+
return deferToThread(self._process_item, item, spider)
22+
23+
def _process_item(self, item, spider):
24+
key = self.item_key(item, spider)
25+
data = self.encoder.encode(dict(item))
26+
self.server.rpush(key, data)
27+
return item
28+
29+
def item_key(self, item, spider):
30+
"""Returns redis key based on given spider"""
31+
return "%s:items" % spider.name
32+

scrapy_redis/queue.py

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import marshal
2+
from scrapy.utils.reqser import request_to_dict, request_from_dict
3+
4+
5+
class SpiderQueue(object):
6+
"""Per-spider queue abstraction on top of redis using sorted set"""
7+
8+
def __init__(self, server, spider, key):
9+
"""Initialize per-spider redis queue
10+
11+
Parameters:
12+
server -- redis connection
13+
spider -- spider instance
14+
key -- key for this queue (e.g. "%(spider)s:queue")
15+
16+
"""
17+
self.server = server
18+
self.spider = spider
19+
self.key = key % {'spider': spider.name}
20+
21+
def __len__(self):
22+
return self.server.zcard(self.key)
23+
24+
def push(self, request):
25+
data = marshal.dumps(request_to_dict(request, self.spider))
26+
pairs = {data: -request.priority}
27+
self.server.zadd(self.key, **pairs)
28+
29+
def pop(self):
30+
# use atomic range/remove using multi/exec
31+
pipe = self.server.pipeline()
32+
pipe.multi()
33+
pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0)
34+
results, count = pipe.execute()
35+
if results:
36+
return request_from_dict(marshal.loads(results[0]), self.spider)
37+
38+
def clear(self):
39+
self.server.delete(self.key)
40+

scrapy_redis/scheduler.py

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import redis
2+
from scrapy_redis.queue import SpiderQueue
3+
from scrapy_redis.dupefilter import RFPDupeFilter
4+
5+
6+
# default values
7+
REDIS_HOST = 'localhost'
8+
REDIS_PORT = 6379
9+
SCHEDULER_PERSIST = False
10+
QUEUE_KEY = '%(spider)s:requests'
11+
DUPEFILTER_KEY = '%(spider)s:dupefilter'
12+
13+
14+
class Scheduler(object):
15+
"""Redis-based scheduler"""
16+
17+
def __init__(self, server, persist, queue_key):
18+
self.server = server
19+
self.persist = persist
20+
self.queue_key = queue_key
21+
22+
def __len__(self):
23+
return len(self.queue)
24+
25+
@classmethod
26+
def from_settings(cls, settings):
27+
host = settings.get('REDIS_HOST', REDIS_HOST)
28+
port = settings.get('REDIS_PORT', REDIS_PORT)
29+
persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
30+
queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
31+
server = redis.Redis(host, port)
32+
return cls(server, persist, queue_key)
33+
34+
def open(self, spider):
35+
self.spider = spider
36+
self.queue = SpiderQueue(self.server, spider, self.queue_key)
37+
self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {'spider': spider.name})
38+
# notice if there are requests already in the queue
39+
if len(self.queue):
40+
spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
41+
42+
def close(self, reason):
43+
if not self.persist:
44+
self.df.clear()
45+
self.queue.clear()
46+
47+
def enqueue_request(self, request):
48+
if not request.dont_filter and self.df.request_seen(request):
49+
return
50+
self.queue.push(request)
51+
52+
def next_request(self):
53+
return self.queue.pop()
54+
55+
def has_pending_requests(self):
56+
return len(self) > 0
57+

setup.py

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from distutils.core import setup
2+
3+
setup(name='scrapy-redis',
4+
version='0.1',
5+
description='Redis-based components for Scrapy',
6+
author='Rolando Espinoza La fuente',
7+
author_email='[email protected]',
8+
url='http://github.com/darkrho/scrapy-redis',
9+
packages=['scrapy_redis'],
10+
license='BSD',
11+
#install_requires=['Scrapy>=0.13'],
12+
)

0 commit comments

Comments
 (0)