-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Streaming commands / Communication (#3)
* intial streaming * initial communication procotol * commands tests * separating communication logic from protocol * streaming args and readme
- Loading branch information
1 parent
303d882
commit bea89a7
Showing
16 changed files
with
456 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,17 @@ | ||
from scrapy.commands.crawl import Command | ||
|
||
from scrapy_streaming.external_spiderloader import ExternalSpiderLoader | ||
|
||
|
||
class CrawlCommand(Command): | ||
""" | ||
Extends the scrapy crawl command, adding the possibility to start a external spider using the crawl command | ||
""" | ||
pass | ||
|
||
def run(self, args, opts): | ||
try: | ||
super(CrawlCommand, self).run(args, opts) | ||
except KeyError: | ||
spname = args[0] | ||
|
||
ExternalSpiderLoader.from_settings(self.settings).crawl(spname) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from scrapy_streaming.communication.map import CommunicationMap | ||
from scrapy_streaming.communication.wrappers import * | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import json | ||
|
||
from scrapy.utils.python import to_unicode, to_native_str | ||
|
||
from scrapy_streaming.communication import wrappers | ||
from scrapy_streaming.utils import MessageError | ||
|
||
|
||
class CommunicationMap(object): | ||
""" | ||
Helper class to create the json messages | ||
""" | ||
|
||
mapping = { | ||
'spider': wrappers.SpiderMessage, | ||
'request': wrappers.RequestMessage, | ||
'log': wrappers.LogMessage | ||
} | ||
|
||
@staticmethod | ||
def parse(line): | ||
try: | ||
msg = json.loads(to_native_str(line)) | ||
|
||
if not isinstance(msg, dict): | ||
raise MessageError('This message is not a json object.') | ||
if 'type' not in msg: | ||
raise MessageError('"type" field not provided.') | ||
|
||
msg_type = msg.pop('type') | ||
try: | ||
return CommunicationMap.mapping[msg_type].from_dict(msg) | ||
except KeyError: | ||
raise MessageError('%s is not a valid message type.' % msg_type) | ||
except ValueError: | ||
raise MessageError('Received message is not a valid json.') | ||
|
||
@staticmethod | ||
def ready(): | ||
fields = {'type': 'ready', 'status': 'ready'} | ||
return json.dumps(fields) | ||
|
||
@staticmethod | ||
def error(message, details): | ||
fields = {'type': 'error', | ||
'received_message': to_unicode(message), | ||
'details': to_unicode(details)} | ||
return json.dumps(fields) | ||
|
||
@staticmethod | ||
def response(resp, request_id='parse'): | ||
fields = _extract_fields(resp, ['url', 'headers', 'status', 'body', 'meta', 'flags']) | ||
fields['id'] = to_unicode(request_id) | ||
return json.dumps(fields) | ||
|
||
|
||
def _extract_fields(item, fields): | ||
""" | ||
Given a list of fields, generate a dict with key being the name of the field | ||
mapping to the serialized item.field value | ||
""" | ||
data = {} | ||
for field in fields: | ||
data[field] = json.loads(json.dumps(getattr(item, field))) | ||
return data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import six | ||
|
||
from scrapy_streaming.spiders import StreamingSpider | ||
from scrapy_streaming.utils import MessageError, RequiredField | ||
|
||
|
||
class ExternalSpiderMessageWrapper(object): | ||
validator = {} | ||
|
||
def __init__(self, default, fields): | ||
self.data = fields | ||
self.validate(fields) | ||
self.update(default, fields) | ||
|
||
@classmethod | ||
def from_dict(cls, data): | ||
return cls(data) | ||
|
||
def validate(self, data): | ||
validator = self.validator | ||
for key, value in data.items(): | ||
if key not in validator: | ||
raise MessageError('Unknown message field: %s' % key) | ||
|
||
if value is not None and not isinstance(value, validator[key]): | ||
raise MessageError('%s field must be defined as %s, received: %s' % | ||
(key, validator[key].__name__, type(value).__name__)) | ||
|
||
def update(self, default, data): | ||
default.update(data) | ||
for item, value in default.items(): | ||
if isinstance(value, RequiredField): | ||
raise MessageError('Required field: %s' % item) | ||
setattr(self, item, value) | ||
|
||
|
||
class RequestMessage(ExternalSpiderMessageWrapper): | ||
validator = {'id': six.text_type, 'url': six.text_type} | ||
|
||
def __init__(self, fields): | ||
default = {'id': None, 'start_urls': None, 'method': None, 'meta': None, | ||
'body': None, 'headers': None, 'cookies': None, 'encoding': None, | ||
'priority': None, 'dont_filter': None} | ||
|
||
super(RequestMessage, self).__init__(default, fields) | ||
|
||
|
||
class SpiderMessage(ExternalSpiderMessageWrapper): | ||
validator = {'name': six.text_type, 'start_urls': list, | ||
'allowed_domains': list, 'custom_settings': dict} | ||
|
||
def __init__(self, fields): | ||
default = {'name': RequiredField(), 'start_urls': RequiredField(), | ||
'allowed_domains': None, 'custom_settings': None} | ||
|
||
super(SpiderMessage, self).__init__(default, fields) | ||
|
||
|
||
class LogMessage(ExternalSpiderMessageWrapper): | ||
|
||
validator = {'message': six.text_type, 'level': six.text_type} | ||
|
||
def __init__(self, fields): | ||
default = {'message': RequiredField(), 'level': RequiredField()} | ||
|
||
super(LogMessage, self).__init__(default, fields) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
from scrapy.utils.python import to_bytes | ||
from twisted.internet import protocol | ||
|
||
|
||
class LineProcessProtocol(protocol.ProcessProtocol, object): | ||
""" | ||
This class extends the twisted ProcessProtocol to split the incoming data in lines. | ||
The data received by ``outReceived`` if added to an internal buffer, and dispatched by ``lineReceived`` | ||
""" | ||
|
||
def __init__(self): | ||
self.__buffer = b'' | ||
self.__delimiter = b'\n' | ||
|
||
def outReceived(self, data): | ||
""" | ||
Implement the outReceived method, buffering the incoming data and | ||
dispatching line by line in the ``lineReceived`` method. | ||
""" | ||
self.__buffer += data | ||
|
||
lines = self.__buffer.splitlines() | ||
if data.endswith(self.__delimiter): | ||
self.__buffer = b'' | ||
else: | ||
self.__buffer = lines.pop() | ||
|
||
for line in lines: | ||
self.lineReceived(line) | ||
|
||
def lineReceived(self, line): | ||
""" | ||
An entire line received by process stdout. You must implement this method to use this class. | ||
""" | ||
raise NotImplementedError | ||
|
||
def writeLine(self, data): | ||
""" | ||
Write the data to the process stdin, adding the new-line delimiter if necessary | ||
""" | ||
data = to_bytes(data) | ||
if not data.endswith(b'\n'): | ||
data += self.__delimiter | ||
self.transport.write(data) |
Oops, something went wrong.