-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial project structure with tox, travis, and initial commands (#2)
* init tests and project structure * readme * added spider loader * get_project_root() raises an exception if not in project
- Loading branch information
1 parent
d8c94d1
commit bb94c45
Showing
17 changed files
with
356 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -60,3 +60,6 @@ target/ | |
|
||
#Ipython Notebook | ||
.ipynb_checkpoints | ||
|
||
# IDEs | ||
.idea/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
language: python | ||
python: 3.5 | ||
sudo: false | ||
env: | ||
- TOXENV=py27 | ||
- TOXENV=py35 | ||
install: | ||
- pip install -U tox twine wheel codecov | ||
script: tox | ||
after_success: | ||
- codecov | ||
cache: | ||
directories: | ||
- $HOME/.cache/pip |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,9 @@ | ||
# scrapy-streaming | ||
# Scrapy Streaming (WIP) | ||
|
||
[](https://travis-ci.org/scrapy-plugins/scrapy-streaming) | ||
[](https://codecov.io/gh/scrapy-plugins/scrapy-streaming) | ||
|
||
The Scrapy Streaming provides an interface to write spiders using any programming language, using json objects to make requests, parse web contents, get data, and more. | ||
|
||
Also, we officially provide helper libraries to develop your spiders using Java, JS, and R. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
scrapy |
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
from scrapy.commands.crawl import Command | ||
|
||
|
||
class CrawlCommand(Command): | ||
""" | ||
Extends the scrapy crawl command, adding the possibility to start a external spider using the crawl command | ||
""" | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from scrapy.commands.list import Command | ||
|
||
from scrapy_streaming.external_spiderloader import ExternalSpiderLoader | ||
|
||
|
||
class ListCommand(Command): | ||
""" | ||
Extends the Scrapy list command, adding external spider to the list | ||
""" | ||
|
||
def run(self, args, opts): | ||
print('[Scrapy Spiders]') | ||
super(ListCommand, self).run(args, opts) | ||
|
||
spiders = [spider.name for spider in ExternalSpiderLoader.from_settings(self.settings).list()] | ||
if spiders: | ||
print('[External Spiders]') | ||
for spider in sorted(spiders): | ||
print(spider) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import os | ||
|
||
from scrapy.commands import ScrapyCommand | ||
from scrapy.exceptions import UsageError | ||
|
||
|
||
class StreamingCommand(ScrapyCommand): | ||
""" | ||
Command to start stand-alone executables with the the scrapy scrapy_streaming | ||
""" | ||
|
||
requires_project = False | ||
|
||
def syntax(self): | ||
return "[options] <path of executable>" | ||
|
||
def short_desc(self): | ||
return "Run a external spider using Scrapy Streaming given its path (doesn't require a project)" | ||
|
||
def run(self, args, opts): | ||
if len(args) != 1: | ||
raise UsageError() | ||
filename = args[0] | ||
if not os.path.exists(filename): | ||
raise UsageError("File not found: %s\n" % filename) | ||
|
||
raise NotImplementedError() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
import json | ||
import os | ||
|
||
from scrapy_streaming.utils import get_project_root | ||
|
||
|
||
class ExternalSpider(object): | ||
""" | ||
Object to represent external spiders defined in ``external.json``. | ||
""" | ||
|
||
def __init__(self, name, command, args=None): | ||
if args is not None and not isinstance(args, list): | ||
raise ValueError("'args' must be defined as an array of strings") | ||
self.name = name | ||
self.command = command | ||
self.args = args | ||
|
||
@classmethod | ||
def from_dict(cls, spider): | ||
return cls(**spider) | ||
|
||
|
||
class ExternalSpiderLoader(object): | ||
""" | ||
This class manages external spiders defined in the ``external.json`` | ||
""" | ||
|
||
def __init__(self, settings): | ||
path = settings.get('EXTERNAL_SPIDERS_PATH', get_project_root()) | ||
# TODO add EXTERNAL_SPIDERS_PATH in docs | ||
path = os.path.abspath(path) | ||
self.external = os.path.join(path, 'external.json') | ||
self._spiders = {} | ||
self._fetch_spiders() | ||
|
||
@classmethod | ||
def from_settings(cls, settings): | ||
return cls(settings) | ||
|
||
def _fetch_spiders(self): | ||
""" | ||
Loads the content in the ``external.json`` file and generate a mapping of external spiders. | ||
Keep the original mapping if the file is not found. | ||
Throws JSONDecodeError if it's not a valid json file. | ||
""" | ||
for spider in _read_json(self.external): | ||
if not isinstance(spider, dict): | ||
raise ValueError('External spiders must be defined as json objects.' | ||
' Read the docs for more information') | ||
|
||
external_spider = ExternalSpider.from_dict(spider) | ||
self._spiders[external_spider.name] = external_spider | ||
return self._spiders | ||
|
||
def list(self): | ||
""" | ||
Returns a list with instance of loaded spiders (ExternalSpider objects) | ||
""" | ||
return list(self._spiders.values()) | ||
|
||
|
||
def _read_json(path): | ||
""" | ||
Parse the json given its path. Raises an exception if the file doesn't exist. | ||
""" | ||
if os.path.isfile(path): | ||
return json.loads(open(path).read()) | ||
else: | ||
raise Exception('Could not found "%s" file. Please, check if it\'s in your project root ' | ||
'or defined in path defined at EXTERNAL_SPIDERS_PATH setting.' % path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import os | ||
|
||
from scrapy.utils.conf import closest_scrapy_cfg | ||
from scrapy.utils.project import inside_project | ||
|
||
|
||
def get_project_root(): | ||
""" | ||
Returns the absolute path of the root of the project, and raise an exception | ||
if the current directory is not inside a project path | ||
""" | ||
os.path.abspath('.') | ||
if inside_project(): | ||
return os.path.dirname(closest_scrapy_cfg()) | ||
raise Exception(os.getcwd(), " does not belong to a Scrapy project") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#!/usr/bin/env python | ||
from setuptools import setup | ||
|
||
setup( | ||
name='scrapy-streaming', | ||
version='0.1', | ||
url='https://github.com/scrapy-plugins/scrapy-streaming', | ||
description='Develop Spiders using any Programming Language', | ||
author='Scrapy developers', | ||
packages=['scrapy_streaming'], | ||
requires=['scrapy'], | ||
|
||
entry_points={ | ||
'scrapy.commands': [ | ||
'streaming=scrapy_streaming.commands.streaming:StreamingCommand', | ||
'list=scrapy_streaming.commands.list:ListCommand', | ||
'crawl=scrapy_streaming.commands.crawl:CrawlCommand' | ||
], | ||
}, | ||
) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import os | ||
import subprocess | ||
import tempfile | ||
from tempfile import mkdtemp | ||
|
||
from os.path import join | ||
|
||
import sys | ||
from time import sleep | ||
|
||
from scrapy.utils.python import to_native_str | ||
from scrapy.utils.test import get_testenv | ||
from shutil import rmtree | ||
from twisted.trial import unittest | ||
|
||
|
||
class ProjectTest(unittest.TestCase): | ||
project_name = 'testproject' | ||
|
||
def setUp(self): | ||
self.temp_path = mkdtemp() | ||
self.cwd = self.temp_path | ||
self.proj_path = join(self.temp_path, self.project_name) | ||
self.proj_mod_path = join(self.proj_path, self.project_name) | ||
self.env = get_testenv() | ||
|
||
self.call('startproject', self.project_name) | ||
self.cwd = join(self.temp_path, self.project_name) | ||
os.chdir(self.cwd) | ||
self.env['SCRAPY_SETTINGS_MODULE'] = '%s.settings' % self.project_name | ||
self.external_path = join(self.cwd, 'external.json') | ||
with open(self.external_path, 'w') as external: | ||
external.write(''' | ||
[ | ||
{ | ||
"name": "PythonSpider", | ||
"command": "scripts/dmoz.py" | ||
}, | ||
{ | ||
"name": "JavaSpider", | ||
"command": "java", | ||
"args": ["MyClass"] | ||
} | ||
] | ||
''') | ||
|
||
def tearDown(self): | ||
rmtree(self.temp_path) | ||
|
||
def call(self, *new_args, **kwargs): | ||
with tempfile.NamedTemporaryFile() as out: | ||
args = (sys.executable, '-m', 'scrapy.cmdline') + new_args | ||
return subprocess.call(args, stdout=out, stderr=out, cwd=self.cwd, | ||
env=self.env, **kwargs) | ||
|
||
def proc(self, *new_args, **kwargs): | ||
args = (sys.executable, '-m', 'scrapy.cmdline') + new_args | ||
p = subprocess.Popen(args, cwd=self.cwd, env=self.env, | ||
stdout=subprocess.PIPE, stderr=subprocess.PIPE, | ||
**kwargs) | ||
|
||
waited = 0 | ||
interval = 0.2 | ||
while p.poll() is None: | ||
sleep(interval) | ||
waited += interval | ||
if waited > 15: | ||
p.kill() | ||
assert False, 'Command took too much time to complete' | ||
|
||
return p | ||
|
||
|
||
class ListCommandTest(ProjectTest): | ||
|
||
def test_list_is_running(self): | ||
self.assertEqual(0, self.call('list')) | ||
|
||
def test_external_spiders(self): | ||
p = self.proc('list') | ||
out = to_native_str(p.stdout.read()) | ||
|
||
self.assertIn("JavaSpider", out) | ||
self.assertIn("PythonSpider", out) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
from tests.test_commands import ProjectTest | ||
from twisted.trial import unittest | ||
|
||
from scrapy_streaming.external_spiderloader import ExternalSpider, ExternalSpiderLoader, _read_json | ||
|
||
|
||
class ExternalSpiderTest(unittest.TestCase): | ||
|
||
def test_wrong_arg_type(self): | ||
params = {'name': 'Name', 'command': 'python', 'args': {'a': 'b'}} | ||
self.assertRaises(ValueError, ExternalSpider.from_dict, params) | ||
|
||
|
||
class ExternalSpiderLoaderTest(ProjectTest): | ||
|
||
def test_list(self): | ||
e = ExternalSpiderLoader({}) | ||
|
||
self.assertEqual(2, len(e.list())) | ||
|
||
def test_invalid_json(self): | ||
open(self.external_path, 'w').write(''' | ||
[ | ||
{ | ||
"name": "PythonSpider", | ||
"command": "scripts/dmoz.py" | ||
}, | ||
''') | ||
self.assertRaises(ValueError, ExternalSpiderLoader.from_settings, {}) | ||
|
||
def test_invalid_json_content(self): | ||
open(self.external_path, 'w').write(''' | ||
{ | ||
"name": "PythonSpider", | ||
"command": "scripts/dmoz.py" | ||
} | ||
''') | ||
self.assertRaises(ValueError, ExternalSpiderLoader.from_settings, {}) | ||
|
||
def test_invalid_file(self): | ||
self.assertRaises(Exception, _read_json, '/home') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import os | ||
|
||
from tests.test_commands import ProjectTest | ||
|
||
|
||
from scrapy_streaming.utils import get_project_root | ||
|
||
|
||
class UtilsTest(ProjectTest): | ||
|
||
def test_get_project(self): | ||
self.assertEqual(get_project_root(), self.cwd) | ||
|
||
def test_get_project_default(self): | ||
os.chdir('../') | ||
self.assertRaises(Exception, get_project_root) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# Tox (http://tox.testrun.org/) is a tool for running tests | ||
# in multiple virtualenvs. This configuration file will run the | ||
# test suite on all supported python versions. To use it, "pip install tox" | ||
# and then run "tox" from this directory. | ||
|
||
[tox] | ||
envlist = py27,py35 | ||
|
||
[testenv] | ||
deps = | ||
-rrequirements.txt | ||
pytest | ||
pytest-cov | ||
hypothesis | ||
hypothesis-pytest | ||
commands = | ||
pip install -e . | ||
py.test --doctest-modules --cov=scrapy_streaming {posargs:scrapy_streaming tests} | ||
|
||
[testenv:py33] | ||
basepython = python3.3 | ||
|
||
[testenv:py34] | ||
basepython = python3.4 | ||
|
||
[testenv:py35] | ||
basepython = python3.5 |