From bb94c458053a4392e3560a79157de8f01e10f9d3 Mon Sep 17 00:00:00 2001 From: Aron Barreira Bordin Date: Mon, 23 May 2016 11:53:00 -0300 Subject: [PATCH] Initial project structure with tox, travis, and initial commands (#2) * init tests and project structure * readme * added spider loader * get_project_root() raises an exception if not in project --- .gitignore | 3 + .travis.yml | 14 ++++ README.md | 10 ++- requirements.txt | 1 + scrapy_streaming/__init__.py | 0 scrapy_streaming/commands/__init__.py | 0 scrapy_streaming/commands/crawl.py | 8 +++ scrapy_streaming/commands/list.py | 19 +++++ scrapy_streaming/commands/streaming.py | 27 +++++++ scrapy_streaming/external_spiderloader.py | 71 +++++++++++++++++++ scrapy_streaming/utils.py | 15 ++++ setup.py | 20 ++++++ tests/__init__.py | 0 tests/test_commands.py | 85 +++++++++++++++++++++++ tests/test_external_spiderloader.py | 41 +++++++++++ tests/test_utils.py | 16 +++++ tox.ini | 27 +++++++ 17 files changed, 356 insertions(+), 1 deletion(-) create mode 100644 .travis.yml create mode 100644 requirements.txt create mode 100644 scrapy_streaming/__init__.py create mode 100644 scrapy_streaming/commands/__init__.py create mode 100644 scrapy_streaming/commands/crawl.py create mode 100644 scrapy_streaming/commands/list.py create mode 100644 scrapy_streaming/commands/streaming.py create mode 100644 scrapy_streaming/external_spiderloader.py create mode 100644 scrapy_streaming/utils.py create mode 100644 setup.py create mode 100644 tests/__init__.py create mode 100644 tests/test_commands.py create mode 100644 tests/test_external_spiderloader.py create mode 100644 tests/test_utils.py create mode 100644 tox.ini diff --git a/.gitignore b/.gitignore index 1dbc687..a7fe1a7 100644 --- a/.gitignore +++ b/.gitignore @@ -60,3 +60,6 @@ target/ #Ipython Notebook .ipynb_checkpoints + +# IDEs +.idea/ \ No newline at end of file diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..d087e44 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,14 @@ +language: python +python: 3.5 +sudo: false +env: + - TOXENV=py27 + - TOXENV=py35 +install: + - pip install -U tox twine wheel codecov +script: tox +after_success: + - codecov +cache: + directories: + - $HOME/.cache/pip diff --git a/README.md b/README.md index 5262583..9bc1a90 100644 --- a/README.md +++ b/README.md @@ -1 +1,9 @@ -# scrapy-streaming \ No newline at end of file +# Scrapy Streaming (WIP) + +[![Build Status](https://travis-ci.org/scrapy-plugins/scrapy-streaming.svg?branch=master)](https://travis-ci.org/scrapy-plugins/scrapy-streaming) +[![codecov](https://codecov.io/gh/scrapy-plugins/scrapy-streaming/branch/master/graph/badge.svg)](https://codecov.io/gh/scrapy-plugins/scrapy-streaming) + +The Scrapy Streaming provides an interface to write spiders using any programming language, using json objects to make requests, parse web contents, get data, and more. + +Also, we officially provide helper libraries to develop your spiders using Java, JS, and R. + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ccee6de --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +scrapy \ No newline at end of file diff --git a/scrapy_streaming/__init__.py b/scrapy_streaming/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scrapy_streaming/commands/__init__.py b/scrapy_streaming/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scrapy_streaming/commands/crawl.py b/scrapy_streaming/commands/crawl.py new file mode 100644 index 0000000..7efdbef --- /dev/null +++ b/scrapy_streaming/commands/crawl.py @@ -0,0 +1,8 @@ +from scrapy.commands.crawl import Command + + +class CrawlCommand(Command): + """ + Extends the scrapy crawl command, adding the possibility to start a external spider using the crawl command + """ + pass diff --git a/scrapy_streaming/commands/list.py b/scrapy_streaming/commands/list.py new file mode 100644 index 0000000..b0a8715 --- /dev/null +++ b/scrapy_streaming/commands/list.py @@ -0,0 +1,19 @@ +from scrapy.commands.list import Command + +from scrapy_streaming.external_spiderloader import ExternalSpiderLoader + + +class ListCommand(Command): + """ + Extends the Scrapy list command, adding external spider to the list + """ + + def run(self, args, opts): + print('[Scrapy Spiders]') + super(ListCommand, self).run(args, opts) + + spiders = [spider.name for spider in ExternalSpiderLoader.from_settings(self.settings).list()] + if spiders: + print('[External Spiders]') + for spider in sorted(spiders): + print(spider) diff --git a/scrapy_streaming/commands/streaming.py b/scrapy_streaming/commands/streaming.py new file mode 100644 index 0000000..7315833 --- /dev/null +++ b/scrapy_streaming/commands/streaming.py @@ -0,0 +1,27 @@ +import os + +from scrapy.commands import ScrapyCommand +from scrapy.exceptions import UsageError + + +class StreamingCommand(ScrapyCommand): + """ + Command to start stand-alone executables with the the scrapy scrapy_streaming + """ + + requires_project = False + + def syntax(self): + return "[options] " + + def short_desc(self): + return "Run a external spider using Scrapy Streaming given its path (doesn't require a project)" + + def run(self, args, opts): + if len(args) != 1: + raise UsageError() + filename = args[0] + if not os.path.exists(filename): + raise UsageError("File not found: %s\n" % filename) + + raise NotImplementedError() diff --git a/scrapy_streaming/external_spiderloader.py b/scrapy_streaming/external_spiderloader.py new file mode 100644 index 0000000..52b44fc --- /dev/null +++ b/scrapy_streaming/external_spiderloader.py @@ -0,0 +1,71 @@ +import json +import os + +from scrapy_streaming.utils import get_project_root + + +class ExternalSpider(object): + """ + Object to represent external spiders defined in ``external.json``. + """ + + def __init__(self, name, command, args=None): + if args is not None and not isinstance(args, list): + raise ValueError("'args' must be defined as an array of strings") + self.name = name + self.command = command + self.args = args + + @classmethod + def from_dict(cls, spider): + return cls(**spider) + + +class ExternalSpiderLoader(object): + """ + This class manages external spiders defined in the ``external.json`` + """ + + def __init__(self, settings): + path = settings.get('EXTERNAL_SPIDERS_PATH', get_project_root()) + # TODO add EXTERNAL_SPIDERS_PATH in docs + path = os.path.abspath(path) + self.external = os.path.join(path, 'external.json') + self._spiders = {} + self._fetch_spiders() + + @classmethod + def from_settings(cls, settings): + return cls(settings) + + def _fetch_spiders(self): + """ + Loads the content in the ``external.json`` file and generate a mapping of external spiders. + Keep the original mapping if the file is not found. + Throws JSONDecodeError if it's not a valid json file. + """ + for spider in _read_json(self.external): + if not isinstance(spider, dict): + raise ValueError('External spiders must be defined as json objects.' + ' Read the docs for more information') + + external_spider = ExternalSpider.from_dict(spider) + self._spiders[external_spider.name] = external_spider + return self._spiders + + def list(self): + """ + Returns a list with instance of loaded spiders (ExternalSpider objects) + """ + return list(self._spiders.values()) + + +def _read_json(path): + """ + Parse the json given its path. Raises an exception if the file doesn't exist. + """ + if os.path.isfile(path): + return json.loads(open(path).read()) + else: + raise Exception('Could not found "%s" file. Please, check if it\'s in your project root ' + 'or defined in path defined at EXTERNAL_SPIDERS_PATH setting.' % path) diff --git a/scrapy_streaming/utils.py b/scrapy_streaming/utils.py new file mode 100644 index 0000000..91fed5c --- /dev/null +++ b/scrapy_streaming/utils.py @@ -0,0 +1,15 @@ +import os + +from scrapy.utils.conf import closest_scrapy_cfg +from scrapy.utils.project import inside_project + + +def get_project_root(): + """ + Returns the absolute path of the root of the project, and raise an exception + if the current directory is not inside a project path + """ + os.path.abspath('.') + if inside_project(): + return os.path.dirname(closest_scrapy_cfg()) + raise Exception(os.getcwd(), " does not belong to a Scrapy project") diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..afee67e --- /dev/null +++ b/setup.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python +from setuptools import setup + +setup( + name='scrapy-streaming', + version='0.1', + url='https://github.com/scrapy-plugins/scrapy-streaming', + description='Develop Spiders using any Programming Language', + author='Scrapy developers', + packages=['scrapy_streaming'], + requires=['scrapy'], + + entry_points={ + 'scrapy.commands': [ + 'streaming=scrapy_streaming.commands.streaming:StreamingCommand', + 'list=scrapy_streaming.commands.list:ListCommand', + 'crawl=scrapy_streaming.commands.crawl:CrawlCommand' + ], + }, +) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_commands.py b/tests/test_commands.py new file mode 100644 index 0000000..e1ded91 --- /dev/null +++ b/tests/test_commands.py @@ -0,0 +1,85 @@ +import os +import subprocess +import tempfile +from tempfile import mkdtemp + +from os.path import join + +import sys +from time import sleep + +from scrapy.utils.python import to_native_str +from scrapy.utils.test import get_testenv +from shutil import rmtree +from twisted.trial import unittest + + +class ProjectTest(unittest.TestCase): + project_name = 'testproject' + + def setUp(self): + self.temp_path = mkdtemp() + self.cwd = self.temp_path + self.proj_path = join(self.temp_path, self.project_name) + self.proj_mod_path = join(self.proj_path, self.project_name) + self.env = get_testenv() + + self.call('startproject', self.project_name) + self.cwd = join(self.temp_path, self.project_name) + os.chdir(self.cwd) + self.env['SCRAPY_SETTINGS_MODULE'] = '%s.settings' % self.project_name + self.external_path = join(self.cwd, 'external.json') + with open(self.external_path, 'w') as external: + external.write(''' +[ + { + "name": "PythonSpider", + "command": "scripts/dmoz.py" + }, + + { + "name": "JavaSpider", + "command": "java", + "args": ["MyClass"] + } +] +''') + + def tearDown(self): + rmtree(self.temp_path) + + def call(self, *new_args, **kwargs): + with tempfile.NamedTemporaryFile() as out: + args = (sys.executable, '-m', 'scrapy.cmdline') + new_args + return subprocess.call(args, stdout=out, stderr=out, cwd=self.cwd, + env=self.env, **kwargs) + + def proc(self, *new_args, **kwargs): + args = (sys.executable, '-m', 'scrapy.cmdline') + new_args + p = subprocess.Popen(args, cwd=self.cwd, env=self.env, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + **kwargs) + + waited = 0 + interval = 0.2 + while p.poll() is None: + sleep(interval) + waited += interval + if waited > 15: + p.kill() + assert False, 'Command took too much time to complete' + + return p + + +class ListCommandTest(ProjectTest): + + def test_list_is_running(self): + self.assertEqual(0, self.call('list')) + + def test_external_spiders(self): + p = self.proc('list') + out = to_native_str(p.stdout.read()) + + self.assertIn("JavaSpider", out) + self.assertIn("PythonSpider", out) diff --git a/tests/test_external_spiderloader.py b/tests/test_external_spiderloader.py new file mode 100644 index 0000000..dfb3ce6 --- /dev/null +++ b/tests/test_external_spiderloader.py @@ -0,0 +1,41 @@ +from tests.test_commands import ProjectTest +from twisted.trial import unittest + +from scrapy_streaming.external_spiderloader import ExternalSpider, ExternalSpiderLoader, _read_json + + +class ExternalSpiderTest(unittest.TestCase): + + def test_wrong_arg_type(self): + params = {'name': 'Name', 'command': 'python', 'args': {'a': 'b'}} + self.assertRaises(ValueError, ExternalSpider.from_dict, params) + + +class ExternalSpiderLoaderTest(ProjectTest): + + def test_list(self): + e = ExternalSpiderLoader({}) + + self.assertEqual(2, len(e.list())) + + def test_invalid_json(self): + open(self.external_path, 'w').write(''' +[ + { + "name": "PythonSpider", + "command": "scripts/dmoz.py" + }, +''') + self.assertRaises(ValueError, ExternalSpiderLoader.from_settings, {}) + + def test_invalid_json_content(self): + open(self.external_path, 'w').write(''' +{ + "name": "PythonSpider", + "command": "scripts/dmoz.py" +} +''') + self.assertRaises(ValueError, ExternalSpiderLoader.from_settings, {}) + + def test_invalid_file(self): + self.assertRaises(Exception, _read_json, '/home') diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..3a93098 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,16 @@ +import os + +from tests.test_commands import ProjectTest + + +from scrapy_streaming.utils import get_project_root + + +class UtilsTest(ProjectTest): + + def test_get_project(self): + self.assertEqual(get_project_root(), self.cwd) + + def test_get_project_default(self): + os.chdir('../') + self.assertRaises(Exception, get_project_root) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..04f8c48 --- /dev/null +++ b/tox.ini @@ -0,0 +1,27 @@ +# Tox (http://tox.testrun.org/) is a tool for running tests +# in multiple virtualenvs. This configuration file will run the +# test suite on all supported python versions. To use it, "pip install tox" +# and then run "tox" from this directory. + +[tox] +envlist = py27,py35 + +[testenv] +deps = + -rrequirements.txt + pytest + pytest-cov + hypothesis + hypothesis-pytest +commands = + pip install -e . + py.test --doctest-modules --cov=scrapy_streaming {posargs:scrapy_streaming tests} + +[testenv:py33] +basepython = python3.3 + +[testenv:py34] +basepython = python3.4 + +[testenv:py35] +basepython = python3.5