Initial project structure with tox, travis, and initial commands (#2)

* init tests and project structure * readme * added spider loader * get_project_root() raises an exception if not in project
scrapy-plugins · May 23, 2016 · bb94c45 · bb94c45
1 parent d8c94d1
commit bb94c45
Show file tree

Hide file tree

Showing 17 changed files with 356 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -60,3 +60,6 @@ target/
 
 #Ipython Notebook
 .ipynb_checkpoints
+
+# IDEs
+.idea/
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,14 @@
+language: python
+python: 3.5
+sudo: false
+env:
+ - TOXENV=py27
+ - TOXENV=py35
+install:
+ - pip install -U tox twine wheel codecov
+script: tox
+after_success:
+  - codecov
+cache:
+  directories:
+    - $HOME/.cache/pip
diff --git a/README.md b/README.md
@@ -1 +1,9 @@
-# scrapy-streaming
+# Scrapy Streaming (WIP)
+
+[![Build Status](https://travis-ci.org/scrapy-plugins/scrapy-streaming.svg?branch=master)](https://travis-ci.org/scrapy-plugins/scrapy-streaming)
+[![codecov](https://codecov.io/gh/scrapy-plugins/scrapy-streaming/branch/master/graph/badge.svg)](https://codecov.io/gh/scrapy-plugins/scrapy-streaming)
+
+The Scrapy Streaming provides an interface to write spiders using any programming language, using json objects to make requests, parse web contents, get data, and more.
+
+Also, we officially provide helper libraries to develop your spiders using Java, JS, and R.
+
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+scrapy
diff --git a/scrapy_streaming/__init__.py b/scrapy_streaming/__init__.py
diff --git a/scrapy_streaming/commands/__init__.py b/scrapy_streaming/commands/__init__.py
diff --git a/scrapy_streaming/commands/crawl.py b/scrapy_streaming/commands/crawl.py
@@ -0,0 +1,8 @@
+from scrapy.commands.crawl import Command
+
+
+class CrawlCommand(Command):
+    """
+    Extends the scrapy crawl command, adding the possibility to start a external spider using the crawl command
+    """
+    pass
diff --git a/scrapy_streaming/commands/list.py b/scrapy_streaming/commands/list.py
@@ -0,0 +1,19 @@
+from scrapy.commands.list import Command
+
+from scrapy_streaming.external_spiderloader import ExternalSpiderLoader
+
+
+class ListCommand(Command):
+    """
+    Extends the Scrapy list command, adding external spider to the list
+    """
+
+    def run(self, args, opts):
+        print('[Scrapy Spiders]')
+        super(ListCommand, self).run(args, opts)
+
+        spiders = [spider.name for spider in ExternalSpiderLoader.from_settings(self.settings).list()]
+        if spiders:
+            print('[External Spiders]')
+            for spider in sorted(spiders):
+                print(spider)
diff --git a/scrapy_streaming/commands/streaming.py b/scrapy_streaming/commands/streaming.py
@@ -0,0 +1,27 @@
+import os
+
+from scrapy.commands import ScrapyCommand
+from scrapy.exceptions import UsageError
+
+
+class StreamingCommand(ScrapyCommand):
+    """
+    Command to start stand-alone executables with the the scrapy scrapy_streaming
+    """
+
+    requires_project = False
+
+    def syntax(self):
+        return "[options] <path of executable>"
+
+    def short_desc(self):
+        return "Run a external spider using Scrapy Streaming given its path (doesn't require a project)"
+
+    def run(self, args, opts):
+        if len(args) != 1:
+            raise UsageError()
+        filename = args[0]
+        if not os.path.exists(filename):
+            raise UsageError("File not found: %s\n" % filename)
+
+        raise NotImplementedError()
diff --git a/scrapy_streaming/external_spiderloader.py b/scrapy_streaming/external_spiderloader.py
@@ -0,0 +1,71 @@
+import json
+import os
+
+from scrapy_streaming.utils import get_project_root
+
+
+class ExternalSpider(object):
+    """
+    Object to represent external spiders defined in ``external.json``.
+    """
+
+    def __init__(self, name, command, args=None):
+        if args is not None and not isinstance(args, list):
+            raise ValueError("'args' must be defined as an array of strings")
+        self.name = name
+        self.command = command
+        self.args = args
+
+    @classmethod
+    def from_dict(cls, spider):
+        return cls(**spider)
+
+
+class ExternalSpiderLoader(object):
+    """
+    This class manages external spiders defined in the ``external.json``
+    """
+
+    def __init__(self, settings):
+        path = settings.get('EXTERNAL_SPIDERS_PATH', get_project_root())
+        # TODO add EXTERNAL_SPIDERS_PATH in docs
+        path = os.path.abspath(path)
+        self.external = os.path.join(path, 'external.json')
+        self._spiders = {}
+        self._fetch_spiders()
+
+    @classmethod
+    def from_settings(cls, settings):
+        return cls(settings)
+
+    def _fetch_spiders(self):
+        """
+        Loads the content in the ``external.json`` file and generate a mapping of external spiders.
+        Keep the original mapping if the file is not found.
+        Throws JSONDecodeError if it's not a valid json file.
+        """
+        for spider in _read_json(self.external):
+            if not isinstance(spider, dict):
+                raise ValueError('External spiders must be defined as json objects.'
+                                 ' Read the docs for more information')
+
+            external_spider = ExternalSpider.from_dict(spider)
+            self._spiders[external_spider.name] = external_spider
+        return self._spiders
+
+    def list(self):
+        """
+        Returns a list with instance of loaded spiders (ExternalSpider objects)
+        """
+        return list(self._spiders.values())
+
+
+def _read_json(path):
+    """
+    Parse the json given its path. Raises an exception if the file doesn't exist.
+    """
+    if os.path.isfile(path):
+        return json.loads(open(path).read())
+    else:
+        raise Exception('Could not found "%s" file. Please, check if it\'s in your project root '
+                        'or defined in path defined at EXTERNAL_SPIDERS_PATH setting.' % path)
diff --git a/scrapy_streaming/utils.py b/scrapy_streaming/utils.py
@@ -0,0 +1,15 @@
+import os
+
+from scrapy.utils.conf import closest_scrapy_cfg
+from scrapy.utils.project import inside_project
+
+
+def get_project_root():
+    """
+    Returns the absolute path of the root of the project, and raise an exception
+    if the current directory is not inside a project path
+    """
+    os.path.abspath('.')
+    if inside_project():
+        return os.path.dirname(closest_scrapy_cfg())
+    raise Exception(os.getcwd(), " does not belong to a Scrapy project")
diff --git a/setup.py b/setup.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+from setuptools import setup
+
+setup(
+    name='scrapy-streaming',
+    version='0.1',
+    url='https://github.com/scrapy-plugins/scrapy-streaming',
+    description='Develop Spiders using any Programming Language',
+    author='Scrapy developers',
+    packages=['scrapy_streaming'],
+    requires=['scrapy'],
+
+    entry_points={
+        'scrapy.commands': [
+            'streaming=scrapy_streaming.commands.streaming:StreamingCommand',
+            'list=scrapy_streaming.commands.list:ListCommand',
+            'crawl=scrapy_streaming.commands.crawl:CrawlCommand'
+        ],
+    },
+)
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_commands.py b/tests/test_commands.py
@@ -0,0 +1,85 @@
+import os
+import subprocess
+import tempfile
+from tempfile import mkdtemp
+
+from os.path import join
+
+import sys
+from time import sleep
+
+from scrapy.utils.python import to_native_str
+from scrapy.utils.test import get_testenv
+from shutil import rmtree
+from twisted.trial import unittest
+
+
+class ProjectTest(unittest.TestCase):
+    project_name = 'testproject'
+
+    def setUp(self):
+        self.temp_path = mkdtemp()
+        self.cwd = self.temp_path
+        self.proj_path = join(self.temp_path, self.project_name)
+        self.proj_mod_path = join(self.proj_path, self.project_name)
+        self.env = get_testenv()
+
+        self.call('startproject', self.project_name)
+        self.cwd = join(self.temp_path, self.project_name)
+        os.chdir(self.cwd)
+        self.env['SCRAPY_SETTINGS_MODULE'] = '%s.settings' % self.project_name
+        self.external_path = join(self.cwd, 'external.json')
+        with open(self.external_path, 'w') as external:
+            external.write('''
+[
+  {
+    "name": "PythonSpider",
+    "command": "scripts/dmoz.py"
+  },
+
+  {
+    "name": "JavaSpider",
+    "command": "java",
+    "args": ["MyClass"]
+  }
+]
+''')
+
+    def tearDown(self):
+        rmtree(self.temp_path)
+
+    def call(self, *new_args, **kwargs):
+        with tempfile.NamedTemporaryFile() as out:
+            args = (sys.executable, '-m', 'scrapy.cmdline') + new_args
+            return subprocess.call(args, stdout=out, stderr=out, cwd=self.cwd,
+                env=self.env, **kwargs)
+
+    def proc(self, *new_args, **kwargs):
+        args = (sys.executable, '-m', 'scrapy.cmdline') + new_args
+        p = subprocess.Popen(args, cwd=self.cwd, env=self.env,
+                             stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                             **kwargs)
+
+        waited = 0
+        interval = 0.2
+        while p.poll() is None:
+            sleep(interval)
+            waited += interval
+            if waited > 15:
+                p.kill()
+                assert False, 'Command took too much time to complete'
+
+        return p
+
+
+class ListCommandTest(ProjectTest):
+
+    def test_list_is_running(self):
+        self.assertEqual(0, self.call('list'))
+
+    def test_external_spiders(self):
+        p = self.proc('list')
+        out = to_native_str(p.stdout.read())
+
+        self.assertIn("JavaSpider", out)
+        self.assertIn("PythonSpider", out)
diff --git a/tests/test_external_spiderloader.py b/tests/test_external_spiderloader.py
@@ -0,0 +1,41 @@
+from tests.test_commands import ProjectTest
+from twisted.trial import unittest
+
+from scrapy_streaming.external_spiderloader import ExternalSpider, ExternalSpiderLoader, _read_json
+
+
+class ExternalSpiderTest(unittest.TestCase):
+
+    def test_wrong_arg_type(self):
+        params = {'name': 'Name', 'command': 'python', 'args': {'a': 'b'}}
+        self.assertRaises(ValueError, ExternalSpider.from_dict, params)
+
+
+class ExternalSpiderLoaderTest(ProjectTest):
+
+    def test_list(self):
+        e = ExternalSpiderLoader({})
+
+        self.assertEqual(2, len(e.list()))
+
+    def test_invalid_json(self):
+        open(self.external_path, 'w').write('''
+[
+  {
+    "name": "PythonSpider",
+    "command": "scripts/dmoz.py"
+  },
+''')
+        self.assertRaises(ValueError, ExternalSpiderLoader.from_settings, {})
+
+    def test_invalid_json_content(self):
+        open(self.external_path, 'w').write('''
+{
+  "name": "PythonSpider",
+  "command": "scripts/dmoz.py"
+}
+''')
+        self.assertRaises(ValueError, ExternalSpiderLoader.from_settings, {})
+
+    def test_invalid_file(self):
+        self.assertRaises(Exception, _read_json, '/home')
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,16 @@
+import os
+
+from tests.test_commands import ProjectTest
+
+
+from scrapy_streaming.utils import get_project_root
+
+
+class UtilsTest(ProjectTest):
+
+    def test_get_project(self):
+        self.assertEqual(get_project_root(), self.cwd)
+
+    def test_get_project_default(self):
+        os.chdir('../')
+        self.assertRaises(Exception, get_project_root)
diff --git a/tox.ini b/tox.ini
@@ -0,0 +1,27 @@
+# Tox (http://tox.testrun.org/) is a tool for running tests
+# in multiple virtualenvs. This configuration file will run the
+# test suite on all supported python versions. To use it, "pip install tox"
+# and then run "tox" from this directory.
+
+[tox]
+envlist = py27,py35
+
+[testenv]
+deps =
+    -rrequirements.txt
+    pytest
+    pytest-cov
+    hypothesis
+    hypothesis-pytest
+commands =
+    pip install -e .
+    py.test --doctest-modules --cov=scrapy_streaming {posargs:scrapy_streaming tests}
+
+[testenv:py33]
+basepython = python3.3
+
+[testenv:py34]
+basepython = python3.4
+
+[testenv:py35]
+basepython = python3.5
-Original file line number
+Diff line change
@@ Expand Up / @@ -60,3 +60,6 @@ target/ @@
     #Ipython Notebook
     .ipynb_checkpoints
+    # IDEs
+    .idea/