Skip to content

Commit

Permalink
Initial project structure with tox, travis, and initial commands (#2)
Browse files Browse the repository at this point in the history
* init tests and project structure

* readme

* added spider loader

* get_project_root() raises an exception if not in project
  • Loading branch information
aron-bordin authored and Raul Gallegos committed May 23, 2016
1 parent d8c94d1 commit bb94c45
Show file tree
Hide file tree
Showing 17 changed files with 356 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,6 @@ target/

#Ipython Notebook
.ipynb_checkpoints

# IDEs
.idea/
14 changes: 14 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
language: python
python: 3.5
sudo: false
env:
- TOXENV=py27
- TOXENV=py35
install:
- pip install -U tox twine wheel codecov
script: tox
after_success:
- codecov
cache:
directories:
- $HOME/.cache/pip
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,9 @@
# scrapy-streaming
# Scrapy Streaming (WIP)

[![Build Status](https://travis-ci.org/scrapy-plugins/scrapy-streaming.svg?branch=master)](https://travis-ci.org/scrapy-plugins/scrapy-streaming)
[![codecov](https://codecov.io/gh/scrapy-plugins/scrapy-streaming/branch/master/graph/badge.svg)](https://codecov.io/gh/scrapy-plugins/scrapy-streaming)

The Scrapy Streaming provides an interface to write spiders using any programming language, using json objects to make requests, parse web contents, get data, and more.

Also, we officially provide helper libraries to develop your spiders using Java, JS, and R.

1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
scrapy
Empty file added scrapy_streaming/__init__.py
Empty file.
Empty file.
8 changes: 8 additions & 0 deletions scrapy_streaming/commands/crawl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from scrapy.commands.crawl import Command


class CrawlCommand(Command):
"""
Extends the scrapy crawl command, adding the possibility to start a external spider using the crawl command
"""
pass
19 changes: 19 additions & 0 deletions scrapy_streaming/commands/list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from scrapy.commands.list import Command

from scrapy_streaming.external_spiderloader import ExternalSpiderLoader


class ListCommand(Command):
"""
Extends the Scrapy list command, adding external spider to the list
"""

def run(self, args, opts):
print('[Scrapy Spiders]')
super(ListCommand, self).run(args, opts)

spiders = [spider.name for spider in ExternalSpiderLoader.from_settings(self.settings).list()]
if spiders:
print('[External Spiders]')
for spider in sorted(spiders):
print(spider)
27 changes: 27 additions & 0 deletions scrapy_streaming/commands/streaming.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import os

from scrapy.commands import ScrapyCommand
from scrapy.exceptions import UsageError


class StreamingCommand(ScrapyCommand):
"""
Command to start stand-alone executables with the the scrapy scrapy_streaming
"""

requires_project = False

def syntax(self):
return "[options] <path of executable>"

def short_desc(self):
return "Run a external spider using Scrapy Streaming given its path (doesn't require a project)"

def run(self, args, opts):
if len(args) != 1:
raise UsageError()
filename = args[0]
if not os.path.exists(filename):
raise UsageError("File not found: %s\n" % filename)

raise NotImplementedError()
71 changes: 71 additions & 0 deletions scrapy_streaming/external_spiderloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import json
import os

from scrapy_streaming.utils import get_project_root


class ExternalSpider(object):
"""
Object to represent external spiders defined in ``external.json``.
"""

def __init__(self, name, command, args=None):
if args is not None and not isinstance(args, list):
raise ValueError("'args' must be defined as an array of strings")
self.name = name
self.command = command
self.args = args

@classmethod
def from_dict(cls, spider):
return cls(**spider)


class ExternalSpiderLoader(object):
"""
This class manages external spiders defined in the ``external.json``
"""

def __init__(self, settings):
path = settings.get('EXTERNAL_SPIDERS_PATH', get_project_root())
# TODO add EXTERNAL_SPIDERS_PATH in docs
path = os.path.abspath(path)
self.external = os.path.join(path, 'external.json')
self._spiders = {}
self._fetch_spiders()

@classmethod
def from_settings(cls, settings):
return cls(settings)

def _fetch_spiders(self):
"""
Loads the content in the ``external.json`` file and generate a mapping of external spiders.
Keep the original mapping if the file is not found.
Throws JSONDecodeError if it's not a valid json file.
"""
for spider in _read_json(self.external):
if not isinstance(spider, dict):
raise ValueError('External spiders must be defined as json objects.'
' Read the docs for more information')

external_spider = ExternalSpider.from_dict(spider)
self._spiders[external_spider.name] = external_spider
return self._spiders

def list(self):
"""
Returns a list with instance of loaded spiders (ExternalSpider objects)
"""
return list(self._spiders.values())


def _read_json(path):
"""
Parse the json given its path. Raises an exception if the file doesn't exist.
"""
if os.path.isfile(path):
return json.loads(open(path).read())
else:
raise Exception('Could not found "%s" file. Please, check if it\'s in your project root '
'or defined in path defined at EXTERNAL_SPIDERS_PATH setting.' % path)
15 changes: 15 additions & 0 deletions scrapy_streaming/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import os

from scrapy.utils.conf import closest_scrapy_cfg
from scrapy.utils.project import inside_project


def get_project_root():
"""
Returns the absolute path of the root of the project, and raise an exception
if the current directory is not inside a project path
"""
os.path.abspath('.')
if inside_project():
return os.path.dirname(closest_scrapy_cfg())
raise Exception(os.getcwd(), " does not belong to a Scrapy project")
20 changes: 20 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/env python
from setuptools import setup

setup(
name='scrapy-streaming',
version='0.1',
url='https://github.com/scrapy-plugins/scrapy-streaming',
description='Develop Spiders using any Programming Language',
author='Scrapy developers',
packages=['scrapy_streaming'],
requires=['scrapy'],

entry_points={
'scrapy.commands': [
'streaming=scrapy_streaming.commands.streaming:StreamingCommand',
'list=scrapy_streaming.commands.list:ListCommand',
'crawl=scrapy_streaming.commands.crawl:CrawlCommand'
],
},
)
Empty file added tests/__init__.py
Empty file.
85 changes: 85 additions & 0 deletions tests/test_commands.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import os
import subprocess
import tempfile
from tempfile import mkdtemp

from os.path import join

import sys
from time import sleep

from scrapy.utils.python import to_native_str
from scrapy.utils.test import get_testenv
from shutil import rmtree
from twisted.trial import unittest


class ProjectTest(unittest.TestCase):
project_name = 'testproject'

def setUp(self):
self.temp_path = mkdtemp()
self.cwd = self.temp_path
self.proj_path = join(self.temp_path, self.project_name)
self.proj_mod_path = join(self.proj_path, self.project_name)
self.env = get_testenv()

self.call('startproject', self.project_name)
self.cwd = join(self.temp_path, self.project_name)
os.chdir(self.cwd)
self.env['SCRAPY_SETTINGS_MODULE'] = '%s.settings' % self.project_name
self.external_path = join(self.cwd, 'external.json')
with open(self.external_path, 'w') as external:
external.write('''
[
{
"name": "PythonSpider",
"command": "scripts/dmoz.py"
},
{
"name": "JavaSpider",
"command": "java",
"args": ["MyClass"]
}
]
''')

def tearDown(self):
rmtree(self.temp_path)

def call(self, *new_args, **kwargs):
with tempfile.NamedTemporaryFile() as out:
args = (sys.executable, '-m', 'scrapy.cmdline') + new_args
return subprocess.call(args, stdout=out, stderr=out, cwd=self.cwd,
env=self.env, **kwargs)

def proc(self, *new_args, **kwargs):
args = (sys.executable, '-m', 'scrapy.cmdline') + new_args
p = subprocess.Popen(args, cwd=self.cwd, env=self.env,
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
**kwargs)

waited = 0
interval = 0.2
while p.poll() is None:
sleep(interval)
waited += interval
if waited > 15:
p.kill()
assert False, 'Command took too much time to complete'

return p


class ListCommandTest(ProjectTest):

def test_list_is_running(self):
self.assertEqual(0, self.call('list'))

def test_external_spiders(self):
p = self.proc('list')
out = to_native_str(p.stdout.read())

self.assertIn("JavaSpider", out)
self.assertIn("PythonSpider", out)
41 changes: 41 additions & 0 deletions tests/test_external_spiderloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from tests.test_commands import ProjectTest
from twisted.trial import unittest

from scrapy_streaming.external_spiderloader import ExternalSpider, ExternalSpiderLoader, _read_json


class ExternalSpiderTest(unittest.TestCase):

def test_wrong_arg_type(self):
params = {'name': 'Name', 'command': 'python', 'args': {'a': 'b'}}
self.assertRaises(ValueError, ExternalSpider.from_dict, params)


class ExternalSpiderLoaderTest(ProjectTest):

def test_list(self):
e = ExternalSpiderLoader({})

self.assertEqual(2, len(e.list()))

def test_invalid_json(self):
open(self.external_path, 'w').write('''
[
{
"name": "PythonSpider",
"command": "scripts/dmoz.py"
},
''')
self.assertRaises(ValueError, ExternalSpiderLoader.from_settings, {})

def test_invalid_json_content(self):
open(self.external_path, 'w').write('''
{
"name": "PythonSpider",
"command": "scripts/dmoz.py"
}
''')
self.assertRaises(ValueError, ExternalSpiderLoader.from_settings, {})

def test_invalid_file(self):
self.assertRaises(Exception, _read_json, '/home')
16 changes: 16 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import os

from tests.test_commands import ProjectTest


from scrapy_streaming.utils import get_project_root


class UtilsTest(ProjectTest):

def test_get_project(self):
self.assertEqual(get_project_root(), self.cwd)

def test_get_project_default(self):
os.chdir('../')
self.assertRaises(Exception, get_project_root)
27 changes: 27 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Tox (http://tox.testrun.org/) is a tool for running tests
# in multiple virtualenvs. This configuration file will run the
# test suite on all supported python versions. To use it, "pip install tox"
# and then run "tox" from this directory.

[tox]
envlist = py27,py35

[testenv]
deps =
-rrequirements.txt
pytest
pytest-cov
hypothesis
hypothesis-pytest
commands =
pip install -e .
py.test --doctest-modules --cov=scrapy_streaming {posargs:scrapy_streaming tests}

[testenv:py33]
basepython = python3.3

[testenv:py34]
basepython = python3.4

[testenv:py35]
basepython = python3.5

0 comments on commit bb94c45

Please sign in to comment.