Skip to content

Commit e6174f2

Browse files
authored
Add unit tests for scrapy.utils and scrapy.pipelines (#171)
1 parent f9afff6 commit e6174f2

File tree

8 files changed

+300
-22
lines changed

8 files changed

+300
-22
lines changed

src/apify/scrapy/utils.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
4747
scrapy_request: The Scrapy request to be converted.
4848
spider: The Scrapy spider that the request is associated with.
4949
50+
Raises:
51+
TypeError: If the scrapy_request is not an instance of the scrapy.Request class.
52+
5053
Returns:
5154
The converted Apify request.
5255
"""
@@ -88,6 +91,10 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
8891
apify_request: The Apify request to be converted.
8992
spider: The Scrapy spider that the request is associated with.
9093
94+
Raises:
95+
TypeError: If the apify_request is not a dictionary.
96+
ValueError: If the apify_request does not contain the required keys.
97+
9198
Returns:
9299
The converted Scrapy request.
93100
"""
@@ -98,7 +105,7 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
98105
missing_keys = [key for key in required_keys if key not in apify_request]
99106

100107
if missing_keys:
101-
raise ValueError(f"apify_request must contain {', '.join(map(repr, missing_keys))} key(s)")
108+
raise ValueError(f'apify_request must contain {", ".join(map(repr, missing_keys))} key(s)')
102109

103110
call_id = crypto_random_object_id(8)
104111
Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...')

tests/unit/scrapy/pipelines/__init__.py

Whitespace-only changes.
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
from __future__ import annotations
2+
3+
from dataclasses import dataclass
4+
5+
import pytest
6+
from scrapy import Field, Item, Spider
7+
8+
from apify import Actor
9+
from apify.scrapy.pipelines import ActorDatasetPushPipeline
10+
11+
12+
class DummySpider(Spider):
13+
name = 'dummy_spider'
14+
15+
16+
class DummyItem(Item):
17+
a = Field()
18+
b = Field()
19+
c = Field()
20+
21+
22+
class TitleItem(Item):
23+
url = Field()
24+
title = Field()
25+
26+
27+
@pytest.fixture()
28+
def spider() -> DummySpider:
29+
"""Fixture to create a "dummy" Scrapy spider."""
30+
return DummySpider()
31+
32+
33+
@pytest.fixture()
34+
def pipeline() -> ActorDatasetPushPipeline:
35+
"""Fixture to create an Actor dataset push pipeline."""
36+
return ActorDatasetPushPipeline()
37+
38+
39+
@dataclass(frozen=True)
40+
class TestCase:
41+
item: Item
42+
item_dict: dict
43+
expected_exception: type[Exception] | None
44+
45+
46+
test_cases = [
47+
TestCase(
48+
item=DummyItem(a='string', b=123, c=False),
49+
item_dict={'a': 'string', 'b': 123, 'c': False},
50+
expected_exception=None,
51+
),
52+
TestCase(
53+
item=TitleItem(url='https://example.com', title='Example'),
54+
item_dict={'url': 'https://example.com', 'title': 'Example'},
55+
expected_exception=None,
56+
),
57+
TestCase(
58+
item=None,
59+
item_dict={},
60+
expected_exception=TypeError,
61+
),
62+
]
63+
64+
65+
@pytest.mark.parametrize('tc', test_cases)
66+
async def test__process_item(
67+
monkeypatch: pytest.MonkeyPatch,
68+
pipeline: ActorDatasetPushPipeline,
69+
spider: Spider,
70+
tc: TestCase,
71+
) -> None:
72+
dataset = []
73+
74+
async def mock_push_data(item: dict) -> None:
75+
dataset.append(item)
76+
77+
monkeypatch.setattr(Actor, 'push_data', mock_push_data)
78+
79+
if tc.expected_exception:
80+
with pytest.raises(tc.expected_exception):
81+
await pipeline.process_item(tc.item, spider)
82+
83+
else:
84+
output = await pipeline.process_item(tc.item, spider)
85+
assert output == tc.item
86+
assert dataset == [tc.item_dict]

tests/unit/scrapy/test_utils.py

Lines changed: 0 additions & 21 deletions
This file was deleted.

tests/unit/scrapy/utils/__init__.py

Whitespace-only changes.
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from __future__ import annotations
2+
3+
from dataclasses import dataclass
4+
5+
import pytest
6+
7+
from apify.scrapy import get_basic_auth_header
8+
9+
10+
@dataclass(frozen=True)
11+
class TestCase:
12+
username: str
13+
password: str
14+
expected_auth_header: bytes
15+
16+
17+
test_cases = [
18+
TestCase('username', 'password', b'Basic dXNlcm5hbWU6cGFzc3dvcmQ='),
19+
TestCase('john_smith', 'secret_password_123', b'Basic am9obl9zbWl0aDpzZWNyZXRfcGFzc3dvcmRfMTIz'),
20+
]
21+
22+
23+
@pytest.mark.parametrize('tc', test_cases)
24+
def test__get_basic_auth_header(tc: TestCase) -> None:
25+
auth_header = get_basic_auth_header(tc.username, tc.password)
26+
assert auth_header == tc.expected_auth_header
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
from __future__ import annotations
2+
3+
from dataclasses import dataclass
4+
5+
import pytest
6+
from scrapy import Request, Spider
7+
8+
from apify.scrapy.utils import to_apify_request
9+
10+
11+
class DummySpider(Spider):
12+
name = 'dummy_spider'
13+
14+
15+
@pytest.fixture()
16+
def spider() -> DummySpider:
17+
"""Fixture to create a "dummy" Scrapy spider."""
18+
return DummySpider()
19+
20+
21+
@dataclass(frozen=True)
22+
class TestCase:
23+
scrapy_request: Request
24+
expected_apify_request: dict | None
25+
expected_exception: type[Exception] | None
26+
27+
28+
test_cases = [
29+
# Valid Scrapy request with 'apify_request_id' and 'apify_request_unique_key'
30+
TestCase(
31+
scrapy_request=Request(
32+
url='https://example.com',
33+
method='GET',
34+
meta={'apify_request_id': 'abc123', 'apify_request_unique_key': 'https://example.com'},
35+
),
36+
expected_apify_request={
37+
'url': 'https://example.com',
38+
'method': 'GET',
39+
'id': 'abc123',
40+
'uniqueKey': 'https://example.com',
41+
'userData': {'scrapy_request': 'gANjCg...'}, # Example base64-encoded pickle data
42+
},
43+
expected_exception=None,
44+
),
45+
# Valid Scrapy request without 'apify_request_id' and 'apify_request_unique_key'
46+
TestCase(
47+
scrapy_request=Request(url='https://apify.com', method='GET'),
48+
expected_apify_request={
49+
'url': 'https://apify.com',
50+
'method': 'GET',
51+
'userData': {'scrapy_request': 'fhSnfa...'}, # Example base64-encoded pickle data
52+
},
53+
expected_exception=None,
54+
),
55+
# Invalid Scrapy request (not an instance of scrapy.Request)
56+
TestCase(
57+
scrapy_request=Spider(name='invalid_request'), # Not a valid Scrapy request
58+
expected_apify_request=None,
59+
expected_exception=TypeError,
60+
),
61+
]
62+
63+
64+
@pytest.mark.parametrize('tc', test_cases)
65+
def test__to_apify_request(spider: Spider, tc: TestCase) -> None:
66+
if tc.expected_exception:
67+
with pytest.raises(tc.expected_exception):
68+
to_apify_request(tc.scrapy_request, spider)
69+
70+
else:
71+
apify_request = to_apify_request(tc.scrapy_request, spider)
72+
assert isinstance(apify_request, dict)
73+
assert tc.expected_apify_request is not None
74+
assert apify_request.get('url') == tc.expected_apify_request.get('url')
75+
assert apify_request.get('method') == tc.expected_apify_request.get('method')
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
from __future__ import annotations
2+
3+
from dataclasses import dataclass
4+
5+
import pytest
6+
from scrapy import Request, Spider
7+
8+
from apify.scrapy.utils import to_scrapy_request
9+
10+
11+
class DummySpider(Spider):
12+
name = 'dummy_spider'
13+
14+
15+
@pytest.fixture()
16+
def spider() -> DummySpider:
17+
"""Fixture to create a "dummy" Scrapy spider."""
18+
return DummySpider()
19+
20+
21+
@dataclass(frozen=True)
22+
class TestCase:
23+
apify_request: dict
24+
expected_scrapy_request: Request | None
25+
expected_exception: type[Exception] | None
26+
27+
28+
test_cases = [
29+
# Valid Apify request without 'userData' (directly from Request Queue)
30+
TestCase(
31+
apify_request={'url': 'https://apify.com/', 'method': 'GET', 'uniqueKey': 'https://apify.com/', 'id': 'fvwscO2UJLdr10B'},
32+
expected_scrapy_request=Request(
33+
url='https://apify.com/',
34+
method='GET',
35+
meta={'apify_request_id': 'fvwscO2UJLdr10B', 'apify_request_unique_key': 'https://apify.com/'},
36+
),
37+
expected_exception=None,
38+
),
39+
# Valid Apify request with 'userData' (reconstruction from encoded Scrapy request)
40+
TestCase(
41+
apify_request={
42+
'url': 'https://apify.com',
43+
'method': 'GET',
44+
'id': 'fvwscO2UJLdr10B',
45+
'uniqueKey': 'https://apify.com',
46+
'userData': {
47+
'scrapy_request': 'gASVJgIAAAAAAAB9lCiMA3VybJSMEWh0dHBzOi8vYXBpZnkuY29tlIwIY2FsbGJhY2uUTowHZXJy\nYmFja5ROjAdoZWFkZXJzlH2UKEMGQWNjZXB0lF2UQz90ZXh0L2h0bWwsYXBwbGljYXRpb24veGh0\nbWwreG1sLGFwcGxpY2F0aW9uL3htbDtxPTAuOSwqLyo7cT0wLjiUYUMPQWNjZXB0LUxhbmd1YWdl\nlF2UQwJlbpRhQwpVc2VyLUFnZW50lF2UQyNTY3JhcHkvMi4xMS4wICgraHR0cHM6Ly9zY3JhcHku\nb3JnKZRhQw9BY2NlcHQtRW5jb2RpbmeUXZRDDWd6aXAsIGRlZmxhdGWUYXWMBm1ldGhvZJSMA0dF\nVJSMBGJvZHmUQwCUjAdjb29raWVzlH2UjARtZXRhlH2UKIwQYXBpZnlfcmVxdWVzdF9pZJSMD2Z2\nd3NjTzJVSkxkcjEwQpSMGGFwaWZ5X3JlcXVlc3RfdW5pcXVlX2tleZSMEWh0dHBzOi8vYXBpZnku\nY29tlIwQZG93bmxvYWRfdGltZW91dJRHQGaAAAAAAACMDWRvd25sb2FkX3Nsb3SUjAlhcGlmeS5j\nb22UjBBkb3dubG9hZF9sYXRlbmN5lEc/tYIIAAAAAHWMCGVuY29kaW5nlIwFdXRmLTiUjAhwcmlv\ncml0eZRLAIwLZG9udF9maWx0ZXKUiYwFZmxhZ3OUXZSMCWNiX2t3YXJnc5R9lHUu\n', # noqa: E501
48+
},
49+
},
50+
expected_scrapy_request=Request(
51+
url='https://apify.com',
52+
method='GET',
53+
meta={'apify_request_id': 'fvwscO2UJLdr10B', 'apify_request_unique_key': 'https://apify.com'},
54+
),
55+
expected_exception=None,
56+
),
57+
# Invalid Apify request (missing 'url' key)
58+
TestCase(
59+
apify_request={'method': 'GET', 'id': 'invalid123', 'uniqueKey': 'https://invalid.com'},
60+
expected_scrapy_request=None,
61+
expected_exception=ValueError,
62+
),
63+
# Invalid Apify request (missing 'id' key)
64+
TestCase(
65+
apify_request={'url': 'https://example.com', 'method': 'GET', 'uniqueKey': 'invalid123'},
66+
expected_scrapy_request=None,
67+
expected_exception=ValueError,
68+
),
69+
# Invalid Apify request (non-string 'userData.scrapy_request')
70+
TestCase(
71+
apify_request={
72+
'url': 'https://example.com',
73+
'method': 'GET',
74+
'id': 'invalid123',
75+
'uniqueKey': 'https://example.com',
76+
'userData': {'scrapy_request': 123},
77+
},
78+
expected_scrapy_request=None,
79+
expected_exception=TypeError,
80+
),
81+
]
82+
83+
84+
@pytest.mark.parametrize('tc', test_cases)
85+
def test__to_scrapy_request(spider: Spider, tc: TestCase) -> None:
86+
if tc.expected_exception:
87+
with pytest.raises(tc.expected_exception):
88+
to_scrapy_request(tc.apify_request, spider)
89+
90+
else:
91+
scrapy_request = to_scrapy_request(tc.apify_request, spider)
92+
93+
assert isinstance(scrapy_request, Request)
94+
assert tc.expected_scrapy_request is not None
95+
assert scrapy_request.url == tc.expected_scrapy_request.url
96+
assert scrapy_request.method == tc.expected_scrapy_request.method
97+
98+
# Check meta fields
99+
assert scrapy_request.meta.get('apify_request_id') == tc.expected_scrapy_request.meta.get('apify_request_id')
100+
assert scrapy_request.meta.get('apify_request_unique_key') == tc.expected_scrapy_request.meta.get('apify_request_unique_key')
101+
102+
# Check if meta field is updated properly when apify_request comes from Scrapy
103+
if 'userData' in tc.apify_request and 'scrapy_request' in tc.apify_request['userData']:
104+
assert scrapy_request.meta['apify_request_id'] == tc.apify_request['id']
105+
assert scrapy_request.meta['apify_request_unique_key'] == tc.apify_request['uniqueKey']

0 commit comments

Comments
 (0)