Skip to content

Commit 45788bc

Browse files
authored
Replace asserts with custom checks in Scrapy subpackage (#174)
1 parent fbf2465 commit 45788bc

File tree

5 files changed

+40
-24
lines changed

5 files changed

+40
-24
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44

55
### Internal changes
66

7+
- Create a new subpackage for Scrapy pipelines
78
- Remove some noqas thanks to the new Ruff release
9+
- Replace asserts with custom checks in Scrapy subpackage
810

911
### Fixed
1012

src/apify/scrapy/middlewares/apify_proxy.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
from __future__ import annotations
22

3-
from typing import TYPE_CHECKING
43
from urllib.parse import ParseResult, urlparse
54

65
try:
6+
from scrapy import Request, Spider # noqa: TCH002
77
from scrapy.core.downloader.handlers.http11 import TunnelError
8+
from scrapy.crawler import Crawler # noqa: TCH002
89
from scrapy.exceptions import NotConfigured
910
except ImportError as exc:
1011
raise ImportError(
@@ -15,10 +16,6 @@
1516
from ...proxy_configuration import ProxyConfiguration
1617
from ..utils import get_basic_auth_header
1718

18-
if TYPE_CHECKING:
19-
from scrapy import Request, Spider
20-
from scrapy.crawler import Crawler
21-
2219

2320
class ApifyHttpProxyMiddleware:
2421
"""Apify HTTP proxy middleware for Scrapy.

src/apify/scrapy/middlewares/apify_retry.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
from typing import TYPE_CHECKING, Any
55

66
try:
7+
from scrapy import Spider # noqa: TCH002
78
from scrapy.downloadermiddlewares.retry import RetryMiddleware
9+
from scrapy.http import Request, Response # noqa: TCH002
810
from scrapy.utils.response import response_status_message
911
except ImportError as exc:
1012
raise ImportError(
@@ -15,9 +17,6 @@
1517
from ..utils import nested_event_loop, open_queue_with_custom_client, to_apify_request
1618

1719
if TYPE_CHECKING:
18-
from scrapy import Spider
19-
from scrapy.http import Request, Response
20-
2120
from ...storages import RequestQueue
2221

2322

src/apify/scrapy/scheduler.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,9 @@ def has_pending_requests(self: ApifyScheduler) -> bool:
5555
Returns:
5656
True if the scheduler has any pending requests, False otherwise.
5757
"""
58-
assert isinstance(self._rq, RequestQueue) # noqa: S101
58+
if not isinstance(self._rq, RequestQueue):
59+
raise TypeError('self._rq must be an instance of the RequestQueue class')
60+
5961
try:
6062
is_finished = nested_event_loop.run_until_complete(self._rq.is_finished())
6163
except BaseException:
@@ -76,10 +78,14 @@ def enqueue_request(self: ApifyScheduler, request: Request) -> bool:
7678
call_id = crypto_random_object_id(8)
7779
Actor.log.debug(f'[{call_id}]: ApifyScheduler.enqueue_request was called (scrapy_request={request})...')
7880

79-
assert isinstance(self.spider, Spider) # noqa: S101
81+
if not isinstance(self.spider, Spider):
82+
raise TypeError('self.spider must be an instance of the Spider class')
83+
8084
apify_request = to_apify_request(request, spider=self.spider)
8185
Actor.log.debug(f'[{call_id}]: scrapy_request was transformed to apify_request (apify_request={apify_request})')
82-
assert isinstance(self._rq, RequestQueue) # noqa: S101
86+
87+
if not isinstance(self._rq, RequestQueue):
88+
raise TypeError('self._rq must be an instance of the RequestQueue class')
8389

8490
try:
8591
result = nested_event_loop.run_until_complete(self._rq.add_request(apify_request))
@@ -98,7 +104,9 @@ def next_request(self: ApifyScheduler) -> Request | None:
98104
"""
99105
call_id = crypto_random_object_id(8)
100106
Actor.log.debug(f'[{call_id}]: ApifyScheduler.next_request was called...')
101-
assert isinstance(self._rq, RequestQueue) # noqa: S101
107+
108+
if not isinstance(self._rq, RequestQueue):
109+
raise TypeError('self._rq must be an instance of the RequestQueue class')
102110

103111
try:
104112
apify_request = nested_event_loop.run_until_complete(self._rq.fetch_next_request())
@@ -111,7 +119,9 @@ def next_request(self: ApifyScheduler) -> Request | None:
111119
if apify_request is None:
112120
return None
113121

114-
assert isinstance(self.spider, Spider) # noqa: S101
122+
if not isinstance(self.spider, Spider):
123+
raise TypeError('self.spider must be an instance of the Spider class')
124+
115125
scrapy_request = to_scrapy_request(apify_request, spider=self.spider)
116126
Actor.log.debug(
117127
f'[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned (scrapy_request={scrapy_request})',

src/apify/scrapy/utils.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,9 @@
66
from base64 import b64encode
77
from urllib.parse import unquote
88

9-
from scrapy.utils.python import to_bytes
10-
119
try:
1210
from scrapy import Request, Spider
11+
from scrapy.utils.python import to_bytes
1312
from scrapy.utils.request import request_from_dict
1413
except ImportError as exc:
1514
raise ImportError(
@@ -51,7 +50,8 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
5150
Returns:
5251
The converted Apify request.
5352
"""
54-
assert isinstance(scrapy_request, Request) # noqa: S101
53+
if not isinstance(scrapy_request, Request):
54+
raise TypeError('scrapy_request must be an instance of the scrapy.Request class')
5555

5656
call_id = crypto_random_object_id(8)
5757
Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...')
@@ -91,11 +91,14 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
9191
Returns:
9292
The converted Scrapy request.
9393
"""
94-
assert isinstance(apify_request, dict) # noqa: S101
95-
assert 'url' in apify_request # noqa: S101
96-
assert 'method' in apify_request # noqa: S101
97-
assert 'id' in apify_request # noqa: S101
98-
assert 'uniqueKey' in apify_request # noqa: S101
94+
if not isinstance(apify_request, dict):
95+
raise TypeError('apify_request must be a dictionary')
96+
97+
required_keys = ['url', 'method', 'id', 'uniqueKey']
98+
missing_keys = [key for key in required_keys if key not in apify_request]
99+
100+
if missing_keys:
101+
raise ValueError(f"apify_request must contain {', '.join(map(repr, missing_keys))} key(s)")
99102

100103
call_id = crypto_random_object_id(8)
101104
Actor.log.debug(f'[{call_id}]: to_scrapy_request was called (apify_request={apify_request})...')
@@ -106,14 +109,19 @@ def to_scrapy_request(apify_request: dict, spider: Spider) -> Request:
106109
# - This process involves decoding the base64-encoded request data and reconstructing
107110
# the Scrapy Request object from its dictionary representation.
108111
Actor.log.debug(f'[{call_id}]: Restoring the Scrapy Request from the apify_request...')
112+
109113
scrapy_request_dict_encoded = apify_request['userData']['scrapy_request']
110-
assert isinstance(scrapy_request_dict_encoded, str) # noqa: S101
114+
if not isinstance(scrapy_request_dict_encoded, str):
115+
raise TypeError('scrapy_request_dict_encoded must be a string')
111116

112117
scrapy_request_dict = pickle.loads(codecs.decode(scrapy_request_dict_encoded.encode(), 'base64'))
113-
assert isinstance(scrapy_request_dict, dict) # noqa: S101
118+
if not isinstance(scrapy_request_dict, dict):
119+
raise TypeError('scrapy_request_dict must be a dictionary')
114120

115121
scrapy_request = request_from_dict(scrapy_request_dict, spider=spider)
116-
assert isinstance(scrapy_request, Request) # noqa: S101
122+
if not isinstance(scrapy_request, Request):
123+
raise TypeError('scrapy_request must be an instance of the Request class')
124+
117125
Actor.log.debug(f'[{call_id}]: Scrapy Request successfully reconstructed (scrapy_request={scrapy_request})...')
118126

119127
# Update the meta field with the meta field from the apify_request

0 commit comments

Comments
 (0)