Skip to content

Commit c3d72eb

Browse files
committed
Add test server and some top level Crawler tests
1 parent 20ecb24 commit c3d72eb

File tree

6 files changed

+312
-96
lines changed

6 files changed

+312
-96
lines changed

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,15 @@ keywords = [
3636
dependencies = [
3737
"apify-client>=1.11.0",
3838
"apify-shared>=1.3.0",
39-
"crawlee~=0.6.0",
39+
"crawlee[parsel]~=0.6.0",
4040
"cryptography>=42.0.0",
4141
"httpx>=0.27.0",
4242
# TODO: ensure compatibility with the latest version of lazy-object-proxy
4343
# https://github.com/apify/apify-sdk-python/issues/460
4444
"lazy-object-proxy<1.11.0",
4545
"more_itertools>=10.2.0",
4646
"typing-extensions>=4.1.0",
47+
"uvicorn",
4748
"websockets>=14.0",
4849
]
4950

tests/integration/actor_source_base/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,4 @@ RUN echo "Python version:" \
1212
&& echo "All installed Python packages:" \
1313
&& pip freeze
1414

15-
CMD ["python3", "-m", "src"]
15+
CMD ["sh", "-c", "python test_server.py & python -m src"]
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
"""
2+
Test server is infinite server http://localhost:8080/{any_number} and each page has links to the next 10 pages.
3+
For example:
4+
http://localhost:8080/ contains links:
5+
http://localhost:8080/0, http://localhost:8080/1, ..., http://localhost:8080/9
6+
7+
http://localhost:8080/1 contains links:
8+
http://localhost:8080/10, http://localhost:8080/11, ..., http://localhost:8080/19
9+
10+
... and so on.
11+
"""
12+
13+
import asyncio
14+
import logging
15+
from collections.abc import Awaitable, Callable, Coroutine
16+
from typing import Any
17+
18+
from uvicorn import Config
19+
from uvicorn.server import Server
20+
from yarl import URL
21+
22+
Receive = Callable[[], Awaitable[dict[str, Any]]]
23+
Send = Callable[[dict[str, Any]], Coroutine[None, None, None]]
24+
25+
26+
async def send_html_response(send: Send, html_content: bytes, status: int = 200) -> None:
27+
"""Send an HTML response to the client."""
28+
await send(
29+
{
30+
'type': 'http.response.start',
31+
'status': status,
32+
'headers': [[b'content-type', b'text/html; charset=utf-8']],
33+
}
34+
)
35+
await send({'type': 'http.response.body', 'body': html_content})
36+
37+
38+
async def app(scope: dict[str, Any], _: Receive, send: Send) -> None:
39+
"""Main ASGI application handler that routes requests to specific handlers.
40+
41+
Args:
42+
scope: The ASGI connection scope.
43+
_: The ASGI receive function.
44+
send: The ASGI send function.
45+
"""
46+
assert scope['type'] == 'http'
47+
path = scope['path']
48+
49+
links = '\n'.join(f'<a href="{path}{i}">{path}{i}</a>' for i in range(10))
50+
await send_html_response(
51+
send,
52+
f"""\
53+
<html><head>
54+
<title>Title for {path} </title>
55+
</head>
56+
<body>
57+
{links}
58+
</body></html>""".encode(),
59+
)
60+
61+
62+
class TestServer(Server):
63+
"""A test HTTP server implementation based on Uvicorn Server."""
64+
65+
@property
66+
def url(self) -> URL:
67+
"""Get the base URL of the server.
68+
69+
Returns:
70+
A URL instance with the server's base URL.
71+
"""
72+
protocol = 'https' if self.config.is_ssl else 'http'
73+
return URL(f'{protocol}://{self.config.host}:{self.config.port}/')
74+
75+
async def serve(self) -> None:
76+
"""Run the server."""
77+
self.restart_requested = asyncio.Event()
78+
79+
loop = asyncio.get_event_loop()
80+
tasks = {
81+
loop.create_task(super().serve()),
82+
}
83+
await asyncio.wait(tasks)
84+
85+
86+
if __name__ == '__main__':
87+
asyncio.run(
88+
TestServer(
89+
config=Config(
90+
app=app,
91+
lifespan='off',
92+
loop='asyncio',
93+
port=8080,
94+
log_config=None,
95+
log_level=logging.CRITICAL,
96+
)
97+
).serve()
98+
)

tests/integration/test_actor_api_helpers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,12 +400,12 @@ async def main_server() -> None:
400400
async with Actor:
401401

402402
class WebhookHandler(BaseHTTPRequestHandler):
403-
def do_GET(self) -> None: # noqa: N802
403+
def do_GET(self) -> None:
404404
self.send_response(200)
405405
self.end_headers()
406406
self.wfile.write(bytes('Hello, world!', encoding='utf-8'))
407407

408-
def do_POST(self) -> None: # noqa: N802
408+
def do_POST(self) -> None:
409409
nonlocal webhook_body
410410
content_length = self.headers.get('content-length')
411411
length = int(content_length) if content_length else 0
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
from tests.integration.conftest import MakeActorFunction, RunActorFunction
2+
3+
4+
async def test_actor_on_platform_max_crawl_depth(
5+
make_actor: MakeActorFunction,
6+
run_actor: RunActorFunction,
7+
) -> None:
8+
"""Test that the actor respects max_crawl_depth."""
9+
10+
async def main() -> None:
11+
"""The crawler entry point."""
12+
import re
13+
14+
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
15+
16+
from apify import Actor
17+
18+
async with Actor:
19+
crawler = ParselCrawler(max_crawl_depth=2)
20+
finished = []
21+
enqueue_pattern = re.compile(r'http://localhost:8080/2+$')
22+
23+
@crawler.router.default_handler
24+
async def default_handler(context: ParselCrawlingContext) -> None:
25+
"""Default request handler."""
26+
context.log.info(f'Processing {context.request.url} ...')
27+
await context.enqueue_links(include=[enqueue_pattern])
28+
await context.push_data({'Url': context.request.url})
29+
finished.append(context.request.url)
30+
31+
await crawler.run(['http://localhost:8080/'])
32+
assert finished == ['http://localhost:8080/', 'http://localhost:8080/2', 'http://localhost:8080/22']
33+
# assert some dataset
34+
35+
actor = await make_actor(label='parsel-crawler', main_func=main)
36+
run_result = await run_actor(actor)
37+
38+
assert run_result.status == 'SUCCEEDED'
39+
40+
41+
async def test_actor_on_platform_max_requests_per_crawl(
42+
make_actor: MakeActorFunction,
43+
run_actor: RunActorFunction,
44+
) -> None:
45+
"""Test that the actor respects max_requests_per_crawl."""
46+
47+
async def main() -> None:
48+
"""The crawler entry point."""
49+
from crawlee import ConcurrencySettings
50+
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
51+
52+
from apify import Actor
53+
54+
async with Actor:
55+
crawler = ParselCrawler(
56+
max_requests_per_crawl=3, concurrency_settings=ConcurrencySettings(max_concurrency=1)
57+
)
58+
finished = []
59+
60+
@crawler.router.default_handler
61+
async def default_handler(context: ParselCrawlingContext) -> None:
62+
"""Default request handler."""
63+
context.log.info(f'Processing {context.request.url} ...')
64+
await context.enqueue_links()
65+
await context.push_data({'Url': context.request.url})
66+
finished.append(context.request.url)
67+
68+
await crawler.run(['http://localhost:8080/'])
69+
assert len(finished) == 3
70+
# assert some dataset
71+
72+
actor = await make_actor(label='parsel-crawler', main_func=main)
73+
run_result = await run_actor(actor)
74+
75+
assert run_result.status == 'SUCCEEDED'

0 commit comments

Comments
 (0)