Skip to content

feat: Add requests argument to EnqueueLinksFunction #1024

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Apr 3, 2025
Merged
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import asyncio

from crawlee import Glob
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


async def main() -> None:
crawler = BeautifulSoupCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)

# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Extract all the documentation links found on the page, except for the examples.
extracted_links = await context.extract_links(
include=[Glob('https://crawlee.dev/docs/**')],
exclude=[Glob('https://crawlee.dev/docs/examples')],
)
# Some very custom filtering which can't be achieved by `extract_links` arguments.
max_link_length = 30
filtered_links = [
link for link in extracted_links if len(link.url) < max_link_length
]
# Add filtered links to the request queue.
await context.add_requests(filtered_links)

# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
asyncio.run(main())
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import asyncio

from crawlee import Glob
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


async def main() -> None:
crawler = PlaywrightCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)

# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Extract all the documentation links found on the page, except for the examples.
extracted_links = await context.extract_links(
include=[Glob('https://crawlee.dev/docs/**')],
exclude=[Glob('https://crawlee.dev/docs/examples')],
)
# Some very custom filtering which can't be achieved by `extract_links` arguments.
max_link_length = 30
filtered_links = [
link for link in extracted_links if len(link.url) < max_link_length
]
# Add filtered links to the request queue.
await context.add_requests(filtered_links)

# Run the crawler with the initial list of requests.
await crawler.run(['https://crawlee.dev'])


if __name__ == '__main__':
asyncio.run(main())
22 changes: 21 additions & 1 deletion docs/examples/crawl_specific_links_on_website.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/crawl_specific_links_on_website_bs.py';
import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/crawl_specific_links_on_website_pw.py';

import BeautifulSoupExampleExtractAndAdd from '!!raw-loader!roa-loader!./code_examples/extract_and_add_specific_links_on_website_bs.py';
import PlaywrightExampleExtractAndAdd from '!!raw-loader!roa-loader!./code_examples/extract_and_add_specific_links_on_website_pw.py';

This example demonstrates how to crawl a website while targeting specific patterns of links. By utilizing the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> helper, you can pass `include` or `exclude` parameters to improve your crawling strategy. This approach ensures that only the links matching the specified patterns are added to the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>. Both `include` and `exclude` support lists of globs or regular expressions. This functionality is great for focusing on relevant sections of a website and avoiding scraping unnecessary or irrelevant content.

<Tabs groupId="main">
<Tabs groupId="first-example">
<TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
<RunnableCodeBlock className="language-python" language="python">
{BeautifulSoupExample}
Expand All @@ -25,3 +28,20 @@ This example demonstrates how to crawl a website while targeting specific patter
</RunnableCodeBlock>
</TabItem>
</Tabs>

## Even more control over the enqueued links

<ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> is a convenience helper and internally it calls <ApiLink to="class/ExtractLinksFunction">`extract_links`</ApiLink> to find the links and <ApiLink to="class/AddRequestsFunction">`add_requests`</ApiLink> to add them to the queue. If you need some additional custom filtering of the extracted links before enqueuing them, then consider using <ApiLink to="class/ExtractLinksFunction">`extract_links`</ApiLink> and <ApiLink to="class/AddRequestsFunction">`add_requests`</ApiLink> instead of the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink>

<Tabs groupId="second-example">
<TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
<RunnableCodeBlock className="language-python">
{BeautifulSoupExampleExtractAndAdd}
</RunnableCodeBlock>
</TabItem>
<TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
<RunnableCodeBlock className="language-python">
{PlaywrightExampleExtractAndAdd}
</RunnableCodeBlock>
</TabItem>
</Tabs>
69 changes: 64 additions & 5 deletions src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,24 +324,83 @@ def __call__(

@docs_group('Functions')
class EnqueueLinksFunction(Protocol):
"""A function for enqueueing new URLs to crawl based on elements selected by a given selector.
"""A function for enqueueing new URLs to crawl based on elements selected by a given selector or explicit requests.

It extracts URLs from the current page and enqueues them for further crawling. It allows filtering through
selectors and other options. You can also specify labels and user data to be associated with the newly
created `Request` objects.
It adds explicitly passed `requests` to the `RequestManager` or it extracts URLs from the current page and enqueues
them for further crawling. It allows filtering through selectors and other options. You can also specify labels and
user data to be associated with the newly created `Request` objects.

It should not be called with `selector`, `label`, `user_data` or `transform_request_function` arguments together
with `requests` argument.

For even more control over the enqueued links you can use combination of `ExtractLinksFunction` and
`AddRequestsFunction`.
"""

@overload
def __call__(
self,
*,
selector: str = 'a',
selector: str | None = None,
label: str | None = None,
user_data: dict[str, Any] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
**kwargs: Unpack[EnqueueLinksKwargs],
) -> Coroutine[None, None, None]: ...

@overload
def __call__(
self, *, requests: Sequence[str | Request] | None = None, **kwargs: Unpack[EnqueueLinksKwargs]
) -> Coroutine[None, None, None]: ...

def __call__(
self,
*,
selector: str | None = None,
label: str | None = None,
user_data: dict[str, Any] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
requests: Sequence[str | Request] | None = None,
Copy link
Collaborator

@janbuchar janbuchar Feb 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I now noticed that the JS counterpart accepts just urls as an array of strings. We should either restrict this, or extend the JS version 🙂

If we choose restricting this one, then most of the other parameters (barring selector) would actually start making sense in combination with urls.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer to keep it as it is for consistency, since we use request: str | Request everywhere else.

**kwargs: Unpack[EnqueueLinksKwargs],
) -> Coroutine[None, None, None]:
"""Call enqueue links function.

Args:
selector: A selector used to find the elements containing the links. The behaviour differs based
on the crawler used:
- `PlaywrightCrawler` supports CSS and XPath selectors.
- `ParselCrawler` supports CSS selectors.
- `BeautifulSoupCrawler` supports CSS selectors.
label: Label for the newly created `Request` objects, used for request routing.
user_data: User data to be provided to the newly created `Request` objects.
transform_request_function: A function that takes `RequestOptions` and returns either:
- Modified `RequestOptions` to update the request configuration,
- `'skip'` to exclude the request from being enqueued,
- `'unchanged'` to use the original request options without modification.
requests: Requests to be added to the `RequestManager`.
**kwargs: Additional keyword arguments.
"""


@docs_group('Functions')
class ExtractLinksFunction(Protocol):
"""A function for extracting URLs to crawl based on elements selected by a given selector.

It extracts URLs from the current page and allows filtering through selectors and other options. You can also
specify labels and user data to be associated with the newly created `Request` objects.
"""

def __call__(
self,
*,
selector: str = 'a',
label: str | None = None,
user_data: dict[str, Any] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
**kwargs: Unpack[EnqueueLinksKwargs],
) -> Coroutine[None, None, list[Request]]:
"""Call extract links function.

Args:
selector: A selector used to find the elements containing the links. The behaviour differs based
on the crawler used:
Expand Down
70 changes: 59 additions & 11 deletions src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import logging
from abc import ABC
from typing import TYPE_CHECKING, Any, Callable, Generic
from typing import TYPE_CHECKING, Any, Callable, Generic, Union

from pydantic import ValidationError
from typing_extensions import TypeVar
Expand All @@ -17,12 +17,12 @@
from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult, TSelectResult

if TYPE_CHECKING:
from collections.abc import AsyncGenerator, Awaitable
from collections.abc import AsyncGenerator, Awaitable, Sequence

from typing_extensions import Unpack

from crawlee import RequestTransformAction
from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, EnqueueLinksKwargs
from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, EnqueueLinksKwargs, ExtractLinksFunction

from ._abstract_http_parser import AbstractHttpParser

Expand Down Expand Up @@ -124,34 +124,36 @@ async def _parse_http_response(
The original crawling context enhanced by the parsing result and enqueue links function.
"""
parsed_content = await self._parser.parse(context.http_response)
extract_links = self._create_extract_links_function(context, parsed_content)
yield ParsedHttpCrawlingContext.from_http_crawling_context(
context=context,
parsed_content=parsed_content,
enqueue_links=self._create_enqueue_links_function(context, parsed_content),
enqueue_links=self._create_enqueue_links_function(context, extract_links),
extract_links=extract_links,
)

def _create_enqueue_links_function(
def _create_extract_links_function(
self, context: HttpCrawlingContext, parsed_content: TParseResult
) -> EnqueueLinksFunction:
"""Create a callback function for extracting links from parsed content and enqueuing them to the crawl.
) -> ExtractLinksFunction:
"""Create a callback function for extracting links from parsed content.

Args:
context: The current crawling context.
parsed_content: The parsed http response.

Returns:
Awaitable that is used for extracting links from parsed content and enqueuing them to the crawl.
Awaitable that is used for extracting links from parsed content.
"""

async def enqueue_links(
async def extract_links(
*,
selector: str = 'a',
label: str | None = None,
user_data: dict[str, Any] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
| None = None,
**kwargs: Unpack[EnqueueLinksKwargs],
) -> None:
) -> list[Request]:
kwargs.setdefault('strategy', 'same-hostname')

requests = list[Request]()
Expand Down Expand Up @@ -183,8 +185,54 @@ async def enqueue_links(
continue

requests.append(request)
return requests

return extract_links

def _create_enqueue_links_function(
self, context: HttpCrawlingContext, extract_links: ExtractLinksFunction
) -> EnqueueLinksFunction:
"""Create a callback function for extracting links from parsed content and enqueuing them to the crawl.

Args:
context: The current crawling context.
extract_links: Function used to extract links from the page.

await context.add_requests(requests, **kwargs)
Returns:
Awaitable that is used for extracting links from parsed content and enqueuing them to the crawl.
"""

async def enqueue_links(
*,
selector: str | None = None,
label: str | None = None,
user_data: dict[str, Any] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
| None = None,
requests: Sequence[str | Request] | None = None,
**kwargs: Unpack[EnqueueLinksKwargs],
) -> None:
kwargs.setdefault('strategy', 'same-hostname')

if requests:
if any((selector, label, user_data, transform_request_function)):
raise ValueError(
'You cannot provide `selector`, `label`, `user_data` or '
'`transform_request_function` arguments when `requests` is provided.'
)
# Add directly passed requests.
await context.add_requests(requests or list[Union[str, Request]](), **kwargs)
else:
# Add requests from extracted links.
await context.add_requests(
await extract_links(
selector=selector or 'a',
label=label,
user_data=user_data,
transform_request_function=transform_request_function,
),
**kwargs,
)

return enqueue_links

Expand Down
8 changes: 6 additions & 2 deletions src/crawlee/crawlers/_abstract_http/_http_crawling_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from typing_extensions import Self, TypeVar

from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction
from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, ExtractLinksFunction
from crawlee._utils.docs import docs_group
from crawlee.http_clients import HttpCrawlingResult, HttpResponse

Expand Down Expand Up @@ -35,14 +35,18 @@ class ParsedHttpCrawlingContext(Generic[TParseResult], HttpCrawlingContext):

parsed_content: TParseResult
enqueue_links: EnqueueLinksFunction
extract_links: ExtractLinksFunction

@classmethod
def from_http_crawling_context(
cls,
context: HttpCrawlingContext,
parsed_content: TParseResult,
enqueue_links: EnqueueLinksFunction,
extract_links: ExtractLinksFunction,
) -> Self:
"""Initialize a new instance from an existing `HttpCrawlingContext`."""
context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}
return cls(parsed_content=parsed_content, enqueue_links=enqueue_links, **context_kwargs)
return cls(
parsed_content=parsed_content, enqueue_links=enqueue_links, extract_links=extract_links, **context_kwargs
)
Loading
Loading