apify · Pijukatel · Apr 3, 2025 · Feb 25, 2025 · Feb 25, 2025 · Feb 25, 2025
diff --git a/docs/examples/code_examples/extract_and_add_specific_links_on_website_bs.py b/docs/examples/code_examples/extract_and_add_specific_links_on_website_bs.py
@@ -0,0 +1,36 @@
+import asyncio
+
+from crawlee import Glob
+from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+
+
+async def main() -> None:
+    crawler = BeautifulSoupCrawler(
+        # Limit the crawl to max requests. Remove or increase it for crawling all links.
+        max_requests_per_crawl=10,
+    )
+
+    # Define the default request handler, which will be called for every request.
+    @crawler.router.default_handler
+    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Extract all the documentation links found on the page, except for the examples.
+        extracted_links = await context.extract_links(
+            include=[Glob('https://crawlee.dev/docs/**')],
+            exclude=[Glob('https://crawlee.dev/docs/examples')],
+        )
+        # Some very custom filtering which can't be achieved by `extract_links` arguments.
+        max_link_length = 30
+        filtered_links = [
+            link for link in extracted_links if len(link.url) < max_link_length
+        ]
+        # Add filtered links to the request queue.
+        await context.add_requests(filtered_links)
+
+    # Run the crawler with the initial list of requests.
+    await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/examples/code_examples/extract_and_add_specific_links_on_website_pw.py b/docs/examples/code_examples/extract_and_add_specific_links_on_website_pw.py
@@ -0,0 +1,36 @@
+import asyncio
+
+from crawlee import Glob
+from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
+
+
+async def main() -> None:
+    crawler = PlaywrightCrawler(
+        # Limit the crawl to max requests. Remove or increase it for crawling all links.
+        max_requests_per_crawl=10,
+    )
+
+    # Define the default request handler, which will be called for every request.
+    @crawler.router.default_handler
+    async def request_handler(context: PlaywrightCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Extract all the documentation links found on the page, except for the examples.
+        extracted_links = await context.extract_links(
+            include=[Glob('https://crawlee.dev/docs/**')],
+            exclude=[Glob('https://crawlee.dev/docs/examples')],
+        )
+        # Some very custom filtering which can't be achieved by `extract_links` arguments.
+        max_link_length = 30
+        filtered_links = [
+            link for link in extracted_links if len(link.url) < max_link_length
+        ]
+        # Add filtered links to the request queue.
+        await context.add_requests(filtered_links)
+
+    # Run the crawler with the initial list of requests.
+    await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/examples/crawl_specific_links_on_website.mdx b/docs/examples/crawl_specific_links_on_website.mdx
@@ -11,9 +11,12 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
 import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/crawl_specific_links_on_website_bs.py';
 import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/crawl_specific_links_on_website_pw.py';
 
+import BeautifulSoupExampleExtractAndAdd from '!!raw-loader!roa-loader!./code_examples/extract_and_add_specific_links_on_website_bs.py';
+import PlaywrightExampleExtractAndAdd from '!!raw-loader!roa-loader!./code_examples/extract_and_add_specific_links_on_website_pw.py';
+
 This example demonstrates how to crawl a website while targeting specific patterns of links. By utilizing the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> helper, you can pass `include` or `exclude` parameters to improve your crawling strategy. This approach ensures that only the links matching the specified patterns are added to the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>. Both `include` and `exclude` support lists of globs or regular expressions. This functionality is great for focusing on relevant sections of a website and avoiding scraping unnecessary or irrelevant content.
 
-<Tabs groupId="main">
+<Tabs groupId="first-example">
     <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
         <RunnableCodeBlock className="language-python" language="python">
             {BeautifulSoupExample}
@@ -25,3 +28,20 @@ This example demonstrates how to crawl a website while targeting specific patter
         </RunnableCodeBlock>
     </TabItem>
 </Tabs>
+
+## Even more control over the enqueued links
+
+<ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> is a convenience helper and internally it calls <ApiLink to="class/ExtractLinksFunction">`extract_links`</ApiLink> to find the links and <ApiLink to="class/AddRequestsFunction">`add_requests`</ApiLink> to add them to the queue. If you need some additional custom filtering of the extracted links before enqueuing them, then consider using <ApiLink to="class/ExtractLinksFunction">`extract_links`</ApiLink> and <ApiLink to="class/AddRequestsFunction">`add_requests`</ApiLink> instead of the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink>
+
+<Tabs groupId="second-example">
+    <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
+        <RunnableCodeBlock className="language-python">
+            {BeautifulSoupExampleExtractAndAdd}
+        </RunnableCodeBlock>
+    </TabItem>
+    <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
+        <RunnableCodeBlock className="language-python">
+            {PlaywrightExampleExtractAndAdd}
+        </RunnableCodeBlock>
+    </TabItem>
+</Tabs>
diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py
@@ -324,24 +324,83 @@ def __call__(
 
 @docs_group('Functions')
 class EnqueueLinksFunction(Protocol):
-    """A function for enqueueing new URLs to crawl based on elements selected by a given selector.
+    """A function for enqueueing new URLs to crawl based on elements selected by a given selector or explicit requests.
 
-    It extracts URLs from the current page and enqueues them for further crawling. It allows filtering through
-    selectors and other options. You can also specify labels and user data to be associated with the newly
-    created `Request` objects.
+    It adds explicitly passed `requests` to the `RequestManager` or it extracts URLs from the current page and enqueues
+    them for further crawling. It allows filtering through selectors and other options. You can also specify labels and
+    user data to be associated with the newly created `Request` objects.
+
+    It should not be called with `selector`, `label`, `user_data` or `transform_request_function` arguments together
+    with `requests` argument.
+
+    For even more control over the enqueued links you can use combination of `ExtractLinksFunction` and
+    `AddRequestsFunction`.
     """
 
+    @overload
     def __call__(
         self,
         *,
-        selector: str = 'a',
+        selector: str | None = None,
         label: str | None = None,
         user_data: dict[str, Any] | None = None,
         transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
         **kwargs: Unpack[EnqueueLinksKwargs],
+    ) -> Coroutine[None, None, None]: ...
+
+    @overload
+    def __call__(
+        self, *, requests: Sequence[str | Request] | None = None, **kwargs: Unpack[EnqueueLinksKwargs]
+    ) -> Coroutine[None, None, None]: ...
+
+    def __call__(
+        self,
+        *,
+        selector: str | None = None,
+        label: str | None = None,
+        user_data: dict[str, Any] | None = None,
+        transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
+        requests: Sequence[str | Request] | None = None,
+        **kwargs: Unpack[EnqueueLinksKwargs],
     ) -> Coroutine[None, None, None]:
         """Call enqueue links function.
 
+        Args:
+            selector: A selector used to find the elements containing the links. The behaviour differs based
+                on the crawler used:
+                - `PlaywrightCrawler` supports CSS and XPath selectors.
+                - `ParselCrawler` supports CSS selectors.
+                - `BeautifulSoupCrawler` supports CSS selectors.
+            label: Label for the newly created `Request` objects, used for request routing.
+            user_data: User data to be provided to the newly created `Request` objects.
+            transform_request_function: A function that takes `RequestOptions` and returns either:
+                - Modified `RequestOptions` to update the request configuration,
+                - `'skip'` to exclude the request from being enqueued,
+                - `'unchanged'` to use the original request options without modification.
+            requests: Requests to be added to the `RequestManager`.
+            **kwargs: Additional keyword arguments.
+        """
+
+
+@docs_group('Functions')
+class ExtractLinksFunction(Protocol):
+    """A function for extracting URLs to crawl based on elements selected by a given selector.
+
+    It extracts URLs from the current page and allows filtering through selectors and other options. You can also
+    specify labels and user data to be associated with the newly created `Request` objects.
+    """
+
+    def __call__(
+        self,
+        *,
+        selector: str = 'a',
+        label: str | None = None,
+        user_data: dict[str, Any] | None = None,
+        transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
+        **kwargs: Unpack[EnqueueLinksKwargs],
+    ) -> Coroutine[None, None, list[Request]]:
+        """Call extract links function.
+
         Args:
             selector: A selector used to find the elements containing the links. The behaviour differs based
                 on the crawler used:

diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
@@ -2,7 +2,7 @@
 
 import logging
 from abc import ABC
-from typing import TYPE_CHECKING, Any, Callable, Generic
+from typing import TYPE_CHECKING, Any, Callable, Generic, Union
 
 from pydantic import ValidationError
 from typing_extensions import TypeVar
@@ -17,12 +17,12 @@
 from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult, TSelectResult
 
 if TYPE_CHECKING:
-    from collections.abc import AsyncGenerator, Awaitable
+    from collections.abc import AsyncGenerator, Awaitable, Sequence
 
     from typing_extensions import Unpack
 
     from crawlee import RequestTransformAction
-    from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, EnqueueLinksKwargs
+    from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, EnqueueLinksKwargs, ExtractLinksFunction
 
     from ._abstract_http_parser import AbstractHttpParser
 
@@ -124,34 +124,36 @@ async def _parse_http_response(
             The original crawling context enhanced by the parsing result and enqueue links function.
         """
         parsed_content = await self._parser.parse(context.http_response)
+        extract_links = self._create_extract_links_function(context, parsed_content)
         yield ParsedHttpCrawlingContext.from_http_crawling_context(
             context=context,
             parsed_content=parsed_content,
-            enqueue_links=self._create_enqueue_links_function(context, parsed_content),
+            enqueue_links=self._create_enqueue_links_function(context, extract_links),
+            extract_links=extract_links,
         )
 
-    def _create_enqueue_links_function(
+    def _create_extract_links_function(
         self, context: HttpCrawlingContext, parsed_content: TParseResult
-    ) -> EnqueueLinksFunction:
-        """Create a callback function for extracting links from parsed content and enqueuing them to the crawl.
+    ) -> ExtractLinksFunction:
+        """Create a callback function for extracting links from parsed content.
 
         Args:
             context: The current crawling context.
             parsed_content: The parsed http response.
 
         Returns:
-            Awaitable that is used for extracting links from parsed content and enqueuing them to the crawl.
+            Awaitable that is used for extracting links from parsed content.
         """
 
-        async def enqueue_links(
+        async def extract_links(
             *,
             selector: str = 'a',
             label: str | None = None,
             user_data: dict[str, Any] | None = None,
             transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
             | None = None,
             **kwargs: Unpack[EnqueueLinksKwargs],
-        ) -> None:
+        ) -> list[Request]:
             kwargs.setdefault('strategy', 'same-hostname')
 
             requests = list[Request]()
@@ -183,8 +185,54 @@ async def enqueue_links(
                     continue
 
                 requests.append(request)
+            return requests
+
+        return extract_links
+
+    def _create_enqueue_links_function(
+        self, context: HttpCrawlingContext, extract_links: ExtractLinksFunction
+    ) -> EnqueueLinksFunction:
+        """Create a callback function for extracting links from parsed content and enqueuing them to the crawl.
+
+        Args:
+            context: The current crawling context.
+            extract_links: Function used to extract links from the page.
 
-            await context.add_requests(requests, **kwargs)
+        Returns:
+            Awaitable that is used for extracting links from parsed content and enqueuing them to the crawl.
+        """
+
+        async def enqueue_links(
+            *,
+            selector: str | None = None,
+            label: str | None = None,
+            user_data: dict[str, Any] | None = None,
+            transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
+            | None = None,
+            requests: Sequence[str | Request] | None = None,
+            **kwargs: Unpack[EnqueueLinksKwargs],
+        ) -> None:
+            kwargs.setdefault('strategy', 'same-hostname')
+
+            if requests:
+                if any((selector, label, user_data, transform_request_function)):
+                    raise ValueError(
+                        'You cannot provide `selector`, `label`, `user_data` or '
+                        '`transform_request_function` arguments when `requests` is provided.'
+                    )
+                # Add directly passed requests.
+                await context.add_requests(requests or list[Union[str, Request]](), **kwargs)
+            else:
+                # Add requests from extracted links.
+                await context.add_requests(
+                    await extract_links(
+                        selector=selector or 'a',
+                        label=label,
+                        user_data=user_data,
+                        transform_request_function=transform_request_function,
+                    ),
+                    **kwargs,
+                )
 
         return enqueue_links
 

diff --git a/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py b/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py
@@ -5,7 +5,7 @@
 
 from typing_extensions import Self, TypeVar
 
-from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction
+from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, ExtractLinksFunction
 from crawlee._utils.docs import docs_group
 from crawlee.http_clients import HttpCrawlingResult, HttpResponse
 
@@ -35,14 +35,18 @@ class ParsedHttpCrawlingContext(Generic[TParseResult], HttpCrawlingContext):
 
     parsed_content: TParseResult
     enqueue_links: EnqueueLinksFunction
+    extract_links: ExtractLinksFunction
 
     @classmethod
     def from_http_crawling_context(
         cls,
         context: HttpCrawlingContext,
         parsed_content: TParseResult,
         enqueue_links: EnqueueLinksFunction,
+        extract_links: ExtractLinksFunction,
     ) -> Self:
         """Initialize a new instance from an existing `HttpCrawlingContext`."""
         context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}
-        return cls(parsed_content=parsed_content, enqueue_links=enqueue_links, **context_kwargs)
+        return cls(
+            parsed_content=parsed_content, enqueue_links=enqueue_links, extract_links=extract_links, **context_kwargs
+        )