Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ async def main() -> None:
# Generator will generate real looking browser fingerprint based on the options.
# Unspecified fingerprint options will be automatically selected by the generator.
fingerprint_generator = DefaultFingerprintGenerator(
header_options=HeaderGeneratorOptions(browsers=['chromium']),
header_options=HeaderGeneratorOptions(browsers=['chrome']),
screen_options=ScreenOptions(min_width=400),
)

Expand Down
27 changes: 27 additions & 0 deletions docs/upgrading/upgrading_to_v1.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,33 @@ title: Upgrading to v1

This page summarizes the breaking changes between Crawlee for Python v0.6 and v1.0.

## Distinct use of word `browser` in similar contexts

Two different contexts:
- Playwright related browser
- fingerprinting related browser

Type of `HeaderGeneratorOptions.browsers` changed from `Literal['chromium', 'firefox', 'webkit', 'edge']` to `Literal['chrome', 'firefox', 'safari', 'edge']` as it is related to the fingerprinting context and not to the Playwright context.

Before:

```python
from crawlee.fingerprint_suite import HeaderGeneratorOptions

HeaderGeneratorOptions(browsers=['chromium'])
HeaderGeneratorOptions(browsers=['webkit'])
```

Now:

```python
from crawlee.fingerprint_suite import HeaderGeneratorOptions

HeaderGeneratorOptions(browsers=['chrome'])
HeaderGeneratorOptions(browsers=['safari'])
```


## Storage clients

In v1.0, we are introducing a new storage clients system. We have completely reworked their interface,
Expand Down
3 changes: 2 additions & 1 deletion src/crawlee/browsers/_playwright_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from crawlee._utils.docs import docs_group
from crawlee.browsers._browser_controller import BrowserController
from crawlee.fingerprint_suite import HeaderGenerator
from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type

if TYPE_CHECKING:
from collections.abc import Mapping
Expand Down Expand Up @@ -234,7 +235,7 @@ async def _create_browser_context(
'sec-ch-ua-mobile',
'sec-ch-ua-platform',
},
browser_type=self.browser_type,
browser_type=fingerprint_browser_type_from_playwright_browser_type(self.browser_type),
)
)
else:
Expand Down
7 changes: 6 additions & 1 deletion src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
from crawlee.errors import SessionError
from crawlee.fingerprint_suite import DefaultFingerprintGenerator, FingerprintGenerator, HeaderGeneratorOptions
from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type
from crawlee.http_clients import HttpxHttpClient
from crawlee.sessions._cookies import PlaywrightCookieParam
from crawlee.statistics import StatisticsState
Expand Down Expand Up @@ -158,7 +159,11 @@ def __init__(
# If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
else:
if fingerprint_generator == 'default':
generator_browser_type = None if browser_type is None else [browser_type]
if not browser_type:
generator_browser_type = None
else:
generator_browser_type = [fingerprint_browser_type_from_playwright_browser_type(browser_type)]

fingerprint_generator = DefaultFingerprintGenerator(
header_options=HeaderGeneratorOptions(browsers=generator_browser_type)
)
Expand Down
57 changes: 24 additions & 33 deletions src/crawlee/fingerprint_suite/_browserforge_adapter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import random
from collections.abc import Iterable
from copy import deepcopy
from functools import reduce
Expand All @@ -11,7 +12,6 @@
from browserforge.fingerprints import Fingerprint as bf_Fingerprint
from browserforge.fingerprints import FingerprintGenerator as bf_FingerprintGenerator
from browserforge.fingerprints import Screen
from browserforge.headers import Browser
from browserforge.headers.generator import DATA_DIR, ListOrString
from browserforge.headers.generator import HeaderGenerator as bf_HeaderGenerator
from typing_extensions import override
Expand All @@ -22,6 +22,8 @@
from ._fingerprint_generator import FingerprintGenerator

if TYPE_CHECKING:
from browserforge.headers import Browser

from ._types import HeaderGeneratorOptions, ScreenOptions, SupportedBrowserType


Expand Down Expand Up @@ -69,15 +71,6 @@ def generate(
This patched version of the method adds additional quality checks on the output of the original method. It tries
to generate headers several times until they match the requirements.

The `browser` parameter accepts `chromium` as a general category, which includes not only Google Chrome
but also other Chromium-based browsers. As a result, a Safari-like user agent may be generated for a `chromium`
input, such as:
```
Mozilla/5.0 (iPhone; CPU iPhone OS 18_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko)
CriOS/130.0.6723.90 Mobile/15E148 Safari/604.1
```
To maintain consistency with previous implementations, only a subset of Chromium headers will be allowed.

Returns:
A generated headers.
"""
Expand All @@ -86,21 +79,18 @@ def generate(

single_browser = self._get_single_browser_type(browser)

if single_browser == 'chromium':
# `BrowserForge` header generator considers `chromium` in general sense and therefore will generate also
# other `Chromium` based browser headers. This adapter desires only specific subset of `chromium` headers
if single_browser == 'chrome':
# `BrowserForge` header generator considers `chrome` in general sense and therefore will generate also
# other `chrome` based browser headers. This adapter desires only specific subset of `chrome` headers
# that contain all 'sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform' headers.
# Increase max attempts as from `BrowserForge` header generator perspective even `chromium`
# headers without `sec-...` headers are valid.
max_attempts += 50

# Browserforge uses term 'safari', we use term 'webkit'
bf_browser_type = 'safari' if single_browser == 'webkit' else single_browser

# Use browserforge to generate headers until it satisfies our additional requirements.
for _attempt in range(max_attempts):
generated_header: dict[str, str] = super().generate(
browser=bf_browser_type,
browser=single_browser,
os=os,
device=device,
locale=locale,
Expand All @@ -120,7 +110,7 @@ def generate(
keyword in generated_header['User-Agent']
for keyword in self._get_expected_browser_keywords(single_browser)
):
if single_browser == 'chromium' and not self._contains_all_sec_headers(generated_header):
if single_browser == 'chrome' and not self._contains_all_sec_headers(generated_header):
# Accept chromium header only with all sec headers.
continue

Expand All @@ -145,19 +135,20 @@ def _get_single_browser_type(self, browser: Iterable[str | Browser] | None) -> s
Handling the original multitype would be pointlessly complex.
"""
# In our case we never pass more than one browser type. In general case more browsers are just bigger pool to
# select from, so narrowing it to the first one is still a valid action.
first_browser = (
next(iter(browser)) if (isinstance(browser, Iterable) and not isinstance(browser, str)) else browser
)

if isinstance(first_browser, str):
single_name = first_browser
elif isinstance(first_browser, Browser):
single_name = first_browser.name
else:
single_name = None

return single_name
# select from, so narrowing it to any of them is still a valid action as we are going to pick just one anyway.
if isinstance(browser, str):
return browser
if isinstance(browser, Iterable):
Copy link
Preview

Copilot AI Jul 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Strings are iterable and will be treated as a sequence of characters here. Exclude str from this check (e.g., isinstance(browser, Iterable) and not isinstance(browser, str)) to avoid iterating over individual characters.

Suggested change
if isinstance(browser, Iterable):
if isinstance(browser, Iterable) and not isinstance(browser, str):

Copilot uses AI. Check for mistakes.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

choice = random.choice(
[
single_browser if isinstance(single_browser, str) else single_browser.name
for single_browser in browser
]
)
if choice in {'chrome', 'firefox', 'safari', 'edge'}:
return choice
raise ValueError('Invalid browser type.')
return None


class PatchedFingerprintGenerator(bf_FingerprintGenerator):
Expand Down Expand Up @@ -254,9 +245,9 @@ class BrowserforgeHeaderGenerator:
def __init__(self) -> None:
self._generator = PatchedHeaderGenerator(locale=['en-US', 'en'])

def generate(self, browser_type: SupportedBrowserType = 'chromium') -> dict[str, str]:
def generate(self, browser_type: SupportedBrowserType = 'chrome') -> dict[str, str]:
"""Generate headers."""
return self._generator.generate(browser=browser_type)
return self._generator.generate(browser=[browser_type])


def get_available_header_network() -> dict:
Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/fingerprint_suite/_consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
COMMON_ACCEPT_LANGUAGE = 'en-US,en;q=0.9'

BROWSER_TYPE_HEADER_KEYWORD = {
'chromium': {'Chrome', 'CriOS'},
'chrome': {'Chrome', 'CriOS'},
'firefox': {'Firefox', 'FxiOS'},
'edge': {'Edg', 'Edge', 'EdgA', 'EdgiOS'},
'webkit': {'Safari'},
'safari': {'Safari'},
}
24 changes: 18 additions & 6 deletions src/crawlee/fingerprint_suite/_header_generator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Literal

from crawlee._types import HttpHeaders
from crawlee._utils.docs import docs_group
Expand All @@ -10,6 +10,18 @@
from crawlee.fingerprint_suite._types import SupportedBrowserType


def fingerprint_browser_type_from_playwright_browser_type(
playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
) -> SupportedBrowserType:
if playwright_browser_type == 'chromium':
return 'chrome'
if playwright_browser_type == 'firefox':
return 'firefox'
if playwright_browser_type == 'webkit':
return 'safari'
raise ValueError(f'Unsupported browser type: {playwright_browser_type}')


@docs_group('Classes')
class HeaderGenerator:
"""Generate realistic looking or browser-like HTTP headers."""
Expand All @@ -21,7 +33,7 @@ def _select_specific_headers(self, all_headers: dict[str, str], header_names: se
return HttpHeaders({key: value for key, value in all_headers.items() if key in header_names})

def get_specific_headers(
self, header_names: set[str] | None = None, browser_type: SupportedBrowserType = 'chromium'
self, header_names: set[str] | None = None, browser_type: SupportedBrowserType = 'chrome'
) -> HttpHeaders:
"""Return subset of headers based on the selected `header_names`.

Expand Down Expand Up @@ -50,21 +62,21 @@ def get_random_user_agent_header(self) -> HttpHeaders:
def get_user_agent_header(
self,
*,
browser_type: SupportedBrowserType = 'chromium',
browser_type: SupportedBrowserType = 'chrome',
) -> HttpHeaders:
"""Get the User-Agent header based on the browser type."""
if browser_type not in {'chromium', 'firefox', 'webkit', 'edge'}:
if browser_type not in {'chrome', 'firefox', 'safari', 'edge'}:
raise ValueError(f'Unsupported browser type: {browser_type}')
all_headers = self._generator.generate(browser_type=browser_type)
return self._select_specific_headers(all_headers, header_names={'User-Agent'})

def get_sec_ch_ua_headers(
self,
*,
browser_type: SupportedBrowserType = 'chromium',
browser_type: SupportedBrowserType = 'chrome',
) -> HttpHeaders:
"""Get the sec-ch-ua headers based on the browser type."""
if browser_type not in {'chromium', 'firefox', 'webkit', 'edge'}:
if browser_type not in {'chrome', 'firefox', 'safari', 'edge'}:
raise ValueError(f'Unsupported browser type: {browser_type}')
all_headers = self._generator.generate(browser_type=browser_type)
return self._select_specific_headers(
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/fingerprint_suite/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
SupportedOperatingSystems = Literal['windows', 'macos', 'linux', 'android', 'ios']
SupportedDevices = Literal['desktop', 'mobile']
SupportedHttpVersion = Literal['1', '2']
SupportedBrowserType = Literal['chromium', 'firefox', 'webkit', 'edge']
SupportedBrowserType = Literal['chrome', 'firefox', 'safari', 'edge']


@docs_group('Data structures')
Expand Down
15 changes: 11 additions & 4 deletions tests/unit/crawlers/_playwright/test_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
)
from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_values
from crawlee.fingerprint_suite._consts import BROWSER_TYPE_HEADER_KEYWORD
from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type
from crawlee.http_clients import HttpxHttpClient
from crawlee.proxy_configuration import ProxyConfiguration
from crawlee.sessions import Session, SessionPool
Expand Down Expand Up @@ -190,7 +191,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
[
pytest.param(None, id='No fingerprint generator. Headers generated by header generator.'),
pytest.param(
DefaultFingerprintGenerator(header_options=HeaderGeneratorOptions(browsers=['chromium'])),
DefaultFingerprintGenerator(header_options=HeaderGeneratorOptions(browsers=['chrome'])),
id='Explicitly passed fingerprint generator.',
),
pytest.param('default', id='Default fingerprint generator.'),
Expand All @@ -214,8 +215,11 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
await crawler.run([str(server_url / 'headers')])

user_agent = headers.get('user-agent')
assert user_agent in get_available_header_values(header_network, {'user-agent', 'User-Agent'})
assert any(keyword in user_agent for keyword in BROWSER_TYPE_HEADER_KEYWORD[browser_type]), user_agent
assert user_agent in get_available_header_values(header_network, {'user-agent', 'User-Agent'}), user_agent
assert any(
keyword in user_agent
for keyword in BROWSER_TYPE_HEADER_KEYWORD[fingerprint_browser_type_from_playwright_browser_type(browser_type)]
), user_agent

assert headers.get('sec-ch-ua') in get_available_header_values(header_network, 'sec-ch-ua')
assert headers.get('sec-ch-ua-mobile') in get_available_header_values(header_network, 'sec-ch-ua-mobile')
Expand Down Expand Up @@ -249,7 +253,10 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:

user_agent = headers.get('user-agent')
assert user_agent in get_available_header_values(header_network, {'user-agent', 'User-Agent'})
assert any(keyword in user_agent for keyword in BROWSER_TYPE_HEADER_KEYWORD[browser_type])
assert any(
keyword in user_agent
for keyword in BROWSER_TYPE_HEADER_KEYWORD[fingerprint_browser_type_from_playwright_browser_type(browser_type)]
)


async def test_custom_headers(server_url: URL) -> None:
Expand Down
21 changes: 21 additions & 0 deletions tests/unit/fingerprint_suite/test_adapters.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
from collections.abc import Iterable

import pytest
from browserforge.headers import Browser

from crawlee.fingerprint_suite import (
DefaultFingerprintGenerator,
HeaderGeneratorOptions,
ScreenOptions,
)
from crawlee.fingerprint_suite._browserforge_adapter import PatchedHeaderGenerator
from crawlee.fingerprint_suite._consts import BROWSER_TYPE_HEADER_KEYWORD


def test_fingerprint_generator_has_default() -> None:
Expand Down Expand Up @@ -64,3 +71,17 @@ def test_fingerprint_generator_all_options() -> None:
assert 'Firefox' in fingerprint.navigator.userAgent
assert 'Win' in fingerprint.navigator.oscpu
assert 'en-US' in fingerprint.navigator.languages


@pytest.mark.parametrize(
'browser',
[
'firefox',
['firefox'],
[Browser(name='firefox')],
],
)
def test_patched_header_generator_generate(browser: Iterable[str | Browser]) -> None:
"""Test that PatchedHeaderGenerator works with all the possible types correctly."""
header = PatchedHeaderGenerator().generate(browser=browser)
assert any(keyword in header['User-Agent'] for keyword in BROWSER_TYPE_HEADER_KEYWORD['firefox'])
6 changes: 3 additions & 3 deletions tests/unit/fingerprint_suite/test_header_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def test_get_random_user_agent_header() -> None:
assert headers['User-Agent']


@pytest.mark.parametrize('browser_type', ['chromium', 'firefox', 'edge', 'webkit'])
@pytest.mark.parametrize('browser_type', ['chrome', 'firefox', 'edge', 'safari'])
def test_get_user_agent_header_stress_test(browser_type: SupportedBrowserType, header_network: dict) -> None:
"""Test that the User-Agent header is consistently generated correctly.

Expand All @@ -55,9 +55,9 @@ def test_get_user_agent_header_invalid_browser_type() -> None:


def test_get_sec_ch_ua_headers_chromium(header_network: dict) -> None:
"""Test that Sec-Ch-Ua headers are generated correctly for Chromium."""
"""Test that Sec-Ch-Ua headers are generated correctly for Chrome."""
header_generator = HeaderGenerator()
headers = header_generator.get_sec_ch_ua_headers(browser_type='chromium')
headers = header_generator.get_sec_ch_ua_headers(browser_type='chrome')

assert headers.get('sec-ch-ua') in get_available_header_values(header_network, 'sec-ch-ua')
assert headers.get('sec-ch-ua-mobile') in get_available_header_values(header_network, 'sec-ch-ua-mobile')
Expand Down