Skip to content

Commit 72b5698

Browse files
authored
refactor!: Split BrowserType literal into two different literals based on context (#1070)
### Description Split `BrowserType` literal into two different literals based on context. This avoids some confusion and some implicit string manipulation in favor of explicit name mapping between the two different literals. In Playwright: `'chromium', 'firefox', 'webkit'` In browser fingerprints context it is : `'chrome', 'firefox', 'safari', 'edge'`
1 parent 8f2327d commit 72b5698

File tree

11 files changed

+116
-52
lines changed

11 files changed

+116
-52
lines changed

docs/examples/code_examples/playwright_crawler_with_fingerprint_generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ async def main() -> None:
1313
# Generator will generate real looking browser fingerprint based on the options.
1414
# Unspecified fingerprint options will be automatically selected by the generator.
1515
fingerprint_generator = DefaultFingerprintGenerator(
16-
header_options=HeaderGeneratorOptions(browsers=['chromium']),
16+
header_options=HeaderGeneratorOptions(browsers=['chrome']),
1717
screen_options=ScreenOptions(min_width=400),
1818
)
1919

docs/upgrading/upgrading_to_v1.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,33 @@ title: Upgrading to v1
55

66
This page summarizes the breaking changes between Crawlee for Python v0.6 and v1.0.
77

8+
## Distinct use of word `browser` in similar contexts
9+
10+
Two different contexts:
11+
- Playwright related browser
12+
- fingerprinting related browser
13+
14+
Type of `HeaderGeneratorOptions.browsers` changed from `Literal['chromium', 'firefox', 'webkit', 'edge']` to `Literal['chrome', 'firefox', 'safari', 'edge']` as it is related to the fingerprinting context and not to the Playwright context.
15+
16+
Before:
17+
18+
```python
19+
from crawlee.fingerprint_suite import HeaderGeneratorOptions
20+
21+
HeaderGeneratorOptions(browsers=['chromium'])
22+
HeaderGeneratorOptions(browsers=['webkit'])
23+
```
24+
25+
Now:
26+
27+
```python
28+
from crawlee.fingerprint_suite import HeaderGeneratorOptions
29+
30+
HeaderGeneratorOptions(browsers=['chrome'])
31+
HeaderGeneratorOptions(browsers=['safari'])
32+
```
33+
34+
835
## Storage clients
936

1037
In v1.0, we are introducing a new storage clients system. We have completely reworked their interface,

src/crawlee/browsers/_playwright_browser_controller.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from crawlee._utils.docs import docs_group
1313
from crawlee.browsers._browser_controller import BrowserController
1414
from crawlee.fingerprint_suite import HeaderGenerator
15+
from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type
1516

1617
if TYPE_CHECKING:
1718
from collections.abc import Mapping
@@ -234,7 +235,7 @@ async def _create_browser_context(
234235
'sec-ch-ua-mobile',
235236
'sec-ch-ua-platform',
236237
},
237-
browser_type=self.browser_type,
238+
browser_type=fingerprint_browser_type_from_playwright_browser_type(self.browser_type),
238239
)
239240
)
240241
else:

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
2121
from crawlee.errors import SessionError
2222
from crawlee.fingerprint_suite import DefaultFingerprintGenerator, FingerprintGenerator, HeaderGeneratorOptions
23+
from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type
2324
from crawlee.http_clients import HttpxHttpClient
2425
from crawlee.sessions._cookies import PlaywrightCookieParam
2526
from crawlee.statistics import StatisticsState
@@ -158,7 +159,11 @@ def __init__(
158159
# If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
159160
else:
160161
if fingerprint_generator == 'default':
161-
generator_browser_type = None if browser_type is None else [browser_type]
162+
if not browser_type:
163+
generator_browser_type = None
164+
else:
165+
generator_browser_type = [fingerprint_browser_type_from_playwright_browser_type(browser_type)]
166+
162167
fingerprint_generator = DefaultFingerprintGenerator(
163168
header_options=HeaderGeneratorOptions(browsers=generator_browser_type)
164169
)

src/crawlee/fingerprint_suite/_browserforge_adapter.py

Lines changed: 24 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import random
34
from collections.abc import Iterable
45
from copy import deepcopy
56
from functools import reduce
@@ -11,7 +12,6 @@
1112
from browserforge.fingerprints import Fingerprint as bf_Fingerprint
1213
from browserforge.fingerprints import FingerprintGenerator as bf_FingerprintGenerator
1314
from browserforge.fingerprints import Screen
14-
from browserforge.headers import Browser
1515
from browserforge.headers.generator import DATA_DIR, ListOrString
1616
from browserforge.headers.generator import HeaderGenerator as bf_HeaderGenerator
1717
from typing_extensions import override
@@ -22,6 +22,8 @@
2222
from ._fingerprint_generator import FingerprintGenerator
2323

2424
if TYPE_CHECKING:
25+
from browserforge.headers import Browser
26+
2527
from ._types import HeaderGeneratorOptions, ScreenOptions, SupportedBrowserType
2628

2729

@@ -69,15 +71,6 @@ def generate(
6971
This patched version of the method adds additional quality checks on the output of the original method. It tries
7072
to generate headers several times until they match the requirements.
7173
72-
The `browser` parameter accepts `chromium` as a general category, which includes not only Google Chrome
73-
but also other Chromium-based browsers. As a result, a Safari-like user agent may be generated for a `chromium`
74-
input, such as:
75-
```
76-
Mozilla/5.0 (iPhone; CPU iPhone OS 18_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko)
77-
CriOS/130.0.6723.90 Mobile/15E148 Safari/604.1
78-
```
79-
To maintain consistency with previous implementations, only a subset of Chromium headers will be allowed.
80-
8174
Returns:
8275
A generated headers.
8376
"""
@@ -86,21 +79,18 @@ def generate(
8679

8780
single_browser = self._get_single_browser_type(browser)
8881

89-
if single_browser == 'chromium':
90-
# `BrowserForge` header generator considers `chromium` in general sense and therefore will generate also
91-
# other `Chromium` based browser headers. This adapter desires only specific subset of `chromium` headers
82+
if single_browser == 'chrome':
83+
# `BrowserForge` header generator considers `chrome` in general sense and therefore will generate also
84+
# other `chrome` based browser headers. This adapter desires only specific subset of `chrome` headers
9285
# that contain all 'sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform' headers.
9386
# Increase max attempts as from `BrowserForge` header generator perspective even `chromium`
9487
# headers without `sec-...` headers are valid.
9588
max_attempts += 50
9689

97-
# Browserforge uses term 'safari', we use term 'webkit'
98-
bf_browser_type = 'safari' if single_browser == 'webkit' else single_browser
99-
10090
# Use browserforge to generate headers until it satisfies our additional requirements.
10191
for _attempt in range(max_attempts):
10292
generated_header: dict[str, str] = super().generate(
103-
browser=bf_browser_type,
93+
browser=single_browser,
10494
os=os,
10595
device=device,
10696
locale=locale,
@@ -120,7 +110,7 @@ def generate(
120110
keyword in generated_header['User-Agent']
121111
for keyword in self._get_expected_browser_keywords(single_browser)
122112
):
123-
if single_browser == 'chromium' and not self._contains_all_sec_headers(generated_header):
113+
if single_browser == 'chrome' and not self._contains_all_sec_headers(generated_header):
124114
# Accept chromium header only with all sec headers.
125115
continue
126116

@@ -145,19 +135,20 @@ def _get_single_browser_type(self, browser: Iterable[str | Browser] | None) -> s
145135
Handling the original multitype would be pointlessly complex.
146136
"""
147137
# In our case we never pass more than one browser type. In general case more browsers are just bigger pool to
148-
# select from, so narrowing it to the first one is still a valid action.
149-
first_browser = (
150-
next(iter(browser)) if (isinstance(browser, Iterable) and not isinstance(browser, str)) else browser
151-
)
152-
153-
if isinstance(first_browser, str):
154-
single_name = first_browser
155-
elif isinstance(first_browser, Browser):
156-
single_name = first_browser.name
157-
else:
158-
single_name = None
159-
160-
return single_name
138+
# select from, so narrowing it to any of them is still a valid action as we are going to pick just one anyway.
139+
if isinstance(browser, str):
140+
return browser
141+
if isinstance(browser, Iterable):
142+
choice = random.choice(
143+
[
144+
single_browser if isinstance(single_browser, str) else single_browser.name
145+
for single_browser in browser
146+
]
147+
)
148+
if choice in {'chrome', 'firefox', 'safari', 'edge'}:
149+
return choice
150+
raise ValueError('Invalid browser type.')
151+
return None
161152

162153

163154
class PatchedFingerprintGenerator(bf_FingerprintGenerator):
@@ -254,9 +245,9 @@ class BrowserforgeHeaderGenerator:
254245
def __init__(self) -> None:
255246
self._generator = PatchedHeaderGenerator(locale=['en-US', 'en'])
256247

257-
def generate(self, browser_type: SupportedBrowserType = 'chromium') -> dict[str, str]:
248+
def generate(self, browser_type: SupportedBrowserType = 'chrome') -> dict[str, str]:
258249
"""Generate headers."""
259-
return self._generator.generate(browser=browser_type)
250+
return self._generator.generate(browser=[browser_type])
260251

261252

262253
def get_available_header_network() -> dict:

src/crawlee/fingerprint_suite/_consts.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
COMMON_ACCEPT_LANGUAGE = 'en-US,en;q=0.9'
44

55
BROWSER_TYPE_HEADER_KEYWORD = {
6-
'chromium': {'Chrome', 'CriOS'},
6+
'chrome': {'Chrome', 'CriOS'},
77
'firefox': {'Firefox', 'FxiOS'},
88
'edge': {'Edg', 'Edge', 'EdgA', 'EdgiOS'},
9-
'webkit': {'Safari'},
9+
'safari': {'Safari'},
1010
}

src/crawlee/fingerprint_suite/_header_generator.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from __future__ import annotations
22

3-
from typing import TYPE_CHECKING
3+
from typing import TYPE_CHECKING, Literal
44

55
from crawlee._types import HttpHeaders
66
from crawlee._utils.docs import docs_group
@@ -10,6 +10,18 @@
1010
from crawlee.fingerprint_suite._types import SupportedBrowserType
1111

1212

13+
def fingerprint_browser_type_from_playwright_browser_type(
14+
playwright_browser_type: Literal['chromium', 'firefox', 'webkit'],
15+
) -> SupportedBrowserType:
16+
if playwright_browser_type == 'chromium':
17+
return 'chrome'
18+
if playwright_browser_type == 'firefox':
19+
return 'firefox'
20+
if playwright_browser_type == 'webkit':
21+
return 'safari'
22+
raise ValueError(f'Unsupported browser type: {playwright_browser_type}')
23+
24+
1325
@docs_group('Classes')
1426
class HeaderGenerator:
1527
"""Generate realistic looking or browser-like HTTP headers."""
@@ -21,7 +33,7 @@ def _select_specific_headers(self, all_headers: dict[str, str], header_names: se
2133
return HttpHeaders({key: value for key, value in all_headers.items() if key in header_names})
2234

2335
def get_specific_headers(
24-
self, header_names: set[str] | None = None, browser_type: SupportedBrowserType = 'chromium'
36+
self, header_names: set[str] | None = None, browser_type: SupportedBrowserType = 'chrome'
2537
) -> HttpHeaders:
2638
"""Return subset of headers based on the selected `header_names`.
2739
@@ -50,21 +62,21 @@ def get_random_user_agent_header(self) -> HttpHeaders:
5062
def get_user_agent_header(
5163
self,
5264
*,
53-
browser_type: SupportedBrowserType = 'chromium',
65+
browser_type: SupportedBrowserType = 'chrome',
5466
) -> HttpHeaders:
5567
"""Get the User-Agent header based on the browser type."""
56-
if browser_type not in {'chromium', 'firefox', 'webkit', 'edge'}:
68+
if browser_type not in {'chrome', 'firefox', 'safari', 'edge'}:
5769
raise ValueError(f'Unsupported browser type: {browser_type}')
5870
all_headers = self._generator.generate(browser_type=browser_type)
5971
return self._select_specific_headers(all_headers, header_names={'User-Agent'})
6072

6173
def get_sec_ch_ua_headers(
6274
self,
6375
*,
64-
browser_type: SupportedBrowserType = 'chromium',
76+
browser_type: SupportedBrowserType = 'chrome',
6577
) -> HttpHeaders:
6678
"""Get the sec-ch-ua headers based on the browser type."""
67-
if browser_type not in {'chromium', 'firefox', 'webkit', 'edge'}:
79+
if browser_type not in {'chrome', 'firefox', 'safari', 'edge'}:
6880
raise ValueError(f'Unsupported browser type: {browser_type}')
6981
all_headers = self._generator.generate(browser_type=browser_type)
7082
return self._select_specific_headers(

src/crawlee/fingerprint_suite/_types.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
SupportedOperatingSystems = Literal['windows', 'macos', 'linux', 'android', 'ios']
1010
SupportedDevices = Literal['desktop', 'mobile']
1111
SupportedHttpVersion = Literal['1', '2']
12-
SupportedBrowserType = Literal['chromium', 'firefox', 'webkit', 'edge']
12+
SupportedBrowserType = Literal['chrome', 'firefox', 'safari', 'edge']
1313

1414

1515
@docs_group('Data structures')

tests/unit/crawlers/_playwright/test_playwright_crawler.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
)
3131
from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_values
3232
from crawlee.fingerprint_suite._consts import BROWSER_TYPE_HEADER_KEYWORD
33+
from crawlee.fingerprint_suite._header_generator import fingerprint_browser_type_from_playwright_browser_type
3334
from crawlee.http_clients import HttpxHttpClient
3435
from crawlee.proxy_configuration import ProxyConfiguration
3536
from crawlee.sessions import Session, SessionPool
@@ -190,7 +191,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
190191
[
191192
pytest.param(None, id='No fingerprint generator. Headers generated by header generator.'),
192193
pytest.param(
193-
DefaultFingerprintGenerator(header_options=HeaderGeneratorOptions(browsers=['chromium'])),
194+
DefaultFingerprintGenerator(header_options=HeaderGeneratorOptions(browsers=['chrome'])),
194195
id='Explicitly passed fingerprint generator.',
195196
),
196197
pytest.param('default', id='Default fingerprint generator.'),
@@ -214,8 +215,11 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
214215
await crawler.run([str(server_url / 'headers')])
215216

216217
user_agent = headers.get('user-agent')
217-
assert user_agent in get_available_header_values(header_network, {'user-agent', 'User-Agent'})
218-
assert any(keyword in user_agent for keyword in BROWSER_TYPE_HEADER_KEYWORD[browser_type]), user_agent
218+
assert user_agent in get_available_header_values(header_network, {'user-agent', 'User-Agent'}), user_agent
219+
assert any(
220+
keyword in user_agent
221+
for keyword in BROWSER_TYPE_HEADER_KEYWORD[fingerprint_browser_type_from_playwright_browser_type(browser_type)]
222+
), user_agent
219223

220224
assert headers.get('sec-ch-ua') in get_available_header_values(header_network, 'sec-ch-ua')
221225
assert headers.get('sec-ch-ua-mobile') in get_available_header_values(header_network, 'sec-ch-ua-mobile')
@@ -249,7 +253,10 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
249253

250254
user_agent = headers.get('user-agent')
251255
assert user_agent in get_available_header_values(header_network, {'user-agent', 'User-Agent'})
252-
assert any(keyword in user_agent for keyword in BROWSER_TYPE_HEADER_KEYWORD[browser_type])
256+
assert any(
257+
keyword in user_agent
258+
for keyword in BROWSER_TYPE_HEADER_KEYWORD[fingerprint_browser_type_from_playwright_browser_type(browser_type)]
259+
)
253260

254261

255262
async def test_custom_headers(server_url: URL) -> None:

tests/unit/fingerprint_suite/test_adapters.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,15 @@
1+
from collections.abc import Iterable
2+
3+
import pytest
4+
from browserforge.headers import Browser
5+
16
from crawlee.fingerprint_suite import (
27
DefaultFingerprintGenerator,
38
HeaderGeneratorOptions,
49
ScreenOptions,
510
)
11+
from crawlee.fingerprint_suite._browserforge_adapter import PatchedHeaderGenerator
12+
from crawlee.fingerprint_suite._consts import BROWSER_TYPE_HEADER_KEYWORD
613

714

815
def test_fingerprint_generator_has_default() -> None:
@@ -64,3 +71,17 @@ def test_fingerprint_generator_all_options() -> None:
6471
assert 'Firefox' in fingerprint.navigator.userAgent
6572
assert 'Win' in fingerprint.navigator.oscpu
6673
assert 'en-US' in fingerprint.navigator.languages
74+
75+
76+
@pytest.mark.parametrize(
77+
'browser',
78+
[
79+
'firefox',
80+
['firefox'],
81+
[Browser(name='firefox')],
82+
],
83+
)
84+
def test_patched_header_generator_generate(browser: Iterable[str | Browser]) -> None:
85+
"""Test that PatchedHeaderGenerator works with all the possible types correctly."""
86+
header = PatchedHeaderGenerator().generate(browser=browser)
87+
assert any(keyword in header['User-Agent'] for keyword in BROWSER_TYPE_HEADER_KEYWORD['firefox'])

0 commit comments

Comments
 (0)