Skip to content

fix!: Refactor service usage to rely on service_locator #691

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
f0c09ed
refactor!: Refactor service container and other related components
vdusek Nov 13, 2024
e5af190
same approach for event manager and storage client
vdusek Nov 13, 2024
37161a5
Fix cycle imports, tests and everything
vdusek Nov 20, 2024
aeeccb5
Rebase & fix tests
vdusek Nov 25, 2024
5a58054
mypy is broken - add type ignore
vdusek Nov 26, 2024
0310ce3
Add ServiceConflictError
vdusek Nov 26, 2024
7312f3e
Rm explicit EM from Snapshotter
vdusek Nov 26, 2024
13fc248
address some changes in tests
vdusek Dec 2, 2024
d90f445
basic crawler accepts storage client
vdusek Dec 4, 2024
60a1fee
stay with config identifiers
vdusek Dec 4, 2024
2fe908a
rm force flag
vdusek Dec 4, 2024
cd57e69
Refactor storages to allow force_cloud feature
vdusek Dec 4, 2024
9643bd0
Details in storages
vdusek Dec 4, 2024
37c0e38
utilize lazy object proxy
vdusek Dec 5, 2024
c62843c
revert utilization of lazy object proxy
vdusek Dec 5, 2024
8d7dc8c
service_container -> service_locator
vdusek Dec 5, 2024
b81f5b4
address feedback
vdusek Dec 5, 2024
6ebcca0
mem storage client fixture minor changes
vdusek Dec 5, 2024
45124a8
let's keep only prepare_test_env fixture
vdusek Dec 10, 2024
a42190f
update test_default_storage_path_used test
vdusek Dec 10, 2024
b7b0c5e
better conftest
vdusek Dec 11, 2024
1dcecf9
better service locator
vdusek Dec 11, 2024
f85d811
add memory storage from_config constructor
vdusek Dec 12, 2024
50b2fda
tests update
vdusek Dec 12, 2024
99d9468
Update tests/unit/test_configuration.py
janbuchar Dec 12, 2024
c87c395
rebased
vdusek Dec 13, 2024
2fc1509
new release of pydantic-settings issues
vdusek Dec 13, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ __pycache__
# Poetry
poetry.toml

# Other Python tools
.ropeproject

# Mise
mise.toml
.mise.toml
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


async def main() -> None:
storage_client = MemoryStorageClient()
storage_client = MemoryStorageClient.from_config()
# highlight-next-line
await storage_client.purge_on_start()

Expand Down
9 changes: 9 additions & 0 deletions docs/upgrading/upgrading_to_v0x.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,18 @@ This page summarizes the breaking changes between Crawlee for Python zero-based
This section summarizes the breaking changes between v0.4.x and v0.5.0.

### BeautifulSoupParser

- Renamed `BeautifulSoupParser` to `BeautifulSoupParserType`. Probably used only in type hints. Please replace previous usages of `BeautifulSoupParser` by `BeautifulSoupParserType`.
- `BeautifulSoupParser` is now a new class that is used in refactored class `BeautifulSoupCrawler`.

### Service locator

- The `crawlee.service_container` was completely refactored and renamed to `crawlee.service_locator`.

### Statistics

- The `crawlee.statistics.Statistics` class do not accept an event manager as an input argument anymore. It uses the default, global one.

## Upgrading to v0.4

This section summarizes the breaking changes between v0.3.x and v0.4.0.
Expand Down
58 changes: 29 additions & 29 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 7 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,15 @@ parsel = { version = ">=1.9.0", optional = true }
playwright = { version = ">=1.27.0", optional = true }
psutil = ">=6.0.0"
pydantic = ">=2.8.1, !=2.10.0, !=2.10.1, !=2.10.2"
pydantic-settings = ">=2.2.0"
# TODO: relax the upper bound once the issue is resolved:
# https://github.com/apify/crawlee-python/issues/814
pydantic-settings = ">=2.2.0 <2.7.0"
pyee = ">=9.0.0"
sortedcollections = ">=2.1.0"
tldextract = ">=5.1.0"
typer = ">=0.12.0"
typing-extensions = ">=4.1.0"
yarl = "^1.18.0"
yarl = ">=1.18.0"

[tool.poetry.group.dev.dependencies]
build = "~1.2.0"
Expand Down Expand Up @@ -206,9 +208,9 @@ warn_unused_ignores = true

[[tool.mypy.overrides]]
# Example codes are sometimes showing integration of crawlee with external tool, which is not dependency of crawlee.
module =[
"apify", # Example code shows integration of apify and crawlee.
"camoufox" # Example code shows integration of camoufox and crawlee.
module = [
"apify", # Example code shows integration of apify and crawlee.
"camoufox", # Example code shows integration of camoufox and crawlee.
]
ignore_missing_imports = true

Expand Down
3 changes: 2 additions & 1 deletion src/crawlee/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from importlib import metadata

from ._request import Request
from ._service_locator import service_locator
from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders
from ._utils.globs import Glob

__version__ = metadata.version('crawlee')

__all__ = ['ConcurrencySettings', 'EnqueueStrategy', 'Glob', 'HttpHeaders', 'Request']
__all__ = ['ConcurrencySettings', 'EnqueueStrategy', 'Glob', 'HttpHeaders', 'Request', 'service_locator']
25 changes: 8 additions & 17 deletions src/crawlee/_autoscaling/snapshotter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,8 @@
import psutil
from sortedcontainers import SortedList

from crawlee._autoscaling.types import (
ClientSnapshot,
CpuSnapshot,
EventLoopSnapshot,
MemorySnapshot,
Snapshot,
)
from crawlee import service_locator
from crawlee._autoscaling.types import ClientSnapshot, CpuSnapshot, EventLoopSnapshot, MemorySnapshot, Snapshot
from crawlee._utils.byte_size import ByteSize
from crawlee._utils.context import ensure_context
from crawlee._utils.docs import docs_group
Expand All @@ -26,8 +21,6 @@
if TYPE_CHECKING:
from types import TracebackType

from crawlee.events import EventManager

logger = getLogger(__name__)

T = TypeVar('T')
Expand All @@ -45,7 +38,6 @@ class Snapshotter:

def __init__(
self,
event_manager: EventManager,
*,
event_loop_snapshot_interval: timedelta = timedelta(milliseconds=500),
client_snapshot_interval: timedelta = timedelta(milliseconds=1000),
Expand All @@ -63,8 +55,6 @@ def __init__(
"""A default constructor.

Args:
event_manager: The event manager used to emit system info events. From data provided by this event
the CPU and memory usage are read.
event_loop_snapshot_interval: The interval at which the event loop is sampled.
client_snapshot_interval: The interval at which the client is sampled.
max_used_cpu_ratio: Sets the ratio, defining the maximum CPU usage. When the CPU usage is higher than
Expand All @@ -90,7 +80,6 @@ def __init__(
if available_memory_ratio is None and max_memory_size is None:
raise ValueError('At least one of `available_memory_ratio` or `max_memory_size` must be specified')

self._event_manager = event_manager
self._event_loop_snapshot_interval = event_loop_snapshot_interval
self._client_snapshot_interval = client_snapshot_interval
self._max_event_loop_delay = max_event_loop_delay
Expand Down Expand Up @@ -145,8 +134,9 @@ async def __aenter__(self) -> Snapshotter:
raise RuntimeError(f'The {self.__class__.__name__} is already active.')

self._active = True
self._event_manager.on(event=Event.SYSTEM_INFO, listener=self._snapshot_cpu)
self._event_manager.on(event=Event.SYSTEM_INFO, listener=self._snapshot_memory)
event_manager = service_locator.get_event_manager()
event_manager.on(event=Event.SYSTEM_INFO, listener=self._snapshot_cpu)
event_manager.on(event=Event.SYSTEM_INFO, listener=self._snapshot_memory)
self._snapshot_event_loop_task.start()
self._snapshot_client_task.start()
return self
Expand All @@ -168,8 +158,9 @@ async def __aexit__(
if not self._active:
raise RuntimeError(f'The {self.__class__.__name__} is not active.')

self._event_manager.off(event=Event.SYSTEM_INFO, listener=self._snapshot_cpu)
self._event_manager.off(event=Event.SYSTEM_INFO, listener=self._snapshot_memory)
event_manager = service_locator.get_event_manager()
event_manager.off(event=Event.SYSTEM_INFO, listener=self._snapshot_cpu)
event_manager.off(event=Event.SYSTEM_INFO, listener=self._snapshot_memory)
await self._snapshot_event_loop_task.stop()
await self._snapshot_client_task.stop()
self._active = False
Expand Down
34 changes: 15 additions & 19 deletions src/crawlee/_log_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@
import logging
import sys
import textwrap
from typing import TYPE_CHECKING, Any
from typing import Any

from colorama import Fore, Style, just_fix_windows_console
from typing_extensions import assert_never

if TYPE_CHECKING:
from crawlee.configuration import Configuration
from crawlee import service_locator

just_fix_windows_console()

Expand All @@ -35,35 +34,32 @@
_LOG_MESSAGE_INDENT = ' ' * 6


def get_configured_log_level(configuration: Configuration) -> int:
verbose_logging_requested = 'verbose_log' in configuration.model_fields_set and configuration.verbose_log
def get_configured_log_level() -> int:
config = service_locator.get_configuration()

if 'log_level' in configuration.model_fields_set:
if configuration.log_level == 'DEBUG':
verbose_logging_requested = 'verbose_log' in config.model_fields_set and config.verbose_log

if 'log_level' in config.model_fields_set:
if config.log_level == 'DEBUG':
return logging.DEBUG
if configuration.log_level == 'INFO':
if config.log_level == 'INFO':
return logging.INFO
if configuration.log_level == 'WARNING':
if config.log_level == 'WARNING':
return logging.WARNING
if configuration.log_level == 'ERROR':
if config.log_level == 'ERROR':
return logging.ERROR
if configuration.log_level == 'CRITICAL':
if config.log_level == 'CRITICAL':
return logging.CRITICAL

assert_never(configuration.log_level)
assert_never(config.log_level)

if sys.flags.dev_mode or verbose_logging_requested:
return logging.DEBUG

return logging.INFO


def configure_logger(
logger: logging.Logger,
configuration: Configuration,
*,
remove_old_handlers: bool = False,
) -> None:
def configure_logger(logger: logging.Logger, *, remove_old_handlers: bool = False) -> None:
handler = logging.StreamHandler()
handler.setFormatter(CrawleeLogFormatter())

Expand All @@ -72,7 +68,7 @@ def configure_logger(
logger.removeHandler(old_handler)

logger.addHandler(handler)
logger.setLevel(get_configured_log_level(configuration))
logger.setLevel(get_configured_log_level())


class CrawleeLogFormatter(logging.Formatter):
Expand Down
Loading
Loading