Skip to content

feat: Add statistics_log_format parameter to BasicCrawler #1061

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Mar 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions docs/examples/code_examples/configure_json_logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from __future__ import annotations

import asyncio
import inspect
import logging
import sys
from typing import TYPE_CHECKING

from loguru import logger

from crawlee.crawlers import HttpCrawler, HttpCrawlingContext

if TYPE_CHECKING:
from loguru import Record


# Configure loguru interceptor to capture standard logging output
class InterceptHandler(logging.Handler):
def emit(self, record: logging.LogRecord) -> None:
# Get corresponding Loguru level if it exists
try:
level: str | int = logger.level(record.levelname).name
except ValueError:
level = record.levelno

# Find caller from where originated the logged message
frame, depth = inspect.currentframe(), 0
while frame:
filename = frame.f_code.co_filename
is_logging = filename == logging.__file__
is_frozen = 'importlib' in filename and '_bootstrap' in filename
if depth > 0 and not (is_logging | is_frozen):
break
frame = frame.f_back
depth += 1

dummy_record = logging.LogRecord('dummy', 0, 'dummy', 0, 'dummy', None, None)
standard_attrs = set(dummy_record.__dict__.keys())
extra_dict = {
key: value
for key, value in record.__dict__.items()
if key not in standard_attrs
}

(
logger.bind(**extra_dict)
.opt(depth=depth, exception=record.exc_info)
.patch(lambda loguru_record: loguru_record.update({'name': record.name}))
.log(level, record.getMessage())
)


# Configure loguru formatter
def formatter(record: Record) -> str:
basic_format = '[{name}] | <level>{level: ^8}</level> | - {message}'
if record['extra']:
basic_format = basic_format + ' {extra}'
return f'{basic_format}\n'


# Remove default loguru logger
logger.remove()

# Set up loguru with JSONL serialization in file `crawler.log`
logger.add('crawler.log', format=formatter, serialize=True, level='INFO')

# Set up loguru logger for console
logger.add(sys.stderr, format=formatter, colorize=True, level='INFO')

# Configure standard logging to use our interceptor
logging.basicConfig(handlers=[InterceptHandler()], level=logging.INFO, force=True)


async def main() -> None:
# Initialize crawler with disabled table logs
crawler = HttpCrawler(
configure_logging=False, # Disable default logging configuration
statistics_log_format='inline', # Set inline formatting for statistics logs
)

# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Run the crawler
await crawler.run(['https://www.crawlee.dev/'])


if __name__ == '__main__':
asyncio.run(main())
57 changes: 57 additions & 0 deletions docs/examples/json_logging.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
---
id: configure-json-logging
title: Сonfigure JSON logging
---

import ApiLink from '@site/src/components/ApiLink';
import CodeBlock from '@theme/CodeBlock';

import JsonLoggingExample from '!!raw-loader!./code_examples/configure_json_logging.py';

This example demonstrates how to configure JSON line (JSONL) logging with Crawlee. By using the `use_table_logs=False` parameter, you can disable table-formatted statistics logs, which makes it easier to parse logs with external tools or to serialize them as JSON.

The example shows how to integrate with the popular [`loguru`](https://github.com/delgan/loguru) library to capture Crawlee logs and format them as JSONL (one JSON object per line). This approach works well when you need to collect logs for analysis, monitoring, or when integrating with logging platforms like ELK Stack, Grafana Loki, or similar systems.

<CodeBlock className="language-python">
{JsonLoggingExample}
</CodeBlock>

Here's an example of what a crawler statistics log entry in JSONL format.

```json
{
"text": "[HttpCrawler] | INFO | - Final request statistics: {'requests_finished': 1, 'requests_failed': 0, 'retry_histogram': [1], 'request_avg_failed_duration': None, 'request_avg_finished_duration': 3.57098, 'requests_finished_per_minute': 17, 'requests_failed_per_minute': 0, 'request_total_duration': 3.57098, 'requests_total': 1, 'crawler_runtime': 3.59165}\n",
"record": {
"elapsed": { "repr": "0:00:05.604568", "seconds": 5.604568 },
"exception": null,
"extra": {
"requests_finished": 1,
"requests_failed": 0,
"retry_histogram": [1],
"request_avg_failed_duration": null,
"request_avg_finished_duration": 3.57098,
"requests_finished_per_minute": 17,
"requests_failed_per_minute": 0,
"request_total_duration": 3.57098,
"requests_total": 1,
"crawler_runtime": 3.59165
},
"file": {
"name": "_basic_crawler.py",
"path": "/crawlers/_basic/_basic_crawler.py"
},
"function": "run",
"level": { "icon": "ℹ️", "name": "INFO", "no": 20 },
"line": 583,
"message": "Final request statistics:",
"module": "_basic_crawler",
"name": "HttpCrawler",
"process": { "id": 198383, "name": "MainProcess" },
"thread": { "id": 135312814966592, "name": "MainThread" },
"time": {
"repr": "2025-03-17 17:14:45.339150+00:00",
"timestamp": 1742231685.33915
}
}
}
```
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ module = [
"apify_fingerprint_datapoints", # Untyped and stubs not available
"camoufox", # Example code shows integration of camoufox and crawlee.
"jaro", # Untyped and stubs not available
"loguru", # Example code shows integration of loguru and crawlee for JSON logging.
"sklearn.linear_model", # Untyped and stubs not available
]
ignore_missing_imports = true
Expand Down
18 changes: 15 additions & 3 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from datetime import timedelta
from functools import partial
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Generic, Union, cast
from typing import TYPE_CHECKING, Any, Callable, Generic, Literal, Union, cast
from urllib.parse import ParseResult, urlparse
from weakref import WeakKeyDictionary

Expand Down Expand Up @@ -135,6 +135,11 @@ class _BasicCrawlerOptions(TypedDict):
configure_logging: NotRequired[bool]
"""If True, the crawler will set up logging infrastructure automatically."""

statistics_log_format: NotRequired[Literal['table', 'inline']]
"""If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain
text log messages.
"""

keep_alive: NotRequired[bool]
"""Flag that can keep crawler running even when there are no requests in queue."""

Expand Down Expand Up @@ -231,6 +236,7 @@ def __init__(
abort_on_error: bool = False,
keep_alive: bool = False,
configure_logging: bool = True,
statistics_log_format: Literal['table', 'inline'] = 'table',
_context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
_additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None,
_logger: logging.Logger | None = None,
Expand Down Expand Up @@ -271,6 +277,8 @@ def __init__(
keep_alive: If True, it will keep crawler alive even if there are no requests in queue.
Use `crawler.stop()` to exit the crawler.
configure_logging: If True, the crawler will set up logging infrastructure automatically.
statistics_log_format: If 'table', displays crawler statistics as formatted tables in logs. If 'inline',
outputs statistics as plain text log messages.
_context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
_additional_context_managers: Additional context managers used throughout the crawler lifecycle.
Expand Down Expand Up @@ -346,12 +354,14 @@ def __init__(
httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger
httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
self._logger = _logger or logging.getLogger(__name__)
self._statistics_log_format = statistics_log_format

# Statistics
self._statistics = statistics or cast(
'Statistics[TStatisticsState]',
Statistics.with_default_state(
periodic_message_logger=self._logger,
statistics_log_format=self._statistics_log_format,
log_message='Current request statistics:',
),
)
Expand Down Expand Up @@ -567,8 +577,10 @@ def sigint_handler() -> None:
await self._save_crawler_state()

final_statistics = self._statistics.calculate()
self._logger.info(f'Final request statistics:\n{final_statistics.to_table()}')

if self._statistics_log_format == 'table':
self._logger.info(f'Final request statistics:\n{final_statistics.to_table()}')
else:
self._logger.info('Final request statistics:', extra=final_statistics.to_dict())
return final_statistics

async def _run_crawler(self) -> None:
Expand Down
3 changes: 3 additions & 0 deletions src/crawlee/statistics/_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ def to_table(self) -> str:

return make_table([(str(k), str(v)) for k, v in str_dict.items()], width=60)

def to_dict(self) -> dict[str, float | int | list[int]]:
return {k: v.total_seconds() if isinstance(v, timedelta) else v for k, v in asdict(self).items()}

@override
def __str__(self) -> str:
return json.dumps(
Expand Down
11 changes: 9 additions & 2 deletions src/crawlee/statistics/_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import math
from datetime import datetime, timedelta, timezone
from logging import Logger, getLogger
from typing import TYPE_CHECKING, Any, Generic, cast
from typing import TYPE_CHECKING, Any, Generic, Literal, cast

from typing_extensions import Self, TypeVar

Expand Down Expand Up @@ -76,6 +76,7 @@ def __init__(
periodic_message_logger: Logger | None = None,
log_interval: timedelta = timedelta(minutes=1),
state_model: type[TStatisticsState],
statistics_log_format: Literal['table', 'inline'] = 'table',
) -> None:
self._id = Statistics.__next_id
Statistics.__next_id += 1
Expand All @@ -99,6 +100,7 @@ def __init__(
self._key_value_store: KeyValueStore | None = key_value_store

self._log_message = log_message
self._statistics_log_format = statistics_log_format
self._periodic_message_logger = periodic_message_logger or logger
self._periodic_logger = RecurringTask(self._log, log_interval)

Expand Down Expand Up @@ -129,6 +131,7 @@ def with_default_state(
log_message: str = 'Statistics',
periodic_message_logger: Logger | None = None,
log_interval: timedelta = timedelta(minutes=1),
statistics_log_format: Literal['table', 'inline'] = 'table',
) -> Statistics[StatisticsState]:
"""Convenience constructor for creating a `Statistics` with default state model `StatisticsState`."""
return Statistics[StatisticsState](
Expand All @@ -140,6 +143,7 @@ def with_default_state(
periodic_message_logger=periodic_message_logger,
log_interval=log_interval,
state_model=StatisticsState,
statistics_log_format=statistics_log_format,
)

@property
Expand Down Expand Up @@ -281,7 +285,10 @@ async def reset(self) -> None:

def _log(self) -> None:
stats = self.calculate()
self._periodic_message_logger.info(f'{self._log_message}\n{stats.to_table()}')
if self._statistics_log_format == 'table':
self._periodic_message_logger.info(f'{self._log_message}\n{stats.to_table()}')
else:
self._periodic_message_logger.info(self._log_message, extra=stats.to_dict())

async def _maybe_load_statistics(self) -> None:
if not self._persistence_enabled:
Expand Down
60 changes: 42 additions & 18 deletions tests/unit/crawlers/_basic/test_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from dataclasses import dataclass
from datetime import timedelta
from pathlib import Path
from typing import TYPE_CHECKING, Any, cast
from typing import TYPE_CHECKING, Any, Literal, cast
from unittest.mock import AsyncMock, Mock, call

import httpx
Expand Down Expand Up @@ -889,11 +889,20 @@ async def handler(context: BasicCrawlingContext) -> None:


@pytest.mark.skipif(os.name == 'nt' and 'CI' in os.environ, reason='Skipped in Windows CI')
async def test_logs_final_statistics(monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture) -> None:
@pytest.mark.parametrize(
('statistics_log_format'),
[
pytest.param('table', id='With table for logs'),
pytest.param('inline', id='With inline logs'),
],
)
async def test_logs_final_statistics(
monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture, statistics_log_format: Literal['table', 'inline']
) -> None:
# Set the log level to INFO to capture the final statistics log.
caplog.set_level(logging.INFO)

crawler = BasicCrawler(configure_logging=False)
crawler = BasicCrawler(configure_logging=False, statistics_log_format=statistics_log_format)

@crawler.router.default_handler
async def handler(context: BasicCrawlingContext) -> None:
Expand Down Expand Up @@ -923,21 +932,36 @@ async def handler(context: BasicCrawlingContext) -> None:
)

assert final_statistics is not None
assert final_statistics.msg.splitlines() == [
'Final request statistics:',
'┌───────────────────────────────┬───────────┐',
'│ requests_finished │ 4 │',
'│ requests_failed │ 33 │',
'│ retry_histogram │ [1, 4, 8] │',
'│ request_avg_failed_duration │ 99.0 │',
'│ request_avg_finished_duration │ 0.483 │',
'│ requests_finished_per_minute │ 0.33 │',
'│ requests_failed_per_minute │ 0.1 │',
'│ request_total_duration │ 720.0 │',
'│ requests_total │ 37 │',
'│ crawler_runtime │ 300.0 │',
'└───────────────────────────────┴───────────┘',
]
if statistics_log_format == 'table':
assert final_statistics.msg.splitlines() == [
'Final request statistics:',
'┌───────────────────────────────┬───────────┐',
'│ requests_finished │ 4 │',
'│ requests_failed │ 33 │',
'│ retry_histogram │ [1, 4, 8] │',
'│ request_avg_failed_duration │ 99.0 │',
'│ request_avg_finished_duration │ 0.483 │',
'│ requests_finished_per_minute │ 0.33 │',
'│ requests_failed_per_minute │ 0.1 │',
'│ request_total_duration │ 720.0 │',
'│ requests_total │ 37 │',
'│ crawler_runtime │ 300.0 │',
'└───────────────────────────────┴───────────┘',
]
else:
assert final_statistics.msg == 'Final request statistics:'

# ignore[attr-defined] since `extra` parameters are not defined for `LogRecord`
assert final_statistics.requests_finished == 4 # type: ignore[attr-defined]
assert final_statistics.requests_failed == 33 # type: ignore[attr-defined]
assert final_statistics.retry_histogram == [1, 4, 8] # type: ignore[attr-defined]
assert final_statistics.request_avg_failed_duration == 99.0 # type: ignore[attr-defined]
assert final_statistics.request_avg_finished_duration == 0.483 # type: ignore[attr-defined]
assert final_statistics.requests_finished_per_minute == 0.33 # type: ignore[attr-defined]
assert final_statistics.requests_failed_per_minute == 0.1 # type: ignore[attr-defined]
assert final_statistics.request_total_duration == 720.0 # type: ignore[attr-defined]
assert final_statistics.requests_total == 37 # type: ignore[attr-defined]
assert final_statistics.crawler_runtime == 300.0 # type: ignore[attr-defined]


async def test_crawler_manual_stop(httpbin: URL) -> None:
Expand Down
Loading