Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add use_table_logs parameter to control using tables in logs #1061

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions docs/examples/code_examples/configure_json_logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from __future__ import annotations

import asyncio
import inspect
import logging

from loguru import logger

from crawlee.crawlers import HttpCrawler, HttpCrawlingContext


# Configure loguru interceptor to capture standard logging output
class InterceptHandler(logging.Handler):
def emit(self, record: logging.LogRecord) -> None:
# Get corresponding Loguru level if it exists
try:
level: str | int = logger.level(record.levelname).name
except ValueError:
level = record.levelno

# Find caller from where originated the logged message
frame, depth = inspect.currentframe(), 0
while frame:
filename = frame.f_code.co_filename
is_logging = filename == logging.__file__
is_frozen = 'importlib' in filename and '_bootstrap' in filename
if depth > 0 and not (is_logging | is_frozen):
break
frame = frame.f_back
depth += 1

logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage())


# Set up loguru with JSONL serialization in file `crawler.log`
logger.add('crawler.log', serialize=True, level='INFO')

# Configure standard logging to use our interceptor
logging.basicConfig(handlers=[InterceptHandler()], level=logging.INFO, force=True)


async def main() -> None:
# Initialize crawler with disabled table logs
crawler = HttpCrawler(
configure_logging=False, # Disable default logging configuration
use_table_logs=False, # Disable table formatting in statistics logs
)

# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: HttpCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Run the crawler
await crawler.run(['https://www.crawlee.dev/'])


if __name__ == '__main__':
asyncio.run(main())
46 changes: 46 additions & 0 deletions docs/examples/json_logging.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
---
id: configure-jsonl-logging
title: Сonfigure JSON logging
---

import ApiLink from '@site/src/components/ApiLink';
import CodeBlock from '@theme/CodeBlock';

import JsonLoggingExample from '!!raw-loader!./code_examples/configure_json_logging.py';

This example demonstrates how to configure JSON line (JSONL) logging with Crawlee. By using the `use_table_logs=False` parameter, you can disable table-formatted statistics logs, which makes it easier to parse logs with external tools or to serialize them as JSON.

The example shows how to integrate with the popular [`loguru`](https://github.com/delgan/loguru) library to capture Crawlee logs and format them as JSONL (one JSON object per line). This approach works well when you need to collect logs for analysis, monitoring, or when integrating with logging platforms like ELK Stack, Grafana Loki, or similar systems.

<CodeBlock className="language-python">
{JsonLoggingExample}
</CodeBlock>

Here's an example of what a crawler statistics log entry in JSONL format.

```json
{
"text": "2025-03-07 16:51:09.947 | INFO | crawlee.crawlers._basic._basic_crawler:run:580 - Final request statistics: requests_finished: 1; requests_failed: 0; retry_histogram: [1]; request_avg_failed_duration: None; request_avg_finished_duration: 0.795506; requests_finished_per_minute: 73; requests_failed_per_minute: 0; request_total_duration: 0.795506; requests_total: 1; crawler_runtime: 0.818803\n",
"record": {
"elapsed": { "repr": "0:00:01.921982", "seconds": 1.921982 },
"exception": null,
"extra": {},
"file": {
"name": "_basic_crawler.py",
"path": "/src/crawlee/crawlers/_basic/_basic_crawler.py"
},
"function": "run",
"level": { "icon": "ℹ️", "name": "INFO", "no": 20 },
"line": 580,
"message": "Final request statistics: requests_finished: 1; requests_failed: 0; retry_histogram: [1]; request_avg_failed_duration: None; request_avg_finished_duration: 0.795506; requests_finished_per_minute: 73; requests_failed_per_minute: 0; request_total_duration: 0.795506; requests_total: 1; crawler_runtime: 0.818803",
"module": "_basic_crawler",
"name": "crawlee.crawlers._basic._basic_crawler",
"process": { "id": 32118, "name": "MainProcess" },
"thread": { "id": 139760540858176, "name": "MainThread" },
"time": {
"repr": "2025-03-07 16:51:09.947345+00:00",
"timestamp": 1741366269.947345
}
}
}
```
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ module = [
"apify", # Example code shows integration of apify and crawlee.
"camoufox", # Example code shows integration of camoufox and crawlee.
"jaro", # Untyped and stubs not available
"loguru", # Example code shows integration of loguru and crawlee for JSON logging.
"sklearn.linear_model", # Untyped and stubs not available
]
ignore_missing_imports = true
Expand Down
16 changes: 14 additions & 2 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,11 @@ class _BasicCrawlerOptions(TypedDict):
configure_logging: NotRequired[bool]
"""If True, the crawler will set up logging infrastructure automatically."""

use_table_logs: NotRequired[bool]
"""If True, displays crawler statistics as formatted tables in logs. If False, outputs statistics as plain
text logmessages.
"""

keep_alive: NotRequired[bool]
"""Flag that can keep crawler running even when there are no requests in queue."""

Expand Down Expand Up @@ -230,6 +235,7 @@ def __init__(
abort_on_error: bool = False,
keep_alive: bool = False,
configure_logging: bool = True,
use_table_logs: bool = True,
_context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
_additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None,
_logger: logging.Logger | None = None,
Expand Down Expand Up @@ -270,6 +276,8 @@ def __init__(
keep_alive: If True, it will keep crawler alive even if there are no requests in queue.
Use `crawler.stop()` to exit the crawler.
configure_logging: If True, the crawler will set up logging infrastructure automatically.
use_table_logs: If True, displays crawler statistics as formatted tables in logs. If False, outputs
statistics as plain text log messages.
_context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
_additional_context_managers: Additional context managers used throughout the crawler lifecycle.
Expand Down Expand Up @@ -345,12 +353,14 @@ def __init__(
httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger
httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
self._logger = _logger or logging.getLogger(__name__)
self._use_table_logs = use_table_logs

# Statistics
self._statistics = statistics or cast(
Statistics[TStatisticsState],
Statistics.with_default_state(
periodic_message_logger=self._logger,
use_table_logs=self._use_table_logs,
log_message='Current request statistics:',
),
)
Expand Down Expand Up @@ -564,8 +574,10 @@ def sigint_handler() -> None:
await self._save_crawler_state()

final_statistics = self._statistics.calculate()
self._logger.info(f'Final request statistics:\n{final_statistics.to_table()}')

if self._use_table_logs:
self._logger.info(f'Final request statistics:\n{final_statistics.to_table()}')
else:
self._logger.info(f'Final request statistics: {final_statistics.to_string()}')
return final_statistics

async def _run_crawler(self) -> None:
Expand Down
5 changes: 5 additions & 0 deletions src/crawlee/statistics/_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ def to_table(self) -> str:

return capture.get().strip('\n')

def to_string(self) -> str:
return '; '.join(
[f'{k}: {v.total_seconds() if isinstance(v, timedelta) else v}' for k, v in asdict(self).items()]
)

@override
def __str__(self) -> str:
return json.dumps(
Expand Down
9 changes: 8 additions & 1 deletion src/crawlee/statistics/_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def __init__(
periodic_message_logger: Logger | None = None,
log_interval: timedelta = timedelta(minutes=1),
state_model: type[TStatisticsState],
use_table_logs: bool = True,
) -> None:
self._id = Statistics.__next_id
Statistics.__next_id += 1
Expand All @@ -99,6 +100,7 @@ def __init__(
self._key_value_store: KeyValueStore | None = key_value_store

self._log_message = log_message
self._use_table_logs = use_table_logs
self._periodic_message_logger = periodic_message_logger or logger
self._periodic_logger = RecurringTask(self._log, log_interval)

Expand Down Expand Up @@ -129,6 +131,7 @@ def with_default_state(
log_message: str = 'Statistics',
periodic_message_logger: Logger | None = None,
log_interval: timedelta = timedelta(minutes=1),
use_table_logs: bool = True,
) -> Statistics[StatisticsState]:
"""Convenience constructor for creating a `Statistics` with default state model `StatisticsState`."""
return Statistics[StatisticsState](
Expand All @@ -140,6 +143,7 @@ def with_default_state(
periodic_message_logger=periodic_message_logger,
log_interval=log_interval,
state_model=StatisticsState,
use_table_logs=use_table_logs,
)

@property
Expand Down Expand Up @@ -281,7 +285,10 @@ async def reset(self) -> None:

def _log(self) -> None:
stats = self.calculate()
self._periodic_message_logger.info(f'{self._log_message}\n{stats.to_table()}')
if self._use_table_logs:
self._periodic_message_logger.info(f'{self._log_message}\n{stats.to_table()}')
else:
self._periodic_message_logger.info(f'{self._log_message}: {stats.to_string()}')

async def _maybe_load_statistics(self) -> None:
if not self._persistence_enabled:
Expand Down
51 changes: 34 additions & 17 deletions tests/unit/crawlers/_basic/test_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -889,11 +889,20 @@ async def handler(context: BasicCrawlingContext) -> None:


@pytest.mark.skipif(os.name == 'nt' and 'CI' in os.environ, reason='Skipped in Windows CI')
async def test_logs_final_statistics(monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture) -> None:
@pytest.mark.parametrize(
('use_table_logs'),
[
pytest.param(True, id='With table for logs'),
pytest.param(False, id='Without table for logs'),
],
)
async def test_logs_final_statistics(
monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture, *, use_table_logs: bool
) -> None:
# Set the log level to INFO to capture the final statistics log.
caplog.set_level(logging.INFO)

crawler = BasicCrawler(configure_logging=False)
crawler = BasicCrawler(configure_logging=False, use_table_logs=use_table_logs)

@crawler.router.default_handler
async def handler(context: BasicCrawlingContext) -> None:
Expand Down Expand Up @@ -923,21 +932,29 @@ async def handler(context: BasicCrawlingContext) -> None:
)

assert final_statistics is not None
assert final_statistics.msg.splitlines() == [
'Final request statistics:',
'┌───────────────────────────────┬───────────┐',
'│ requests_finished │ 4 │',
'│ requests_failed │ 33 │',
'│ retry_histogram │ [1, 4, 8] │',
'│ request_avg_failed_duration │ 99.0 │',
'│ request_avg_finished_duration │ 0.483 │',
'│ requests_finished_per_minute │ 0.33 │',
'│ requests_failed_per_minute │ 0.1 │',
'│ request_total_duration │ 720.0 │',
'│ requests_total │ 37 │',
'│ crawler_runtime │ 300.0 │',
'└───────────────────────────────┴───────────┘',
]
if use_table_logs:
assert final_statistics.msg.splitlines() == [
'Final request statistics:',
'┌───────────────────────────────┬───────────┐',
'│ requests_finished │ 4 │',
'│ requests_failed │ 33 │',
'│ retry_histogram │ [1, 4, 8] │',
'│ request_avg_failed_duration │ 99.0 │',
'│ request_avg_finished_duration │ 0.483 │',
'│ requests_finished_per_minute │ 0.33 │',
'│ requests_failed_per_minute │ 0.1 │',
'│ request_total_duration │ 720.0 │',
'│ requests_total │ 37 │',
'│ crawler_runtime │ 300.0 │',
'└───────────────────────────────┴───────────┘',
]
else:
assert final_statistics.msg == (
'Final request statistics: requests_finished: 4; requests_failed: 33; '
'retry_histogram: [1, 4, 8]; request_avg_failed_duration: 99.0; request_avg_finished_duration: 0.483; '
'requests_finished_per_minute: 0.33; requests_failed_per_minute: 0.1; request_total_duration: 720.0; '
'requests_total: 37; crawler_runtime: 300.0'
)


async def test_crawler_manual_stop(httpbin: URL) -> None:
Expand Down
Loading