apify · vdusek · Mar 18, 2025 · Mar 7, 2025 · Mar 7, 2025 · Mar 7, 2025
diff --git a/docs/examples/code_examples/configure_json_logging.py b/docs/examples/code_examples/configure_json_logging.py
@@ -0,0 +1,91 @@
+from __future__ import annotations
+
+import asyncio
+import inspect
+import logging
+import sys
+from typing import TYPE_CHECKING
+
+from loguru import logger
+
+from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
+
+if TYPE_CHECKING:
+    from loguru import Record
+
+
+# Configure loguru interceptor to capture standard logging output
+class InterceptHandler(logging.Handler):
+    def emit(self, record: logging.LogRecord) -> None:
+        # Get corresponding Loguru level if it exists
+        try:
+            level: str | int = logger.level(record.levelname).name
+        except ValueError:
+            level = record.levelno
+
+        # Find caller from where originated the logged message
+        frame, depth = inspect.currentframe(), 0
+        while frame:
+            filename = frame.f_code.co_filename
+            is_logging = filename == logging.__file__
+            is_frozen = 'importlib' in filename and '_bootstrap' in filename
+            if depth > 0 and not (is_logging | is_frozen):
+                break
+            frame = frame.f_back
+            depth += 1
+
+        dummy_record = logging.LogRecord('dummy', 0, 'dummy', 0, 'dummy', None, None)
+        standard_attrs = set(dummy_record.__dict__.keys())
+        extra_dict = {
+            key: value
+            for key, value in record.__dict__.items()
+            if key not in standard_attrs
+        }
+
+        (
+            logger.bind(**extra_dict)
+            .opt(depth=depth, exception=record.exc_info)
+            .patch(lambda loguru_record: loguru_record.update({'name': record.name}))
+            .log(level, record.getMessage())
+        )
+
+
+# Configure loguru formatter
+def formatter(record: Record) -> str:
+    basic_format = '[{name}] | <level>{level: ^8}</level> | - {message}'
+    if record['extra']:
+        basic_format = basic_format + ' {extra}'
+    return f'{basic_format}\n'
+
+
+# Remove default loguru logger
+logger.remove()
+
+# Set up loguru with JSONL serialization in file `crawler.log`
+logger.add('crawler.log', format=formatter, serialize=True, level='INFO')
+
+# Set up loguru logger for console
+logger.add(sys.stderr, format=formatter, colorize=True, level='INFO')
+
+# Configure standard logging to use our interceptor
+logging.basicConfig(handlers=[InterceptHandler()], level=logging.INFO, force=True)
+
+
+async def main() -> None:
+    # Initialize crawler with disabled table logs
+    crawler = HttpCrawler(
+        configure_logging=False,  # Disable default logging configuration
+        statistics_log_format='inline',  # Set inline formatting for statistics logs
+    )
+
+    # Define the default request handler, which will be called for every request.
+    @crawler.router.default_handler
+    async def request_handler(context: HttpCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+    # Run the crawler
+    await crawler.run(['https://www.crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/examples/json_logging.mdx b/docs/examples/json_logging.mdx
@@ -0,0 +1,57 @@
+---
+id: configure-json-logging
+title: Сonfigure JSON logging
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import CodeBlock from '@theme/CodeBlock';
+
+import JsonLoggingExample from '!!raw-loader!./code_examples/configure_json_logging.py';
+
+This example demonstrates how to configure JSON line (JSONL) logging with Crawlee. By using the `use_table_logs=False` parameter, you can disable table-formatted statistics logs, which makes it easier to parse logs with external tools or to serialize them as JSON.
+
+The example shows how to integrate with the popular [`loguru`](https://github.com/delgan/loguru) library to capture Crawlee logs and format them as JSONL (one JSON object per line). This approach works well when you need to collect logs for analysis, monitoring, or when integrating with logging platforms like ELK Stack, Grafana Loki, or similar systems.
+
+<CodeBlock className="language-python">
+    {JsonLoggingExample}
+</CodeBlock>
+
+Here's an example of what a crawler statistics log entry in JSONL format.
+
+```json
+{
+    "text": "[HttpCrawler] |   INFO   | - Final request statistics: {'requests_finished': 1, 'requests_failed': 0, 'retry_histogram': [1], 'request_avg_failed_duration': None, 'request_avg_finished_duration': 3.57098, 'requests_finished_per_minute': 17, 'requests_failed_per_minute': 0, 'request_total_duration': 3.57098, 'requests_total': 1, 'crawler_runtime': 3.59165}\n",
+    "record": {
+        "elapsed": { "repr": "0:00:05.604568", "seconds": 5.604568 },
+        "exception": null,
+        "extra": {
+            "requests_finished": 1,
+            "requests_failed": 0,
+            "retry_histogram": [1],
+            "request_avg_failed_duration": null,
+            "request_avg_finished_duration": 3.57098,
+            "requests_finished_per_minute": 17,
+            "requests_failed_per_minute": 0,
+            "request_total_duration": 3.57098,
+            "requests_total": 1,
+            "crawler_runtime": 3.59165
+        },
+        "file": {
+            "name": "_basic_crawler.py",
+            "path": "/crawlers/_basic/_basic_crawler.py"
+        },
+        "function": "run",
+        "level": { "icon": "ℹ️", "name": "INFO", "no": 20 },
+        "line": 583,
+        "message": "Final request statistics:",
+        "module": "_basic_crawler",
+        "name": "HttpCrawler",
+        "process": { "id": 198383, "name": "MainProcess" },
+        "thread": { "id": 135312814966592, "name": "MainThread" },
+        "time": {
+            "repr": "2025-03-17 17:14:45.339150+00:00",
+            "timestamp": 1742231685.33915
+        }
+    }
+}
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -230,6 +230,7 @@ module = [
     "apify_fingerprint_datapoints", # Untyped and stubs not available
     "camoufox",                     # Example code shows integration of camoufox and crawlee.
     "jaro",                         # Untyped and stubs not available
+    "loguru",                       # Example code shows integration of loguru and crawlee for JSON logging.
     "sklearn.linear_model",         # Untyped and stubs not available
 ]
 ignore_missing_imports = true

diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -13,7 +13,7 @@
 from datetime import timedelta
 from functools import partial
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Generic, Union, cast
+from typing import TYPE_CHECKING, Any, Callable, Generic, Literal, Union, cast
 from urllib.parse import ParseResult, urlparse
 from weakref import WeakKeyDictionary
 
@@ -135,6 +135,11 @@ class _BasicCrawlerOptions(TypedDict):
     configure_logging: NotRequired[bool]
     """If True, the crawler will set up logging infrastructure automatically."""
 
+    statistics_log_format: NotRequired[Literal['table', 'inline']]
+    """If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain
+    text log messages.
+    """
+
     keep_alive: NotRequired[bool]
     """Flag that can keep crawler running even when there are no requests in queue."""
 
@@ -231,6 +236,7 @@ def __init__(
         abort_on_error: bool = False,
         keep_alive: bool = False,
         configure_logging: bool = True,
+        statistics_log_format: Literal['table', 'inline'] = 'table',
         _context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
         _additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None,
         _logger: logging.Logger | None = None,
@@ -271,6 +277,8 @@ def __init__(
             keep_alive: If True, it will keep crawler alive even if there are no requests in queue.
                 Use `crawler.stop()` to exit the crawler.
             configure_logging: If True, the crawler will set up logging infrastructure automatically.
+            statistics_log_format: If 'table', displays crawler statistics as formatted tables in logs. If 'inline',
+                outputs statistics as plain text log messages.
             _context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
                 Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
             _additional_context_managers: Additional context managers used throughout the crawler lifecycle.
@@ -346,12 +354,14 @@ def __init__(
             httpx_logger = logging.getLogger('httpx')  # Silence HTTPX logger
             httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
         self._logger = _logger or logging.getLogger(__name__)
+        self._statistics_log_format = statistics_log_format
 
         # Statistics
         self._statistics = statistics or cast(
             'Statistics[TStatisticsState]',
             Statistics.with_default_state(
                 periodic_message_logger=self._logger,
+                statistics_log_format=self._statistics_log_format,
                 log_message='Current request statistics:',
             ),
         )
@@ -567,8 +577,10 @@ def sigint_handler() -> None:
         await self._save_crawler_state()
 
         final_statistics = self._statistics.calculate()
-        self._logger.info(f'Final request statistics:\n{final_statistics.to_table()}')
-
+        if self._statistics_log_format == 'table':
+            self._logger.info(f'Final request statistics:\n{final_statistics.to_table()}')
+        else:
+            self._logger.info('Final request statistics:', extra=final_statistics.to_dict())
         return final_statistics
 
     async def _run_crawler(self) -> None:

diff --git a/src/crawlee/statistics/_models.py b/src/crawlee/statistics/_models.py
@@ -35,6 +35,9 @@ def to_table(self) -> str:
 
         return make_table([(str(k), str(v)) for k, v in str_dict.items()], width=60)
 
+    def to_dict(self) -> dict[str, float | int | list[int]]:
+        return {k: v.total_seconds() if isinstance(v, timedelta) else v for k, v in asdict(self).items()}
+
     @override
     def __str__(self) -> str:
         return json.dumps(

diff --git a/src/crawlee/statistics/_statistics.py b/src/crawlee/statistics/_statistics.py
@@ -4,7 +4,7 @@
 import math
 from datetime import datetime, timedelta, timezone
 from logging import Logger, getLogger
-from typing import TYPE_CHECKING, Any, Generic, cast
+from typing import TYPE_CHECKING, Any, Generic, Literal, cast
 
 from typing_extensions import Self, TypeVar
 
@@ -76,6 +76,7 @@ def __init__(
         periodic_message_logger: Logger | None = None,
         log_interval: timedelta = timedelta(minutes=1),
         state_model: type[TStatisticsState],
+        statistics_log_format: Literal['table', 'inline'] = 'table',
     ) -> None:
         self._id = Statistics.__next_id
         Statistics.__next_id += 1
@@ -99,6 +100,7 @@ def __init__(
         self._key_value_store: KeyValueStore | None = key_value_store
 
         self._log_message = log_message
+        self._statistics_log_format = statistics_log_format
         self._periodic_message_logger = periodic_message_logger or logger
         self._periodic_logger = RecurringTask(self._log, log_interval)
 
@@ -129,6 +131,7 @@ def with_default_state(
         log_message: str = 'Statistics',
         periodic_message_logger: Logger | None = None,
         log_interval: timedelta = timedelta(minutes=1),
+        statistics_log_format: Literal['table', 'inline'] = 'table',
     ) -> Statistics[StatisticsState]:
         """Convenience constructor for creating a `Statistics` with default state model `StatisticsState`."""
         return Statistics[StatisticsState](
@@ -140,6 +143,7 @@ def with_default_state(
             periodic_message_logger=periodic_message_logger,
             log_interval=log_interval,
             state_model=StatisticsState,
+            statistics_log_format=statistics_log_format,
         )
 
     @property
@@ -281,7 +285,10 @@ async def reset(self) -> None:
 
     def _log(self) -> None:
         stats = self.calculate()
-        self._periodic_message_logger.info(f'{self._log_message}\n{stats.to_table()}')
+        if self._statistics_log_format == 'table':
+            self._periodic_message_logger.info(f'{self._log_message}\n{stats.to_table()}')
+        else:
+            self._periodic_message_logger.info(self._log_message, extra=stats.to_dict())
 
     async def _maybe_load_statistics(self) -> None:
         if not self._persistence_enabled:

diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py
@@ -11,7 +11,7 @@
 from dataclasses import dataclass
 from datetime import timedelta
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, cast
+from typing import TYPE_CHECKING, Any, Literal, cast
 from unittest.mock import AsyncMock, Mock, call
 
 import httpx
@@ -889,11 +889,20 @@ async def handler(context: BasicCrawlingContext) -> None:
 
 
 @pytest.mark.skipif(os.name == 'nt' and 'CI' in os.environ, reason='Skipped in Windows CI')
-async def test_logs_final_statistics(monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture) -> None:
+@pytest.mark.parametrize(
+    ('statistics_log_format'),
+    [
+        pytest.param('table', id='With table for logs'),
+        pytest.param('inline', id='With inline logs'),
+    ],
+)
+async def test_logs_final_statistics(
+    monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture, statistics_log_format: Literal['table', 'inline']
+) -> None:
     # Set the log level to INFO to capture the final statistics log.
     caplog.set_level(logging.INFO)
 
-    crawler = BasicCrawler(configure_logging=False)
+    crawler = BasicCrawler(configure_logging=False, statistics_log_format=statistics_log_format)
 
     @crawler.router.default_handler
     async def handler(context: BasicCrawlingContext) -> None:
@@ -923,21 +932,36 @@ async def handler(context: BasicCrawlingContext) -> None:
     )
 
     assert final_statistics is not None
-    assert final_statistics.msg.splitlines() == [
-        'Final request statistics:',
-        '┌───────────────────────────────┬───────────┐',
-        '│ requests_finished             │ 4         │',
-        '│ requests_failed               │ 33        │',
-        '│ retry_histogram               │ [1, 4, 8] │',
-        '│ request_avg_failed_duration   │ 99.0      │',
-        '│ request_avg_finished_duration │ 0.483     │',
-        '│ requests_finished_per_minute  │ 0.33      │',
-        '│ requests_failed_per_minute    │ 0.1       │',
-        '│ request_total_duration        │ 720.0     │',
-        '│ requests_total                │ 37        │',
-        '│ crawler_runtime               │ 300.0     │',
-        '└───────────────────────────────┴───────────┘',
-    ]
+    if statistics_log_format == 'table':
+        assert final_statistics.msg.splitlines() == [
+            'Final request statistics:',
+            '┌───────────────────────────────┬───────────┐',
+            '│ requests_finished             │ 4         │',
+            '│ requests_failed               │ 33        │',
+            '│ retry_histogram               │ [1, 4, 8] │',
+            '│ request_avg_failed_duration   │ 99.0      │',
+            '│ request_avg_finished_duration │ 0.483     │',
+            '│ requests_finished_per_minute  │ 0.33      │',
+            '│ requests_failed_per_minute    │ 0.1       │',
+            '│ request_total_duration        │ 720.0     │',
+            '│ requests_total                │ 37        │',
+            '│ crawler_runtime               │ 300.0     │',
+            '└───────────────────────────────┴───────────┘',
+        ]
+    else:
+        assert final_statistics.msg == 'Final request statistics:'
+
+        # ignore[attr-defined] since `extra` parameters are not defined for `LogRecord`
+        assert final_statistics.requests_finished == 4  # type: ignore[attr-defined]
+        assert final_statistics.requests_failed == 33  # type: ignore[attr-defined]
+        assert final_statistics.retry_histogram == [1, 4, 8]  # type: ignore[attr-defined]
+        assert final_statistics.request_avg_failed_duration == 99.0  # type: ignore[attr-defined]
+        assert final_statistics.request_avg_finished_duration == 0.483  # type: ignore[attr-defined]
+        assert final_statistics.requests_finished_per_minute == 0.33  # type: ignore[attr-defined]
+        assert final_statistics.requests_failed_per_minute == 0.1  # type: ignore[attr-defined]
+        assert final_statistics.request_total_duration == 720.0  # type: ignore[attr-defined]
+        assert final_statistics.requests_total == 37  # type: ignore[attr-defined]
+        assert final_statistics.crawler_runtime == 300.0  # type: ignore[attr-defined]
 
 
 async def test_crawler_manual_stop(httpbin: URL) -> None: