Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,54 @@ Use this mark to auto-use the `saas_mode` fixture.

Use this mark to auto-use the `enterprise_mode` fixture.

### OpenTelemetry

Flagsmith supports exporting traces and structured logs over OTLP.

#### Configuration

OTel instrumentation is opt-in, controlled by environment variables:

| Variable | Description | Default |
| --------------------------------- | --------------------------------------------------------------------------------------------------------------------- | --------------- |
| `OTEL_EXPORTER_OTLP_ENDPOINT` | Base OTLP endpoint (e.g. `http://collector:4318`). If unset, no OTel setup occurs. | _(disabled)_ |
| `OTEL_SERVICE_NAME` | The `service.name` resource attribute. | `flagsmith-api` |
| `OTEL_TRACING_EXCLUDED_URL_PATHS` | Comma-separated URL paths to exclude from tracing (e.g. `health/liveness,health/readiness`). | _(none)_ |

Standard `OTEL_*` env vars (e.g. `OTEL_RESOURCE_ATTRIBUTES`, `OTEL_EXPORTER_OTLP_HEADERS`) are also respected by the OTel SDK.

#### What gets configured

When `OTEL_EXPORTER_OTLP_ENDPOINT` is set, `ensure_cli_env()` sets up:

- **Tracing**: `TracerProvider` with OTLP/HTTP span export, W3C `TraceContext` + `Baggage` propagation, and auto-instrumentation for:
- **Django** (`DjangoInstrumentor`): creates a root span per HTTP request with span names formatted as `{METHOD} {route_template}` (e.g. `GET /api/v1/projects/{pk}/`).
- **psycopg2** (`Psycopg2Instrumentor`): creates child spans for each SQL query with `db.system`, `db.statement`, and `db.name` attributes. SQL commenter is enabled, adding trace context as SQL comments for database-side correlation.
- **Redis** (`RedisInstrumentor`): creates child spans for each Redis command with `db.system` and `db.statement` attributes.
- **Structured log export**: A structlog processor that emits each log event as both an OTLP log record and a span event (when an active span exists).

#### Emitting OTel log events via structlog

Use structlog as usual. The OTel processor captures events and maps them to OTLP log records:

```python
import structlog

log = structlog.get_logger("code_references")
log.info("scan-created", code_references__count=3, feature__count=2)
```

This produces:

1. An **OTLP log record** with:
- `Body: scan-created`
- `EventName: code_references.scan_created` (logger name + `inflection.underscore` of the event)
- `Severity: INFO`
- `Attributes: code_references.count=3, feature.count=2` (double underscores are converted to dots)
- W3C Baggage entries from the current OTel context are copied into log attributes (e.g. `amplitude.device_id`, `amplitude.session_id`).

2. A **span event** on the active span (if one exists) with the same name and attributes. This makes structlog events visible in trace backends (e.g. SigNoz's "Events" tab) without requiring separate log correlation. When no span is active (e.g. during startup or management commands), only the OTLP log record is emitted.

### Metrics

Flagsmith uses Prometheus to track performance metrics.
Expand Down
8 changes: 8 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@ optional-dependencies = { test-tools = [
"drf-writable-nested",
"environs (<15)",
"gunicorn (>=19.1)",
"inflection",
"opentelemetry-api (>=1.25,<2)",
"opentelemetry-sdk (>=1.25,<2)",
"opentelemetry-exporter-otlp-proto-http (>=1.25,<2)",
"opentelemetry-instrumentation-django (>=0.46b0,<1)",
"opentelemetry-instrumentation-psycopg2 (>=0.46b0,<1)",
"opentelemetry-instrumentation-redis (>=0.46b0,<1)",
"redis (>=5,<6)",
"prometheus-client (>=0.0.16)",
"psycopg2-binary (>=2.9,<3)",
"requests",
Expand Down
7 changes: 6 additions & 1 deletion src/common/core/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def setup_logging(
logging_configuration_file: str | None = None,
application_loggers: list[str] | None = None,
extra_foreign_processors: list[Processor] | None = None,
otel_processors: list[Processor] | None = None,
) -> None:
"""
Set up logging for the application.
Expand Down Expand Up @@ -111,7 +112,9 @@ def setup_logging(
logging.config.dictConfig(dict_config)

setup_structlog(
log_format=log_format, extra_foreign_processors=extra_foreign_processors
log_format=log_format,
extra_foreign_processors=extra_foreign_processors,
otel_processors=otel_processors,
)


Expand Down Expand Up @@ -215,6 +218,7 @@ def drop_internal_keys(
def setup_structlog(
log_format: str,
extra_foreign_processors: list[Processor] | None = None,
otel_processors: list[Processor] | None = None,
) -> None:
"""Configure structlog to route through stdlib logging."""
from common.core.sentry import sentry_processor
Expand All @@ -240,6 +244,7 @@ def setup_structlog(
structlog.processors.format_exc_info,
structlog.processors.TimeStamper(fmt="iso"),
sentry_processor,
*(otel_processors or []),
structlog.stdlib.ProcessorFormatter.wrap_for_formatter,
],
wrapper_class=structlog.stdlib.BoundLogger,
Expand Down
30 changes: 30 additions & 0 deletions src/common/core/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,35 @@ def ensure_cli_env() -> typing.Generator[None, None, None]:
"""
ctx = contextlib.ExitStack()

# Set up OTel instrumentation (opt-in via OTEL_EXPORTER_OTLP_ENDPOINT).
otel_processors = None
otel_endpoint = env.str("OTEL_EXPORTER_OTLP_ENDPOINT", None)
if otel_endpoint:
from common.core.otel import (
add_otel_trace_context,
build_otel_log_provider,
build_tracer_provider,
make_structlog_otel_processor,
setup_tracing,
)

service_name = env.str("OTEL_SERVICE_NAME", "flagsmith-api")
log_provider = build_otel_log_provider(
endpoint=f"{otel_endpoint}/v1/logs",
service_name=service_name,
)
otel_processors = [
add_otel_trace_context,
make_structlog_otel_processor(log_provider),
]
tracer_provider = build_tracer_provider(
endpoint=f"{otel_endpoint}/v1/traces",
service_name=service_name,
)
excluded_urls = env.str("OTEL_TRACING_EXCLUDED_URL_PATHS", None)
ctx.enter_context(setup_tracing(tracer_provider, excluded_urls=excluded_urls))
ctx.callback(log_provider.shutdown)

# Set up logging early, before Django settings are loaded.
setup_logging(
log_level=env.str("LOG_LEVEL", "INFO"),
Expand All @@ -48,6 +77,7 @@ def ensure_cli_env() -> typing.Generator[None, None, None]:
env.list("ACCESS_LOG_EXTRA_ITEMS", []) or None,
),
],
otel_processors=otel_processors,
)

# Prometheus multiproc support
Expand Down
204 changes: 204 additions & 0 deletions src/common/core/otel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
import contextlib
import json
from collections.abc import Generator
from datetime import datetime, timezone
from importlib.metadata import version
from typing import cast

import inflection
import structlog
from opentelemetry import baggage, trace
from opentelemetry import context as otel_context
from opentelemetry._logs import SeverityNumber
from opentelemetry.baggage.propagation import W3CBaggagePropagator
from opentelemetry.exporter.otlp.proto.http._log_exporter import (
OTLPLogExporter,
)
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
OTLPSpanExporter,
)
from opentelemetry.instrumentation.django import DjangoInstrumentor
from opentelemetry.instrumentation.psycopg2 import Psycopg2Instrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.propagate import set_global_textmap
from opentelemetry.propagators.composite import CompositePropagator
from opentelemetry.propagators.textmap import TextMapPropagator
from opentelemetry.sdk._logs import LoggerProvider
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.trace.propagation.tracecontext import (
TraceContextTextMapPropagator,
)
from opentelemetry.util.types import AnyValue, Attributes
from structlog.typing import EventDict, Processor

_SEVERITY_MAP: dict[str, SeverityNumber] = {
"debug": SeverityNumber.DEBUG,
"info": SeverityNumber.INFO,
"warning": SeverityNumber.WARN,
"error": SeverityNumber.ERROR,
"critical": SeverityNumber.FATAL,
}

_RESERVED_KEYS = frozenset(
[
"event",
"level",
"timestamp",
"logger",
"trace_id",
"span_id",
]
)


def add_otel_trace_context(
logger: structlog.types.WrappedLogger,
method_name: str,
event_dict: EventDict,
) -> EventDict:
"""Add ``trace_id`` and ``span_id`` from the active OTel span to the event dict."""
span = trace.get_current_span()
ctx = span.get_span_context()
if ctx and ctx.is_valid:
event_dict["trace_id"] = f"{ctx.trace_id:032x}"
event_dict["span_id"] = f"{ctx.span_id:016x}"
return event_dict


def make_structlog_otel_processor(logger_provider: LoggerProvider) -> Processor:
"""Create a structlog processor that emits log records to OpenTelemetry.

Sits in the processor chain *before* the final renderer so that
only structlog-originated logs reach OTel. Passes the event_dict
through unchanged so downstream processors (console/JSON renderers)
still work normally.

Pass the returned processor to :func:`~common.core.logging.setup_logging`
via ``otel_processor``.
"""
otel_logger = logger_provider.get_logger(__name__, version("flagsmith-common"))

def processor(
logger: structlog.types.WrappedLogger,
method_name: str,
event_dict: EventDict,
) -> EventDict:
attributes = map_event_dict_to_otel_attributes(event_dict)

# Copy W3C baggage entries into log attributes so downstream
# exporters can access them.
ctx = otel_context.get_current()
for key, value in baggage.get_all(ctx).items():
attributes[key] = str(value)

body = event_dict.get("event", "")
logger_name = event_dict.get("logger")
event_name = inflection.underscore(body) if body else "unknown"
if logger_name:
event_name = f"{logger_name}.{event_name}"

# Some observability platforms don't surface OTel's EventName.
# Keep a custom attribute for better visibility.
attributes["flagsmith.event"] = event_name

log_level = event_dict.get("level", method_name)

otel_logger.emit(
timestamp=int(datetime.now(timezone.utc).timestamp() * 1e9),
context=otel_context.get_current(),
severity_text=log_level,
severity_number=_SEVERITY_MAP.get(log_level, SeverityNumber.TRACE),
body=body,
event_name=event_name,
attributes=attributes,
)

# Also attach as a span event if there's an active span.
span = trace.get_current_span()
if span.is_recording():
# AnyValue is a superset of AttributeValue at runtime;
# the cast keeps mypy happy.
span.add_event(event_name, attributes=cast(Attributes, attributes))

return event_dict

return processor


def map_event_dict_to_otel_attributes(event_dict: EventDict) -> dict[str, AnyValue]:
return {
k.replace("__", "."): map_value_to_otel_value(v)
for k, v in event_dict.items()
if k not in _RESERVED_KEYS
}


def map_value_to_otel_value(value: object) -> str | int | float | bool:
"""Coerce a value to an OTel-attribute-compatible type."""
if isinstance(value, (bool, str, int, float)):
return value
return json.dumps(value, default=str)


def build_otel_log_provider(*, endpoint: str, service_name: str) -> LoggerProvider:
"""Create and configure an OTel LoggerProvider with OTLP/HTTP export."""
resource = Resource.create({"service.name": service_name})
provider = LoggerProvider(resource=resource)
exporter = OTLPLogExporter(endpoint=endpoint)
provider.add_log_record_processor(BatchLogRecordProcessor(exporter))
return provider


def build_tracer_provider(*, endpoint: str, service_name: str) -> TracerProvider:
"""Create a TracerProvider with OTLP/HTTP export."""
resource = Resource.create({"service.name": service_name})
tracer_provider = TracerProvider(resource=resource)
span_exporter = OTLPSpanExporter(endpoint=endpoint)
tracer_provider.add_span_processor(BatchSpanProcessor(span_exporter))
return tracer_provider


@contextlib.contextmanager
def setup_tracing(
tracer_provider: TracerProvider,
excluded_urls: str | None = None,
) -> Generator[None, None, None]:
"""Set up and tear down OTel distributed tracing with Django instrumentation.

Sets the global TracerProvider, configures W3C trace context +
baggage propagation, and instruments Django so that every request
creates a span with the incoming trace context.

On exit, uninstruments Django and shuts down the tracer provider.

Must be called *before* Django's WSGI app is created.

Args:
tracer_provider: The TracerProvider to use.
excluded_urls: Comma-separated URL paths to exclude from tracing
(e.g. ``"health/liveness,health/readiness"``). If not provided,
falls back to the ``OTEL_PYTHON_DJANGO_EXCLUDED_URLS`` env var.
"""
trace.set_tracer_provider(tracer_provider)

propagator: TextMapPropagator = CompositePropagator(
[
TraceContextTextMapPropagator(),
W3CBaggagePropagator(),
]
)
set_global_textmap(propagator)

DjangoInstrumentor().instrument(excluded_urls=excluded_urls)
Psycopg2Instrumentor().instrument(enable_commenter=True, skip_dep_check=True)
RedisInstrumentor().instrument()
try:
yield
finally:
RedisInstrumentor().uninstrument()
Psycopg2Instrumentor().uninstrument()
DjangoInstrumentor().uninstrument()
tracer_provider.shutdown()
10 changes: 8 additions & 2 deletions src/common/gunicorn/middleware.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from typing import Callable

from django.http import HttpRequest, HttpResponse
from opentelemetry import trace

from common.gunicorn.utils import get_route_template, log_extra


class RouteLoggerMiddleware:
"""
Make the resolved Django route available to the WSGI server
(e.g. Gunicorn) for logging purposes.
(e.g. Gunicorn) for logging and tracing purposes.
"""

def __init__(
Expand All @@ -21,10 +22,15 @@ def __call__(self, request: HttpRequest) -> HttpResponse:
response = self.get_response(request)

if resolver_match := request.resolver_match:
route_template = get_route_template(resolver_match.route)
log_extra(
request=request,
key="route",
value=get_route_template(resolver_match.route),
value=route_template,
)
span = trace.get_current_span()
if span.is_recording():
span.update_name(f"{request.method} {route_template}")
span.set_attribute("http.route", route_template)

return response
Loading
Loading