developmentseed · hrodmn · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025 · Jul 11, 2025
diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.13
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,18 @@
+FROM python:3.12-slim
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+
+WORKDIR /app
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+  --mount=type=bind,source=uv.lock,target=uv.lock \
+  --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
+  apt-get update && apt-get install -y git && \ 
+  uv sync --locked --no-install-project
+
+COPY pyproject.toml uv.lock main.py /app/.
+
+# Sync the project
+RUN --mount=type=cache,target=/root/.cache/uv \
+  uv sync --locked
+
+CMD ["uv", "run", "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "main:app", "--bind", "0.0.0.0:8000", "--workers", "1"]
diff --git a/README.md b/README.md
@@ -0,0 +1,84 @@
+# TiTiler OpenTelemetry Observability Stack
+
+A complete observability stack for TiTiler applications using OpenTelemetry, demonstrating distributed tracing, metrics collection, and structured logging with Grafana visualization.
+The goal is to demonstrate how OpenTelemetry can be used to provide correlated traces and logs and how those can be pulled into an observability platform like Grafana.
+
+## Architecture
+
+The stack consists of six containerized services orchestrated on a single Docker bridge network (`observability-net`):
+
+### Core Services
+
+**titiler** (`main.py:139`)
+
+- FastAPI application with OpenTelemetry instrumentation
+- Exports traces, metrics, and logs to OpenTelemetry Collector via OTLP/gRPC
+- Custom metrics middleware tracks tile requests, HTTP requests, and response times
+- Configured with GDAL optimizations for raster processing
+- Exposed on port 8000
+
+**otel-collector** (`otel-collector-config.yml`)
+
+- OpenTelemetry Collector with contrib distribution
+- Receives telemetry data via OTLP protocol (ports 4317/4318)
+- Processes and routes data to appropriate backends:
+  - Traces � Jaeger
+  - Metrics � Prometheus
+  - Logs � Loki
+- Applies resource attributes and batching for performance
+
+### Observability Backends
+
+**prometheus** (`prometheus.yml`)
+
+- Time-series database for metrics storage
+- Receives metrics from OpenTelemetry Collector via remote write API
+- Exposed on port 9090
+
+**jaeger** (`jaeger:latest`)
+
+- Distributed tracing backend with OpenTelemetry Logging Protocol (OTLP) ingestion enabled
+- Receives traces from OpenTelemetry Collector
+- Web UI exposed on port 16686
+
+**loki** (`loki-config.yaml`)
+
+- Log aggregation system for structured log storage
+- Receives logs from OpenTelemetry Collector via push API
+- Exposed on port 3100
+
+**grafana** (`grafana-*.yml`)
+
+- Visualization platform with pre-configured datasources
+- Unified dashboard for metrics, traces, and logs correlation
+- Default credentials: admin/admin
+- Exposed on port 3000
+
+## Network Flow
+
+1. TiTiler application generates telemetry data during request processing
+2. OpenTelemetry SDKs export data to Collector via OTLP
+3. Collector processes and forwards data to appropriate backends
+4. Grafana queries all backends for unified observability dashboard
+
+## Key Features
+
+- **Distributed Tracing**: Full request flow visibility with OpenTelemetry spans
+- **Custom Metrics**: Tile-specific metrics including zoom levels and response times
+- **Structured Logging**: JSON-formatted logs with trace correlation
+- **Zero-configuration**: Pre-provisioned Grafana datasources and dashboards
+- **Hot Reload**: Development mode with file watching for rapid iteration
+
+## Usage
+
+```bash
+docker compose up
+```
+
+Access services:
+
+- TiTiler API: <http://localhost:8000>
+- Grafana: <http://localhost:3000>
+- Prometheus: <http://localhost:9090>
+- Jaeger: <http://localhost:16686>
+
diff --git a/SERVICE-TRACING-DECISION.md b/SERVICE-TRACING-DECISION.md
@@ -0,0 +1,156 @@
+# Service Tracing Decision & Design Document
+
+## Overview
+
+This document outlines the decision framework and design considerations for implementing service tracing in applications like TiTiler, comparing deployment environments and establishing benchmarks for telemetry implementation.
+
+## Advantages Over Basic Logging
+
+### Traditional Setup (e.g. Lambda + CloudWatch)
+
+**Limitations:**
+
+- No request correlation across services
+- Manual log parsing and filtering
+- Limited performance insights
+- Reactive debugging only
+
+### OpenTelemetry Approach
+
+**Benefits:**
+
+- Automatic trace correlation with `trace_id`
+- Request flow visualization across distributed services
+- Proactive performance monitoring with service-level indicators (SLIs)
+- Structured metrics for capacity planning
+- Root cause analysis through span relationships
+
+## Key Telemetry Points
+
+### Essential Metrics
+
+- **Request rate, error rate, duration (RED metrics)**
+- **Resource utilization (CPU, memory, I/O)**
+- **Usage metrics (tiles served, requests per data source, cache hits)**
+
+### Critical Trace Points
+
+- **HTTP request boundaries**
+- **Database queries and external API calls**
+- **Async operations and queue processing**
+- **Error conditions and retry logic**
+
+### Recommended Sampling
+
+- **Production:** 10-25% sampling rate
+- **Development:** 100% sampling
+- **Error traces:** Always sample (100%)
+
+## Unintrusive Tracing Infrastructure
+
+### Standard Logging with Trace Correlation
+
+**Philosophy:** Minimize code changes while maximizing observability value through automatic trace correlation.
+
+**Approach:**
+
+- Use existing `logging` statements without modification
+- OpenTelemetry automatically injects `trace_id` and `span_id` into log records
+- Correlate logs with traces using shared identifiers
+- Preserve existing log formatting and practices
+
+**Example Implementation:**
+
+```python
+# Standard logging (no changes needed)
+logger.info("Processing tile request", extra={"zoom": z, "x": x, "y": y})
+
+# OpenTelemetry automatically adds:
+# - trace_id: correlates with distributed trace
+# - span_id: links to specific operation
+# - resource attributes: service.name, service.version
+```
+
+**Benefits:**
+
+- No learning curve for developers
+- Existing log statements gain trace context
+- Gradual adoption without code rewrites
+- Backward compatibility with non-instrumented services
+
+### Automatic Instrumentation Priority
+
+**Zero-Code Instrumentation:**
+
+OpenTelemetry provides automatic instrumentation libraries that add tracing without code modifications by patching common frameworks and libraries at runtime.
+
+- FastAPI/Flask/Django automatic request tracing
+- Database query instrumentation (SQLAlchemy, psycopg2)
+- HTTP client libraries (requests, httpx, aiohttp)
+- Redis/Memcached cache operations
+
+**Manual Instrumentation Only When Needed:**
+
+- Business logic spans for critical operations
+- Custom metrics for domain-specific KPIs
+- Error context for debugging complex workflows
+
+## Deployment Environment Considerations
+
+### AWS Lambda
+
+**Characteristics:**
+
+- Cold start overhead: ~50-100ms additional latency for OTLP exporters
+- Memory impact: +20-30MB for OpenTelemetry SDKs
+- Use AWS X-Ray SDK instead of OTLP for native integration
+- Async exporters essential to avoid timeout issues
+- CloudWatch Logs automatic, but limited correlation
+
+**Recommendation:** Use AWS X-Ray for Lambda deployments to minimize cold start impact and leverage native AWS integration.
+
+### Kubernetes/Container Platforms
+
+**Characteristics:**
+
+- Sidecar collector pattern reduces application overhead
+- Service mesh (Istio) provides automatic trace propagation
+- Persistent connections to collectors reduce export latency
+- Resource allocation: Reserve 0.1 CPU, 128MB for telemetry
+- Better suited for complex distributed applications
+
+**Recommendation:** Full OpenTelemetry stack with collector pattern for containerized environments.
+
+## Performance Impact Benchmarks
+
+### Instrumentation Overhead
+
+- **HTTP request latency:** +2-5ms per request
+- **Memory usage:** +15-25MB baseline
+- **CPU overhead:** ~1-3% under normal load
+- **Batch size optimization:** 100-500 spans reduces network calls
+
+### Network Impact
+
+- **OTLP/gRPC:** ~1KB per span, 200-500 bytes per metric
+- **Sampling strategies:** 10-50% for high-volume services
+- **Async export:** Prevents blocking application threads
+
+## Decision Matrix
+
+| Factor | AWS Lambda | Kubernetes | Container Platform |
+|--------|------------|------------|-------------------|
+| Cold Start Impact | High | Low | Low |
+| Memory Overhead | Critical | Manageable | Manageable |
+| Trace Correlation | Limited | Excellent | Excellent |
+| Implementation Complexity | Low | Medium | Medium |
+| Operational Overhead | Low | High | Medium |
+
+## Implementation Recommendations
+
+1. **Start with sampling:** Begin with 10% sampling in production
+2. **Prioritize errors:** Always trace error conditions
+3. **Monitor overhead:** Track instrumentation performance impact
+4. **Batch exports:** Use batch processors to reduce network calls
+5. **Resource allocation:** Reserve compute resources for telemetry
+6. **Preserve existing logging:** Use standard logging statements with automatic trace correlation
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,109 @@
+services:
+  titiler:
+    build:
+      context: .
+    ports:
+      - "8000:8000"
+    command: ["uv", "run", "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "main:app", "--bind", "0.0.0.0:8000", "--workers", "1"]
+    environment:
+      # GDAL config
+      - CPL_TMPDIR=/tmp
+      - GDAL_CACHEMAX=75%
+      - GDAL_INGESTED_BYTES_AT_OPEN=32768
+      - GDAL_DISABLE_READDIR_ON_OPEN=EMPTY_DIR
+      - GDAL_HTTP_MERGE_CONSECUTIVE_RANGES=YES
+      - GDAL_HTTP_MULTIPLEX=YES
+      - GDAL_HTTP_VERSION=2
+      - PYTHONWARNINGS=ignore
+      - VSI_CACHE=TRUE
+      - VSI_CACHE_SIZE=536870912
+      - TITILER_API_DEBUG=TRUE
+      - TITILER_API_TELEMETRY_ENABLED=True
+      - OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
+    depends_on:
+      - otel-collector
+    develop:
+      watch:
+        - action: sync+restart
+          path: main.py
+          target: /app/main.py
+
+        - action: rebuild
+          path: ./pyproject.toml
+    networks:
+      - observability-net
+
+  otel-collector:
+    image: otel/opentelemetry-collector-contrib:latest 
+    command: ["--config=/etc/otel-collector-config.yml"]
+    volumes:
+      - ./otel-collector-config.yml:/etc/otel-collector-config.yml 
+    ports:
+      - "4317:4317" 
+      - "4318:4318" 
+      - "8888:8888" 
+    depends_on:
+      - prometheus 
+      - jaeger 
+    networks:
+      - observability-net 
+    develop:
+      watch:
+        - action: sync+restart
+          path: otel-collector-config.yml
+          target: /etc/otel-collector-config.yml
+
+  prometheus:
+    image: prom/prometheus:latest 
+    command:
+      - --config.file=/etc/prometheus/prometheus.yml
+      - --web.enable-remote-write-receiver 
+    volumes:
+      - ./prometheus.yml:/etc/prometheus/prometheus.yml 
+    ports:
+      - "9090:9090" 
+    networks:
+      - observability-net 
+
+  jaeger:
+    image: jaegertracing/all-in-one:latest 
+    ports:
+      - "16686:16686" 
+      - "14268:14268" 
+      - "14250:14250" 
+    environment:
+      - COLLECTOR_OTLP_ENABLED=true
+    networks:
+      - observability-net 
+
+  loki:
+    image: grafana/loki:latest
+    ports:
+      - "3100:3100" 
+    command: -config.file=/etc/loki/local-config.yaml 
+    volumes:
+      - ./loki-config.yaml:/etc/loki/local-config.yaml 
+    networks:
+      - observability-net 
+
+  grafana:
+    image: grafana/grafana:latest
+    ports:
+      - "3000:3000" 
+    environment:
+      - GF_SECURITY_ADMIN_USER=admin 
+      - GF_SECURITY_ADMIN_PASSWORD=admin 
+    volumes:
+      - ./grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
+      - ./grafana-dashboards.yml:/etc/grafana/provisioning/dashboards/dashboards.yml
+      - ./grafana-dashboard.json:/etc/grafana/provisioning/dashboards/grafana-dashboard.json
+    depends_on:
+      - prometheus 
+      - jaeger 
+      - loki
+    networks:
+      - observability-net 
+
+networks:
+  observability-net:
+    driver: bridge