scaleapi · danielmillerp · Jun 25, 2026 · Jun 25, 2026 · greptile-apps · Jun 25, 2026
@@ -121,7 +121,13 @@ class EnvironmentVariables(BaseModel):
     MONGODB_DATABASE_NAME: str | None = "agentex"
     MONGODB_MAX_POOL_SIZE: int = 50
     MONGODB_MIN_POOL_SIZE: int = 5
-    REDIS_MAX_CONNECTIONS: int = 50  # Increased for SSE streaming
+    # SSE streaming currently holds one blocking XREAD connection per connected
+    # client, so the pool needs headroom for peak concurrent streams per pod.
+    # NOTE: this is only the in-code default — deployed environments override it
+    # via the REDIS_MAX_CONNECTIONS env var, which is the real cap. Bumping this
+    # buys headroom but does NOT change the 1-connection-per-client scaling; the
+    # durable fix is a shared per-pod reader that fans out to in-process queues.
+    REDIS_MAX_CONNECTIONS: int = 200
     REDIS_CONNECTION_TIMEOUT: int = 60  # Connection timeout in seconds
     REDIS_SOCKET_TIMEOUT: int = 30  # Socket timeout in seconds
     REDIS_STREAM_MAXLEN: int = (
@@ -193,7 +199,7 @@ def refresh(cls, force_refresh: bool = False) -> EnvironmentVariables | None:
                 os.environ.get(EnvVarKeys.MONGODB_MIN_POOL_SIZE, "5")
             ),
             REDIS_MAX_CONNECTIONS=int(
-                os.environ.get(EnvVarKeys.REDIS_MAX_CONNECTIONS, "100")
+                os.environ.get(EnvVarKeys.REDIS_MAX_CONNECTIONS, "200")
             ),
             REDIS_CONNECTION_TIMEOUT=int(
                 os.environ.get(EnvVarKeys.REDIS_CONNECTION_TIMEOUT, "20")

@@ -114,6 +114,11 @@ async def stream_task_events(
         ping_interval = float(
             self.environment_variables.SSE_KEEPALIVE_PING_INTERVAL
         )  # Configurable keepalive ping interval
+        # Track consecutive read failures so we can back off and avoid a
+        # tight error loop. When the Redis pool is exhausted, every connected
+        # client's read fails on each cycle; without backoff this turns into a
+        # log-ingestion firehose (one failure per client per cycle, ~once/sec).
+        consecutive_errors = 0
         try:
             # Application-level control loop
             while True:
@@ -133,6 +138,10 @@ async def stream_task_events(
                         last_message_time = asyncio.get_running_loop().time()
                         await asyncio.sleep(0.02)
 
+                    # A read cycle completed without raising — the stream is
+                    # healthy again, so reset the backoff/error counter.
+                    consecutive_errors = 0
+
                     # If we didn't get any messages, add a small pause
                     # to prevent tight loops and send keepalive ping if needed
                     if message_count == 0:
@@ -151,13 +160,25 @@ async def stream_task_events(
                     )
                     raise
                 except Exception as e:
+                    consecutive_errors += 1
+                    # Always log the full traceback — nothing is swallowed.
+                    # Volume is controlled two ways instead of by dropping
+                    # diagnostics: structured JSON logging keeps each traceback
+                    # to a single log entry (see utils.logging), and the
+                    # exponential backoff below caps how often a sustained
+                    # failure can repeat. The failure counter gives context on
+                    # how long a stream has been erroring.
                     logger.error(
-                        f"Error processing events for task {task_id}: {e}",
+                        f"Error processing events for task {task_id} "
+                        f"(failure #{consecutive_errors}): {e}",
                         exc_info=True,
                     )
                     yield f"data: {TaskStreamErrorEventEntity(type='error', message=str(e)).model_dump_json()}\n\n"
-                    # Add a small delay before continuing
-                    await asyncio.sleep(1)
+                    # Exponential backoff (capped) so a sustained failure (e.g.
+                    # Redis pool exhaustion) doesn't spin a tight per-client
+                    # loop hammering Redis and flooding logs.
+                    backoff = min(2.0 ** min(consecutive_errors - 1, 5), 30.0)
+                    await asyncio.sleep(backoff)
 
         except asyncio.CancelledError:
             # Just exit the generator on cancellation

@@ -1,9 +1,11 @@
 import contextvars
+import json
 import logging
 import os
 import re
 import sys
 from collections.abc import Sequence
+from typing import Any
 
 import ddtrace
 import json_log_formatter
@@ -14,6 +16,17 @@
 # Check if Datadog is configured
 _is_datadog_configured = bool(os.environ.get("DD_AGENT_HOST"))
 
+# Emit structured JSON logs in all deployed environments. JSON keeps a
+# multi-line traceback (exc_info=True) as a single log entry — the newlines
+# live inside the quoted `exc_info` field — instead of fanning out into one
+# cluster-log entry per traceback line. Splitting tracebacks per line was a
+# primary multiplier behind a log-ingestion spike on a plain-text-logging
+# cluster. Local development keeps plain text for readable console output;
+# JSON is the default everywhere else (including when ENVIRONMENT is unset)
+# so a deployed cluster can never silently fall back to per-line tracebacks.
+_is_local_dev = os.environ.get("ENVIRONMENT", "").lower() == "development"
+_use_json_logs = not _is_local_dev
+
 # Include Datadog trace IDs only when Datadog is configured
 if _is_datadog_configured:
     LOG_FORMAT: str = (
@@ -89,6 +102,30 @@ def filter(self, record: logging.LogRecord) -> bool:
 _sensitive_data_filter = SensitiveDataFilter()
 
 
+# Cap the size of individual structured fields in JSON logs. Request logging
+# (LoggedAPIRoute.log_request) attaches the decoded body, headers, and
+# query_params as ``extra``, and the JSON formatter serializes ``extra`` —
+# unlike the plain-text formatter, which drops it. Without a cap, a single
+# large request payload would create a very large per-request log entry on
+# every request, reintroducing the log-volume problem this mode exists to
+# avoid. ``exc_info`` is exempt: a traceback is bounded per error (not per
+# request) and the full stack is worth keeping.
+_MAX_JSON_FIELD_CHARS = 4096
+_UNCAPPED_JSON_FIELDS = frozenset({"exc_info"})
+
+
+def _truncate_log_value(value: Any) -> Any:
+    """Return ``value`` unchanged if small, else a truncated string marker."""
+    try:
+        rendered = value if isinstance(value, str) else json.dumps(value, default=str)
+    except (TypeError, ValueError):
+        rendered = str(value)
+    if len(rendered) <= _MAX_JSON_FIELD_CHARS:
+        return value
+    dropped = len(rendered) - _MAX_JSON_FIELD_CHARS
+    return rendered[:_MAX_JSON_FIELD_CHARS] + f"...[truncated {dropped} chars]"
+
+
 class CustomJSONFormatter(json_log_formatter.JSONFormatter):
     def json_record(self, message: str, extra: dict, record: logging.LogRecord) -> dict:
         extra = super().json_record(message, extra, record)
@@ -123,7 +160,12 @@ def json_record(self, message: str, extra: dict, record: logging.LogRecord) -> d
         if version_override:
             extra["dd.version"] = version_override
 
-        return extra
+        # Bound per-field size so large request bodies/headers/query_params
+        # logged via `extra` can't create oversized entries on every request.
+        return {
+            k: v if k in _UNCAPPED_JSON_FIELDS else _truncate_log_value(v)
+            for k, v in extra.items()
+        }
 
 
 def make_logger(name: str) -> logging.Logger:
@@ -134,7 +176,7 @@ def make_logger(name: str) -> logging.Logger:
 
     logger = logging.getLogger(name)
     stream_handler = logging.StreamHandler()
-    if _is_datadog_configured:
+    if _use_json_logs:
         stream_handler.setFormatter(CustomJSONFormatter())
     else:
         stream_handler.setFormatter(logging.Formatter(LOG_FORMAT))