PlatformNetwork · echobt · Feb 3, 2026
diff --git a/agent.py b/agent.py
@@ -3,13 +3,22 @@
 SuperAgent for Term Challenge - Entry Point (SDK 3.0 Compatible).
 
 This agent accepts --instruction from the validator and runs autonomously.
-Uses litellm for LLM calls instead of term_sdk.
+Supports multiple LLM providers:
+- Chutes API (default): Uses moonshotai/Kimi-K2.5-TEE with thinking mode
+- OpenRouter/litellm: Fallback to other providers
 
 Installation:
     pip install .                    # via pyproject.toml
     pip install -r requirements.txt  # via requirements.txt
 
 Usage:
+    # With Chutes API (default - requires CHUTES_API_TOKEN)
+    export CHUTES_API_TOKEN="your-token"
+    python agent.py --instruction "Your task description here..."
+
+    # With OpenRouter (fallback)
+    export LLM_PROVIDER="openrouter"
+    export OPENROUTER_API_KEY="your-key"
     python agent.py --instruction "Your task description here..."
 """
 
@@ -29,7 +38,7 @@
 def ensure_dependencies():
     """Install dependencies if not present."""
     try:
-        import litellm
+        import openai
         import httpx
         import pydantic
     except ImportError:
@@ -48,7 +57,7 @@ def ensure_dependencies():
 from src.core.loop import run_agent_loop
 from src.tools.registry import ToolRegistry
 from src.output.jsonl import emit, ErrorEvent
-from src.llm.client import LiteLLMClient, CostLimitExceeded
+from src.llm.client import get_llm_client, CostLimitExceeded, ChutesClient, LiteLLMClient
 
 
 class AgentContext:
@@ -130,21 +139,30 @@ def main():
     parser.add_argument("--instruction", required=True, help="Task instruction from validator")
     args = parser.parse_args()
 
+    provider = CONFIG.get("provider", "chutes")
+
     _log("=" * 60)
-    _log("SuperAgent Starting (SDK 3.0 - litellm)")
+    _log(f"SuperAgent Starting (SDK 3.0 - {provider})")
     _log("=" * 60)
+    _log(f"Provider: {provider}")
     _log(f"Model: {CONFIG['model']}")
-    _log(f"Reasoning effort: {CONFIG.get('reasoning_effort', 'default')}")
+    _log(f"Thinking mode: {CONFIG.get('enable_thinking', True)}")
     _log(f"Instruction: {args.instruction[:200]}...")
     _log("-" * 60)
 
     # Initialize components
     start_time = time.time()
 
-    llm = LiteLLMClient(
+    # Use factory function to get appropriate client based on provider
+    llm = get_llm_client(
+        provider=provider,
         model=CONFIG["model"],
         temperature=CONFIG.get("temperature"),
         max_tokens=CONFIG.get("max_tokens", 16384),
+        cost_limit=CONFIG.get("cost_limit", 100.0),
+        enable_thinking=CONFIG.get("enable_thinking", True),
+        cache_extended_retention=CONFIG.get("cache_extended_retention", True),
+        cache_key=CONFIG.get("cache_key"),
     )
 
     tools = ToolRegistry()

diff --git a/pyproject.toml b/pyproject.toml
@@ -27,6 +27,7 @@ dependencies = [
     "rich>=13.0",
     "typer>=0.12.0",
     "litellm>=1.50.0",
+    "openai>=1.0.0",
 ]
 
 [project.optional-dependencies]

diff --git a/requirements.txt b/requirements.txt
@@ -5,3 +5,4 @@ tomli-w>=1.0
 rich>=13.0
 typer>=0.12.0
 litellm>=1.50.0
+openai>=1.0.0
diff --git a/src/config/defaults.py b/src/config/defaults.py
@@ -1,13 +1,12 @@
 """
 Hardcoded benchmark configuration for SuperAgent.
 
-Simulates Codex exec with these flags:
-- --model gpt-5.2
-- -c model_reasoning_effort=xhigh
-- --dangerously-bypass-approvals-and-sandbox
-- --skip-git-repo-check
-- --enable unified_exec
-- --json
+Default provider: Chutes API with Kimi K2.5-TEE model.
+Supports thinking mode with <think>...</think> reasoning blocks.
+
+Alternative providers available via LLM_PROVIDER environment variable:
+- "chutes" (default): Chutes API with Kimi K2.5-TEE
+- "openrouter": OpenRouter with Claude or other models
 
 All settings are hardcoded - no CLI arguments needed.
 """
@@ -18,33 +17,40 @@
 from typing import Any, Dict
 
 
-# Main configuration - simulates Codex exec benchmark mode
+# Main configuration - default to Chutes API with Kimi K2.5-TEE
 CONFIG: Dict[str, Any] = {
     # ==========================================================================
-    # Model Settings (simulates --model gpt-5.2 -c model_reasoning_effort=xhigh)
+    # Model Settings - Chutes API with Kimi K2.5-TEE
     # ==========================================================================
 
-    # Model to use via OpenRouter (prefix with openrouter/ for litellm)
-    "model": os.environ.get("LLM_MODEL", "openrouter/anthropic/claude-sonnet-4-20250514"),
+    # Model to use via Chutes API
+    # Kimi K2.5-TEE: 1T params (32B activated), 256K context window
+    # Supports thinking mode with reasoning_content
+    "model": os.environ.get("LLM_MODEL", "moonshotai/Kimi-K2.5-TEE"),
 
-    # Provider
-    "provider": "openrouter",
+    # Provider: "chutes" for Chutes API, "openrouter" for litellm/OpenRouter
+    "provider": os.environ.get("LLM_PROVIDER", "chutes"),
 
-    # Reasoning effort: none, minimal, low, medium, high, xhigh (not used for Claude)
-    "reasoning_effort": "none",
+    # Enable Kimi K2.5 thinking mode (reasoning in thinking blocks)
+    "enable_thinking": True,
 
-    # Token limits
+    # Token limits (Kimi K2.5 supports up to 32K output)
     "max_tokens": 16384,
 
-    # Temperature (0 = deterministic)
-    "temperature": 0.0,
+    # Temperature - Kimi K2.5 best practices:
+    # - Thinking mode: 1.0 (with top_p=0.95)
+    # - Instant mode: 0.6 (with top_p=0.95)
+    "temperature": 1.0,
+
+    # Cost limit in USD
+    "cost_limit": 100.0,
 
     # ==========================================================================
     # Agent Execution Settings
     # ==========================================================================
 
     # Maximum iterations before stopping
-    "max_iterations": 200,
+    "max_iterations": 350,
 
     # Maximum tokens for tool output truncation (middle-out strategy)
     "max_output_tokens": 2500,  # ~10KB
@@ -56,10 +62,10 @@
     # Context Management (like OpenCode/Codex)
     # ==========================================================================
 
-    # Model context window (Claude Opus 4.5 = 200K)
-    "model_context_limit": 200_000,
+    # Model context window (Kimi K2.5 = 256K)
+    "model_context_limit": 256_000,
 
-    # Reserved tokens for output
+    # Reserved tokens for output (Kimi K2.5 can output up to 32K)
     "output_token_max": 32_000,
 
     # Trigger compaction at this % of usable context (85%)
@@ -70,16 +76,17 @@
     "prune_minimum": 20_000,   # Only prune if we can recover at least this many
 
     # ==========================================================================
-    # Prompt Caching (Anthropic via OpenRouter/Bedrock)
+    # Prompt Caching
     # ==========================================================================
 
-    # Enable prompt caching
+    # Enable prompt caching (Chutes may support server-side caching)
     "cache_enabled": True,
 
-    # Note: Anthropic caching requires minimum tokens per breakpoint:
-    # - Claude Opus 4.5 on Bedrock: 4096 tokens minimum
-    # - Claude Sonnet/other: 1024 tokens minimum
-    # System prompt should be large enough to meet this threshold
+    # Chutes API caching notes:
+    # - Kimi K2.5 on Chutes uses server-side optimization
+    # - Keep system prompt stable for best performance
+    "cache_extended_retention": True,
+    "cache_key": None,
 
     # ==========================================================================
     # Simulated Codex Flags (all enabled/bypassed for benchmark)