Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 24 additions & 6 deletions agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,22 @@
SuperAgent for Term Challenge - Entry Point (SDK 3.0 Compatible).

This agent accepts --instruction from the validator and runs autonomously.
Uses litellm for LLM calls instead of term_sdk.
Supports multiple LLM providers:
- Chutes API (default): Uses moonshotai/Kimi-K2.5-TEE with thinking mode
- OpenRouter/litellm: Fallback to other providers

Installation:
pip install . # via pyproject.toml
pip install -r requirements.txt # via requirements.txt

Usage:
# With Chutes API (default - requires CHUTES_API_TOKEN)
export CHUTES_API_TOKEN="your-token"
python agent.py --instruction "Your task description here..."

# With OpenRouter (fallback)
export LLM_PROVIDER="openrouter"
export OPENROUTER_API_KEY="your-key"
python agent.py --instruction "Your task description here..."
"""

Expand All @@ -29,7 +38,7 @@
def ensure_dependencies():
"""Install dependencies if not present."""
try:
import litellm
import openai
import httpx
import pydantic
except ImportError:
Expand All @@ -48,7 +57,7 @@ def ensure_dependencies():
from src.core.loop import run_agent_loop
from src.tools.registry import ToolRegistry
from src.output.jsonl import emit, ErrorEvent
from src.llm.client import LiteLLMClient, CostLimitExceeded
from src.llm.client import get_llm_client, CostLimitExceeded, ChutesClient, LiteLLMClient


class AgentContext:
Expand Down Expand Up @@ -130,21 +139,30 @@ def main():
parser.add_argument("--instruction", required=True, help="Task instruction from validator")
args = parser.parse_args()

provider = CONFIG.get("provider", "chutes")

_log("=" * 60)
_log("SuperAgent Starting (SDK 3.0 - litellm)")
_log(f"SuperAgent Starting (SDK 3.0 - {provider})")
_log("=" * 60)
_log(f"Provider: {provider}")
_log(f"Model: {CONFIG['model']}")
_log(f"Reasoning effort: {CONFIG.get('reasoning_effort', 'default')}")
_log(f"Thinking mode: {CONFIG.get('enable_thinking', True)}")
_log(f"Instruction: {args.instruction[:200]}...")
_log("-" * 60)

# Initialize components
start_time = time.time()

llm = LiteLLMClient(
# Use factory function to get appropriate client based on provider
llm = get_llm_client(
provider=provider,
model=CONFIG["model"],
temperature=CONFIG.get("temperature"),
max_tokens=CONFIG.get("max_tokens", 16384),
cost_limit=CONFIG.get("cost_limit", 100.0),
enable_thinking=CONFIG.get("enable_thinking", True),
cache_extended_retention=CONFIG.get("cache_extended_retention", True),
cache_key=CONFIG.get("cache_key"),
)

tools = ToolRegistry()
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ dependencies = [
"rich>=13.0",
"typer>=0.12.0",
"litellm>=1.50.0",
"openai>=1.0.0",
]

[project.optional-dependencies]
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ tomli-w>=1.0
rich>=13.0
typer>=0.12.0
litellm>=1.50.0
openai>=1.0.0
63 changes: 35 additions & 28 deletions src/config/defaults.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
"""
Hardcoded benchmark configuration for SuperAgent.

Simulates Codex exec with these flags:
- --model gpt-5.2
- -c model_reasoning_effort=xhigh
- --dangerously-bypass-approvals-and-sandbox
- --skip-git-repo-check
- --enable unified_exec
- --json
Default provider: Chutes API with Kimi K2.5-TEE model.
Supports thinking mode with <think>...</think> reasoning blocks.

Alternative providers available via LLM_PROVIDER environment variable:
- "chutes" (default): Chutes API with Kimi K2.5-TEE
- "openrouter": OpenRouter with Claude or other models

All settings are hardcoded - no CLI arguments needed.
"""
Expand All @@ -18,33 +17,40 @@
from typing import Any, Dict


# Main configuration - simulates Codex exec benchmark mode
# Main configuration - default to Chutes API with Kimi K2.5-TEE
CONFIG: Dict[str, Any] = {
# ==========================================================================
# Model Settings (simulates --model gpt-5.2 -c model_reasoning_effort=xhigh)
# Model Settings - Chutes API with Kimi K2.5-TEE
# ==========================================================================

# Model to use via OpenRouter (prefix with openrouter/ for litellm)
"model": os.environ.get("LLM_MODEL", "openrouter/anthropic/claude-sonnet-4-20250514"),
# Model to use via Chutes API
# Kimi K2.5-TEE: 1T params (32B activated), 256K context window
# Supports thinking mode with reasoning_content
"model": os.environ.get("LLM_MODEL", "moonshotai/Kimi-K2.5-TEE"),

# Provider
"provider": "openrouter",
# Provider: "chutes" for Chutes API, "openrouter" for litellm/OpenRouter
"provider": os.environ.get("LLM_PROVIDER", "chutes"),

# Reasoning effort: none, minimal, low, medium, high, xhigh (not used for Claude)
"reasoning_effort": "none",
# Enable Kimi K2.5 thinking mode (reasoning in thinking blocks)
"enable_thinking": True,

# Token limits
# Token limits (Kimi K2.5 supports up to 32K output)
"max_tokens": 16384,

# Temperature (0 = deterministic)
"temperature": 0.0,
# Temperature - Kimi K2.5 best practices:
# - Thinking mode: 1.0 (with top_p=0.95)
# - Instant mode: 0.6 (with top_p=0.95)
"temperature": 1.0,

# Cost limit in USD
"cost_limit": 100.0,

# ==========================================================================
# Agent Execution Settings
# ==========================================================================

# Maximum iterations before stopping
"max_iterations": 200,
"max_iterations": 350,

# Maximum tokens for tool output truncation (middle-out strategy)
"max_output_tokens": 2500, # ~10KB
Expand All @@ -56,10 +62,10 @@
# Context Management (like OpenCode/Codex)
# ==========================================================================

# Model context window (Claude Opus 4.5 = 200K)
"model_context_limit": 200_000,
# Model context window (Kimi K2.5 = 256K)
"model_context_limit": 256_000,

# Reserved tokens for output
# Reserved tokens for output (Kimi K2.5 can output up to 32K)
"output_token_max": 32_000,

# Trigger compaction at this % of usable context (85%)
Expand All @@ -70,16 +76,17 @@
"prune_minimum": 20_000, # Only prune if we can recover at least this many

# ==========================================================================
# Prompt Caching (Anthropic via OpenRouter/Bedrock)
# Prompt Caching
# ==========================================================================

# Enable prompt caching
# Enable prompt caching (Chutes may support server-side caching)
"cache_enabled": True,

# Note: Anthropic caching requires minimum tokens per breakpoint:
# - Claude Opus 4.5 on Bedrock: 4096 tokens minimum
# - Claude Sonnet/other: 1024 tokens minimum
# System prompt should be large enough to meet this threshold
# Chutes API caching notes:
# - Kimi K2.5 on Chutes uses server-side optimization
# - Keep system prompt stable for best performance
"cache_extended_retention": True,
"cache_key": None,

# ==========================================================================
# Simulated Codex Flags (all enabled/bypassed for benchmark)
Expand Down
Loading