get-convex · mikecann · Oct 22, 2025 · Oct 21, 2025 · Oct 21, 2025
diff --git a/package.json b/package.json
@@ -26,13 +26,14 @@
     "typescript-eslint": "^8.23.0",
     "vitest": "^3.0.2"
   },
-  "scripts": {   
-    "local:run:fundamentals": "cross-env TEST_FILTER=000-fundamentals npm run local:run",
-    "local:run:one": "cross-env TEST_FILTER=000-fundamentals/000 npm run local:run",
-    "local:run": "cross-env MODELS=gpt-4.1 DISABLE_BRAINTRUST=1 VERBOSE_INFO_LOGS=1 LOCAL_RESULTS=local_results.jsonl npm run run:evals",
-    "braintrust:fundamentals": "cross-env TEST_FILTER=000-fundamentals npm run braintrust:run",
-    "braintrust:run:one": "cross-env TEST_FILTER=000-fundamentals/000 npm run braintrust:run",
-    "braintrust:run": "cross-env MODELS=gpt-4.1 VERBOSE_INFO_LOGS=1 LOCAL_RESULTS=local_results.jsonl npm run run:evals",
+  "scripts": {
+    "local:run:fundamentals": "cross-env MODELS=gpt-5 TEST_FILTER=000-fundamentals npm run local:run",
+    "local:run:one": "cross-env MODELS=gpt-5 TEST_FILTER=000-fundamentals/000 npm run local:run",
+    "local:run": "cross-env DISABLE_BRAINTRUST=1 VERBOSE_INFO_LOGS=1 LOCAL_RESULTS=local_results.jsonl npm run run:evals",
+    "grade:last": "bun run scripts/gradeLast.ts --concise",
+    "braintrust:fundamentals": "cross-env MODELS=gpt-5 TEST_FILTER=000-fundamentals npm run braintrust:run",
+    "braintrust:run:one": "cross-env MODELS=gpt-5 TEST_FILTER=000-fundamentals/000 npm run braintrust:run",
+    "braintrust:run": "cross-env LOCAL_RESULTS=local_results.jsonl VERBOSE_INFO_LOGS=1 npm run run:evals",
     "run:evals": "pdm run python -m runner.eval_convex_coding"
   }
 }
diff --git a/runner/models/__init__.py b/runner/models/__init__.py
@@ -89,6 +89,22 @@ class ModelTemplate(BaseModel):
         uses_system_prompt=False,
         provider=ModelProvider.OPENAI,
     ),
+    ModelTemplate(
+        name="gpt-5-mini",
+        formatted_name="GPT-5 mini",
+        max_concurrency=int(os.getenv("OPENAI_CONCURRENCY", "4")),
+        requires_chain_of_thought=False,
+        uses_system_prompt=False,
+        provider=ModelProvider.OPENAI,
+    ),
+      ModelTemplate(
+        name="gpt-5-nano",
+        formatted_name="GPT-5 nano",
+        max_concurrency=int(os.getenv("OPENAI_CONCURRENCY", "4")),
+        requires_chain_of_thought=False,
+        uses_system_prompt=False,
+        provider=ModelProvider.OPENAI,
+    ),
     ModelTemplate(
         name="deepseek-ai/DeepSeek-V3",
         formatted_name="DeepSeek V3",

diff --git a/runner/models/model_codegen.py b/runner/models/model_codegen.py
@@ -45,14 +45,22 @@ def generate(self, prompt: str):
             system_message = {"role": "system", "content": SYSTEM_PROMPT}
         else:
             system_message = {"role": "user", "content": SYSTEM_PROMPT}
-        response = self.client.chat.completions.create(
-            model=self.model.name,
-            messages=[
+        # Build parameters, selecting the correct token limit key for newer models
+        max_token_limit = 8192 if self.model.name == "claude-3-5-sonnet-latest" else 16384
+        create_params = {
+            "model": self.model.name,
+            "messages": [
                 system_message,
                 {"role": "user", "content": user_prompt},
             ],
-            max_tokens=8192 if self.model.name == "claude-3-5-sonnet-latest" else 16384,
-        )
+        }
+        # Some newer models (e.g., GPT-5 family) expect `max_completion_tokens` instead of `max_tokens`.
+        if self.model.name.startswith("gpt-5"):
+            create_params["max_completion_tokens"] = max_token_limit
+        else:
+            create_params["max_tokens"] = max_token_limit
+
+        response = self.client.chat.completions.create(**create_params)
         return self._parse_response(response.choices[0].message.content)
 
     def _parse_response(self, response: str):