browserbase · seanmcguire12 · Apr 15, 2025 · Apr 15, 2025 · Apr 16, 2025 · Apr 16, 2025
diff --git a/evals/args.ts b/evals/args.ts
@@ -9,6 +9,7 @@ const parsedArgs: {
   concurrency?: number;
   extractMethod?: string;
   provider?: string;
+  useExternalClients?: boolean;
   leftover: string[];
 } = {
   leftover: [],
@@ -31,6 +32,9 @@ for (const arg of rawArgs) {
     parsedArgs.extractMethod = arg.split("=")[1];
   } else if (arg.startsWith("provider=")) {
     parsedArgs.provider = arg.split("=")[1]?.toLowerCase();
+  } else if (arg.startsWith("useExternalClients=")) {
+    const val = arg.split("=")[1]?.toLowerCase();
+    parsedArgs.useExternalClients = val === "true";
   } else {
     parsedArgs.leftover.push(arg);
   }

diff --git a/evals/index.eval.ts b/evals/index.eval.ts
@@ -20,25 +20,19 @@ import {
   filterByCategory,
   filterByEvalName,
   useTextExtract,
+  parsedArgs,
 } from "./args";
-import { generateExperimentName } from "./utils";
+import { createLLMClient, generateExperimentName } from "./utils";
 import { exactMatch, errorMatch } from "./scoring";
 import { tasksByName, MODELS, tasksConfig } from "./taskConfig";
-import { Eval, wrapAISDKModel, wrapOpenAI } from "braintrust";
+import { Eval } from "braintrust";
 import { EvalFunction, SummaryResult, Testcase } from "@/types/evals";
 import { EvalLogger } from "./logger";
-import { AvailableModel, LLMClient } from "@/dist";
+import { AvailableModel } from "@/dist";
 import { env } from "./env";
 import dotenv from "dotenv";
 import { StagehandEvalError } from "@/types/stagehandErrors";
-import { CustomOpenAIClient } from "@/examples/external_clients/customOpenAI";
-import OpenAI from "openai";
 import { initStagehand } from "./initStagehand";
-import { AISdkClient } from "@/examples/external_clients/aisdk";
-import { google } from "@ai-sdk/google";
-import { anthropic } from "@ai-sdk/anthropic";
-import { groq } from "@ai-sdk/groq";
-import { cerebras } from "@ai-sdk/cerebras";
 dotenv.config();
 
 /**
@@ -273,51 +267,15 @@ const generateFilteredTestcases = (): Testcase[] => {
           }
 
           // Execute the task
-          let llmClient: LLMClient;
-          if (input.modelName.startsWith("gpt")) {
-            llmClient = new CustomOpenAIClient({
-              modelName: input.modelName as AvailableModel,
-              client: wrapOpenAI(
-                new OpenAI({
-                  apiKey: process.env.OPENAI_API_KEY,
-                }),
-              ),
-            });
-          } else if (input.modelName.startsWith("gemini")) {
-            llmClient = new AISdkClient({
-              model: wrapAISDKModel(google(input.modelName)),
-            });
-          } else if (input.modelName.startsWith("claude")) {
-            llmClient = new AISdkClient({
-              model: wrapAISDKModel(anthropic(input.modelName)),
-            });
-          } else if (input.modelName.includes("groq")) {
-            llmClient = new AISdkClient({
-              model: wrapAISDKModel(
-                groq(
-                  input.modelName.substring(input.modelName.indexOf("/") + 1),
-                ),
-              ),
-            });
-          } else if (input.modelName.includes("cerebras")) {
-            llmClient = new AISdkClient({
-              model: wrapAISDKModel(
-                cerebras(
-                  input.modelName.substring(input.modelName.indexOf("/") + 1),
-                ),
-              ),
-            });
-          } else if (input.modelName.includes("/")) {
-            llmClient = new CustomOpenAIClient({
-              modelName: input.modelName as AvailableModel,
-              client: wrapOpenAI(
-                new OpenAI({
-                  apiKey: process.env.TOGETHER_AI_API_KEY,
-                  baseURL: "https://api.together.xyz/v1",
-                }),
-              ),
-            });
-          }
+          const llmClient = createLLMClient({
+            modelName: input.modelName,
+            useExternalClients: parsedArgs.useExternalClients === true,
+            logger: (msg) => logger.log(msg),
+            openAiKey: process.env.OPENAI_API_KEY,
+            googleKey: process.env.GOOGLE_API_KEY,
+            anthropicKey: process.env.ANTHROPIC_API_KEY,
+            togetherKey: process.env.TOGETHER_AI_API_KEY,
+          });
           const taskInput = await initStagehand({
             logger,
             llmClient,

diff --git a/evals/utils.ts b/evals/utils.ts
@@ -11,6 +11,21 @@
 import { LogLine } from "@/dist";
 import stringComparison from "string-comparison";
 const { jaroWinkler } = stringComparison;
+import OpenAI from "openai";
+import { wrapAISDKModel, wrapOpenAI } from "braintrust";
+import { anthropic } from "@ai-sdk/anthropic";
+import { google } from "@ai-sdk/google";
+import { groq } from "@ai-sdk/groq";
+import { cerebras } from "@ai-sdk/cerebras";
+import { LLMClient } from "@/dist";
+import { AISdkClient } from "@/examples/external_clients/aisdk";
+import { CustomOpenAIClient } from "@/examples/external_clients/customOpenAI";
+import { OpenAIClient } from "@/lib/llm/OpenAIClient";
+import { AnthropicClient } from "@/lib/llm/AnthropicClient";
+import { GoogleClient } from "@/lib/llm/GoogleClient";
+import { CreateLLMClientOptions } from "@/types/evals";
+import { StagehandEvalError } from "@/types/stagehandErrors";
+import { openai } from "@ai-sdk/openai";
 
 /**
  * normalizeString:
@@ -119,3 +134,103 @@ export function logLineToString(logLine: LogLine): string {
     return "error logging line";
   }
 }
+
+export function createLLMClient({
+  modelName,
+  useExternalClients,
+  logger,
+  openAiKey,
+  googleKey,
+  anthropicKey,
+  togetherKey,
+}: CreateLLMClientOptions): LLMClient {
+  const isOpenAIModel =
+    modelName.startsWith("gpt") || modelName.startsWith("o");
+  const isGoogleModel = modelName.startsWith("gemini");
+  const isAnthropicModel = modelName.startsWith("claude");
+  const isGroqModel = modelName.includes("groq");
+  const isCerebrasModel = modelName.includes("cerebras");
+
+  if (useExternalClients) {
+    if (isOpenAIModel) {
+      if (modelName.includes("/")) {
+        return new CustomOpenAIClient({
+          modelName,
+          client: wrapOpenAI(
+            new OpenAI({
+              apiKey: togetherKey,
+              baseURL: "https://api.together.xyz/v1",
+            }),
+          ),
+        });
+      }
+      return new AISdkClient({
+        model: wrapAISDKModel(openai(modelName)),
+      });
+    } else if (isGoogleModel) {
+      return new AISdkClient({
+        model: wrapAISDKModel(google(modelName)),
+      });
+    } else if (isAnthropicModel) {
+      return new AISdkClient({
+        model: wrapAISDKModel(anthropic(modelName)),
+      });
+    } else if (isGroqModel) {
+      const groqModel = modelName.substring(modelName.indexOf("/") + 1);
-      const groqModel = modelName.substring(modelName.indexOf("/") + 1);
+      const slashIndex = modelName.indexOf("/");
+      const groqModel = slashIndex === -1 ? modelName : modelName.substring(slashIndex + 1);
-      const groqModel = modelName.substring(modelName.indexOf("/") + 1);
+      const slashIndex = modelName.indexOf("/");
+      const groqModel = slashIndex === -1 ? modelName : modelName.substring(slashIndex + 1);
+      return new AISdkClient({
+        model: wrapAISDKModel(groq(groqModel)),
+      });
+    } else if (isCerebrasModel) {
+      const cerebrasModel = modelName.substring(modelName.indexOf("/") + 1);
+      return new AISdkClient({
+        model: wrapAISDKModel(cerebras(cerebrasModel)),
+      });
+    }
+    throw new StagehandEvalError(`Unknown modelName: ${modelName}`);
+  } else {
+    if (isOpenAIModel) {
+      if (modelName.includes("/")) {
+        return new CustomOpenAIClient({
+          modelName,
+          client: wrapOpenAI(
+            new OpenAI({
+              apiKey: togetherKey,
+              baseURL: "https://api.together.xyz/v1",
+            }),
+          ),
+        });
+      }
+      return new OpenAIClient({
+        logger,
+        modelName,
+        enableCaching: false,
+        clientOptions: {
+          apiKey: openAiKey,
+        },
+      });
+    } else if (isGoogleModel) {
+      return new GoogleClient({
+        logger,
+        modelName,
+        enableCaching: false,
+        clientOptions: {
+          apiKey: googleKey,
+        },
+      });
+    } else if (isAnthropicModel) {
+      return new AnthropicClient({
+        logger,
+        modelName,
+        enableCaching: false,
+        clientOptions: {
+          apiKey: anthropicKey,
+        },
+      });
+    } else if (isGroqModel || isCerebrasModel) {
+      throw new StagehandEvalError(
+        `${modelName} can only be used when useExternalClients=true`,
+      );
+    }
+    throw new StagehandEvalError(`Unknown modelName: ${modelName}`);
+  }
+}
diff --git a/types/evals.ts b/types/evals.ts
@@ -77,3 +77,13 @@ export interface EvalResult {
 export type LogLineEval = LogLine & {
   parsedAuxiliary?: string | object;
 };
+
+export interface CreateLLMClientOptions {
+  modelName: AvailableModel;
+  useExternalClients?: boolean;
+  logger?: (msg: LogLine) => void;
+  openAiKey?: string;
+  googleKey?: string;
+  anthropicKey?: string;
+  togetherKey?: string;
+}