add targeted extract evals (#562)

seanmcguire12 · web-flow · commit 8e944b5f466c · 2025-03-05T15:40:25.000-08:00
* add targeted_extract category

* add targeted_extract evals

* add targeted extract evals to CI

* prettier

* log targeted_extract_score

* i need to go to yaml school
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -24,6 +24,7 @@ jobs:
       run-act: ${{ steps.check-labels.outputs.run-act }}
       run-observe: ${{ steps.check-labels.outputs.run-observe }}
       run-text-extract: ${{ steps.check-labels.outputs.run-text-extract }}
+      run-targeted-extract: ${{ steps.check-labels.outputs.run-targeted-extract }}
     steps:
       - id: check-labels
         run: |
@@ -34,6 +35,7 @@ jobs:
             echo "run-act=true" >> $GITHUB_OUTPUT
             echo "run-observe=true" >> $GITHUB_OUTPUT
             echo "run-text-extract=true" >> $GITHUB_OUTPUT
+            echo "run-targeted-extract=true" >> $GITHUB_OUTPUT
             exit 0
           fi
 
@@ -42,6 +44,7 @@ jobs:
           echo "run-act=${{ contains(github.event.pull_request.labels.*.name, 'act') }}" >> $GITHUB_OUTPUT
           echo "run-observe=${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" >> $GITHUB_OUTPUT
           echo "run-text-extract=${{ contains(github.event.pull_request.labels.*.name, 'text-extract') }}" >> $GITHUB_OUTPUT
+          echo "run-targeted-extract=${{ contains(github.event.pull_request.labels.*.name, 'targeted-extract') }}" >> $GITHUB_OUTPUT
 
   run-lint:
     runs-on: ubuntu-latest
@@ -518,3 +521,71 @@ jobs:
             echo "Eval summary not found for observe category. Failing CI."
             exit 1
           fi
+
+  run-targeted-extract-evals:
+    needs: [run-observe-evals, determine-evals]
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    env:
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
+      BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
+      BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
+      HEADLESS: true
+      EVAL_ENV: browserbase
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Check for 'targeted-extract' label
+        id: label-check
+        run: |
+          if [ "${{ needs.determine-evals.outputs.run-targeted-extract }}" != "true" ]; then
+            echo "has_label=false" >> $GITHUB_OUTPUT
+            echo "No label for TARGETED-EXTRACT. Exiting with success."
+          else
+            echo "has_label=true" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Set up Node.js
+        if: needs.determine-evals.outputs.run-targeted-extract == 'true'
+        uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+
+      - name: Install dependencies
+        if: needs.determine-evals.outputs.run-targeted-extract == 'true'
+        run: |
+          rm -rf node_modules
+          rm -f package-lock.json
+          npm install
+
+      - name: Install Playwright browsers
+        if: needs.determine-evals.outputs.run-targeted-extract == 'true'
+        run: npm exec playwright install --with-deps
+
+      - name: Build Stagehand
+        if: needs.determine-evals.outputs.run-targeted-extract == 'true'
+        run: npm run build
+
+      - name: Run targeted extract Evals
+        if: needs.determine-evals.outputs.run-targeted-extract == 'true'
+        run: npm run evals category targeted_extract -- --extract-method=textExtract
+
+      - name: Log targeted extract Evals Performance
+        if: needs.determine-evals.outputs.run-targeted-extract == 'true'
+        run: |
+          experimentName=$(jq -r '.experimentName' eval-summary.json)
+          echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
+          if [ -f eval-summary.json ]; then
+            targeted_extract_score=$(jq '.categories.targeted_extract' eval-summary.json)
+            echo "Targeted extract category score: $targeted_extract_score%"
+            if (( $(echo "$targeted_extract_score < 80" | bc -l) )); then
+              echo "Targeted extract score is below 80%. Failing CI."
+              exit 1
+            fi
+          else
+            echo "Eval summary not found for targeted_extract category. Failing CI."
+            exit 1
+          fi
diff --git a/evals/args.ts b/evals/args.ts
@@ -62,6 +62,7 @@ const DEFAULT_EVAL_CATEGORIES = process.env.EVAL_CATEGORIES
       "extract",
       "experimental",
       "text_extract",
+      "targeted_extract",
     ];
 
 // Finally, interpret leftover arguments to see if user typed "category X" or a single eval name
diff --git a/evals/evals.config.json b/evals/evals.config.json
@@ -254,11 +254,31 @@
     },
     {
       "name": "extract_hamilton_weather",
-      "categories": ["text_extract"]
+      "categories": ["targeted_extract"]
     },
     {
       "name": "extract_regulations_table",
-      "categories": ["text_extract"]
+      "categories": ["targeted_extract"]
+    },
+    {
+      "name": "extract_recipe",
+      "categories": ["targeted_extract"]
+    },
+    {
+      "name": "extract_aigrant_targeted",
+      "categories": ["targeted_extract"]
+    },
+    {
+      "name": "extract_aigrant_targeted_2",
+      "categories": ["targeted_extract"]
+    },
+    {
+      "name": "extract_geniusee",
+      "categories": ["targeted_extract"]
+    },
+    {
+      "name": "extract_geniusee_2",
+      "categories": ["targeted_extract"]
     }
   ]
 }
diff --git a/evals/tasks/extract_aigrant_targeted.ts b/evals/tasks/extract_aigrant_targeted.ts
@@ -0,0 +1,69 @@
+import { z } from "zod";
+import { initStagehand } from "@/evals/initStagehand";
+import { EvalFunction } from "@/types/evals";
+
+export const extract_aigrant_targeted: EvalFunction = async ({
+  modelName,
+  logger,
+  useTextExtract,
+}) => {
+  const { stagehand, initResponse } = await initStagehand({
+    modelName,
+    logger,
+    domSettleTimeoutMs: 3000,
+  });
+
+  const { debugUrl, sessionUrl } = initResponse;
+
+  await stagehand.page.goto("https://aigrant.com/");
+  const selector = "/html/body/div/ul[5]/li[28]";
+  const company = await stagehand.page.extract({
+    instruction: "Extract the company name.",
+    schema: z.object({
+      company_name: z.string(),
+    }),
+    modelName,
+    useTextExtract,
+    selector: selector,
+  });
+
+  await stagehand.close();
+  const companyName = company.company_name;
+
+  const expectedName = {
+    company_name: "Coframe",
+  };
+
+  const nameMatches = companyName == expectedName.company_name;
+
+  if (!nameMatches) {
+    logger.error({
+      message: "extracted company name does not match expected",
+      level: 0,
+      auxiliary: {
+        expected: {
+          value: expectedName.company_name,
+          type: "string",
+        },
+        actual: {
+          value: companyName,
+          type: "string",
+        },
+      },
+    });
+    return {
+      _success: false,
+      error: "Company name does not match expected",
+      logs: logger.getLogs(),
+      debugUrl,
+      sessionUrl,
+    };
+  }
+
+  return {
+    _success: true,
+    logs: logger.getLogs(),
+    debugUrl,
+    sessionUrl,
+  };
+};
diff --git a/evals/tasks/extract_aigrant_targeted_2.ts b/evals/tasks/extract_aigrant_targeted_2.ts
@@ -0,0 +1,76 @@
+import { z } from "zod";
+import { initStagehand } from "@/evals/initStagehand";
+import { EvalFunction } from "@/types/evals";
+
+export const extract_aigrant_targeted_2: EvalFunction = async ({
+  modelName,
+  logger,
+  useTextExtract,
+}) => {
+  const { stagehand, initResponse } = await initStagehand({
+    modelName,
+    logger,
+    domSettleTimeoutMs: 3000,
+  });
+
+  const { debugUrl, sessionUrl } = initResponse;
+
+  await stagehand.page.goto("https://aigrant.com/");
+  const selector = "/html/body/div/ul[5]/li[28]";
+  const company = await stagehand.page.extract({
+    instruction: "Extract the name of the company that comes after 'Coframe'.",
+    schema: z.object({
+      company_name: z.string(),
+    }),
+    modelName,
+    useTextExtract,
+    selector: selector,
+  });
+
+  await stagehand.close();
+  const companyName = company.company_name;
+
+  // nameWeShouldNotGet matches the name of the company that comes after
+  // CoFrame on the website. Since we are using targeted_extract here,
+  // and passing in a selector that does NOT contain the nameWeShouldNotGet,
+  // the LLM should have no visibility into what comes after 'CoFrame' if
+  // targeted_extract is performing correctly
+  const nameWeShouldNotGet = {
+    company_name: "OpusClip",
+  };
+
+  const nameMatches = companyName == nameWeShouldNotGet.company_name;
+
+  if (nameMatches) {
+    logger.error({
+      message:
+        "extracted company name matches the company name that we SHOULD NOT get",
+      level: 0,
+      auxiliary: {
+        expected: {
+          value: nameWeShouldNotGet.company_name,
+          type: "string",
+        },
+        actual: {
+          value: companyName,
+          type: "string",
+        },
+      },
+    });
+    return {
+      _success: false,
+      error:
+        "extracted company name matches the company name that we SHOULD NOT get",
+      logs: logger.getLogs(),
+      debugUrl,
+      sessionUrl,
+    };
+  }
+
+  return {
+    _success: true,
+    logs: logger.getLogs(),
+    debugUrl,
+    sessionUrl,
+  };
+};
diff --git a/evals/tasks/extract_geniusee.ts b/evals/tasks/extract_geniusee.ts
@@ -0,0 +1,71 @@
+import { z } from "zod";
+import { initStagehand } from "@/evals/initStagehand";
+import { EvalFunction } from "@/types/evals";
+
+export const extract_geniusee: EvalFunction = async ({
+  modelName,
+  logger,
+  useTextExtract,
+}) => {
+  const { stagehand, initResponse } = await initStagehand({
+    modelName,
+    logger,
+    domSettleTimeoutMs: 3000,
+  });
+
+  const { debugUrl, sessionUrl } = initResponse;
+
+  await stagehand.page.goto("https://geniusee-blog.surge.sh/single-blog/");
+  const selector = "/html/body/main/div[2]/div[2]/div[2]/table";
+  const scalability = await stagehand.page.extract({
+    instruction:
+      "Extract the scalability comment in the table for Gemini (Google)",
+    schema: z.object({
+      scalability: z.string(),
+    }),
+    modelName,
+    useTextExtract,
+    selector: selector,
+  });
+
+  await stagehand.close();
+  const scalabilityComment = scalability.scalability;
+
+  const expectedScalabilityComment = {
+    scalability: "Scalable architecture with API access",
+  };
+
+  const commentMatches =
+    scalabilityComment == expectedScalabilityComment.scalability;
+
+  if (!commentMatches) {
+    logger.error({
+      message: "extracted scalability comment does not match expected",
+      level: 0,
+      auxiliary: {
+        expected: {
+          value: expectedScalabilityComment.scalability,
+          type: "string",
+        },
+        actual: {
+          value: scalabilityComment,
+          type: "string",
+        },
+      },
+    });
+    return {
+      _success: false,
+      error: "extracted scalability comment does not match expected",
+      logs: logger.getLogs(),
+      debugUrl,
+      sessionUrl,
+    };
+  }
+
+  return {
+    _success: true,
+    logs: logger.getLogs(),
+    debugUrl,
+    sessionUrl,
+  };
+};
diff --git a/evals/tasks/extract_geniusee_2.ts b/evals/tasks/extract_geniusee_2.ts
diff --git a/evals/tasks/extract_recipe.ts b/evals/tasks/extract_recipe.ts
diff --git a/types/evals.ts b/types/evals.ts