Skip to content

Commit 8e944b5

Browse files
add targeted extract evals (#562)
* add targeted_extract category * add targeted_extract evals * add targeted extract evals to CI * prettier * log targeted_extract_score * i need to go to yaml school
1 parent 65ac3dc commit 8e944b5

File tree

9 files changed

+492
-2
lines changed

9 files changed

+492
-2
lines changed

.github/workflows/ci.yml

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ jobs:
2424
run-act: ${{ steps.check-labels.outputs.run-act }}
2525
run-observe: ${{ steps.check-labels.outputs.run-observe }}
2626
run-text-extract: ${{ steps.check-labels.outputs.run-text-extract }}
27+
run-targeted-extract: ${{ steps.check-labels.outputs.run-targeted-extract }}
2728
steps:
2829
- id: check-labels
2930
run: |
@@ -34,6 +35,7 @@ jobs:
3435
echo "run-act=true" >> $GITHUB_OUTPUT
3536
echo "run-observe=true" >> $GITHUB_OUTPUT
3637
echo "run-text-extract=true" >> $GITHUB_OUTPUT
38+
echo "run-targeted-extract=true" >> $GITHUB_OUTPUT
3739
exit 0
3840
fi
3941
@@ -42,6 +44,7 @@ jobs:
4244
echo "run-act=${{ contains(github.event.pull_request.labels.*.name, 'act') }}" >> $GITHUB_OUTPUT
4345
echo "run-observe=${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" >> $GITHUB_OUTPUT
4446
echo "run-text-extract=${{ contains(github.event.pull_request.labels.*.name, 'text-extract') }}" >> $GITHUB_OUTPUT
47+
echo "run-targeted-extract=${{ contains(github.event.pull_request.labels.*.name, 'targeted-extract') }}" >> $GITHUB_OUTPUT
4548
4649
run-lint:
4750
runs-on: ubuntu-latest
@@ -518,3 +521,71 @@ jobs:
518521
echo "Eval summary not found for observe category. Failing CI."
519522
exit 1
520523
fi
524+
525+
run-targeted-extract-evals:
526+
needs: [run-observe-evals, determine-evals]
527+
runs-on: ubuntu-latest
528+
timeout-minutes: 60
529+
env:
530+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
531+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
532+
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
533+
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
534+
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
535+
HEADLESS: true
536+
EVAL_ENV: browserbase
537+
steps:
538+
- name: Check out repository code
539+
uses: actions/checkout@v4
540+
541+
- name: Check for 'targeted-extract' label
542+
id: label-check
543+
run: |
544+
if [ "${{ needs.determine-evals.outputs.run-targeted-extract }}" != "true" ]; then
545+
echo "has_label=false" >> $GITHUB_OUTPUT
546+
echo "No label for TARGETED-EXTRACT. Exiting with success."
547+
else
548+
echo "has_label=true" >> $GITHUB_OUTPUT
549+
fi
550+
551+
- name: Set up Node.js
552+
if: needs.determine-evals.outputs.run-targeted-extract == 'true'
553+
uses: actions/setup-node@v4
554+
with:
555+
node-version: "20"
556+
557+
- name: Install dependencies
558+
if: needs.determine-evals.outputs.run-targeted-extract == 'true'
559+
run: |
560+
rm -rf node_modules
561+
rm -f package-lock.json
562+
npm install
563+
564+
- name: Install Playwright browsers
565+
if: needs.determine-evals.outputs.run-targeted-extract == 'true'
566+
run: npm exec playwright install --with-deps
567+
568+
- name: Build Stagehand
569+
if: needs.determine-evals.outputs.run-targeted-extract == 'true'
570+
run: npm run build
571+
572+
- name: Run targeted extract Evals
573+
if: needs.determine-evals.outputs.run-targeted-extract == 'true'
574+
run: npm run evals category targeted_extract -- --extract-method=textExtract
575+
576+
- name: Log targeted extract Evals Performance
577+
if: needs.determine-evals.outputs.run-targeted-extract == 'true'
578+
run: |
579+
experimentName=$(jq -r '.experimentName' eval-summary.json)
580+
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
581+
if [ -f eval-summary.json ]; then
582+
targeted_extract_score=$(jq '.categories.targeted_extract' eval-summary.json)
583+
echo "Targeted extract category score: $targeted_extract_score%"
584+
if (( $(echo "$targeted_extract_score < 80" | bc -l) )); then
585+
echo "Targeted extract score is below 80%. Failing CI."
586+
exit 1
587+
fi
588+
else
589+
echo "Eval summary not found for targeted_extract category. Failing CI."
590+
exit 1
591+
fi

evals/args.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ const DEFAULT_EVAL_CATEGORIES = process.env.EVAL_CATEGORIES
6262
"extract",
6363
"experimental",
6464
"text_extract",
65+
"targeted_extract",
6566
];
6667

6768
// Finally, interpret leftover arguments to see if user typed "category X" or a single eval name

evals/evals.config.json

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -254,11 +254,31 @@
254254
},
255255
{
256256
"name": "extract_hamilton_weather",
257-
"categories": ["text_extract"]
257+
"categories": ["targeted_extract"]
258258
},
259259
{
260260
"name": "extract_regulations_table",
261-
"categories": ["text_extract"]
261+
"categories": ["targeted_extract"]
262+
},
263+
{
264+
"name": "extract_recipe",
265+
"categories": ["targeted_extract"]
266+
},
267+
{
268+
"name": "extract_aigrant_targeted",
269+
"categories": ["targeted_extract"]
270+
},
271+
{
272+
"name": "extract_aigrant_targeted_2",
273+
"categories": ["targeted_extract"]
274+
},
275+
{
276+
"name": "extract_geniusee",
277+
"categories": ["targeted_extract"]
278+
},
279+
{
280+
"name": "extract_geniusee_2",
281+
"categories": ["targeted_extract"]
262282
}
263283
]
264284
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import { z } from "zod";
2+
import { initStagehand } from "@/evals/initStagehand";
3+
import { EvalFunction } from "@/types/evals";
4+
5+
export const extract_aigrant_targeted: EvalFunction = async ({
6+
modelName,
7+
logger,
8+
useTextExtract,
9+
}) => {
10+
const { stagehand, initResponse } = await initStagehand({
11+
modelName,
12+
logger,
13+
domSettleTimeoutMs: 3000,
14+
});
15+
16+
const { debugUrl, sessionUrl } = initResponse;
17+
18+
await stagehand.page.goto("https://aigrant.com/");
19+
const selector = "/html/body/div/ul[5]/li[28]";
20+
const company = await stagehand.page.extract({
21+
instruction: "Extract the company name.",
22+
schema: z.object({
23+
company_name: z.string(),
24+
}),
25+
modelName,
26+
useTextExtract,
27+
selector: selector,
28+
});
29+
30+
await stagehand.close();
31+
const companyName = company.company_name;
32+
33+
const expectedName = {
34+
company_name: "Coframe",
35+
};
36+
37+
const nameMatches = companyName == expectedName.company_name;
38+
39+
if (!nameMatches) {
40+
logger.error({
41+
message: "extracted company name does not match expected",
42+
level: 0,
43+
auxiliary: {
44+
expected: {
45+
value: expectedName.company_name,
46+
type: "string",
47+
},
48+
actual: {
49+
value: companyName,
50+
type: "string",
51+
},
52+
},
53+
});
54+
return {
55+
_success: false,
56+
error: "Company name does not match expected",
57+
logs: logger.getLogs(),
58+
debugUrl,
59+
sessionUrl,
60+
};
61+
}
62+
63+
return {
64+
_success: true,
65+
logs: logger.getLogs(),
66+
debugUrl,
67+
sessionUrl,
68+
};
69+
};
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import { z } from "zod";
2+
import { initStagehand } from "@/evals/initStagehand";
3+
import { EvalFunction } from "@/types/evals";
4+
5+
export const extract_aigrant_targeted_2: EvalFunction = async ({
6+
modelName,
7+
logger,
8+
useTextExtract,
9+
}) => {
10+
const { stagehand, initResponse } = await initStagehand({
11+
modelName,
12+
logger,
13+
domSettleTimeoutMs: 3000,
14+
});
15+
16+
const { debugUrl, sessionUrl } = initResponse;
17+
18+
await stagehand.page.goto("https://aigrant.com/");
19+
const selector = "/html/body/div/ul[5]/li[28]";
20+
const company = await stagehand.page.extract({
21+
instruction: "Extract the name of the company that comes after 'Coframe'.",
22+
schema: z.object({
23+
company_name: z.string(),
24+
}),
25+
modelName,
26+
useTextExtract,
27+
selector: selector,
28+
});
29+
30+
await stagehand.close();
31+
const companyName = company.company_name;
32+
33+
// nameWeShouldNotGet matches the name of the company that comes after
34+
// CoFrame on the website. Since we are using targeted_extract here,
35+
// and passing in a selector that does NOT contain the nameWeShouldNotGet,
36+
// the LLM should have no visibility into what comes after 'CoFrame' if
37+
// targeted_extract is performing correctly
38+
const nameWeShouldNotGet = {
39+
company_name: "OpusClip",
40+
};
41+
42+
const nameMatches = companyName == nameWeShouldNotGet.company_name;
43+
44+
if (nameMatches) {
45+
logger.error({
46+
message:
47+
"extracted company name matches the company name that we SHOULD NOT get",
48+
level: 0,
49+
auxiliary: {
50+
expected: {
51+
value: nameWeShouldNotGet.company_name,
52+
type: "string",
53+
},
54+
actual: {
55+
value: companyName,
56+
type: "string",
57+
},
58+
},
59+
});
60+
return {
61+
_success: false,
62+
error:
63+
"extracted company name matches the company name that we SHOULD NOT get",
64+
logs: logger.getLogs(),
65+
debugUrl,
66+
sessionUrl,
67+
};
68+
}
69+
70+
return {
71+
_success: true,
72+
logs: logger.getLogs(),
73+
debugUrl,
74+
sessionUrl,
75+
};
76+
};

evals/tasks/extract_geniusee.ts

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import { z } from "zod";
2+
import { initStagehand } from "@/evals/initStagehand";
3+
import { EvalFunction } from "@/types/evals";
4+
5+
export const extract_geniusee: EvalFunction = async ({
6+
modelName,
7+
logger,
8+
useTextExtract,
9+
}) => {
10+
const { stagehand, initResponse } = await initStagehand({
11+
modelName,
12+
logger,
13+
domSettleTimeoutMs: 3000,
14+
});
15+
16+
const { debugUrl, sessionUrl } = initResponse;
17+
18+
await stagehand.page.goto("https://geniusee-blog.surge.sh/single-blog/");
19+
const selector = "/html/body/main/div[2]/div[2]/div[2]/table";
20+
const scalability = await stagehand.page.extract({
21+
instruction:
22+
"Extract the scalability comment in the table for Gemini (Google)",
23+
schema: z.object({
24+
scalability: z.string(),
25+
}),
26+
modelName,
27+
useTextExtract,
28+
selector: selector,
29+
});
30+
31+
await stagehand.close();
32+
const scalabilityComment = scalability.scalability;
33+
34+
const expectedScalabilityComment = {
35+
scalability: "Scalable architecture with API access",
36+
};
37+
38+
const commentMatches =
39+
scalabilityComment == expectedScalabilityComment.scalability;
40+
41+
if (!commentMatches) {
42+
logger.error({
43+
message: "extracted scalability comment does not match expected",
44+
level: 0,
45+
auxiliary: {
46+
expected: {
47+
value: expectedScalabilityComment.scalability,
48+
type: "string",
49+
},
50+
actual: {
51+
value: scalabilityComment,
52+
type: "string",
53+
},
54+
},
55+
});
56+
return {
57+
_success: false,
58+
error: "extracted scalability comment does not match expected",
59+
logs: logger.getLogs(),
60+
debugUrl,
61+
sessionUrl,
62+
};
63+
}
64+
65+
return {
66+
_success: true,
67+
logs: logger.getLogs(),
68+
debugUrl,
69+
sessionUrl,
70+
};
71+
};

0 commit comments

Comments
 (0)