Skip to content

test removing verifyActCompletion #530

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion evals/tasks/simple_google_search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@ export const simple_google_search: EvalFunction = async ({
await stagehand.page.goto("https://www.google.com");

await stagehand.page.act({
action: 'Search for "OpenAI"',
action: 'Type "OpenAI" into the search bar',
});

await stagehand.page.act({
action: "Click the search button",
});

const expectedUrl = "https://www.google.com/search?q=OpenAI";
Expand Down
1 change: 1 addition & 0 deletions examples/example.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ async function example() {
});
await stagehand.init();
await stagehand.page.goto("https://docs.stagehand.dev");
await stagehand.page.act("click on the quickstart");
}

(async () => {
Expand Down
186 changes: 15 additions & 171 deletions lib/handlers/actHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import {
PlaywrightCommandMethodNotSupportedException,
} from "../../types/playwright";
import { ActionCache } from "../cache/ActionCache";
import { act, fillInVariables, verifyActCompletion } from "../inference";
import { act, fillInVariables } from "../inference";
import { LLMClient } from "../llm/LLMClient";
import { LLMProvider } from "../llm/LLMProvider";
import { StagehandContext } from "../StagehandContext";
Expand Down Expand Up @@ -164,90 +164,6 @@ export class StagehandActHandler {
return id;
}

private async _verifyActionCompletion({
completed,
requestId,
action,
steps,
llmClient,
domSettleTimeoutMs,
}: {
completed: boolean;
requestId: string;
action: string;
steps: string;
llmClient: LLMClient;
domSettleTimeoutMs?: number;
}): Promise<boolean> {
if (!completed) {
return false;
}

await this.stagehandPage._waitForSettledDom(domSettleTimeoutMs);

// o1 is overkill for this task + this task uses a lot of tokens. So we switch it 4o
let verifyLLmClient = llmClient;
if (
llmClient.modelName.startsWith("o1") ||
llmClient.modelName.startsWith("o3")
) {
verifyLLmClient = this.llmProvider.getClient(
"gpt-4o",
llmClient.clientOptions,
);
}

const { outputString: domElements } =
await this.stagehandPage.page.evaluate(() => {
return window.processAllOfDom();
});

let actionCompleted = false;
if (completed) {
// Run action completion verifier
this.logger({
category: "action",
message: "action marked as completed, verifying if this is true...",
level: 1,
auxiliary: {
action: {
value: action,
type: "string",
},
},
});

// Always use text-based DOM verification (no vision).
actionCompleted = await verifyActCompletion({
goal: action,
steps,
llmProvider: this.llmProvider,
llmClient: verifyLLmClient,
domElements,
logger: this.logger,
requestId,
});

this.logger({
category: "action",
message: "action completion verification result",
level: 1,
auxiliary: {
action: {
value: action,
type: "string",
},
result: {
value: actionCompleted.toString(),
type: "boolean",
},
},
});
}

return actionCompleted;
}

private async _performPlaywrightMethod(
method: string,
args: unknown[],
Expand Down Expand Up @@ -992,35 +908,11 @@ export class StagehandActHandler {
);

if (cachedStep.completed) {
// Verify the action was completed successfully
const actionCompleted = await this._verifyActionCompletion({
completed: true,
llmClient,
steps,
requestId,
return {
success: true,
message: "action completed successfully using cached step",
action,
domSettleTimeoutMs,
});

this.logger({
category: "action",
message: "action completion verification result from cache",
level: 1,
auxiliary: {
actionCompleted: {
value: actionCompleted.toString(),
type: "boolean",
},
},
});

if (actionCompleted) {
return {
success: true,
message: "action completed successfully using cached step",
action,
};
}
};
}

return this.act({
Expand Down Expand Up @@ -1393,65 +1285,17 @@ export class StagehandActHandler {
}
}

const actionCompleted = await this._verifyActionCompletion({
completed: response.completed,
requestId,
action,
steps,
llmClient,
domSettleTimeoutMs,
}).catch((error) => {
this.logger({
category: "action",
message:
"error verifying action completion. Assuming action completed.",
level: 1,
auxiliary: {
error: {
value: error.message,
type: "string",
},
trace: {
value: error.stack,
type: "string",
},
},
});

return true;
this.logger({
category: "action",
message: "action completed successfully",
level: 1,
});

if (!actionCompleted) {
this.logger({
category: "action",
message: "continuing to next action step",
level: 1,
});

return this.act({
action,
steps,
llmClient,
chunksSeen,
requestId,
variables,
previousSelectors: [...previousSelectors, foundXpath],
skipActionCacheForThisStep: false,
domSettleTimeoutMs,
});
} else {
this.logger({
category: "action",
message: "action completed successfully",
level: 1,
});
await this._recordAction(action, response.step);
return {
success: true,
message: `Action completed successfully: ${steps}${response.step}`,
action: action,
};
}
await this._recordAction(action, response.step);
return {
success: true,
message: `Action completed successfully: ${steps}${response.step}`,
action: action,
};
} catch (error) {
this.logger({
category: "action",
Expand Down
55 changes: 0 additions & 55 deletions lib/inference.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import { z } from "zod";
import { ActCommandParams, ActCommandResult } from "../types/act";
import { VerifyActCompletionParams } from "../types/inference";
import { LogLine } from "../types/log";
import { ChatMessage, LLMClient } from "./llm/LLMClient";
import {
Expand All @@ -15,62 +14,8 @@ import {
buildObserveUserMessage,
buildRefineSystemPrompt,
buildRefineUserPrompt,
buildVerifyActCompletionSystemPrompt,
buildVerifyActCompletionUserPrompt,
} from "./prompt";

export async function verifyActCompletion({
goal,
steps,
llmClient,
domElements,
logger,
requestId,
}: VerifyActCompletionParams): Promise<boolean> {
const verificationSchema = z.object({
completed: z.boolean().describe("true if the goal is accomplished"),
});

type VerificationResponse = z.infer<typeof verificationSchema>;

const response = await llmClient.createChatCompletion<VerificationResponse>({
options: {
messages: [
buildVerifyActCompletionSystemPrompt(),
buildVerifyActCompletionUserPrompt(goal, steps, domElements),
],
temperature: 0.1,
top_p: 1,
frequency_penalty: 0,
presence_penalty: 0,
response_model: {
name: "Verification",
schema: verificationSchema,
},
requestId,
},
logger,
});

if (!response || typeof response !== "object") {
logger({
category: "VerifyAct",
message: "Unexpected response format: " + JSON.stringify(response),
});
return false;
}

if (response.completed === undefined) {
logger({
category: "VerifyAct",
message: "Missing 'completed' field in response",
});
return false;
}

return response.completed;
}

export function fillInVariables(
text: string,
variables: Record<string, string>,
Expand Down
71 changes: 0 additions & 71 deletions lib/prompt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,77 +25,6 @@ Note 2: Sometimes what your are looking for is hidden behind and element you nee
Again, if the user's goal will be accomplished after running the playwright action, set completed to true. Also, if the user provides custom instructions, it is imperative that you follow them no matter what.
`;

const verifyActCompletionSystemPrompt = `
You are a browser automation assistant. The job has given you a goal and a list of steps that have been taken so far. Your job is to determine if the user's goal has been completed based on the provided information.

# Input
You will receive:
1. The user's goal: A clear description of what the user wants to achieve.
2. Steps taken so far: A list of actions that have been performed up to this point.

# Your Task
Analyze the provided information to determine if the user's goal has been fully completed.

# Output
Return a boolean value:
- true: If the goal has been definitively completed based on the steps taken and the current page.
- false: If the goal has not been completed or if there's any uncertainty about its completion.

# Important Considerations
- False positives are okay. False negatives are not okay.
- Look for evidence of errors on the page or something having gone wrong in completing the goal. If one does not exist, return true.
`;

// ## Examples for completion check
// ### Example 1
// 1. User's goal: "input data scientist into role"
// 2. Steps you've taken so far: "The role input field was filled with 'data scientist'."
// 3. Active DOM elements: ["<input id="c9" class="VfPpkd-fmcmS-wGMbrd " aria-expanded="false" data-axe="mdc-autocomplete">data scientist</input>", "<button class="VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-INsAgc lJ9FBc nDgy9d" type="submit">Search</button>"]

// Output: Will need to have completed set to true. Nothing else matters.
// Reasoning: The goal the user set has already been accomplished. We should not take any extra actions outside of the scope of the goal (for example, clicking on the search button is an invalid action - ie: not acceptable).

// ### Example 2
// 1. User's goal: "Sign up for the newsletter"
// 2. Steps you've taken so far: ["The email input field was filled with '[email protected]'."]
// 3. Active DOM elements: ["<input type='email' id='newsletter-email' placeholder='Enter your email'></input>", "<button id='subscribe-button'>Subscribe</button>"]

// Output: Will need to have click on the subscribe button as action. And completed set to false.
// Reasoning: There might be an error when trying to submit the form and you need to make sure the goal is accomplished properly. So you set completed to false.

export function buildVerifyActCompletionSystemPrompt(): ChatMessage {
return {
role: "system",
content: verifyActCompletionSystemPrompt,
};
}

export function buildVerifyActCompletionUserPrompt(
goal: string,
steps = "None",
domElements: string | undefined,
): ChatMessage {
let actUserPrompt = `
# My Goal
${goal}

# Steps You've Taken So Far
${steps}
`;

if (domElements) {
actUserPrompt += `
# Active DOM Elements on the current page
${domElements}
`;
}

return {
role: "user",
content: actUserPrompt,
};
}

export function buildUserInstructionsString(
userProvidedInstructions?: string,
): string {
Expand Down
12 changes: 0 additions & 12 deletions types/inference.ts

This file was deleted.