From aebb5837793cd821c47772880c7e73f5d3426541 Mon Sep 17 00:00:00 2001
From: Aditya Bharat Soni <adityasoni9998@gmail.com>
Date: Thu, 23 Jan 2025 15:18:30 -0500
Subject: [PATCH] Support for VisualWebArena evaluation in OpenHands (#4773)

Co-authored-by: Xingyao Wang <xingyao@all-hands.dev>
Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Graham Neubig <neubig@gmail.com>
---
 .github/workflows/integration-runner.yml      |  36 ++-
 .../benchmarks/visualwebarena/README.md       |  50 +++
 .../benchmarks/visualwebarena/__init__.py     |   0
 .../visualwebarena/get_success_rate.py        |  40 +++
 .../benchmarks/visualwebarena/run_infer.py    | 254 +++++++++++++++
 .../visualwebarena/scripts/run_infer.sh       |  48 +++
 evaluation/integration_tests/run_infer.py     |   1 +
 openhands/agenthub/__init__.py                |   2 +
 .../agenthub/visualbrowsing_agent/README.md   |   7 +
 .../agenthub/visualbrowsing_agent/__init__.py |   6 +
 .../visualbrowsing_agent.py                   | 306 ++++++++++++++++++
 openhands/events/observation/browse.py        |   2 +
 openhands/runtime/browser/browser_env.py      |  56 +++-
 openhands/runtime/browser/utils.py            |   4 +
 poetry.lock                                   | 268 ++++++++++++++-
 pyproject.toml                                |   2 +
 16 files changed, 1063 insertions(+), 19 deletions(-)
 create mode 100644 evaluation/benchmarks/visualwebarena/README.md
 create mode 100644 evaluation/benchmarks/visualwebarena/__init__.py
 create mode 100644 evaluation/benchmarks/visualwebarena/get_success_rate.py
 create mode 100644 evaluation/benchmarks/visualwebarena/run_infer.py
 create mode 100755 evaluation/benchmarks/visualwebarena/scripts/run_infer.sh
 create mode 100644 openhands/agenthub/visualbrowsing_agent/README.md
 create mode 100644 openhands/agenthub/visualbrowsing_agent/__init__.py
 create mode 100644 openhands/agenthub/visualbrowsing_agent/visualbrowsing_agent.py

diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
index 9af41bbdecdb..00d7c45957ef 100644
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -160,7 +160,6 @@ jobs:
           echo "api_key = \"$LLM_API_KEY\"" >> config.toml
           echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
           echo "temperature = 0.0" >> config.toml
-
       - name: Run integration test evaluation for DelegatorAgent (DeepSeek)
         env:
           SANDBOX_FORCE_REBUILD_RUNTIME: True
@@ -174,12 +173,42 @@ jobs:
           cat $REPORT_FILE_DELEGATOR_DEEPSEEK >> $GITHUB_ENV
           echo >> $GITHUB_ENV
           echo "EOF" >> $GITHUB_ENV
+      # -------------------------------------------------------------
+      # Run VisualBrowsingAgent tests for DeepSeek, limited to t05 and t06
+      - name: Wait a little bit (again)
+        run: sleep 5
+
+      - name: Configure config.toml for testing VisualBrowsingAgent (DeepSeek)
+        env:
+          LLM_MODEL: "litellm_proxy/deepseek-chat"
+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          MAX_ITERATIONS: 15
+        run: |
+          echo "[llm.eval]" > config.toml
+          echo "model = \"$LLM_MODEL\"" >> config.toml
+          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
+          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
+          echo "temperature = 0.0" >> config.toml
+      - name: Run integration test evaluation for VisualBrowsingAgent (DeepSeek)
+        env:
+          SANDBOX_FORCE_REBUILD_RUNTIME: True
+        run: |
+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD VisualBrowsingAgent '' 15 $N_PROCESSES "t05_simple_browsing,t06_github_pr_browsing.py" 'visualbrowsing_deepseek_run'
+
+          # Find and export the visual browsing agent test results
+          REPORT_FILE_VISUALBROWSING_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/VisualBrowsingAgent/deepseek*_maxiter_15_N* -name "report.md" -type f | head -n 1)
+          echo "REPORT_FILE_VISUALBROWSING_DEEPSEEK: $REPORT_FILE_VISUALBROWSING_DEEPSEEK"
+          echo "INTEGRATION_TEST_REPORT_VISUALBROWSING_DEEPSEEK<<EOF" >> $GITHUB_ENV
+          cat $REPORT_FILE_VISUALBROWSING_DEEPSEEK >> $GITHUB_ENV
+          echo >> $GITHUB_ENV
+          echo "EOF" >> $GITHUB_ENV
 
       - name: Create archive of evaluation outputs
         run: |
           TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
           cd evaluation/evaluation_outputs/outputs  # Change to the outputs directory
-          tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* integration_tests/DelegatorAgent/*  # Only include the actual result directories
+          tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* integration_tests/DelegatorAgent/* integration_tests/VisualBrowsingAgent/* # Only include the actual result directories
 
       - name: Upload evaluation results as artifact
         uses: actions/upload-artifact@v4
@@ -227,4 +256,7 @@ jobs:
               **Integration Tests Report Delegator (DeepSeek)**
               ${{ env.INTEGRATION_TEST_REPORT_DELEGATOR_DEEPSEEK }}
               ---
+              **Integration Tests Report VisualBrowsing (DeepSeek)**
+              ${{ env.INTEGRATION_TEST_REPORT_VISUALBROWSING_DEEPSEEK }}
+              ---
               Download testing outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }})
diff --git a/evaluation/benchmarks/visualwebarena/README.md b/evaluation/benchmarks/visualwebarena/README.md
new file mode 100644
index 000000000000..340550493d7c
--- /dev/null
+++ b/evaluation/benchmarks/visualwebarena/README.md
@@ -0,0 +1,50 @@
+# VisualWebArena Evaluation with OpenHands Browsing Agents
+
+This folder contains evaluation for [VisualWebArena](https://github.com/web-arena-x/visualwebarena) benchmark, powered by [BrowserGym](https://github.com/ServiceNow/BrowserGym) for easy evaluation of how well an agent capable of browsing can perform on realistic web browsing tasks.
+
+## Setup Environment and LLM Configuration
+
+Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
+
+## Setup VisualWebArena Environment
+
+VisualWebArena requires you to set up websites containing pre-populated content that is accessible via URL to the machine running the OpenHands agents.
+Follow [this document](https://github.com/web-arena-x/visualwebarena/blob/main/environment_docker/README.md) to set up your own VisualWebArena environment through local servers or AWS EC2 instances.
+Take note of the base URL (`$VISUALWEBARENA_BASE_URL`) of the machine where the environment is installed.
+
+## Test if your environment works
+
+Access with browser the above VisualWebArena website URLs and see if they load correctly.
+If you cannot access the website, make sure the firewall allows public access of the aforementioned ports on your server
+Check the network security policy if you are using an AWS machine.
+Follow the VisualWebArena environment setup guide carefully, and make sure the URL fields are populated with the correct base URL of your server.
+
+## Run Evaluation
+
+```bash
+export VISUALWEBARENA_BASE_URL=<YOUR_SERVER_URL_HERE>
+export OPENAI_API_KEY="yourkey" # this OpenAI API key is required for some visualWebArena validators that utilize LLMs
+export OPENAI_BASE_URL="https://api.openai.com/v1/" # base URL for OpenAI model used for VisualWebArena evaluation
+bash evaluation/benchmarks/visualwebarena/scripts/run_infer.sh llm.claude HEAD VisualBrowsingAgent
+```
+
+Results will be in `evaluation/evaluation_outputs/outputs/visualwebarena/`
+
+To calculate the success rate, run:
+
+```sh
+poetry run python evaluation/benchmarks/visualwebarena/get_success_rate.py evaluation/evaluation_outputs/outputs/visualwebarena/SOME_AGENT/EXP_NAME/output.jsonl
+```
+
+## Submit your evaluation results
+
+You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
+
+## VisualBrowsingAgent V1.0 result
+
+Tested on VisualBrowsingAgent V1.0
+
+VisualWebArena, 910 tasks (high cost, single run due to fixed task), max step 15. Resolve rates are:
+
+- GPT4o: 26.15%
+- Claude-3.5 Sonnet: 25.27%
diff --git a/evaluation/benchmarks/visualwebarena/__init__.py b/evaluation/benchmarks/visualwebarena/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/evaluation/benchmarks/visualwebarena/get_success_rate.py b/evaluation/benchmarks/visualwebarena/get_success_rate.py
new file mode 100644
index 000000000000..7b8d2542d008
--- /dev/null
+++ b/evaluation/benchmarks/visualwebarena/get_success_rate.py
@@ -0,0 +1,40 @@
+import argparse
+import json
+
+import browsergym.visualwebarena  # noqa F401 register visualwebarena tasks as gym environments
+import gymnasium as gym
+
+parser = argparse.ArgumentParser(description='Calculate average reward.')
+parser.add_argument('output_path', type=str, help='path to output.jsonl')
+
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    env_ids = [
+        id
+        for id in gym.envs.registry.keys()
+        if id.startswith('browsergym/visualwebarena')
+    ]
+    total_num = len(env_ids)
+    print('Total number of tasks: ', total_num)
+    total_reward = 0
+    total_cost = 0
+    actual_num = 0
+    with open(args.output_path, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+            actual_num += 1
+            total_cost += data['metrics']['accumulated_cost']
+            reward = data['test_result']['reward']
+            if reward >= 0:
+                total_reward += data['test_result']['reward']
+            else:
+                actual_num -= 1
+    avg_reward = total_reward / total_num
+    print('Total reward: ', total_reward)
+    print('Success Rate: ', avg_reward)
+
+    avg_cost = total_cost / actual_num
+    print('Avg Cost: ', avg_cost)
+    print('Total Cost: ', total_cost)
+    print('Actual number of tasks finished: ', actual_num)
diff --git a/evaluation/benchmarks/visualwebarena/run_infer.py b/evaluation/benchmarks/visualwebarena/run_infer.py
new file mode 100644
index 000000000000..5010daa42ef2
--- /dev/null
+++ b/evaluation/benchmarks/visualwebarena/run_infer.py
@@ -0,0 +1,254 @@
+import asyncio
+import json
+import os
+from typing import Any
+
+import browsergym.visualwebarena  # noqa F401 register visualwebarena tasks as gym environments
+import gymnasium as gym
+import pandas as pd
+
+from evaluation.utils.shared import (
+    EvalMetadata,
+    EvalOutput,
+    compatibility_for_eval_history_pairs,
+    make_metadata,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+    update_llm_config_for_completions_logging,
+)
+from openhands.controller.state.state import State
+from openhands.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_llm_config_arg,
+    parse_arguments,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import (
+    BrowseInteractiveAction,
+    CmdRunAction,
+    MessageAction,
+)
+from openhands.events.observation import CmdOutputObservation
+from openhands.runtime.base import Runtime
+from openhands.runtime.browser.browser_env import (
+    BROWSER_EVAL_GET_GOAL_ACTION,
+    BROWSER_EVAL_GET_REWARDS_ACTION,
+)
+from openhands.utils.async_utils import call_async_from_sync
+
+SUPPORTED_AGENT_CLS = {'VisualBrowsingAgent'}
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'VisualBrowsingAgent': 'Continue the task. IMPORTANT: do not talk to the user until you have finished the task',
+}
+
+
+def get_config(
+    metadata: EvalMetadata,
+    env_id: str,
+) -> AppConfig:
+    base_url = os.environ.get('VISUALWEBARENA_BASE_URL', None)
+    openai_api_key = os.environ.get('OPENAI_API_KEY', None)
+    openai_base_url = os.environ.get('OPENAI_BASE_URL', None)
+    assert base_url is not None, 'VISUALWEBARENA_BASE_URL must be set'
+    assert openai_api_key is not None, 'OPENAI_API_KEY must be set'
+    assert openai_base_url is not None, 'OPENAI_BASE_URL must be set'
+    config = AppConfig(
+        default_agent=metadata.agent_class,
+        run_as_openhands=False,
+        runtime='docker',
+        max_iterations=metadata.max_iterations,
+        sandbox=SandboxConfig(
+            base_container_image='python:3.12-bookworm',
+            enable_auto_lint=True,
+            use_host_network=False,
+            browsergym_eval_env=env_id,
+            runtime_startup_env_vars={
+                'BASE_URL': base_url,
+                'OPENAI_API_KEY': openai_api_key,
+                'OPENAI_BASE_URL': openai_base_url,
+                'VWA_CLASSIFIEDS': f'{base_url}:9980',
+                'VWA_CLASSIFIEDS_RESET_TOKEN': '4b61655535e7ed388f0d40a93600254c',
+                'VWA_SHOPPING': f'{base_url}:7770',
+                'VWA_SHOPPING_ADMIN': f'{base_url}:7780/admin',
+                'VWA_REDDIT': f'{base_url}:9999',
+                'VWA_GITLAB': f'{base_url}:8023',
+                'VWA_WIKIPEDIA': f'{base_url}:8888',
+                'VWA_HOMEPAGE': f'{base_url}:4399',
+            },
+            timeout=300,
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+        attach_to_existing=True,
+    )
+    config.set_llm_config(
+        update_llm_config_for_completions_logging(
+            metadata.llm_config,
+            metadata.eval_output_dir,
+            env_id,
+        )
+    )
+    return config
+
+
+def initialize_runtime(
+    runtime: Runtime,
+) -> tuple[str, list]:
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    # Set instance id
+    action = CmdRunAction(command='mkdir -p /workspace')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    goal = obs.content
+    goal_image_urls = []
+    if hasattr(obs, 'goal_image_urls'):
+        goal_image_urls = obs.goal_image_urls
+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    return goal, goal_image_urls
+
+
+def complete_runtime(
+    runtime: Runtime,
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    obs: CmdOutputObservation
+
+    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    return {
+        'rewards': json.loads(obs.content),
+    }
+
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+):
+    env_id = instance.instance_id
+
+    config = get_config(metadata, env_id)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, env_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {env_id}.')
+
+    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
+    task_str, goal_image_urls = initialize_runtime(runtime)
+    initial_user_action = MessageAction(content=task_str, image_urls=goal_image_urls)
+    state: State | None = asyncio.run(
+        run_controller(
+            config=config,
+            initial_user_action=initial_user_action,
+            runtime=runtime,
+        )
+    )
+    # ======= Attempt to evaluate the agent's environment impact =======
+
+    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    metrics = state.metrics.get() if state.metrics else None
+
+    # Instruction obtained from the first message from the USER
+    instruction = ''
+    for event in state.history:
+        if isinstance(event, MessageAction):
+            instruction = event.content
+            break
+
+    try:
+        return_val = complete_runtime(runtime)
+        logger.info(f'Return value from complete_runtime: {return_val}')
+        reward = max(return_val['rewards'])
+    except Exception:
+        reward = -1.0  # kept -1 to identify instances for which evaluation failed.
+
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = compatibility_for_eval_history_pairs(state.history)
+
+    # Save the output
+    output = EvalOutput(
+        instance_id=env_id,
+        instruction=instruction,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+        test_result={
+            'reward': reward,
+        },
+    )
+    runtime.close()
+    return output
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+
+    dataset = pd.DataFrame(
+        {
+            'instance_id': [
+                id
+                for id in gym.envs.registry.keys()
+                if id.startswith('browsergym/visualwebarena')
+            ]
+        }
+    )
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
+    metadata = make_metadata(
+        llm_config,
+        'visualwebarena',
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.eval_output_dir,
+    )
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+    instances = prepare_dataset(dataset, output_file, args.eval_n_limit)
+
+    run_evaluation(
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+    )
diff --git a/evaluation/benchmarks/visualwebarena/scripts/run_infer.sh b/evaluation/benchmarks/visualwebarena/scripts/run_infer.sh
new file mode 100755
index 000000000000..9c1f6dc3e8d6
--- /dev/null
+++ b/evaluation/benchmarks/visualwebarena/scripts/run_infer.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
+# configure browsing agent
+export USE_NAV="true"
+export USE_CONCISE_ANSWER="true"
+
+MODEL_CONFIG=$1
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+NUM_WORKERS=$5
+
+if [ -z "$NUM_WORKERS" ]; then
+  NUM_WORKERS=1
+  echo "Number of workers not specified, use default $NUM_WORKERS"
+fi
+checkout_eval_branch
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default VisualBrowsingAgent"
+  AGENT="VisualBrowsingAgent"
+fi
+
+get_openhands_version
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $OPENHANDS_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+EVAL_NOTE="${OPENHANDS_VERSION}"
+
+COMMAND="poetry run python evaluation/benchmarks/visualwebarena/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 15 \
+  --eval-num-workers $NUM_WORKERS \
+  --eval-note $EVAL_NOTE"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
diff --git a/evaluation/integration_tests/run_infer.py b/evaluation/integration_tests/run_infer.py
index b7018d0b04d1..f240d2e2333d 100644
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -35,6 +35,7 @@
 FAKE_RESPONSES = {
     'CodeActAgent': fake_user_response,
     'DelegatorAgent': fake_user_response,
+    'VisualBrowsingAgent': fake_user_response,
 }
 
 
diff --git a/openhands/agenthub/__init__.py b/openhands/agenthub/__init__.py
index 892c0d682d2e..bad9f81a1c69 100644
--- a/openhands/agenthub/__init__.py
+++ b/openhands/agenthub/__init__.py
@@ -12,6 +12,7 @@
     codeact_agent,
     delegator_agent,
     dummy_agent,
+    visualbrowsing_agent,
 )
 
 __all__ = [
@@ -19,6 +20,7 @@
     'delegator_agent',
     'dummy_agent',
     'browsing_agent',
+    'visualbrowsing_agent',
 ]
 
 for agent in all_microagents.values():
diff --git a/openhands/agenthub/visualbrowsing_agent/README.md b/openhands/agenthub/visualbrowsing_agent/README.md
new file mode 100644
index 000000000000..60ce2c6d098b
--- /dev/null
+++ b/openhands/agenthub/visualbrowsing_agent/README.md
@@ -0,0 +1,7 @@
+# Browsing Agent Framework
+
+This folder implements the AgentLab [generic agent](https://github.com/ServiceNow/AgentLab/tree/main/src/agentlab/agents/generic_agent) that enables full-featured web browsing. The observations given to the agent include set-of-marks annotated web-page screenshot, accessibility tree of the web-page and all the thoughts and actions from previous steps.
+
+## Test run
+
+Note that for browsing tasks, GPT-4/Claude is usually a requirement to get reasonable results, due to the complexity of the web page structures. This agent has been evaluated on the VisualWebArena benchmark and the CodeAct agent does not call this VisualBrowsingAgent. CodeAct agent uses has in-built support for browsing (e.g., via browse_url and browser tool).
diff --git a/openhands/agenthub/visualbrowsing_agent/__init__.py b/openhands/agenthub/visualbrowsing_agent/__init__.py
new file mode 100644
index 000000000000..834e81a64316
--- /dev/null
+++ b/openhands/agenthub/visualbrowsing_agent/__init__.py
@@ -0,0 +1,6 @@
+from openhands.agenthub.visualbrowsing_agent.visualbrowsing_agent import (
+    VisualBrowsingAgent,
+)
+from openhands.controller.agent import Agent
+
+Agent.register('VisualBrowsingAgent', VisualBrowsingAgent)
diff --git a/openhands/agenthub/visualbrowsing_agent/visualbrowsing_agent.py b/openhands/agenthub/visualbrowsing_agent/visualbrowsing_agent.py
new file mode 100644
index 000000000000..76bc8ba42726
--- /dev/null
+++ b/openhands/agenthub/visualbrowsing_agent/visualbrowsing_agent.py
@@ -0,0 +1,306 @@
+from browsergym.core.action.highlevel import HighLevelActionSet
+from browsergym.utils.obs import flatten_axtree_to_str
+
+from openhands.agenthub.browsing_agent.response_parser import BrowsingResponseParser
+from openhands.controller.agent import Agent
+from openhands.controller.state.state import State
+from openhands.core.config import AgentConfig
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.message import ImageContent, Message, TextContent
+from openhands.events.action import (
+    Action,
+    AgentFinishAction,
+    BrowseInteractiveAction,
+    MessageAction,
+)
+from openhands.events.event import EventSource
+from openhands.events.observation import BrowserOutputObservation
+from openhands.events.observation.observation import Observation
+from openhands.llm.llm import LLM
+from openhands.runtime.plugins import (
+    PluginRequirement,
+)
+
+
+def get_error_prefix(obs: BrowserOutputObservation) -> str:
+    # temporary fix for OneStopMarket to ignore timeout errors
+    if 'timeout' in obs.last_browser_action_error:
+        return ''
+    return f'## Error from previous action:\n{obs.last_browser_action_error}\n'
+
+
+def create_goal_prompt(goal: str, image_urls: list[str] | None):
+    goal_txt: str = f"""\
+# Instructions
+Review the current state of the page and all other information to find the best possible next action to accomplish your goal. Your answer will be interpreted and executed by a program, make sure to follow the formatting instructions.
+
+## Goal:
+{goal}
+"""
+    goal_image_urls = []
+    if image_urls is not None:
+        for idx, url in enumerate(image_urls):
+            goal_txt = goal_txt + f'Images: Goal input image ({idx+1})\n'
+            goal_image_urls.append(url)
+    goal_txt += '\n'
+    return goal_txt, goal_image_urls
+
+
+def create_observation_prompt(
+    axtree_txt: str,
+    tabs: str,
+    focused_element: str,
+    error_prefix: str,
+    som_screenshot: str | None,
+):
+    txt_observation = f"""
+# Observation of current step:
+{tabs}{axtree_txt}{focused_element}{error_prefix}
+"""
+
+    # screenshot + som: will be a non-empty string if present in observation
+    screenshot_url = None
+    if (som_screenshot is not None) and (len(som_screenshot) > 0):
+        txt_observation += 'Image: Current page screenshot (Note that only visible portion of webpage is present in the screenshot. You may need to scroll to view the remaining portion of the web-page.\n'
+        screenshot_url = som_screenshot
+    else:
+        logger.info('SOM Screenshot not present in observation!')
+    txt_observation += '\n'
+    return txt_observation, screenshot_url
+
+
+def get_tabs(obs: BrowserOutputObservation) -> str:
+    prompt_pieces = ['\n## Currently open tabs:']
+    for page_index, page_url in enumerate(obs.open_pages_urls):
+        active_or_not = ' (active tab)' if page_index == obs.active_page_index else ''
+        prompt_piece = f"""\
+Tab {page_index}{active_or_not}:
+URL: {page_url}
+"""
+        prompt_pieces.append(prompt_piece)
+    return '\n'.join(prompt_pieces) + '\n'
+
+
+def get_axtree(axtree_txt: str) -> str:
+    bid_info = """\
+Note: [bid] is the unique alpha-numeric identifier at the beginning of lines for each element in the AXTree. Always use bid to refer to elements in your actions.
+
+"""
+    visible_tag_info = """\
+Note: You can only interact with visible elements. If the "visible" tag is not present, the element is not visible on the page.
+
+"""
+    return f'\n## AXTree:\n{bid_info}{visible_tag_info}{axtree_txt}\n'
+
+
+def get_action_prompt(action_set: HighLevelActionSet) -> str:
+    action_set_generic_info = """\
+Note: This action set allows you to interact with your environment. Most of them are python function executing playwright code. The primary way of referring to elements in the page is through bid which are specified in your observations.
+
+"""
+    action_description = action_set.describe(
+        with_long_description=False,
+        with_examples=False,
+    )
+    action_prompt = f'# Action space:\n{action_set_generic_info}{action_description}\n'
+    return action_prompt
+
+
+def get_history_prompt(prev_actions: list[BrowseInteractiveAction]) -> str:
+    history_prompt = ['# History of all previous interactions with the task:\n']
+    for i in range(len(prev_actions)):
+        history_prompt.append(f'## step {i+1}')
+        history_prompt.append(
+            f'\nOuput thought and action: {prev_actions[i].thought} ```{prev_actions[i].browser_actions}```\n'
+        )
+    return '\n'.join(history_prompt) + '\n'
+
+
+class VisualBrowsingAgent(Agent):
+    VERSION = '1.0'
+    """
+    VisualBrowsing Agent that can uses webpage screenshots during browsing.
+    """
+
+    sandbox_plugins: list[PluginRequirement] = []
+    response_parser = BrowsingResponseParser()
+
+    def __init__(
+        self,
+        llm: LLM,
+        config: AgentConfig,
+    ) -> None:
+        """Initializes a new instance of the VisualBrowsingAgent class.
+
+        Parameters:
+        - llm (LLM): The llm to be used by this agent
+        """
+        super().__init__(llm, config)
+        # define a configurable action space, with chat functionality, web navigation, and webpage grounding using accessibility tree and HTML.
+        # see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/highlevel.py for more details
+        action_subsets = [
+            'chat',
+            'bid',
+            'nav',
+            'tab',
+            'infeas',
+        ]
+        self.action_space = HighLevelActionSet(
+            subsets=action_subsets,
+            strict=False,  # less strict on the parsing of the actions
+            multiaction=False,
+        )
+        self.action_prompt = get_action_prompt(self.action_space)
+        self.abstract_example = f"""
+# Abstract Example
+
+Here is an abstract version of the answer with description of the content of each tag. Make sure you follow this structure, but replace the content with your answer:
+
+You must mandatorily think step by step. If you need to make calculations such as coordinates, write them here. Describe the effect that your previous action had on the current content of the page. In summary the next action I will perform is ```{self.action_space.example_action(abstract=True)}```
+"""
+        self.concrete_example = """
+# Concrete Example
+
+Here is a concrete example of how to format your answer. Make sure to generate the action in the correct format ensuring that the action is present inside ``````:
+
+Let's think step-by-step. From previous action I tried to set the value of year to "2022", using select_option, but it doesn't appear to be in the form. It may be a dynamic dropdown, I will try using click with the bid "324" and look at the response from the page. In summary the next action I will perform is ```click('324')```
+"""
+        self.hints = """
+Note:
+* Make sure to use bid to identify elements when using commands.
+* Interacting with combobox, dropdowns and auto-complete fields can be tricky, sometimes you need to use select_option, while other times you need to use fill or click and wait for the reaction of the page.
+
+"""
+        self.reset()
+
+    def reset(self) -> None:
+        """Resets the VisualBrowsingAgent."""
+        super().reset()
+        self.cost_accumulator = 0
+        self.error_accumulator = 0
+
+    def step(self, state: State) -> Action:
+        """Performs one step using the VisualBrowsingAgent.
+
+        This includes gathering information on previous steps and prompting the model to make a browsing command to execute.
+
+        Parameters:
+        - state (State): used to get updated info
+
+        Returns:
+        - BrowseInteractiveAction(browsergym_command) - BrowserGym commands to run
+        - MessageAction(content) - Message action to run (e.g. ask for clarification)
+        - AgentFinishAction() - end the interaction
+        """
+        messages: list[Message] = []
+        prev_actions = []
+        cur_axtree_txt = ''
+        error_prefix = ''
+        focused_element = ''
+        tabs = ''
+        last_obs = None
+        last_action = None
+
+        if len(state.history) == 1:
+            # for visualwebarena, webarena and miniwob++ eval, we need to retrieve the initial observation already in browser env
+            # initialize and retrieve the first observation by issuing an noop OP
+            # For non-benchmark browsing, the browser env starts with a blank page, and the agent is expected to first navigate to desired websites
+            return BrowseInteractiveAction(browser_actions='noop(1000)')
+
+        for event in state.history:
+            if isinstance(event, BrowseInteractiveAction):
+                prev_actions.append(event)
+                last_action = event
+            elif isinstance(event, MessageAction) and event.source == EventSource.AGENT:
+                # agent has responded, task finished.
+                return AgentFinishAction(outputs={'content': event.content})
+            elif isinstance(event, Observation):
+                last_obs = event
+
+        if len(prev_actions) >= 1:  # ignore noop()
+            prev_actions = prev_actions[1:]  # remove the first noop action
+
+        # if the final BrowserInteractiveAction exec BrowserGym's send_msg_to_user,
+        # we should also send a message back to the user in OpenHands and call it a day
+        if (
+            isinstance(last_action, BrowseInteractiveAction)
+            and last_action.browsergym_send_msg_to_user
+        ):
+            return MessageAction(last_action.browsergym_send_msg_to_user)
+
+        history_prompt = get_history_prompt(prev_actions)
+        if isinstance(last_obs, BrowserOutputObservation):
+            if last_obs.error:
+                # add error recovery prompt prefix
+                error_prefix = get_error_prefix(last_obs)
+                if len(error_prefix) > 0:
+                    self.error_accumulator += 1
+                    if self.error_accumulator > 5:
+                        return MessageAction(
+                            'Too many errors encountered. Task failed.'
+                        )
+            focused_element = '## Focused element:\nNone\n'
+            if last_obs.focused_element_bid is not None:
+                focused_element = (
+                    f"## Focused element:\nbid='{last_obs.focused_element_bid}'\n"
+                )
+            tabs = get_tabs(last_obs)
+            try:
+                # IMPORTANT: keep AX Tree of full webpage, add visible and clickable tags
+                cur_axtree_txt = flatten_axtree_to_str(
+                    last_obs.axtree_object,
+                    extra_properties=last_obs.extra_element_properties,
+                    with_visible=True,
+                    with_clickable=True,
+                    with_center_coords=False,
+                    with_bounding_box_coords=False,
+                    filter_visible_only=False,
+                    filter_with_bid_only=False,
+                    filter_som_only=False,
+                )
+                cur_axtree_txt = get_axtree(axtree_txt=cur_axtree_txt)
+            except Exception as e:
+                logger.error(
+                    'Error when trying to process the accessibility tree: %s', e
+                )
+                return MessageAction('Error encountered when browsing.')
+            set_of_marks = last_obs.set_of_marks
+        goal, image_urls = state.get_current_user_intent()
+
+        if goal is None:
+            goal = state.inputs['task']
+        goal_txt, goal_images = create_goal_prompt(goal, image_urls)
+        observation_txt, som_screenshot = create_observation_prompt(
+            cur_axtree_txt, tabs, focused_element, error_prefix, set_of_marks
+        )
+        human_prompt = [TextContent(type='text', text=goal_txt)]
+        if len(goal_images) > 0:
+            human_prompt.append(ImageContent(image_urls=goal_images))
+        human_prompt.append(TextContent(type='text', text=observation_txt))
+        if som_screenshot is not None:
+            human_prompt.append(ImageContent(image_urls=[som_screenshot]))
+        remaining_content = f"""
+{history_prompt}\
+{self.action_prompt}\
+{self.hints}\
+{self.abstract_example}\
+{self.concrete_example}\
+"""
+        human_prompt.append(TextContent(type='text', text=remaining_content))
+
+        system_msg = """\
+You are an agent trying to solve a web task based on the content of the page and user instructions. You can interact with the page and explore, and send messages to the user when you finish the task. Each time you submit an action it will be sent to the browser and you will receive a new page.
+""".strip()
+
+        messages.append(Message(role='system', content=[TextContent(text=system_msg)]))
+        messages.append(Message(role='user', content=human_prompt))
+
+        flat_messages = self.llm.format_messages_for_llm(messages)
+
+        response = self.llm.completion(
+            messages=flat_messages,
+            temperature=0.0,
+            stop=[')```', ')\n```'],
+        )
+
+        return self.response_parser.parse(response)
diff --git a/openhands/events/observation/browse.py b/openhands/events/observation/browse.py
index bc347d657473..c8a4c2cb6276 100644
--- a/openhands/events/observation/browse.py
+++ b/openhands/events/observation/browse.py
@@ -13,8 +13,10 @@ class BrowserOutputObservation(Observation):
     url: str
     trigger_by_action: str
     screenshot: str = field(repr=False, default='')  # don't show in repr
+    set_of_marks: str = field(default='', repr=False)  # don't show in repr
     error: bool = False
     observation: str = ObservationType.BROWSE
+    goal_image_urls: list = field(default_factory=list)
     # do not include in the memory
     open_pages_urls: list = field(default_factory=list)
     active_page_index: int = -1
diff --git a/openhands/runtime/browser/browser_env.py b/openhands/runtime/browser/browser_env.py
index e7752a701236..f12565341efa 100644
--- a/openhands/runtime/browser/browser_env.py
+++ b/openhands/runtime/browser/browser_env.py
@@ -11,7 +11,7 @@
 import html2text
 import numpy as np
 import tenacity
-from browsergym.utils.obs import flatten_dom_to_str
+from browsergym.utils.obs import flatten_dom_to_str, overlay_som
 from PIL import Image
 
 from openhands.core.exceptions import BrowserInitException
@@ -65,15 +65,22 @@ def init_browser(self):
             logger.error(f'Failed to start browser process: {e}')
             raise
 
-        if not self.check_alive():
+        if not self.check_alive(timeout=200):
             self.close()
             raise BrowserInitException('Failed to start browser environment.')
 
     def browser_process(self):
         if self.eval_mode:
             assert self.browsergym_eval_env is not None
-            logger.debug('Initializing browser env for web browsing evaluation.')
-            if 'webarena' in self.browsergym_eval_env:
+            logger.info('Initializing browser env for web browsing evaluation.')
+            if not self.browsergym_eval_env.startswith('browsergym/'):
+                self.browsergym_eval_env = 'browsergym/' + self.browsergym_eval_env
+            if 'visualwebarena' in self.browsergym_eval_env:
+                import browsergym.visualwebarena  # noqa F401 register visualwebarena tasks as gym environments
+                import nltk
+
+                nltk.download('punkt_tab')
+            elif 'webarena' in self.browsergym_eval_env:
                 import browsergym.webarena  # noqa F401 register webarena tasks as gym environments
             elif 'miniwob' in self.browsergym_eval_env:
                 import browsergym.miniwob  # noqa F401 register miniwob tasks as gym environments
@@ -81,10 +88,7 @@ def browser_process(self):
                 raise ValueError(
                     f'Unsupported browsergym eval env: {self.browsergym_eval_env}'
                 )
-            env = gym.make(
-                self.browsergym_eval_env,
-                tags_to_mark='all',
-            )
+            env = gym.make(self.browsergym_eval_env, tags_to_mark='all', timeout=100000)
         else:
             env = gym.make(
                 'browsergym/openended',
@@ -94,17 +98,27 @@ def browser_process(self):
                 disable_env_checker=True,
                 tags_to_mark='all',
             )
-
         obs, info = env.reset()
 
+        logger.info('Successfully called env.reset')
         # EVAL ONLY: save the goal into file for evaluation
         self.eval_goal = None
+        self.goal_image_urls = []
         self.eval_rewards: list[float] = []
         if self.eval_mode:
-            logger.debug(f"Browsing goal: {obs['goal']}")
             self.eval_goal = obs['goal']
+            if 'goal_object' in obs:
+                if len(obs['goal_object']) > 0:
+                    self.eval_goal = obs['goal_object'][0]['text']
+                for message in obs['goal_object']:
+                    if message['type'] == 'image_url':
+                        image_src = message['image_url']
+                        if isinstance(image_src, dict):
+                            image_src = image_src['url']
+                        self.goal_image_urls.append(image_src)
+            logger.debug(f'Browsing goal: {self.eval_goal}')
+        logger.info('Browser env started.')
 
-        logger.debug('Browser env started.')
         while should_continue():
             try:
                 if self.browser_side.poll(timeout=0.01):
@@ -122,7 +136,13 @@ def browser_process(self):
                     # EVAL ONLY: Get evaluation info
                     if action_data['action'] == BROWSER_EVAL_GET_GOAL_ACTION:
                         self.browser_side.send(
-                            (unique_request_id, {'text_content': self.eval_goal})
+                            (
+                                unique_request_id,
+                                {
+                                    'text_content': self.eval_goal,
+                                    'image_content': self.goal_image_urls,
+                                },
+                            )
                         )
                         continue
                     elif action_data['action'] == BROWSER_EVAL_GET_REWARDS_ACTION:
@@ -145,7 +165,15 @@ def browser_process(self):
                     html_str = flatten_dom_to_str(obs['dom_object'])
                     obs['text_content'] = self.html_text_converter.handle(html_str)
                     # make observation serializable
-                    obs['screenshot'] = self.image_to_png_base64_url(obs['screenshot'])
+                    obs['set_of_marks'] = self.image_to_png_base64_url(
+                        overlay_som(
+                            obs['screenshot'], obs.get('extra_element_properties', {})
+                        ),
+                        add_data_prefix=True,
+                    )
+                    obs['screenshot'] = self.image_to_png_base64_url(
+                        obs['screenshot'], add_data_prefix=True
+                    )
                     obs['active_page_index'] = obs['active_page_index'].item()
                     obs['elapsed_time'] = obs['elapsed_time'].item()
                     self.browser_side.send((unique_request_id, obs))
@@ -157,7 +185,7 @@ def browser_process(self):
                     pass
                 return
 
-    def step(self, action_str: str, timeout: float = 30) -> dict:
+    def step(self, action_str: str, timeout: float = 100) -> dict:
         """Execute an action in the browser environment and return the observation."""
         unique_request_id = str(uuid.uuid4())
         self.agent_side.send((unique_request_id, {'action': action_str}))
diff --git a/openhands/runtime/browser/utils.py b/openhands/runtime/browser/utils.py
index 45e098c9f7d8..b029ac08412d 100644
--- a/openhands/runtime/browser/utils.py
+++ b/openhands/runtime/browser/utils.py
@@ -35,6 +35,10 @@ async def browse(
             content=obs['text_content'],  # text content of the page
             url=obs.get('url', ''),  # URL of the page
             screenshot=obs.get('screenshot', None),  # base64-encoded screenshot, png
+            set_of_marks=obs.get(
+                'set_of_marks', None
+            ),  # base64-encoded Set-of-Marks annotated screenshot, png,
+            goal_image_urls=obs.get('image_content', []),
             open_pages_urls=obs.get('open_pages_urls', []),  # list of open pages
             active_page_index=obs.get(
                 'active_page_index', -1
diff --git a/poetry.lock b/poetry.lock
index f9206380ac69..77920b3c3b94 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -588,6 +588,43 @@ urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >
 [package.extras]
 crt = ["awscrt (==0.23.4)"]
 
+[[package]]
+name = "browsergym"
+version = "0.10.2"
+description = "BrowserGym: a gym environment for web task automation in the Chromium browser"
+optional = false
+python-versions = ">3.7"
+files = [
+    {file = "browsergym-0.10.2-py3-none-any.whl", hash = "sha256:9581d1d1f1fcd1cf35266cf30c881d60c147a0d374b3491eeaebb07d9690f868"},
+    {file = "browsergym-0.10.2.tar.gz", hash = "sha256:3cdd7520cca857421aa7ec0a965968df4bcef721299a424397f86d7cad078ab0"},
+]
+
+[package.dependencies]
+browsergym-assistantbench = "0.10.2"
+browsergym-core = "0.10.2"
+browsergym-experiments = "0.10.2"
+browsergym-miniwob = "0.10.2"
+browsergym-visualwebarena = "0.10.2"
+browsergym-webarena = "0.10.2"
+browsergym-workarena = ">=0.4.1"
+
+[[package]]
+name = "browsergym-assistantbench"
+version = "0.10.2"
+description = "AssistantBench benchmark for BrowserGym"
+optional = false
+python-versions = ">3.7"
+files = [
+    {file = "browsergym_assistantbench-0.10.2-py3-none-any.whl", hash = "sha256:af0d3a3e23686066b070feca38f8740262bed6d65ccf9098f393334a005987c0"},
+    {file = "browsergym_assistantbench-0.10.2.tar.gz", hash = "sha256:de18eb7c010403d5d467b927b4713b56f6e97a59493bee4c42599d4d7cb54dce"},
+]
+
+[package.dependencies]
+browsergym-core = "0.10.2"
+datasets = "*"
+numpy = "*"
+scipy = "*"
+
 [[package]]
 name = "browsergym-core"
 version = "0.10.2"
@@ -608,6 +645,22 @@ pillow = ">=10.1"
 playwright = ">=1.39,<2.0"
 pyparsing = ">=3"
 
+[[package]]
+name = "browsergym-experiments"
+version = "0.10.2"
+description = "Experimentation tools for BrowserGym"
+optional = false
+python-versions = ">3.7"
+files = [
+    {file = "browsergym_experiments-0.10.2-py3-none-any.whl", hash = "sha256:60a626b3159ef63b5ff72a6c8156c8f3cf82a9278dfc5a9d3ece39c2b1913595"},
+    {file = "browsergym_experiments-0.10.2.tar.gz", hash = "sha256:b49bc27f315ad12014ff21580c7c7aca6489ca4106e7ab46502f716674efa236"},
+]
+
+[package.dependencies]
+browsergym-core = "0.10.2"
+dataclasses-json = "*"
+tiktoken = ">=0.4"
+
 [[package]]
 name = "browsergym-miniwob"
 version = "0.10.2"
@@ -622,6 +675,22 @@ files = [
 [package.dependencies]
 browsergym-core = "0.10.2"
 
+[[package]]
+name = "browsergym-visualwebarena"
+version = "0.10.2"
+description = "VisualWebArena benchmark for BrowserGym"
+optional = false
+python-versions = ">3.7"
+files = [
+    {file = "browsergym_visualwebarena-0.10.2-py3-none-any.whl", hash = "sha256:87c913ccd4d12a79c625b5c4d9ead7e0bc50b298d19e413204bb586a67736d83"},
+    {file = "browsergym_visualwebarena-0.10.2.tar.gz", hash = "sha256:5f84a4f33a21106c9b650cecb0362b78af2546d9927255828c273fe800d776a1"},
+]
+
+[package.dependencies]
+browsergym-core = "0.10.2"
+libvisualwebarena = "0.0.14"
+requests = "*"
+
 [[package]]
 name = "browsergym-webarena"
 version = "0.10.2"
@@ -637,6 +706,26 @@ files = [
 browsergym-core = "0.10.2"
 libwebarena = "0.0.3"
 
+[[package]]
+name = "browsergym-workarena"
+version = "0.4.1"
+description = "WorkArena benchmark for BrowserGym"
+optional = false
+python-versions = ">3.7"
+files = [
+    {file = "browsergym_workarena-0.4.1-py3-none-any.whl", hash = "sha256:b8f04b2e3801fd32962b7d99f0685c507b258841e2b4bfdb46d041091d2f1b89"},
+    {file = "browsergym_workarena-0.4.1.tar.gz", hash = "sha256:ba2958d804b80836c7f81360d66b99c6c655c5070eddc5fae9c1c88306a23403"},
+]
+
+[package.dependencies]
+browsergym-core = ">=0.2"
+english-words = ">=2.0.1"
+faker = ">=24.8.0"
+numpy = ">=1.14"
+requests = ">=2.31"
+tenacity = ">=8.2.3"
+tqdm = ">=4.66.2"
+
 [[package]]
 name = "build"
 version = "1.2.2.post1"
@@ -1542,6 +1631,16 @@ protobuf = ">=3.20.0,<6.0.0"
 python-dateutil = ">=2.8.2"
 typing-extensions = ">=4.1.0"
 
+[[package]]
+name = "english-words"
+version = "2.0.1"
+description = "Generate sets of english words by combining different word lists"
+optional = false
+python-versions = "*"
+files = [
+    {file = "english-words-2.0.1.tar.gz", hash = "sha256:a4105c57493bb757a3d8973fcf8e1dc05e7ca09c836dff467c3fb445f84bc43d"},
+]
+
 [[package]]
 name = "evaluate"
 version = "0.4.3"
@@ -1605,6 +1704,21 @@ files = [
 [package.extras]
 tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"]
 
+[[package]]
+name = "faker"
+version = "33.1.0"
+description = "Faker is a Python package that generates fake data for you."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "Faker-33.1.0-py3-none-any.whl", hash = "sha256:d30c5f0e2796b8970de68978365247657486eb0311c5abe88d0b895b68dff05d"},
+    {file = "faker-33.1.0.tar.gz", hash = "sha256:1c925fc0e86a51fc46648b504078c88d0cd48da1da2595c4e712841cab43a1e4"},
+]
+
+[package.dependencies]
+python-dateutil = ">=2.4"
+typing-extensions = "*"
+
 [[package]]
 name = "farama-notifications"
 version = "0.0.4"
@@ -2959,6 +3073,39 @@ files = [
 [package.extras]
 all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"]
 
+[[package]]
+name = "imageio"
+version = "2.36.1"
+description = "Library for reading and writing a wide range of image, video, scientific, and volumetric data formats."
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "imageio-2.36.1-py3-none-any.whl", hash = "sha256:20abd2cae58e55ca1af8a8dcf43293336a59adf0391f1917bf8518633cfc2cdf"},
+    {file = "imageio-2.36.1.tar.gz", hash = "sha256:e4e1d231f47f9a9e16100b0f7ce1a86e8856fb4d1c0fa2c4365a316f1746be62"},
+]
+
+[package.dependencies]
+numpy = "*"
+pillow = ">=8.3.2"
+
+[package.extras]
+all-plugins = ["astropy", "av", "imageio-ffmpeg", "numpy (>2)", "pillow-heif", "psutil", "rawpy", "tifffile"]
+all-plugins-pypy = ["av", "imageio-ffmpeg", "pillow-heif", "psutil", "tifffile"]
+build = ["wheel"]
+dev = ["black", "flake8", "fsspec[github]", "pytest", "pytest-cov"]
+docs = ["numpydoc", "pydata-sphinx-theme", "sphinx (<6)"]
+ffmpeg = ["imageio-ffmpeg", "psutil"]
+fits = ["astropy"]
+full = ["astropy", "av", "black", "flake8", "fsspec[github]", "gdal", "imageio-ffmpeg", "itk", "numpy (>2)", "numpydoc", "pillow-heif", "psutil", "pydata-sphinx-theme", "pytest", "pytest-cov", "rawpy", "sphinx (<6)", "tifffile", "wheel"]
+gdal = ["gdal"]
+itk = ["itk"]
+linting = ["black", "flake8"]
+pillow-heif = ["pillow-heif"]
+pyav = ["av"]
+rawpy = ["numpy (>2)", "rawpy"]
+test = ["fsspec[github]", "pytest", "pytest-cov"]
+tifffile = ["tifffile"]
+
 [[package]]
 name = "importlib-metadata"
 version = "7.1.0"
@@ -3668,6 +3815,25 @@ websocket-client = ">=0.32.0,<0.40.0 || >0.40.0,<0.41.dev0 || >=0.43.dev0"
 [package.extras]
 adal = ["adal (>=1.0.2)"]
 
+[[package]]
+name = "lazy-loader"
+version = "0.4"
+description = "Makes it easy to load subpackages and functions on demand."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "lazy_loader-0.4-py3-none-any.whl", hash = "sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc"},
+    {file = "lazy_loader-0.4.tar.gz", hash = "sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1"},
+]
+
+[package.dependencies]
+packaging = "*"
+
+[package.extras]
+dev = ["changelist (==0.5)"]
+lint = ["pre-commit (==3.7.0)"]
+test = ["pytest (>=7.4)", "pytest-cov (>=4.1)"]
+
 [[package]]
 name = "libtmux"
 version = "0.39.0"
@@ -3679,6 +3845,33 @@ files = [
     {file = "libtmux-0.39.0.tar.gz", hash = "sha256:59346aeef3c0d6017f3bc5e23248d43cdf50f32b775b9cb5d9ff5e2e5f3059f4"},
 ]
 
+[[package]]
+name = "libvisualwebarena"
+version = "0.0.14"
+description = "This is an unofficial, use-at-your-own risks port of the visualwebarena benchmark, for use as a standalone library package."
+optional = false
+python-versions = "<4,>=3.7"
+files = [
+    {file = "libvisualwebarena-0.0.14-py3-none-any.whl", hash = "sha256:636b06ca1d52f1a363503b5b563492e83f2482efaf85bb26b69744565a499f0f"},
+    {file = "libvisualwebarena-0.0.14.tar.gz", hash = "sha256:7e660179f60f1df8d884204f2b742a2117e7fe050823d839ca5744ea1c0709a7"},
+]
+
+[package.dependencies]
+aiolimiter = "*"
+beartype = "0.12.0"
+evaluate = "*"
+flask = "*"
+gymnasium = "*"
+nltk = "*"
+openai = ">=1"
+Pillow = "*"
+playwright = ">=1.32,<1.40"
+scikit-image = ">=0.16"
+text-generation = "*"
+tiktoken = "*"
+transformers = "*"
+types-tqdm = "*"
+
 [[package]]
 name = "libwebarena"
 version = "0.0.3"
@@ -5276,7 +5469,6 @@ description = "Nvidia JIT LTO Library"
 optional = false
 python-versions = ">=3"
 files = [
-    {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83"},
     {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57"},
     {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:fd9020c501d27d135f983c6d3e244b197a7ccad769e34df53a42e276b0e25fa1"},
 ]
@@ -7740,6 +7932,54 @@ files = [
 attrs = ">=18.0.0"
 pathspec = ">=0.10.1"
 
+[[package]]
+name = "scikit-image"
+version = "0.24.0"
+description = "Image processing in Python"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "scikit_image-0.24.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cb3bc0264b6ab30b43c4179ee6156bc18b4861e78bb329dd8d16537b7bbf827a"},
+    {file = "scikit_image-0.24.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:9c7a52e20cdd760738da38564ba1fed7942b623c0317489af1a598a8dedf088b"},
+    {file = "scikit_image-0.24.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93f46e6ce42e5409f4d09ce1b0c7f80dd7e4373bcec635b6348b63e3c886eac8"},
+    {file = "scikit_image-0.24.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39ee0af13435c57351a3397eb379e72164ff85161923eec0c38849fecf1b4764"},
+    {file = "scikit_image-0.24.0-cp310-cp310-win_amd64.whl", hash = "sha256:7ac7913b028b8aa780ffae85922894a69e33d1c0bf270ea1774f382fe8bf95e7"},
+    {file = "scikit_image-0.24.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:272909e02a59cea3ed4aa03739bb88df2625daa809f633f40b5053cf09241831"},
+    {file = "scikit_image-0.24.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:190ebde80b4470fe8838764b9b15f232a964f1a20391663e31008d76f0c696f7"},
+    {file = "scikit_image-0.24.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:59c98cc695005faf2b79904e4663796c977af22586ddf1b12d6af2fa22842dc2"},
+    {file = "scikit_image-0.24.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa27b3a0dbad807b966b8db2d78da734cb812ca4787f7fbb143764800ce2fa9c"},
+    {file = "scikit_image-0.24.0-cp311-cp311-win_amd64.whl", hash = "sha256:dacf591ac0c272a111181afad4b788a27fe70d213cfddd631d151cbc34f8ca2c"},
+    {file = "scikit_image-0.24.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6fccceb54c9574590abcddc8caf6cefa57c13b5b8b4260ab3ff88ad8f3c252b3"},
+    {file = "scikit_image-0.24.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:ccc01e4760d655aab7601c1ba7aa4ddd8b46f494ac46ec9c268df6f33ccddf4c"},
+    {file = "scikit_image-0.24.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18836a18d3a7b6aca5376a2d805f0045826bc6c9fc85331659c33b4813e0b563"},
+    {file = "scikit_image-0.24.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8579bda9c3f78cb3b3ed8b9425213c53a25fa7e994b7ac01f2440b395babf660"},
+    {file = "scikit_image-0.24.0-cp312-cp312-win_amd64.whl", hash = "sha256:82ab903afa60b2da1da2e6f0c8c65e7c8868c60a869464c41971da929b3e82bc"},
+    {file = "scikit_image-0.24.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ef04360eda372ee5cd60aebe9be91258639c86ae2ea24093fb9182118008d009"},
+    {file = "scikit_image-0.24.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:e9aadb442360a7e76f0c5c9d105f79a83d6df0e01e431bd1d5757e2c5871a1f3"},
+    {file = "scikit_image-0.24.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e37de6f4c1abcf794e13c258dc9b7d385d5be868441de11c180363824192ff7"},
+    {file = "scikit_image-0.24.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4688c18bd7ec33c08d7bf0fd19549be246d90d5f2c1d795a89986629af0a1e83"},
+    {file = "scikit_image-0.24.0-cp39-cp39-win_amd64.whl", hash = "sha256:56dab751d20b25d5d3985e95c9b4e975f55573554bd76b0aedf5875217c93e69"},
+    {file = "scikit_image-0.24.0.tar.gz", hash = "sha256:5d16efe95da8edbeb363e0c4157b99becbd650a60b77f6e3af5768b66cf007ab"},
+]
+
+[package.dependencies]
+imageio = ">=2.33"
+lazy-loader = ">=0.4"
+networkx = ">=2.8"
+numpy = ">=1.23"
+packaging = ">=21"
+pillow = ">=9.1"
+scipy = ">=1.9"
+tifffile = ">=2022.8.12"
+
+[package.extras]
+build = ["Cython (>=3.0.4)", "build", "meson-python (>=0.15)", "ninja", "numpy (>=2.0.0rc1)", "packaging (>=21)", "pythran", "setuptools (>=67)", "spin (==0.8)", "wheel"]
+data = ["pooch (>=1.6.0)"]
+developer = ["ipython", "pre-commit", "tomli"]
+docs = ["PyWavelets (>=1.1.1)", "dask[array] (>=2022.9.2)", "ipykernel", "ipywidgets", "kaleido", "matplotlib (>=3.6)", "myst-parser", "numpydoc (>=1.7)", "pandas (>=1.5)", "plotly (>=5.10)", "pooch (>=1.6)", "pydata-sphinx-theme (>=0.15.2)", "pytest-doctestplus", "pytest-runner", "scikit-learn (>=1.1)", "seaborn (>=0.11)", "sphinx (>=7.3)", "sphinx-copybutton", "sphinx-gallery (>=0.14)", "sphinx_design (>=0.5)", "tifffile (>=2022.8.12)"]
+optional = ["PyWavelets (>=1.1.1)", "SimpleITK", "astropy (>=5.0)", "cloudpickle (>=0.2.1)", "dask[array] (>=2021.1.0)", "matplotlib (>=3.6)", "pooch (>=1.6.0)", "pyamg", "scikit-learn (>=1.1)"]
+test = ["asv", "numpydoc (>=1.7)", "pooch (>=1.6.0)", "pytest (>=7.0)", "pytest-cov (>=2.11.0)", "pytest-doctestplus", "pytest-faulthandler", "pytest-localserver"]
+
 [[package]]
 name = "scikit-learn"
 version = "1.6.0"
@@ -8451,6 +8691,28 @@ files = [
     {file = "threadpoolctl-3.5.0.tar.gz", hash = "sha256:082433502dd922bf738de0d8bcc4fdcbf0979ff44c42bd40f5af8a282f6fa107"},
 ]
 
+[[package]]
+name = "tifffile"
+version = "2024.9.20"
+description = "Read and write TIFF files"
+optional = false
+python-versions = ">=3.10"
+files = [
+    {file = "tifffile-2024.9.20-py3-none-any.whl", hash = "sha256:c54dc85bc1065d972cb8a6ffb3181389d597876aa80177933459733e4ed243dd"},
+    {file = "tifffile-2024.9.20.tar.gz", hash = "sha256:3fbf3be2f995a7051a8ae05a4be70c96fc0789f22ed6f1c4104c973cf68a640b"},
+]
+
+[package.dependencies]
+numpy = "*"
+
+[package.extras]
+all = ["defusedxml", "fsspec", "imagecodecs (>=2023.8.12)", "lxml", "matplotlib", "zarr"]
+codecs = ["imagecodecs (>=2023.8.12)"]
+plot = ["matplotlib"]
+test = ["cmapfile", "czifile", "dask", "defusedxml", "fsspec", "imagecodecs", "lfdfiles", "lxml", "ndtiff", "oiffile", "psdtags", "pytest", "roifile", "xarray", "zarr"]
+xml = ["defusedxml", "lxml"]
+zarr = ["fsspec", "zarr"]
+
 [[package]]
 name = "tiktoken"
 version = "0.8.0"
@@ -9857,4 +10119,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "f0fdb1fa00337a3fdda425cbfb9af7020d7460fdca8eb9dcfbe4817cf60d0a05"
+content-hash = "6b74056694bdc84a4583c2f93a5b218f15688827cb59e289eb83331045a1582e"
diff --git a/pyproject.toml b/pyproject.toml
index 7c292255fb6e..8e191f9d53d9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -142,8 +142,10 @@ gdown = "*"
 matplotlib = "*"
 seaborn = "*"
 tabulate = "*"
+browsergym = "0.10.2"
 browsergym-webarena = "0.10.2"
 browsergym-miniwob = "0.10.2"
+browsergym-visualwebarena = "0.10.2"
 
 [tool.poetry-dynamic-versioning]
 enable = true