Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A few fixes for TAC evaluation harness #6586

Merged
merged 10 commits into from
Feb 15, 2025
19 changes: 18 additions & 1 deletion evaluation/benchmarks/the_agent_company/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
get_llm_config_arg,
get_parser,
)
from openhands.core.config.agent_config import AgentConfig
from openhands.core.logger import openhands_logger as logger
from openhands.core.main import create_runtime, run_controller
from openhands.events.action import CmdRunAction, MessageAction
Expand All @@ -34,6 +35,7 @@ def get_config(
task_short_name: str,
mount_path_on_host: str,
llm_config: LLMConfig,
agent_config: AgentConfig,
) -> AppConfig:
config = AppConfig(
run_as_openhands=False,
Expand All @@ -57,6 +59,7 @@ def get_config(
workspace_mount_path_in_sandbox='/outputs',
)
config.set_llm_config(llm_config)
config.set_agent_config(agent_config)
return config


Expand Down Expand Up @@ -152,6 +155,12 @@ def run_solver(
os.path.join(screenshots_dir, f'{image_id}.png'), 'wb'
) as file:
file.write(image_data)
if obs.set_of_marks:
som_image_data = base64.b64decode(obs.set_of_marks)
with open(
os.path.join(screenshots_dir, f'{image_id}_som.png'), 'wb'
) as file:
file.write(som_image_data)

if save_final_state:
os.makedirs(state_dir, exist_ok=True)
Expand Down Expand Up @@ -214,6 +223,14 @@ def run_evaluator(
)
args, _ = parser.parse_known_args()

#### TODO: parse this from cli and toml ####
agent_config: AgentConfig | None = None
agent_config = AgentConfig(
enable_som_visual_browsing=True,
disabled_microagents=['github'],
enyst marked this conversation as resolved.
Show resolved Hide resolved
)
############################################

agent_llm_config: LLMConfig | None = None
if args.agent_llm_config:
agent_llm_config = get_llm_config_arg(args.agent_llm_config)
Expand Down Expand Up @@ -254,7 +271,7 @@ def run_evaluator(
else:
temp_dir = tempfile.mkdtemp()
config: AppConfig = get_config(
args.task_image_name, task_short_name, temp_dir, agent_llm_config
args.task_image_name, task_short_name, temp_dir, agent_llm_config, agent_config
)
runtime: Runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
Expand Down