From ef12bc5381d74dd53e9f20354264593716e91f8e Mon Sep 17 00:00:00 2001 From: Boxuan Li Date: Thu, 13 Feb 2025 12:05:03 -0800 Subject: [PATCH] Evaluation harness: Add agent config option (#6662) --- evaluation/benchmarks/gaia/run_infer.py | 14 ++++- .../benchmarks/gaia/scripts/run_infer.sh | 5 ++ .../benchmarks/the_agent_company/run_infer.py | 17 +++++- .../the_agent_company/scripts/run_infer.sh | 26 +++++--- evaluation/utils/shared.py | 4 ++ openhands/core/config/__init__.py | 2 + openhands/core/config/utils.py | 60 ++++++++++++++++++- tests/unit/test_arg_parser.py | 3 +- tests/unit/test_config.py | 30 ++++++++++ 9 files changed, 149 insertions(+), 12 deletions(-) diff --git a/evaluation/benchmarks/gaia/run_infer.py b/evaluation/benchmarks/gaia/run_infer.py index a8b442819267..2fdab0b2927a 100644 --- a/evaluation/benchmarks/gaia/run_infer.py +++ b/evaluation/benchmarks/gaia/run_infer.py @@ -25,6 +25,7 @@ get_llm_config_arg, get_parser, ) +from openhands.core.config.utils import get_agent_config_arg from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction @@ -63,8 +64,12 @@ def get_config( workspace_mount_path=None, ) config.set_llm_config(metadata.llm_config) - agent_config = config.get_agent_config(metadata.agent_class) - agent_config.enable_prompt_extensions = False + if metadata.agent_config: + config.set_agent_config(metadata.agent_config, metadata.agent_class) + else: + logger.info('Agent config not provided, using default settings') + agent_config = config.get_agent_config(metadata.agent_class) + agent_config.enable_prompt_extensions = False return config @@ -238,6 +243,10 @@ def process_instance( ) args, _ = parser.parse_known_args() + agent_config = None + if args.agent_config: + agent_config = get_agent_config_arg(args.agent_config) + llm_config = None if args.llm_config: llm_config = get_llm_config_arg(args.llm_config) @@ -256,6 +265,7 @@ def process_instance( eval_output_dir=args.eval_output_dir, data_split=args.data_split, details={'gaia-level': args.level}, + agent_config=agent_config, ) dataset = load_dataset('gaia-benchmark/GAIA', args.level) diff --git a/evaluation/benchmarks/gaia/scripts/run_infer.sh b/evaluation/benchmarks/gaia/scripts/run_infer.sh index 4b2f8f73dffa..217809880d40 100755 --- a/evaluation/benchmarks/gaia/scripts/run_infer.sh +++ b/evaluation/benchmarks/gaia/scripts/run_infer.sh @@ -9,6 +9,7 @@ AGENT=$3 EVAL_LIMIT=$4 LEVELS=$5 NUM_WORKERS=$6 +AGENT_CONFIG=$7 if [ -z "$NUM_WORKERS" ]; then NUM_WORKERS=1 @@ -49,5 +50,9 @@ if [ -n "$EVAL_LIMIT" ]; then COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT" fi +if [ -n "$AGENT_CONFIG" ]; then + echo "AGENT_CONFIG: $AGENT_CONFIG" + COMMAND="$COMMAND --agent-config $AGENT_CONFIG" + # Run the command eval $COMMAND diff --git a/evaluation/benchmarks/the_agent_company/run_infer.py b/evaluation/benchmarks/the_agent_company/run_infer.py index 5cd7c027e20f..cbfbb386fdde 100644 --- a/evaluation/benchmarks/the_agent_company/run_infer.py +++ b/evaluation/benchmarks/the_agent_company/run_infer.py @@ -18,9 +18,11 @@ AppConfig, LLMConfig, SandboxConfig, + get_agent_config_arg, get_llm_config_arg, get_parser, ) +from openhands.core.config.agent_config import AgentConfig from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller from openhands.events.action import CmdRunAction, MessageAction @@ -34,6 +36,7 @@ def get_config( task_short_name: str, mount_path_on_host: str, llm_config: LLMConfig, + agent_config: AgentConfig, ) -> AppConfig: config = AppConfig( run_as_openhands=False, @@ -58,6 +61,14 @@ def get_config( workspace_mount_path_in_sandbox='/outputs', ) config.set_llm_config(llm_config) + if agent_config: + config.set_agent_config(agent_config) + else: + logger.info('Agent config not provided, using default settings') + agent_config = AgentConfig( + enable_prompt_extensions=False, + ) + config.set_agent_config(agent_config) return config @@ -215,6 +226,10 @@ def run_evaluator( ) args, _ = parser.parse_known_args() + agent_config: AgentConfig | None = None + if args.agent_config: + agent_config = get_agent_config_arg(args.agent_config) + agent_llm_config: LLMConfig | None = None if args.agent_llm_config: agent_llm_config = get_llm_config_arg(args.agent_llm_config) @@ -255,7 +270,7 @@ def run_evaluator( else: temp_dir = tempfile.mkdtemp() config: AppConfig = get_config( - args.task_image_name, task_short_name, temp_dir, agent_llm_config + args.task_image_name, task_short_name, temp_dir, agent_llm_config, agent_config ) runtime: Runtime = create_runtime(config) call_async_from_sync(runtime.connect) diff --git a/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh b/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh index b5bc7874c12e..3366c9826005 100755 --- a/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh +++ b/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh @@ -44,6 +44,10 @@ while [[ $# -gt 0 ]]; do ENV_LLM_CONFIG="$2" shift 2 ;; + --agent-config) + AGENT_CONFIG="$2" + shift 2 + ;; --outputs-path) OUTPUTS_PATH="$2" shift 2 @@ -140,13 +144,21 @@ while IFS= read -r task_image; do continue fi - export PYTHONPATH=evaluation/benchmarks/the_agent_company:\$PYTHONPATH && \ - poetry run python run_infer.py \ - --agent-llm-config "$AGENT_LLM_CONFIG" \ - --env-llm-config "$ENV_LLM_CONFIG" \ - --outputs-path "$OUTPUTS_PATH" \ - --server-hostname "$SERVER_HOSTNAME" \ - --task-image-name "$task_image" + # Build the Python command + COMMAND="poetry run python run_infer.py \ + --agent-llm-config \"$AGENT_LLM_CONFIG\" \ + --env-llm-config \"$ENV_LLM_CONFIG\" \ + --outputs-path \"$OUTPUTS_PATH\" \ + --server-hostname \"$SERVER_HOSTNAME\" \ + --task-image-name \"$task_image\"" + + # Add agent-config if it's defined + if [ -n "$AGENT_CONFIG" ]; then + COMMAND="$COMMAND --agent-config $AGENT_CONFIG" + fi + + export PYTHONPATH=evaluation/benchmarks/the_agent_company:$PYTHONPATH && \ + eval "$COMMAND" # Prune unused images and volumes docker image rm "$task_image" diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index 0f8ac8fa8332..7035d56e41ef 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -17,6 +17,7 @@ from openhands.controller.state.state import State from openhands.core.config import LLMConfig +from openhands.core.config.agent_config import AgentConfig from openhands.core.config.condenser_config import ( CondenserConfig, NoOpCondenserConfig, @@ -43,6 +44,7 @@ class EvalMetadata(BaseModel): agent_class: str llm_config: LLMConfig + agent_config: AgentConfig | None = None max_iterations: int eval_output_dir: str start_time: str @@ -167,6 +169,7 @@ def make_metadata( eval_output_dir: str, data_split: str | None = None, details: dict[str, Any] | None = None, + agent_config: AgentConfig | None = None, condenser_config: CondenserConfig | None = None, ) -> EvalMetadata: model_name = llm_config.model.split('/')[-1] @@ -189,6 +192,7 @@ def make_metadata( metadata = EvalMetadata( agent_class=agent_class, llm_config=llm_config, + agent_config=agent_config, max_iterations=max_iterations, eval_output_dir=eval_output_path, start_time=time.strftime('%Y-%m-%d %H:%M:%S'), diff --git a/openhands/core/config/__init__.py b/openhands/core/config/__init__.py index 2e0f87e32143..d653f3e70ac4 100644 --- a/openhands/core/config/__init__.py +++ b/openhands/core/config/__init__.py @@ -10,6 +10,7 @@ from openhands.core.config.security_config import SecurityConfig from openhands.core.config.utils import ( finalize_config, + get_agent_config_arg, get_llm_config_arg, get_parser, load_app_config, @@ -31,6 +32,7 @@ 'load_from_env', 'load_from_toml', 'finalize_config', + 'get_agent_config_arg', 'get_llm_config_arg', 'get_field_info', 'get_parser', diff --git a/openhands/core/config/utils.py b/openhands/core/config/utils.py index b6bf4b77abad..e0b1ee71adc3 100644 --- a/openhands/core/config/utils.py +++ b/openhands/core/config/utils.py @@ -298,7 +298,59 @@ def finalize_config(cfg: AppConfig): ) -# Utility function for command line --group argument +def get_agent_config_arg( + agent_config_arg: str, toml_file: str = 'config.toml' +) -> AgentConfig | None: + """Get a group of agent settings from the config file. + + A group in config.toml can look like this: + + ``` + [agent.default] + enable_prompt_extensions = false + ``` + + The user-defined group name, like "default", is the argument to this function. The function will load the AgentConfig object + with the settings of this group, from the config file, and set it as the AgentConfig object for the app. + + Note that the group must be under "agent" group, or in other words, the group name must start with "agent.". + + Args: + agent_config_arg: The group of agent settings to get from the config.toml file. + toml_file: Path to the configuration file to read from. Defaults to 'config.toml'. + + Returns: + AgentConfig: The AgentConfig object with the settings from the config file. + """ + # keep only the name, just in case + agent_config_arg = agent_config_arg.strip('[]') + + # truncate the prefix, just in case + if agent_config_arg.startswith('agent.'): + agent_config_arg = agent_config_arg[6:] + + logger.openhands_logger.debug(f'Loading agent config from {agent_config_arg}') + + # load the toml file + try: + with open(toml_file, 'r', encoding='utf-8') as toml_contents: + toml_config = toml.load(toml_contents) + except FileNotFoundError as e: + logger.openhands_logger.error(f'Config file not found: {e}') + return None + except toml.TomlDecodeError as e: + logger.openhands_logger.error( + f'Cannot parse agent group from {agent_config_arg}. Exception: {e}' + ) + return None + + # update the agent config with the specified section + if 'agent' in toml_config and agent_config_arg in toml_config['agent']: + return AgentConfig(**toml_config['agent'][agent_config_arg]) + logger.openhands_logger.debug(f'Loading from toml failed for {agent_config_arg}') + return None + + def get_llm_config_arg( llm_config_arg: str, toml_file: str = 'config.toml' ) -> LLMConfig | None: @@ -443,6 +495,12 @@ def get_parser() -> argparse.ArgumentParser: type=str, help='Replace default LLM ([llm] section in config.toml) config with the specified LLM config, e.g. "llama3" for [llm.llama3] section in config.toml', ) + parser.add_argument( + '--agent-config', + default=None, + type=str, + help='Replace default Agent ([agent] section in config.toml) config with the specified Agent config, e.g. "CodeAct" for [agent.CodeAct] section in config.toml', + ) parser.add_argument( '-n', '--name', diff --git a/tests/unit/test_arg_parser.py b/tests/unit/test_arg_parser.py index 51c736f19c05..b71cd5e2c18b 100644 --- a/tests/unit/test_arg_parser.py +++ b/tests/unit/test_arg_parser.py @@ -128,6 +128,7 @@ def test_help_message(capsys): '--eval-note EVAL_NOTE', '--eval-ids EVAL_IDS', '-l LLM_CONFIG, --llm-config LLM_CONFIG', + '--agent-config AGENT_CONFIG', '-n NAME, --name NAME', '--config-file CONFIG_FILE', '--no-auto-continue', @@ -137,4 +138,4 @@ def test_help_message(capsys): assert element in help_output, f"Expected '{element}' to be in the help message" option_count = help_output.count(' -') - assert option_count == 17, f'Expected 17 options, found {option_count}' + assert option_count == 18, f'Expected 18 options, found {option_count}' diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index 5edfd64cda90..7aab02c0e019 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -9,6 +9,7 @@ AppConfig, LLMConfig, finalize_config, + get_agent_config_arg, get_llm_config_arg, load_from_env, load_from_toml, @@ -781,3 +782,32 @@ def test_get_agent_configs(default_config, temp_toml_file): assert codeact_config.memory_enabled is True browsing_config = default_config.get_agent_configs().get('BrowsingAgent') assert browsing_config.memory_max_threads == 10 + + +def test_get_agent_config_arg(temp_toml_file): + temp_toml = """ +[core] +max_iterations = 100 +max_budget_per_task = 4.0 + +[agent.CodeActAgent] +memory_enabled = true +enable_prompt_extensions = false + +[agent.BrowsingAgent] +memory_enabled = false +enable_prompt_extensions = true +memory_max_threads = 10 +""" + + with open(temp_toml_file, 'w') as f: + f.write(temp_toml) + + agent_config = get_agent_config_arg('CodeActAgent', temp_toml_file) + assert agent_config.memory_enabled + assert not agent_config.enable_prompt_extensions + + agent_config2 = get_agent_config_arg('BrowsingAgent', temp_toml_file) + assert not agent_config2.memory_enabled + assert agent_config2.enable_prompt_extensions + assert agent_config2.memory_max_threads == 10