From ef12bc5381d74dd53e9f20354264593716e91f8e Mon Sep 17 00:00:00 2001
From: Boxuan Li <liboxuan@connect.hku.hk>
Date: Thu, 13 Feb 2025 12:05:03 -0800
Subject: [PATCH] Evaluation harness: Add agent config option (#6662)

---
 evaluation/benchmarks/gaia/run_infer.py       | 14 ++++-
 .../benchmarks/gaia/scripts/run_infer.sh      |  5 ++
 .../benchmarks/the_agent_company/run_infer.py | 17 +++++-
 .../the_agent_company/scripts/run_infer.sh    | 26 +++++---
 evaluation/utils/shared.py                    |  4 ++
 openhands/core/config/__init__.py             |  2 +
 openhands/core/config/utils.py                | 60 ++++++++++++++++++-
 tests/unit/test_arg_parser.py                 |  3 +-
 tests/unit/test_config.py                     | 30 ++++++++++
 9 files changed, 149 insertions(+), 12 deletions(-)

diff --git a/evaluation/benchmarks/gaia/run_infer.py b/evaluation/benchmarks/gaia/run_infer.py
index a8b442819267..2fdab0b2927a 100644
--- a/evaluation/benchmarks/gaia/run_infer.py
+++ b/evaluation/benchmarks/gaia/run_infer.py
@@ -25,6 +25,7 @@
     get_llm_config_arg,
     get_parser,
 )
+from openhands.core.config.utils import get_agent_config_arg
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
@@ -63,8 +64,12 @@ def get_config(
         workspace_mount_path=None,
     )
     config.set_llm_config(metadata.llm_config)
-    agent_config = config.get_agent_config(metadata.agent_class)
-    agent_config.enable_prompt_extensions = False
+    if metadata.agent_config:
+        config.set_agent_config(metadata.agent_config, metadata.agent_class)
+    else:
+        logger.info('Agent config not provided, using default settings')
+        agent_config = config.get_agent_config(metadata.agent_class)
+        agent_config.enable_prompt_extensions = False
     return config
 
 
@@ -238,6 +243,10 @@ def process_instance(
     )
     args, _ = parser.parse_known_args()
 
+    agent_config = None
+    if args.agent_config:
+        agent_config = get_agent_config_arg(args.agent_config)
+
     llm_config = None
     if args.llm_config:
         llm_config = get_llm_config_arg(args.llm_config)
@@ -256,6 +265,7 @@ def process_instance(
         eval_output_dir=args.eval_output_dir,
         data_split=args.data_split,
         details={'gaia-level': args.level},
+        agent_config=agent_config,
     )
 
     dataset = load_dataset('gaia-benchmark/GAIA', args.level)
diff --git a/evaluation/benchmarks/gaia/scripts/run_infer.sh b/evaluation/benchmarks/gaia/scripts/run_infer.sh
index 4b2f8f73dffa..217809880d40 100755
--- a/evaluation/benchmarks/gaia/scripts/run_infer.sh
+++ b/evaluation/benchmarks/gaia/scripts/run_infer.sh
@@ -9,6 +9,7 @@ AGENT=$3
 EVAL_LIMIT=$4
 LEVELS=$5
 NUM_WORKERS=$6
+AGENT_CONFIG=$7
 
 if [ -z "$NUM_WORKERS" ]; then
   NUM_WORKERS=1
@@ -49,5 +50,9 @@ if [ -n "$EVAL_LIMIT" ]; then
   COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
 fi
 
+if [ -n "$AGENT_CONFIG" ]; then
+  echo "AGENT_CONFIG: $AGENT_CONFIG"
+  COMMAND="$COMMAND --agent-config $AGENT_CONFIG"
+
 # Run the command
 eval $COMMAND
diff --git a/evaluation/benchmarks/the_agent_company/run_infer.py b/evaluation/benchmarks/the_agent_company/run_infer.py
index 5cd7c027e20f..cbfbb386fdde 100644
--- a/evaluation/benchmarks/the_agent_company/run_infer.py
+++ b/evaluation/benchmarks/the_agent_company/run_infer.py
@@ -18,9 +18,11 @@
     AppConfig,
     LLMConfig,
     SandboxConfig,
+    get_agent_config_arg,
     get_llm_config_arg,
     get_parser,
 )
+from openhands.core.config.agent_config import AgentConfig
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import CmdRunAction, MessageAction
@@ -34,6 +36,7 @@ def get_config(
     task_short_name: str,
     mount_path_on_host: str,
     llm_config: LLMConfig,
+    agent_config: AgentConfig,
 ) -> AppConfig:
     config = AppConfig(
         run_as_openhands=False,
@@ -58,6 +61,14 @@ def get_config(
         workspace_mount_path_in_sandbox='/outputs',
     )
     config.set_llm_config(llm_config)
+    if agent_config:
+        config.set_agent_config(agent_config)
+    else:
+        logger.info('Agent config not provided, using default settings')
+        agent_config = AgentConfig(
+            enable_prompt_extensions=False,
+        )
+        config.set_agent_config(agent_config)
     return config
 
 
@@ -215,6 +226,10 @@ def run_evaluator(
     )
     args, _ = parser.parse_known_args()
 
+    agent_config: AgentConfig | None = None
+    if args.agent_config:
+        agent_config = get_agent_config_arg(args.agent_config)
+
     agent_llm_config: LLMConfig | None = None
     if args.agent_llm_config:
         agent_llm_config = get_llm_config_arg(args.agent_llm_config)
@@ -255,7 +270,7 @@ def run_evaluator(
     else:
         temp_dir = tempfile.mkdtemp()
     config: AppConfig = get_config(
-        args.task_image_name, task_short_name, temp_dir, agent_llm_config
+        args.task_image_name, task_short_name, temp_dir, agent_llm_config, agent_config
     )
     runtime: Runtime = create_runtime(config)
     call_async_from_sync(runtime.connect)
diff --git a/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh b/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh
index b5bc7874c12e..3366c9826005 100755
--- a/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh
+++ b/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh
@@ -44,6 +44,10 @@ while [[ $# -gt 0 ]]; do
             ENV_LLM_CONFIG="$2"
             shift 2
             ;;
+        --agent-config)
+            AGENT_CONFIG="$2"
+            shift 2
+            ;;
         --outputs-path)
             OUTPUTS_PATH="$2"
             shift 2
@@ -140,13 +144,21 @@ while IFS= read -r task_image; do
         continue
     fi
 
-    export PYTHONPATH=evaluation/benchmarks/the_agent_company:\$PYTHONPATH && \
-        poetry run python run_infer.py \
-            --agent-llm-config "$AGENT_LLM_CONFIG" \
-            --env-llm-config "$ENV_LLM_CONFIG" \
-            --outputs-path "$OUTPUTS_PATH" \
-            --server-hostname "$SERVER_HOSTNAME" \
-            --task-image-name "$task_image"
+    # Build the Python command
+    COMMAND="poetry run python run_infer.py \
+            --agent-llm-config \"$AGENT_LLM_CONFIG\" \
+            --env-llm-config \"$ENV_LLM_CONFIG\" \
+            --outputs-path \"$OUTPUTS_PATH\" \
+            --server-hostname \"$SERVER_HOSTNAME\" \
+            --task-image-name \"$task_image\""
+
+    # Add agent-config if it's defined
+    if [ -n "$AGENT_CONFIG" ]; then
+        COMMAND="$COMMAND --agent-config $AGENT_CONFIG"
+    fi
+
+    export PYTHONPATH=evaluation/benchmarks/the_agent_company:$PYTHONPATH && \
+        eval "$COMMAND"
 
     # Prune unused images and volumes
     docker image rm "$task_image"
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index 0f8ac8fa8332..7035d56e41ef 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -17,6 +17,7 @@
 
 from openhands.controller.state.state import State
 from openhands.core.config import LLMConfig
+from openhands.core.config.agent_config import AgentConfig
 from openhands.core.config.condenser_config import (
     CondenserConfig,
     NoOpCondenserConfig,
@@ -43,6 +44,7 @@
 class EvalMetadata(BaseModel):
     agent_class: str
     llm_config: LLMConfig
+    agent_config: AgentConfig | None = None
     max_iterations: int
     eval_output_dir: str
     start_time: str
@@ -167,6 +169,7 @@ def make_metadata(
     eval_output_dir: str,
     data_split: str | None = None,
     details: dict[str, Any] | None = None,
+    agent_config: AgentConfig | None = None,
     condenser_config: CondenserConfig | None = None,
 ) -> EvalMetadata:
     model_name = llm_config.model.split('/')[-1]
@@ -189,6 +192,7 @@ def make_metadata(
     metadata = EvalMetadata(
         agent_class=agent_class,
         llm_config=llm_config,
+        agent_config=agent_config,
         max_iterations=max_iterations,
         eval_output_dir=eval_output_path,
         start_time=time.strftime('%Y-%m-%d %H:%M:%S'),
diff --git a/openhands/core/config/__init__.py b/openhands/core/config/__init__.py
index 2e0f87e32143..d653f3e70ac4 100644
--- a/openhands/core/config/__init__.py
+++ b/openhands/core/config/__init__.py
@@ -10,6 +10,7 @@
 from openhands.core.config.security_config import SecurityConfig
 from openhands.core.config.utils import (
     finalize_config,
+    get_agent_config_arg,
     get_llm_config_arg,
     get_parser,
     load_app_config,
@@ -31,6 +32,7 @@
     'load_from_env',
     'load_from_toml',
     'finalize_config',
+    'get_agent_config_arg',
     'get_llm_config_arg',
     'get_field_info',
     'get_parser',
diff --git a/openhands/core/config/utils.py b/openhands/core/config/utils.py
index b6bf4b77abad..e0b1ee71adc3 100644
--- a/openhands/core/config/utils.py
+++ b/openhands/core/config/utils.py
@@ -298,7 +298,59 @@ def finalize_config(cfg: AppConfig):
         )
 
 
-# Utility function for command line --group argument
+def get_agent_config_arg(
+    agent_config_arg: str, toml_file: str = 'config.toml'
+) -> AgentConfig | None:
+    """Get a group of agent settings from the config file.
+
+    A group in config.toml can look like this:
+
+    ```
+    [agent.default]
+    enable_prompt_extensions = false
+    ```
+
+    The user-defined group name, like "default", is the argument to this function. The function will load the AgentConfig object
+    with the settings of this group, from the config file, and set it as the AgentConfig object for the app.
+
+    Note that the group must be under "agent" group, or in other words, the group name must start with "agent.".
+
+    Args:
+        agent_config_arg: The group of agent settings to get from the config.toml file.
+        toml_file: Path to the configuration file to read from. Defaults to 'config.toml'.
+
+    Returns:
+        AgentConfig: The AgentConfig object with the settings from the config file.
+    """
+    # keep only the name, just in case
+    agent_config_arg = agent_config_arg.strip('[]')
+
+    # truncate the prefix, just in case
+    if agent_config_arg.startswith('agent.'):
+        agent_config_arg = agent_config_arg[6:]
+
+    logger.openhands_logger.debug(f'Loading agent config from {agent_config_arg}')
+
+    # load the toml file
+    try:
+        with open(toml_file, 'r', encoding='utf-8') as toml_contents:
+            toml_config = toml.load(toml_contents)
+    except FileNotFoundError as e:
+        logger.openhands_logger.error(f'Config file not found: {e}')
+        return None
+    except toml.TomlDecodeError as e:
+        logger.openhands_logger.error(
+            f'Cannot parse agent group from {agent_config_arg}. Exception: {e}'
+        )
+        return None
+
+    # update the agent config with the specified section
+    if 'agent' in toml_config and agent_config_arg in toml_config['agent']:
+        return AgentConfig(**toml_config['agent'][agent_config_arg])
+    logger.openhands_logger.debug(f'Loading from toml failed for {agent_config_arg}')
+    return None
+
+
 def get_llm_config_arg(
     llm_config_arg: str, toml_file: str = 'config.toml'
 ) -> LLMConfig | None:
@@ -443,6 +495,12 @@ def get_parser() -> argparse.ArgumentParser:
         type=str,
         help='Replace default LLM ([llm] section in config.toml) config with the specified LLM config, e.g. "llama3" for [llm.llama3] section in config.toml',
     )
+    parser.add_argument(
+        '--agent-config',
+        default=None,
+        type=str,
+        help='Replace default Agent ([agent] section in config.toml) config with the specified Agent config, e.g. "CodeAct" for [agent.CodeAct] section in config.toml',
+    )
     parser.add_argument(
         '-n',
         '--name',
diff --git a/tests/unit/test_arg_parser.py b/tests/unit/test_arg_parser.py
index 51c736f19c05..b71cd5e2c18b 100644
--- a/tests/unit/test_arg_parser.py
+++ b/tests/unit/test_arg_parser.py
@@ -128,6 +128,7 @@ def test_help_message(capsys):
         '--eval-note EVAL_NOTE',
         '--eval-ids EVAL_IDS',
         '-l LLM_CONFIG, --llm-config LLM_CONFIG',
+        '--agent-config AGENT_CONFIG',
         '-n NAME, --name NAME',
         '--config-file CONFIG_FILE',
         '--no-auto-continue',
@@ -137,4 +138,4 @@ def test_help_message(capsys):
         assert element in help_output, f"Expected '{element}' to be in the help message"
 
     option_count = help_output.count('  -')
-    assert option_count == 17, f'Expected 17 options, found {option_count}'
+    assert option_count == 18, f'Expected 18 options, found {option_count}'
diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
index 5edfd64cda90..7aab02c0e019 100644
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@@ -9,6 +9,7 @@
     AppConfig,
     LLMConfig,
     finalize_config,
+    get_agent_config_arg,
     get_llm_config_arg,
     load_from_env,
     load_from_toml,
@@ -781,3 +782,32 @@ def test_get_agent_configs(default_config, temp_toml_file):
     assert codeact_config.memory_enabled is True
     browsing_config = default_config.get_agent_configs().get('BrowsingAgent')
     assert browsing_config.memory_max_threads == 10
+
+
+def test_get_agent_config_arg(temp_toml_file):
+    temp_toml = """
+[core]
+max_iterations = 100
+max_budget_per_task = 4.0
+
+[agent.CodeActAgent]
+memory_enabled = true
+enable_prompt_extensions = false
+
+[agent.BrowsingAgent]
+memory_enabled = false
+enable_prompt_extensions = true
+memory_max_threads = 10
+"""
+
+    with open(temp_toml_file, 'w') as f:
+        f.write(temp_toml)
+
+    agent_config = get_agent_config_arg('CodeActAgent', temp_toml_file)
+    assert agent_config.memory_enabled
+    assert not agent_config.enable_prompt_extensions
+
+    agent_config2 = get_agent_config_arg('BrowsingAgent', temp_toml_file)
+    assert not agent_config2.memory_enabled
+    assert agent_config2.enable_prompt_extensions
+    assert agent_config2.memory_max_threads == 10