From fcba1af563aeec80bece6fe1e4ce9bdff816218a Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 13 Feb 2025 17:20:32 +0000 Subject: [PATCH] Refactor SandboxConfig to use get_default_sandbox_config_for_eval --- evaluation/benchmarks/bird/run_infer.py | 8 ++---- .../benchmarks/commit0_bench/run_infer.py | 28 +++++++------------ evaluation/benchmarks/mint/run_infer.py | 13 ++++----- evaluation/benchmarks/toolqa/run_infer.py | 8 ++---- 4 files changed, 20 insertions(+), 37 deletions(-) diff --git a/evaluation/benchmarks/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py index 02d92aa3ee3e..55c8f2ffdf31 100644 --- a/evaluation/benchmarks/bird/run_infer.py +++ b/evaluation/benchmarks/bird/run_infer.py @@ -28,6 +28,7 @@ SandboxConfig, get_llm_config_arg, parse_arguments, + get_default_sandbox_config_for_eval, ) from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller @@ -76,12 +77,7 @@ def get_config( run_as_openhands=False, runtime='docker', max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='python:3.12-bookworm', - enable_auto_lint=True, - use_host_network=False, - remote_runtime_enable_retries=True, - ), + sandbox=get_default_sandbox_config_for_eval(), # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/commit0_bench/run_infer.py b/evaluation/benchmarks/commit0_bench/run_infer.py index 2e0fc528f7c3..783d623d4e19 100644 --- a/evaluation/benchmarks/commit0_bench/run_infer.py +++ b/evaluation/benchmarks/commit0_bench/run_infer.py @@ -28,6 +28,7 @@ SandboxConfig, get_llm_config_arg, get_parser, + get_default_sandbox_config_for_eval, ) from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller @@ -105,9 +106,7 @@ def get_config( instance: pd.Series, metadata: EvalMetadata, ) -> AppConfig: - # COMMIT0_CONTAINER_IMAGE = 'wentingzhao/' assert USE_INSTANCE_IMAGE - # We use a different instance image for the each instance of commit0 eval repo_name = instance['repo'].split('/')[1] base_container_image = get_instance_docker_image(repo_name) logger.info( @@ -115,28 +114,21 @@ def get_config( f'Please make sure this image exists. ' f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.' ) - # else: - # raise - # base_container_image = SWE_BENCH_CONTAINER_IMAGE - # logger.info(f'Using swe-bench container image: {base_container_image}') + + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = base_container_image + sandbox_config.timeout = 300 # large enough timeout, since some testcases take very long to run + sandbox_config.api_key = os.environ.get('ALLHANDS_API_KEY', None) + sandbox_config.remote_runtime_api_url = os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL') + sandbox_config.keep_runtime_alive = False + sandbox_config.remote_runtime_init_timeout = 3600 config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, max_iterations=metadata.max_iterations, runtime=os.environ.get('RUNTIME', 'docker'), - sandbox=SandboxConfig( - base_container_image=base_container_image, - enable_auto_lint=True, - use_host_network=False, - # large enough timeout, since some testcases take very long to run - timeout=300, - api_key=os.environ.get('ALLHANDS_API_KEY', None), - remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), - keep_runtime_alive=False, - remote_runtime_init_timeout=3600, - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/mint/run_infer.py b/evaluation/benchmarks/mint/run_infer.py index 4c356f26d944..14a6988d8756 100644 --- a/evaluation/benchmarks/mint/run_infer.py +++ b/evaluation/benchmarks/mint/run_infer.py @@ -25,6 +25,7 @@ SandboxConfig, get_llm_config_arg, get_parser, + get_default_sandbox_config_for_eval, ) from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller @@ -103,18 +104,16 @@ def load_incontext_example(task_name: str, with_tool: bool = True): def get_config( metadata: EvalMetadata, ) -> AppConfig: + sandbox_config = get_default_sandbox_config_for_eval() + sandbox_config.base_container_image = 'xingyaoww/od-eval-mint:v1.0' + sandbox_config.runtime_extra_deps = f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}' + config = AppConfig( default_agent=metadata.agent_class, run_as_openhands=False, runtime='docker', max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='xingyaoww/od-eval-mint:v1.0', - enable_auto_lint=True, - use_host_network=False, - runtime_extra_deps=f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}', - remote_runtime_enable_retries=True, - ), + sandbox=sandbox_config, # do not mount workspace workspace_base=None, workspace_mount_path=None, diff --git a/evaluation/benchmarks/toolqa/run_infer.py b/evaluation/benchmarks/toolqa/run_infer.py index 45b9febed27b..cc15c6595c71 100644 --- a/evaluation/benchmarks/toolqa/run_infer.py +++ b/evaluation/benchmarks/toolqa/run_infer.py @@ -21,6 +21,7 @@ SandboxConfig, get_llm_config_arg, get_parser, + get_default_sandbox_config_for_eval, ) from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime, run_controller @@ -46,12 +47,7 @@ def get_config( run_as_openhands=False, runtime='docker', max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='python:3.12-bookworm', - enable_auto_lint=True, - use_host_network=False, - remote_runtime_enable_retries=True, - ), + sandbox=get_default_sandbox_config_for_eval(), # do not mount workspace workspace_base=None, workspace_mount_path=None,