From fcba1af563aeec80bece6fe1e4ce9bdff816218a Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 13 Feb 2025 17:20:32 +0000
Subject: [PATCH] Refactor SandboxConfig to use
 get_default_sandbox_config_for_eval

---
 evaluation/benchmarks/bird/run_infer.py       |  8 ++----
 .../benchmarks/commit0_bench/run_infer.py     | 28 +++++++------------
 evaluation/benchmarks/mint/run_infer.py       | 13 ++++-----
 evaluation/benchmarks/toolqa/run_infer.py     |  8 ++----
 4 files changed, 20 insertions(+), 37 deletions(-)

diff --git a/evaluation/benchmarks/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py
index 02d92aa3ee3e..55c8f2ffdf31 100644
--- a/evaluation/benchmarks/bird/run_infer.py
+++ b/evaluation/benchmarks/bird/run_infer.py
@@ -28,6 +28,7 @@
     SandboxConfig,
     get_llm_config_arg,
     parse_arguments,
+    get_default_sandbox_config_for_eval,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -76,12 +77,7 @@ def get_config(
         run_as_openhands=False,
         runtime='docker',
         max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='python:3.12-bookworm',
-            enable_auto_lint=True,
-            use_host_network=False,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=get_default_sandbox_config_for_eval(),
         # do not mount workspace
         workspace_base=None,
         workspace_mount_path=None,
diff --git a/evaluation/benchmarks/commit0_bench/run_infer.py b/evaluation/benchmarks/commit0_bench/run_infer.py
index 2e0fc528f7c3..783d623d4e19 100644
--- a/evaluation/benchmarks/commit0_bench/run_infer.py
+++ b/evaluation/benchmarks/commit0_bench/run_infer.py
@@ -28,6 +28,7 @@
     SandboxConfig,
     get_llm_config_arg,
     get_parser,
+    get_default_sandbox_config_for_eval,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -105,9 +106,7 @@ def get_config(
     instance: pd.Series,
     metadata: EvalMetadata,
 ) -> AppConfig:
-    # COMMIT0_CONTAINER_IMAGE = 'wentingzhao/'
     assert USE_INSTANCE_IMAGE
-    # We use a different instance image for the each instance of commit0 eval
     repo_name = instance['repo'].split('/')[1]
     base_container_image = get_instance_docker_image(repo_name)
     logger.info(
@@ -115,28 +114,21 @@ def get_config(
         f'Please make sure this image exists. '
         f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
     )
-    # else:
-    #     raise
-    # base_container_image = SWE_BENCH_CONTAINER_IMAGE
-    # logger.info(f'Using swe-bench container image: {base_container_image}')
+
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = base_container_image
+    sandbox_config.timeout = 300  # large enough timeout, since some testcases take very long to run
+    sandbox_config.api_key = os.environ.get('ALLHANDS_API_KEY', None)
+    sandbox_config.remote_runtime_api_url = os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL')
+    sandbox_config.keep_runtime_alive = False
+    sandbox_config.remote_runtime_init_timeout = 3600
 
     config = AppConfig(
         default_agent=metadata.agent_class,
         run_as_openhands=False,
         max_iterations=metadata.max_iterations,
         runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=SandboxConfig(
-            base_container_image=base_container_image,
-            enable_auto_lint=True,
-            use_host_network=False,
-            # large enough timeout, since some testcases take very long to run
-            timeout=300,
-            api_key=os.environ.get('ALLHANDS_API_KEY', None),
-            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
-            keep_runtime_alive=False,
-            remote_runtime_init_timeout=3600,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
         # do not mount workspace
         workspace_base=None,
         workspace_mount_path=None,
diff --git a/evaluation/benchmarks/mint/run_infer.py b/evaluation/benchmarks/mint/run_infer.py
index 4c356f26d944..14a6988d8756 100644
--- a/evaluation/benchmarks/mint/run_infer.py
+++ b/evaluation/benchmarks/mint/run_infer.py
@@ -25,6 +25,7 @@
     SandboxConfig,
     get_llm_config_arg,
     get_parser,
+    get_default_sandbox_config_for_eval,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -103,18 +104,16 @@ def load_incontext_example(task_name: str, with_tool: bool = True):
 def get_config(
     metadata: EvalMetadata,
 ) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'xingyaoww/od-eval-mint:v1.0'
+    sandbox_config.runtime_extra_deps = f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}'
+
     config = AppConfig(
         default_agent=metadata.agent_class,
         run_as_openhands=False,
         runtime='docker',
         max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='xingyaoww/od-eval-mint:v1.0',
-            enable_auto_lint=True,
-            use_host_network=False,
-            runtime_extra_deps=f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}',
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
         # do not mount workspace
         workspace_base=None,
         workspace_mount_path=None,
diff --git a/evaluation/benchmarks/toolqa/run_infer.py b/evaluation/benchmarks/toolqa/run_infer.py
index 45b9febed27b..cc15c6595c71 100644
--- a/evaluation/benchmarks/toolqa/run_infer.py
+++ b/evaluation/benchmarks/toolqa/run_infer.py
@@ -21,6 +21,7 @@
     SandboxConfig,
     get_llm_config_arg,
     get_parser,
+    get_default_sandbox_config_for_eval,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -46,12 +47,7 @@ def get_config(
         run_as_openhands=False,
         runtime='docker',
         max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='python:3.12-bookworm',
-            enable_auto_lint=True,
-            use_host_network=False,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=get_default_sandbox_config_for_eval(),
         # do not mount workspace
         workspace_base=None,
         workspace_mount_path=None,