Refactor SandboxConfig to use get_default_sandbox_config_for_eval

All-Hands-AI · Feb 13, 2025 · fcba1af · fcba1af
1 parent 8242721
commit fcba1af
Show file tree

Hide file tree

Showing 4 changed files with 20 additions and 37 deletions.
diff --git a/evaluation/benchmarks/bird/run_infer.py b/evaluation/benchmarks/bird/run_infer.py
@@ -28,6 +28,7 @@
     SandboxConfig,
     get_llm_config_arg,
     parse_arguments,
+    get_default_sandbox_config_for_eval,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -76,12 +77,7 @@ def get_config(
         run_as_openhands=False,
         runtime='docker',
         max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='python:3.12-bookworm',
-            enable_auto_lint=True,
-            use_host_network=False,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=get_default_sandbox_config_for_eval(),
         # do not mount workspace
         workspace_base=None,
         workspace_mount_path=None,

diff --git a/evaluation/benchmarks/commit0_bench/run_infer.py b/evaluation/benchmarks/commit0_bench/run_infer.py
@@ -28,6 +28,7 @@
     SandboxConfig,
     get_llm_config_arg,
     get_parser,
+    get_default_sandbox_config_for_eval,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -105,38 +106,29 @@ def get_config(
     instance: pd.Series,
     metadata: EvalMetadata,
 ) -> AppConfig:
-    # COMMIT0_CONTAINER_IMAGE = 'wentingzhao/'
     assert USE_INSTANCE_IMAGE
-    # We use a different instance image for the each instance of commit0 eval
     repo_name = instance['repo'].split('/')[1]
     base_container_image = get_instance_docker_image(repo_name)
     logger.info(
         f'Using instance container image: {base_container_image}. '
         f'Please make sure this image exists. '
         f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
     )
-    # else:
-    #     raise
-    # base_container_image = SWE_BENCH_CONTAINER_IMAGE
-    # logger.info(f'Using swe-bench container image: {base_container_image}')
+
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = base_container_image
+    sandbox_config.timeout = 300  # large enough timeout, since some testcases take very long to run
+    sandbox_config.api_key = os.environ.get('ALLHANDS_API_KEY', None)
+    sandbox_config.remote_runtime_api_url = os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL')
+    sandbox_config.keep_runtime_alive = False
+    sandbox_config.remote_runtime_init_timeout = 3600
 
     config = AppConfig(
         default_agent=metadata.agent_class,
         run_as_openhands=False,
         max_iterations=metadata.max_iterations,
         runtime=os.environ.get('RUNTIME', 'docker'),
-        sandbox=SandboxConfig(
-            base_container_image=base_container_image,
-            enable_auto_lint=True,
-            use_host_network=False,
-            # large enough timeout, since some testcases take very long to run
-            timeout=300,
-            api_key=os.environ.get('ALLHANDS_API_KEY', None),
-            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
-            keep_runtime_alive=False,
-            remote_runtime_init_timeout=3600,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
         # do not mount workspace
         workspace_base=None,
         workspace_mount_path=None,

diff --git a/evaluation/benchmarks/mint/run_infer.py b/evaluation/benchmarks/mint/run_infer.py
@@ -25,6 +25,7 @@
     SandboxConfig,
     get_llm_config_arg,
     get_parser,
+    get_default_sandbox_config_for_eval,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -103,18 +104,16 @@ def load_incontext_example(task_name: str, with_tool: bool = True):
 def get_config(
     metadata: EvalMetadata,
 ) -> AppConfig:
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = 'xingyaoww/od-eval-mint:v1.0'
+    sandbox_config.runtime_extra_deps = f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}'
+
     config = AppConfig(
         default_agent=metadata.agent_class,
         run_as_openhands=False,
         runtime='docker',
         max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='xingyaoww/od-eval-mint:v1.0',
-            enable_auto_lint=True,
-            use_host_network=False,
-            runtime_extra_deps=f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}',
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=sandbox_config,
         # do not mount workspace
         workspace_base=None,
         workspace_mount_path=None,

diff --git a/evaluation/benchmarks/toolqa/run_infer.py b/evaluation/benchmarks/toolqa/run_infer.py
@@ -21,6 +21,7 @@
     SandboxConfig,
     get_llm_config_arg,
     get_parser,
+    get_default_sandbox_config_for_eval,
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
@@ -46,12 +47,7 @@ def get_config(
         run_as_openhands=False,
         runtime='docker',
         max_iterations=metadata.max_iterations,
-        sandbox=SandboxConfig(
-            base_container_image='python:3.12-bookworm',
-            enable_auto_lint=True,
-            use_host_network=False,
-            remote_runtime_enable_retries=True,
-        ),
+        sandbox=get_default_sandbox_config_for_eval(),
         # do not mount workspace
         workspace_base=None,
         workspace_mount_path=None,