From 6a6dc93e0379bfdf96096fce06b21a8e51871aec Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Mon, 10 Feb 2025 22:21:11 -0500 Subject: [PATCH] feat(runtime): use `prlimit` to limit resource usage of command to avoid OOM Runtime Kill (#6338) Co-authored-by: openhands Co-authored-by: Engel Nyst Co-authored-by: Graham Neubig --- openhands/runtime/action_execution_server.py | 19 +++ .../runtime/impl/remote/remote_runtime.py | 10 +- openhands/runtime/utils/bash.py | 16 ++- tests/runtime/test_runtime_resource.py | 113 ++++++++++++++++++ tests/runtime/test_stress_docker_runtime.py | 36 ------ 5 files changed, 151 insertions(+), 43 deletions(-) create mode 100644 tests/runtime/test_runtime_resource.py delete mode 100644 tests/runtime/test_stress_docker_runtime.py diff --git a/openhands/runtime/action_execution_server.py b/openhands/runtime/action_execution_server.py index 2148ab2267d1..f2d12196fede 100644 --- a/openhands/runtime/action_execution_server.py +++ b/openhands/runtime/action_execution_server.py @@ -21,6 +21,7 @@ from pathlib import Path from zipfile import ZipFile +import psutil from fastapi import Depends, FastAPI, HTTPException, Request, UploadFile from fastapi.exceptions import RequestValidationError from fastapi.responses import JSONResponse, StreamingResponse @@ -108,6 +109,22 @@ def __init__( self.last_execution_time = self.start_time self._initialized = False + if _override_max_memory_gb := os.environ.get('RUNTIME_MAX_MEMORY_GB', None): + self.max_memory_gb = int(_override_max_memory_gb) + logger.info( + f'Setting max memory to {self.max_memory_gb}GB (according to the RUNTIME_MAX_MEMORY_GB environment variable)' + ) + else: + # Get available system memory + total_memory_gb = psutil.virtual_memory().total / ( + 1024 * 1024 * 1024 + ) # Convert to GB + self.max_memory_gb = int(max(0.5, total_memory_gb - 1.0)) + # Reserve 1GB as head room, minimum of 0.5GB + logger.info( + f'Total memory: {total_memory_gb}GB, setting limit to {self.max_memory_gb}GB (reserved 1GB for action execution server, minimum 0.5GB)' + ) + @property def initial_cwd(self): return self._initial_cwd @@ -120,8 +137,10 @@ async def ainit(self): no_change_timeout_seconds=int( os.environ.get('NO_CHANGE_TIMEOUT_SECONDS', 30) ), + max_memory_mb=self.max_memory_gb * 1024, ) self.bash_session.initialize() + await wait_all( (self._init_plugin(plugin) for plugin in self.plugins_to_load), timeout=30, diff --git a/openhands/runtime/impl/remote/remote_runtime.py b/openhands/runtime/impl/remote/remote_runtime.py index cb10b2c15b78..f33acedbf4ad 100644 --- a/openhands/runtime/impl/remote/remote_runtime.py +++ b/openhands/runtime/impl/remote/remote_runtime.py @@ -212,13 +212,17 @@ def _start_runtime(self): plugins=self.plugins, app_config=self.config, ) + environment = { + 'DEBUG': 'true' + if self.config.debug or os.environ.get('DEBUG', 'false').lower() == 'true' + else {}, + } + environment.update(self.config.sandbox.runtime_startup_env_vars) start_request = { 'image': self.container_image, 'command': command, 'working_dir': '/openhands/code/', - 'environment': {'DEBUG': 'true'} - if self.config.debug or os.environ.get('DEBUG', 'false').lower() == 'true' - else {}, + 'environment': environment, 'session_id': self.sid, 'resource_factor': self.config.sandbox.remote_runtime_resource_factor, } diff --git a/openhands/runtime/utils/bash.py b/openhands/runtime/utils/bash.py index 5fda883d4d01..419573d7546d 100644 --- a/openhands/runtime/utils/bash.py +++ b/openhands/runtime/utils/bash.py @@ -175,25 +175,32 @@ def __init__( work_dir: str, username: str | None = None, no_change_timeout_seconds: int = 30, + max_memory_mb: int | None = None, ): self.NO_CHANGE_TIMEOUT_SECONDS = no_change_timeout_seconds self.work_dir = work_dir self.username = username self._initialized = False + self.max_memory_mb = max_memory_mb def initialize(self): self.server = libtmux.Server() - window_command = '/bin/bash' + _shell_command = '/bin/bash' if self.username in ['root', 'openhands']: # This starts a non-login (new) shell for the given user - window_command = f'su {self.username} -' + _shell_command = f'su {self.username} -' # otherwise, we are running as the CURRENT USER (e.g., when running LocalRuntime) + if self.max_memory_mb is not None: + window_command = ( + f'prlimit --as={self.max_memory_mb * 1024 * 1024} {_shell_command}' + ) + else: + window_command = _shell_command + logger.debug(f'Initializing bash session with command: {window_command}') session_name = f'openhands-{self.username}-{uuid.uuid4()}' self.session = self.server.new_session( session_name=session_name, - window_name='bash', - window_command=window_command, start_directory=self.work_dir, kill_session=True, x=1000, @@ -207,6 +214,7 @@ def initialize(self): # We need to create a new pane because the initial pane's history limit is (default) 2000 _initial_window = self.session.attached_window self.window = self.session.new_window( + window_name='bash', window_shell=window_command, start_directory=self.work_dir, ) diff --git a/tests/runtime/test_runtime_resource.py b/tests/runtime/test_runtime_resource.py new file mode 100644 index 000000000000..2873939f132d --- /dev/null +++ b/tests/runtime/test_runtime_resource.py @@ -0,0 +1,113 @@ +"""Stress tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox.""" + +from conftest import _close_test_runtime, _load_runtime + +from openhands.core.logger import openhands_logger as logger +from openhands.events.action import CmdRunAction + + +def test_stress_docker_runtime(temp_dir, runtime_cls, repeat=1): + runtime, config = _load_runtime( + temp_dir, + runtime_cls, + docker_runtime_kwargs={ + 'cpu_period': 100000, # 100ms + 'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU) + 'mem_limit': '4G', # 4 GB of memory + }, + ) + + action = CmdRunAction( + command='sudo apt-get update && sudo apt-get install -y stress-ng' + ) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.exit_code == 0 + + for _ in range(repeat): + # run stress-ng stress tests for 1 minute + action = CmdRunAction(command='stress-ng --all 1 -t 30s') + action.set_hard_timeout(120) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + + _close_test_runtime(runtime) + + +def test_stress_docker_runtime_hit_memory_limits(temp_dir, runtime_cls): + """Test runtime behavior under resource constraints.""" + runtime, config = _load_runtime( + temp_dir, + runtime_cls, + docker_runtime_kwargs={ + 'cpu_period': 100000, # 100ms + 'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU) + 'mem_limit': '4G', # 4 GB of memory + 'memswap_limit': '0', # No swap + 'mem_swappiness': 0, # Disable swapping + 'oom_kill_disable': False, # Enable OOM killer + }, + runtime_startup_env_vars={ + 'RUNTIME_MAX_MEMORY_GB': '3', + }, + ) + + action = CmdRunAction( + command='sudo apt-get update && sudo apt-get install -y stress-ng' + ) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.exit_code == 0 + + action = CmdRunAction( + command='stress-ng --vm 1 --vm-bytes 6G --timeout 30s --metrics' + ) + action.set_hard_timeout(120) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert 'aborted early, out of system resources' in obs.content + assert obs.exit_code == 3 # OOM killed! + + _close_test_runtime(runtime) + + +def test_stress_docker_runtime_within_memory_limits(temp_dir, runtime_cls): + """Test runtime behavior under resource constraints.""" + runtime, config = _load_runtime( + temp_dir, + runtime_cls, + docker_runtime_kwargs={ + 'cpu_period': 100000, # 100ms + 'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU) + 'mem_limit': '4G', # 4 GB of memory + 'memswap_limit': '0', # No swap + 'mem_swappiness': 0, # Disable swapping + 'oom_kill_disable': False, # Enable OOM killer + }, + runtime_startup_env_vars={ + 'RUNTIME_MAX_MEMORY_GB': '7', + }, + ) + + action = CmdRunAction( + command='sudo apt-get update && sudo apt-get install -y stress-ng' + ) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.exit_code == 0 + + action = CmdRunAction( + command='stress-ng --vm 1 --vm-bytes 6G --timeout 30s --metrics' + ) + action.set_hard_timeout(120) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.exit_code == 0 + + _close_test_runtime(runtime) diff --git a/tests/runtime/test_stress_docker_runtime.py b/tests/runtime/test_stress_docker_runtime.py deleted file mode 100644 index b679a0836253..000000000000 --- a/tests/runtime/test_stress_docker_runtime.py +++ /dev/null @@ -1,36 +0,0 @@ -"""Stress tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox.""" - -from conftest import _close_test_runtime, _load_runtime - -from openhands.core.logger import openhands_logger as logger -from openhands.events.action import CmdRunAction - - -def test_stress_docker_runtime(temp_dir, runtime_cls, repeat=1): - runtime, config = _load_runtime( - temp_dir, - runtime_cls, - docker_runtime_kwargs={ - 'cpu_period': 100000, # 100ms - 'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU) - 'mem_limit': '4G', # 4 GB of memory - }, - ) - - action = CmdRunAction( - command='sudo apt-get update && sudo apt-get install -y stress-ng' - ) - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - assert obs.exit_code == 0 - - for _ in range(repeat): - # run stress-ng stress tests for 1 minute - action = CmdRunAction(command='stress-ng --all 1 -t 1m') - action.set_hard_timeout(120) - logger.info(action, extra={'msg_type': 'ACTION'}) - obs = runtime.run_action(action) - logger.info(obs, extra={'msg_type': 'OBSERVATION'}) - - _close_test_runtime(runtime)