Skip to content

Commit

Permalink
feat(runtime): use prlimit to limit resource usage of command to av…
Browse files Browse the repository at this point in the history
…oid OOM Runtime Kill (#6338)

Co-authored-by: openhands <[email protected]>
Co-authored-by: Engel Nyst <[email protected]>
Co-authored-by: Graham Neubig <[email protected]>
  • Loading branch information
4 people authored Feb 11, 2025
1 parent 1a715d2 commit 6a6dc93
Show file tree
Hide file tree
Showing 5 changed files with 151 additions and 43 deletions.
19 changes: 19 additions & 0 deletions openhands/runtime/action_execution_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from pathlib import Path
from zipfile import ZipFile

import psutil
from fastapi import Depends, FastAPI, HTTPException, Request, UploadFile
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse, StreamingResponse
Expand Down Expand Up @@ -108,6 +109,22 @@ def __init__(
self.last_execution_time = self.start_time
self._initialized = False

if _override_max_memory_gb := os.environ.get('RUNTIME_MAX_MEMORY_GB', None):
self.max_memory_gb = int(_override_max_memory_gb)
logger.info(
f'Setting max memory to {self.max_memory_gb}GB (according to the RUNTIME_MAX_MEMORY_GB environment variable)'
)
else:
# Get available system memory
total_memory_gb = psutil.virtual_memory().total / (
1024 * 1024 * 1024
) # Convert to GB
self.max_memory_gb = int(max(0.5, total_memory_gb - 1.0))
# Reserve 1GB as head room, minimum of 0.5GB
logger.info(
f'Total memory: {total_memory_gb}GB, setting limit to {self.max_memory_gb}GB (reserved 1GB for action execution server, minimum 0.5GB)'
)

@property
def initial_cwd(self):
return self._initial_cwd
Expand All @@ -120,8 +137,10 @@ async def ainit(self):
no_change_timeout_seconds=int(
os.environ.get('NO_CHANGE_TIMEOUT_SECONDS', 30)
),
max_memory_mb=self.max_memory_gb * 1024,
)
self.bash_session.initialize()

await wait_all(
(self._init_plugin(plugin) for plugin in self.plugins_to_load),
timeout=30,
Expand Down
10 changes: 7 additions & 3 deletions openhands/runtime/impl/remote/remote_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,13 +212,17 @@ def _start_runtime(self):
plugins=self.plugins,
app_config=self.config,
)
environment = {
'DEBUG': 'true'
if self.config.debug or os.environ.get('DEBUG', 'false').lower() == 'true'
else {},
}
environment.update(self.config.sandbox.runtime_startup_env_vars)
start_request = {
'image': self.container_image,
'command': command,
'working_dir': '/openhands/code/',
'environment': {'DEBUG': 'true'}
if self.config.debug or os.environ.get('DEBUG', 'false').lower() == 'true'
else {},
'environment': environment,
'session_id': self.sid,
'resource_factor': self.config.sandbox.remote_runtime_resource_factor,
}
Expand Down
16 changes: 12 additions & 4 deletions openhands/runtime/utils/bash.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,25 +175,32 @@ def __init__(
work_dir: str,
username: str | None = None,
no_change_timeout_seconds: int = 30,
max_memory_mb: int | None = None,
):
self.NO_CHANGE_TIMEOUT_SECONDS = no_change_timeout_seconds
self.work_dir = work_dir
self.username = username
self._initialized = False
self.max_memory_mb = max_memory_mb

def initialize(self):
self.server = libtmux.Server()
window_command = '/bin/bash'
_shell_command = '/bin/bash'
if self.username in ['root', 'openhands']:
# This starts a non-login (new) shell for the given user
window_command = f'su {self.username} -'
_shell_command = f'su {self.username} -'
# otherwise, we are running as the CURRENT USER (e.g., when running LocalRuntime)
if self.max_memory_mb is not None:
window_command = (
f'prlimit --as={self.max_memory_mb * 1024 * 1024} {_shell_command}'
)
else:
window_command = _shell_command

logger.debug(f'Initializing bash session with command: {window_command}')
session_name = f'openhands-{self.username}-{uuid.uuid4()}'
self.session = self.server.new_session(
session_name=session_name,
window_name='bash',
window_command=window_command,
start_directory=self.work_dir,
kill_session=True,
x=1000,
Expand All @@ -207,6 +214,7 @@ def initialize(self):
# We need to create a new pane because the initial pane's history limit is (default) 2000
_initial_window = self.session.attached_window
self.window = self.session.new_window(
window_name='bash',
window_shell=window_command,
start_directory=self.work_dir,
)
Expand Down
113 changes: 113 additions & 0 deletions tests/runtime/test_runtime_resource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""Stress tests for the DockerRuntime, which connects to the ActionExecutor running in the sandbox."""

from conftest import _close_test_runtime, _load_runtime

from openhands.core.logger import openhands_logger as logger
from openhands.events.action import CmdRunAction


def test_stress_docker_runtime(temp_dir, runtime_cls, repeat=1):
runtime, config = _load_runtime(
temp_dir,
runtime_cls,
docker_runtime_kwargs={
'cpu_period': 100000, # 100ms
'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU)
'mem_limit': '4G', # 4 GB of memory
},
)

action = CmdRunAction(
command='sudo apt-get update && sudo apt-get install -y stress-ng'
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0

for _ in range(repeat):
# run stress-ng stress tests for 1 minute
action = CmdRunAction(command='stress-ng --all 1 -t 30s')
action.set_hard_timeout(120)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})

_close_test_runtime(runtime)


def test_stress_docker_runtime_hit_memory_limits(temp_dir, runtime_cls):
"""Test runtime behavior under resource constraints."""
runtime, config = _load_runtime(
temp_dir,
runtime_cls,
docker_runtime_kwargs={
'cpu_period': 100000, # 100ms
'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU)
'mem_limit': '4G', # 4 GB of memory
'memswap_limit': '0', # No swap
'mem_swappiness': 0, # Disable swapping
'oom_kill_disable': False, # Enable OOM killer
},
runtime_startup_env_vars={
'RUNTIME_MAX_MEMORY_GB': '3',
},
)

action = CmdRunAction(
command='sudo apt-get update && sudo apt-get install -y stress-ng'
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0

action = CmdRunAction(
command='stress-ng --vm 1 --vm-bytes 6G --timeout 30s --metrics'
)
action.set_hard_timeout(120)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert 'aborted early, out of system resources' in obs.content
assert obs.exit_code == 3 # OOM killed!

_close_test_runtime(runtime)


def test_stress_docker_runtime_within_memory_limits(temp_dir, runtime_cls):
"""Test runtime behavior under resource constraints."""
runtime, config = _load_runtime(
temp_dir,
runtime_cls,
docker_runtime_kwargs={
'cpu_period': 100000, # 100ms
'cpu_quota': 100000, # Can use 100ms out of each 100ms period (1 CPU)
'mem_limit': '4G', # 4 GB of memory
'memswap_limit': '0', # No swap
'mem_swappiness': 0, # Disable swapping
'oom_kill_disable': False, # Enable OOM killer
},
runtime_startup_env_vars={
'RUNTIME_MAX_MEMORY_GB': '7',
},
)

action = CmdRunAction(
command='sudo apt-get update && sudo apt-get install -y stress-ng'
)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0

action = CmdRunAction(
command='stress-ng --vm 1 --vm-bytes 6G --timeout 30s --metrics'
)
action.set_hard_timeout(120)
logger.info(action, extra={'msg_type': 'ACTION'})
obs = runtime.run_action(action)
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
assert obs.exit_code == 0

_close_test_runtime(runtime)
36 changes: 0 additions & 36 deletions tests/runtime/test_stress_docker_runtime.py

This file was deleted.

0 comments on commit 6a6dc93

Please sign in to comment.