Skip to content

Commit c2aa9e1

Browse files
Merge pull request #800 from mlcommons/dev
Dev -> main
2 parents ce1003e + 5ce9e5a commit c2aa9e1

File tree

5 files changed

+96
-412
lines changed

5 files changed

+96
-412
lines changed

docker/scripts/startup.sh

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,10 @@ while [ "$1" != "" ]; do
132132
shift
133133
TEST=$1
134134
;;
135+
--additional_requirements_path)
136+
shift
137+
ADDITIONAL_REQUIREMENTS_PATH=$1
138+
;;
135139
*)
136140
usage
137141
exit 1
@@ -140,6 +144,16 @@ while [ "$1" != "" ]; do
140144
shift
141145
done
142146

147+
148+
# Optionally install addtional dependencies
149+
if [[ -n ${ADDITIONAL_REQUIREMENTS_PATH+x} ]]; then
150+
echo "Installing addtional requirements..."
151+
COMMAND="cd algorithmic-efficiency && pip install -r ${ADDITIONAL_REQUIREMENTS_PATH}"
152+
echo $COMMAND
153+
eval $COMMAND
154+
fi
155+
156+
143157
if [[ ${TEST} == "true" ]]; then
144158
cd algorithmic-efficiency
145159
COMMAND="python3 tests/test_traindiffs.py"

scoring/run_workloads.py

Lines changed: 80 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,11 @@
99
--tuning_search_space <path_to_tuning_search_space_json>
1010
"""
1111

12+
import datetime
1213
import json
1314
import os
1415
import struct
16+
import subprocess
1517
import time
1618

1719
from absl import app
@@ -26,9 +28,11 @@
2628
'docker_image_url',
2729
'us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev',
2830
'URL to docker image')
29-
flags.DEFINE_integer('run_percentage',
30-
100,
31-
'Percentage of max num steps to run for.')
31+
flags.DEFINE_integer(
32+
'run_percentage',
33+
100,
34+
'Percentage of max num steps to run for.'
35+
'Must set the flag enable_step_budget to True for this to take effect.')
3236
flags.DEFINE_string('experiment_name',
3337
'my_experiment',
3438
'Name of top sub directory in experiment dir.')
@@ -83,10 +87,24 @@
8387
'If your algorithm has a smaller per step time than our baselines '
8488
'you may want to increase the number of steps per workload.')
8589
flags.DEFINE_string(
86-
'workload',
90+
'workloads',
8791
None,
92+
'String representing a comma separated list of workload names.'
8893
'If not None, only run this workload, else run all workloads in workload_metadata_path.'
8994
)
95+
flags.DEFINE_string('additional_requirements_path',
96+
None,
97+
'Path to requirements.txt if any.')
98+
flags.DEFINE_integer(
99+
'max_steps',
100+
None,
101+
'Maximum number of steps to run. Must set flag enable_step_budget.'
102+
'This flag takes precedence over the run_percentage flag.')
103+
flags.DEFINE_bool(
104+
'enable_step_budget',
105+
False,
106+
'Flag that has to be explicitly set to override time budgets to step budget percentage.'
107+
)
90108

91109
FLAGS = flags.FLAGS
92110

@@ -106,15 +124,40 @@ def container_running():
106124
return True
107125

108126

127+
def kill_containers():
128+
docker_client = docker.from_env()
129+
containers = docker_client.containers.list()
130+
for container in containers:
131+
container.kill()
132+
133+
134+
def gpu_is_active():
135+
output = subprocess.check_output([
136+
'nvidia-smi',
137+
'--query-gpu=utilization.gpu',
138+
'--format=csv,noheader,nounits'
139+
])
140+
return any(int(x) > 0 for x in output.decode().splitlines())
141+
142+
109143
def wait_until_container_not_running(sleep_interval=5 * 60):
144+
# check gpu util
145+
# if the gpu has not been utilized for 30 minutes kill the
146+
gpu_last_active = datetime.datetime.now().timestamp()
147+
110148
while container_running():
149+
# check if gpus have been inactive > 45 min and if so terminate container
150+
if gpu_is_active():
151+
gpu_last_active = datetime.datetime.now().timestamp()
152+
if (datetime.datetime.now().timestamp() - gpu_last_active) > 45 * 60:
153+
kill_containers(
154+
"Killing container: GPUs have been inactive > 45 minutes...")
111155
time.sleep(sleep_interval)
112156
return
113157

114158

115159
def main(_):
116160
framework = FLAGS.framework
117-
run_fraction = FLAGS.run_percentage / 100.
118161
experiment_name = FLAGS.experiment_name
119162
docker_image_url = FLAGS.docker_image_url
120163
submission_path = FLAGS.submission_path
@@ -132,7 +175,13 @@ def main(_):
132175
study_end_index = FLAGS.study_end_index
133176
else:
134177
study_end_index = num_studies - 1
178+
179+
additional_requirements_path_flag = ''
180+
if FLAGS.additional_requirements_path:
181+
additional_requirements_path_flag = f'--additional_requirements_path {FLAGS.additional_requirements_path} '
182+
135183
submission_id = FLAGS.submission_id
184+
136185
rng_seed = FLAGS.seed
137186

138187
if not rng_seed:
@@ -144,17 +193,22 @@ def main(_):
144193
with open(FLAGS.workload_metadata_path) as f:
145194
workload_metadata = json.load(f)
146195

196+
# Get list of all possible workloads
147197
workloads = [w for w in workload_metadata.keys()]
148198

149-
# Read held-out workloads
199+
# Read heldout workloads
150200
if FLAGS.held_out_workloads_config_path:
151201
held_out_workloads = read_held_out_workloads(
152202
FLAGS.held_out_workloads_config_path)
153203
workloads = workloads + held_out_workloads
154204

155-
# Filter for single workload
156-
if FLAGS.workload and (FLAGS.workload in workloads):
157-
workloads = [FLAGS.workload]
205+
# Filter workloads if explicit workloads specified
206+
if FLAGS.workloads is not None:
207+
workloads = list(
208+
filter(lambda x: x in FLAGS.workloads.split(','), workloads))
209+
if len(workloads) != len(FLAGS.workloads.split(',')):
210+
unmatched_workloads = set(FLAGS.workloads.split(',')) - set(workloads)
211+
raise ValueError(f'Invalid workload name {unmatched_workloads}')
158212

159213
rng_subkeys = prng.split(rng_key, num_studies)
160214

@@ -174,14 +228,22 @@ def main(_):
174228
"sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'") # clear caches
175229
print('=' * 100)
176230
dataset = workload_metadata[base_workload_name]['dataset']
177-
max_steps = int(workload_metadata[base_workload_name]['max_steps'] *
178-
run_fraction)
231+
max_steps_flag = ''
232+
if FLAGS.enable_step_budget:
233+
run_fraction = FLAGS.run_percentage / 100.
234+
if FLAGS.max_steps is None:
235+
max_steps = int(workload_metadata[base_workload_name]['max_steps'] *
236+
run_fraction)
237+
else:
238+
max_steps = FLAGS.max_steps
239+
max_steps_flag = f'-m {max_steps}'
240+
179241
mount_repo_flag = ''
180242
if FLAGS.local:
181-
mount_repo_flag = '-v $HOME/algorithmic-efficiency:/algorithmic-efficiency '
182-
command = ('docker run -t -d -v $HOME/data/:/data/ '
183-
'-v $HOME/experiment_runs/:/experiment_runs '
184-
'-v $HOME/experiment_runs/logs:/logs '
243+
mount_repo_flag = '-v /home/kasimbeg/algorithmic-efficiency:/algorithmic-efficiency '
244+
command = ('docker run -t -d -v /home/kasimbeg/data/:/data/ '
245+
'-v /home/kasimbeg/experiment_runs/:/experiment_runs '
246+
'-v /home/kasimbeg/experiment_runs/logs:/logs '
185247
f'{mount_repo_flag}'
186248
'--gpus all --ipc=host '
187249
f'{docker_image_url} '
@@ -190,9 +252,10 @@ def main(_):
190252
f'-s {submission_path} '
191253
f'-w {workload} '
192254
f'-e {study_dir} '
193-
f'-m {max_steps} '
255+
f'{max_steps_flag} '
194256
f'--num_tuning_trials {num_tuning_trials} '
195257
f'--rng_seed {run_seed} '
258+
f'{additional_requirements_path_flag}'
196259
'-c false '
197260
'-o true '
198261
'-i true ')
@@ -235,4 +298,4 @@ def main(_):
235298

236299
if __name__ == '__main__':
237300
flags.mark_flag_as_required('workload_metadata_path')
238-
app.run(main)
301+
app.run(main)

setup.cfg

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ jax_core_deps =
121121
chex==0.1.7
122122
ml_dtypes==0.2.0
123123
protobuf==4.25.3
124+
scipy==1.11.4
125+
124126

125127
# JAX CPU
126128
jax_cpu =

0 commit comments

Comments
 (0)