9
9
--tuning_search_space <path_to_tuning_search_space_json>
10
10
"""
11
11
12
+ import datetime
12
13
import json
13
14
import os
14
15
import struct
16
+ import subprocess
15
17
import time
16
18
17
19
from absl import app
26
28
'docker_image_url' ,
27
29
'us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev' ,
28
30
'URL to docker image' )
29
- flags .DEFINE_integer ('run_percentage' ,
30
- 100 ,
31
- 'Percentage of max num steps to run for.' )
31
+ flags .DEFINE_integer (
32
+ 'run_percentage' ,
33
+ 100 ,
34
+ 'Percentage of max num steps to run for.'
35
+ 'Must set the flag enable_step_budget to True for this to take effect.' )
32
36
flags .DEFINE_string ('experiment_name' ,
33
37
'my_experiment' ,
34
38
'Name of top sub directory in experiment dir.' )
83
87
'If your algorithm has a smaller per step time than our baselines '
84
88
'you may want to increase the number of steps per workload.' )
85
89
flags .DEFINE_string (
86
- 'workload ' ,
90
+ 'workloads ' ,
87
91
None ,
92
+ 'String representing a comma separated list of workload names.'
88
93
'If not None, only run this workload, else run all workloads in workload_metadata_path.'
89
94
)
95
+ flags .DEFINE_string ('additional_requirements_path' ,
96
+ None ,
97
+ 'Path to requirements.txt if any.' )
98
+ flags .DEFINE_integer (
99
+ 'max_steps' ,
100
+ None ,
101
+ 'Maximum number of steps to run. Must set flag enable_step_budget.'
102
+ 'This flag takes precedence over the run_percentage flag.' )
103
+ flags .DEFINE_bool (
104
+ 'enable_step_budget' ,
105
+ False ,
106
+ 'Flag that has to be explicitly set to override time budgets to step budget percentage.'
107
+ )
90
108
91
109
FLAGS = flags .FLAGS
92
110
@@ -106,15 +124,40 @@ def container_running():
106
124
return True
107
125
108
126
127
+ def kill_containers ():
128
+ docker_client = docker .from_env ()
129
+ containers = docker_client .containers .list ()
130
+ for container in containers :
131
+ container .kill ()
132
+
133
+
134
+ def gpu_is_active ():
135
+ output = subprocess .check_output ([
136
+ 'nvidia-smi' ,
137
+ '--query-gpu=utilization.gpu' ,
138
+ '--format=csv,noheader,nounits'
139
+ ])
140
+ return any (int (x ) > 0 for x in output .decode ().splitlines ())
141
+
142
+
109
143
def wait_until_container_not_running (sleep_interval = 5 * 60 ):
144
+ # check gpu util
145
+ # if the gpu has not been utilized for 30 minutes kill the
146
+ gpu_last_active = datetime .datetime .now ().timestamp ()
147
+
110
148
while container_running ():
149
+ # check if gpus have been inactive > 45 min and if so terminate container
150
+ if gpu_is_active ():
151
+ gpu_last_active = datetime .datetime .now ().timestamp ()
152
+ if (datetime .datetime .now ().timestamp () - gpu_last_active ) > 45 * 60 :
153
+ kill_containers (
154
+ "Killing container: GPUs have been inactive > 45 minutes..." )
111
155
time .sleep (sleep_interval )
112
156
return
113
157
114
158
115
159
def main (_ ):
116
160
framework = FLAGS .framework
117
- run_fraction = FLAGS .run_percentage / 100.
118
161
experiment_name = FLAGS .experiment_name
119
162
docker_image_url = FLAGS .docker_image_url
120
163
submission_path = FLAGS .submission_path
@@ -132,7 +175,13 @@ def main(_):
132
175
study_end_index = FLAGS .study_end_index
133
176
else :
134
177
study_end_index = num_studies - 1
178
+
179
+ additional_requirements_path_flag = ''
180
+ if FLAGS .additional_requirements_path :
181
+ additional_requirements_path_flag = f'--additional_requirements_path { FLAGS .additional_requirements_path } '
182
+
135
183
submission_id = FLAGS .submission_id
184
+
136
185
rng_seed = FLAGS .seed
137
186
138
187
if not rng_seed :
@@ -144,17 +193,22 @@ def main(_):
144
193
with open (FLAGS .workload_metadata_path ) as f :
145
194
workload_metadata = json .load (f )
146
195
196
+ # Get list of all possible workloads
147
197
workloads = [w for w in workload_metadata .keys ()]
148
198
149
- # Read held-out workloads
199
+ # Read heldout workloads
150
200
if FLAGS .held_out_workloads_config_path :
151
201
held_out_workloads = read_held_out_workloads (
152
202
FLAGS .held_out_workloads_config_path )
153
203
workloads = workloads + held_out_workloads
154
204
155
- # Filter for single workload
156
- if FLAGS .workload and (FLAGS .workload in workloads ):
157
- workloads = [FLAGS .workload ]
205
+ # Filter workloads if explicit workloads specified
206
+ if FLAGS .workloads is not None :
207
+ workloads = list (
208
+ filter (lambda x : x in FLAGS .workloads .split (',' ), workloads ))
209
+ if len (workloads ) != len (FLAGS .workloads .split (',' )):
210
+ unmatched_workloads = set (FLAGS .workloads .split (',' )) - set (workloads )
211
+ raise ValueError (f'Invalid workload name { unmatched_workloads } ' )
158
212
159
213
rng_subkeys = prng .split (rng_key , num_studies )
160
214
@@ -174,14 +228,22 @@ def main(_):
174
228
"sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'" ) # clear caches
175
229
print ('=' * 100 )
176
230
dataset = workload_metadata [base_workload_name ]['dataset' ]
177
- max_steps = int (workload_metadata [base_workload_name ]['max_steps' ] *
178
- run_fraction )
231
+ max_steps_flag = ''
232
+ if FLAGS .enable_step_budget :
233
+ run_fraction = FLAGS .run_percentage / 100.
234
+ if FLAGS .max_steps is None :
235
+ max_steps = int (workload_metadata [base_workload_name ]['max_steps' ] *
236
+ run_fraction )
237
+ else :
238
+ max_steps = FLAGS .max_steps
239
+ max_steps_flag = f'-m { max_steps } '
240
+
179
241
mount_repo_flag = ''
180
242
if FLAGS .local :
181
- mount_repo_flag = '-v $HOME /algorithmic-efficiency:/algorithmic-efficiency '
182
- command = ('docker run -t -d -v $HOME /data/:/data/ '
183
- '-v $HOME /experiment_runs/:/experiment_runs '
184
- '-v $HOME /experiment_runs/logs:/logs '
243
+ mount_repo_flag = '-v /home/kasimbeg /algorithmic-efficiency:/algorithmic-efficiency '
244
+ command = ('docker run -t -d -v /home/kasimbeg /data/:/data/ '
245
+ '-v /home/kasimbeg /experiment_runs/:/experiment_runs '
246
+ '-v /home/kasimbeg /experiment_runs/logs:/logs '
185
247
f'{ mount_repo_flag } '
186
248
'--gpus all --ipc=host '
187
249
f'{ docker_image_url } '
@@ -190,9 +252,10 @@ def main(_):
190
252
f'-s { submission_path } '
191
253
f'-w { workload } '
192
254
f'-e { study_dir } '
193
- f'-m { max_steps } '
255
+ f'{ max_steps_flag } '
194
256
f'--num_tuning_trials { num_tuning_trials } '
195
257
f'--rng_seed { run_seed } '
258
+ f'{ additional_requirements_path_flag } '
196
259
'-c false '
197
260
'-o true '
198
261
'-i true ' )
@@ -235,4 +298,4 @@ def main(_):
235
298
236
299
if __name__ == '__main__' :
237
300
flags .mark_flag_as_required ('workload_metadata_path' )
238
- app .run (main )
301
+ app .run (main )
0 commit comments