File tree Expand file tree Collapse file tree 1 file changed +8
-3
lines changed Expand file tree Collapse file tree 1 file changed +8
-3
lines changed Original file line number Diff line number Diff line change @@ -162,13 +162,18 @@ def run_trainer(trainer_config: SpmdTrainer.Config) -> Any:
162
162
raise
163
163
try :
164
164
logging .info ("Trying to clean up ongoing traces" )
165
- jax .stop_trace ()
165
+ jax .profiler . stop_trace ()
166
166
logging .info ("Successfully cleaned up ongoing traces" )
167
- except ValueError as e :
168
- logging .info ("No ongoing traces to clean up" , exc_info = True )
167
+ except ( RuntimeError , ValueError ) as e :
168
+ logging .info ("No ongoing traces to clean up" )
169
169
except Exception as e :
170
170
logging .exception ("Error trying to clean up ongoing traces" )
171
171
raise
172
+
173
+ jax .clear_caches ()
174
+ for array in jax .live_arrays ():
175
+ array .delete ()
176
+
172
177
ten_minutes = 10 * 60
173
178
elastic_manager .wait_for_slices (timeout = ten_minutes )
174
179
else :
You can’t perform that action at this time.
0 commit comments