Skip to content

Commit 759d6f4

Browse files
committed
Checking in
1 parent cdcb140 commit 759d6f4

File tree

1 file changed

+8
-3
lines changed

1 file changed

+8
-3
lines changed

axlearn/common/launch_trainer.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -162,13 +162,18 @@ def run_trainer(trainer_config: SpmdTrainer.Config) -> Any:
162162
raise
163163
try:
164164
logging.info("Trying to clean up ongoing traces")
165-
jax.stop_trace()
165+
jax.profiler.stop_trace()
166166
logging.info("Successfully cleaned up ongoing traces")
167-
except ValueError as e:
168-
logging.info("No ongoing traces to clean up", exc_info=True)
167+
except (RuntimeError, ValueError) as e:
168+
logging.info("No ongoing traces to clean up")
169169
except Exception as e:
170170
logging.exception("Error trying to clean up ongoing traces")
171171
raise
172+
173+
jax.clear_caches()
174+
for array in jax.live_arrays():
175+
array.delete()
176+
172177
ten_minutes = 10 * 60
173178
elastic_manager.wait_for_slices(timeout=ten_minutes)
174179
else:

0 commit comments

Comments
 (0)