diff --git a/src/nanotron/helpers.py b/src/nanotron/helpers.py index a82f0294..e7712b47 100644 --- a/src/nanotron/helpers.py +++ b/src/nanotron/helpers.py @@ -481,7 +481,15 @@ def get_profiler(config: Config): on_trace_ready = None prof = profile( activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], - schedule=torch.profiler.schedule(wait=1, warmup=1, active=1, repeat=1, skip_first=3), + # schedule=torch.profiler.schedule(wait=1, warmup=1, active=1, repeat=1, skip_first=3), + # In this example with wait=1, warmup=1, active=2, repeat=1, + # profiler will skip the first step/iteration, + # start warming up on the second, record + # the third, forth, fifth iterations, + # after which the trace will become available + # and on_trace_ready (when set) is called; + # the cycle repeats starting with the next step + schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1, skip_first=2), on_trace_ready=on_trace_ready, # record_shapes=True, # profile_memory=True, diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py index 6b257098..de910b49 100644 --- a/src/nanotron/trainer.py +++ b/src/nanotron/trainer.py @@ -417,7 +417,7 @@ def train( torch.cuda.empty_cache() with prof: for self.iteration_step in range(self.metadata.last_train_step + 1, self.config.tokens.train_steps + 1): - if isinstance(prof, torch.profiler.profile) and self.iteration_step < 6: # Only profile the first 6 steps + if isinstance(prof, torch.profiler.profile): prof.step() self.iteration_start_time = time.time()