diff --git a/src/nanotron/helpers.py b/src/nanotron/helpers.py
index a82f0294..e7712b47 100644
--- a/src/nanotron/helpers.py
+++ b/src/nanotron/helpers.py
@@ -481,7 +481,15 @@ def get_profiler(config: Config):
             on_trace_ready = None
         prof = profile(
             activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
-            schedule=torch.profiler.schedule(wait=1, warmup=1, active=1, repeat=1, skip_first=3),
+            # schedule=torch.profiler.schedule(wait=1, warmup=1, active=1, repeat=1, skip_first=3),
+            # In this example with wait=1, warmup=1, active=2, repeat=1,
+            # profiler will skip the first step/iteration,
+            # start warming up on the second, record
+            # the third, forth, fifth iterations,
+            # after which the trace will become available
+            # and on_trace_ready (when set) is called;
+            # the cycle repeats starting with the next step
+            schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1, skip_first=2),
             on_trace_ready=on_trace_ready,
             # record_shapes=True,
             # profile_memory=True,
diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py
index 6b257098..de910b49 100644
--- a/src/nanotron/trainer.py
+++ b/src/nanotron/trainer.py
@@ -417,7 +417,7 @@ def train(
         torch.cuda.empty_cache()
         with prof:
             for self.iteration_step in range(self.metadata.last_train_step + 1, self.config.tokens.train_steps + 1):
-                if isinstance(prof, torch.profiler.profile) and self.iteration_step < 6: # Only profile the first 6 steps
+                if isinstance(prof, torch.profiler.profile):
                     prof.step()
 
                 self.iteration_start_time = time.time()