Hooks

mydatascience · mydatascience · commit 2198d88e6b5f · 2025-10-09T20:03:27.000+04:00
Signed-off-by: Vladimir Suvorov &lt;suvorovv@google.com&gt;
diff --git a/src/MaxText/examples/grpo_llama3_demo.py b/src/MaxText/examples/grpo_llama3_demo.py
@@ -17,16 +17,7 @@
 This tutorial demonstrates training the Llama3.1 8B-IT model on
  the GSM8K math reasoning benchmark using Group Relative Policy Optimization (GRPO).
    GRPO can enhance your model's problem-solving skills on mathematical word problems,
-     coding problems, etc. 
-
-GOODPUT MONITORING FEATURES:
-- Automatic goodput measurement and tracking
-- Badput breakdown analysis (non-productive time tracking)
-- Step time deviation monitoring
-- TensorBoard and Google Cloud Monitoring integration
-- Performance metrics upload to GCM
-- Real-time training efficiency monitoring
-"""
+     coding problems, etc. """
 
 # This tutorial demonstrates training the Llama3.1 8B-IT model on the GSM8K math
 # reasoning benchmark using Group Relative Policy Optimization (GRPO). GRPO can
@@ -89,10 +80,6 @@
 from MaxText import pyconfig
 from MaxText.integration.tunix.tunix_adapter import TunixMaxTextAdapter
 
-# MaxText goodput monitoring imports
-from MaxText.utils.goodput_utils import maybe_monitor_goodput, create_goodput_recorder, maybe_record_goodput, GoodputEvent
-from MaxText import max_logging
-
 # This is for running the script in a colab or notebook environment.
 # import nest_asyncio
 # nest_asyncio.apply()  # To fix "This event loop is already running" error in Colab
@@ -144,14 +131,6 @@
 # ====== Reproducibility ======
 SEED = 42
 
-# ====== Goodput Monitoring ======
-# Enable goodput monitoring for performance tracking
-ENABLE_GOODPUT_RECORDING = True
-MONITOR_GOODPUT = True
-ENABLE_GCP_GOODPUT_METRICS = True
-ENABLE_GCP_STEP_DEVIATION_METRICS = True
-GOODPUT_UPLOAD_INTERVAL_SECONDS = 30
-
 
 # ====== GRPO ======
 # === Generation during GRPO training ===
@@ -929,30 +908,6 @@ def evaluate(
 # Let's set up all the configs first - checkpointing, metric logging and training.
 # We then train the model.
 def main():
-  # Create a mock config object for goodput monitoring
-  class MockConfig:
-
-    def __init__(self):
-      self.monitor_goodput = MONITOR_GOODPUT
-      self.enable_goodput_recording = ENABLE_GOODPUT_RECORDING
-      self.enable_gcp_goodput_metrics = ENABLE_GCP_GOODPUT_METRICS
-      self.enable_gcp_step_deviation_metrics = ENABLE_GCP_STEP_DEVIATION_METRICS
-      self.goodput_upload_interval_seconds = GOODPUT_UPLOAD_INTERVAL_SECONDS
-      self.run_name = "grpo_llama3_demo"
-      self.tensorboard_dir = LOG_DIR
-      self.enable_pathways_goodput = False
-      self.monitor_step_time_deviation = True
-      self.step_deviation_interval_seconds = 60
-      self.report_performance_metric_for_gcp_monitoring = False
-
-  config = MockConfig()
-
-  # Initialize goodput monitoring
-  maybe_monitor_goodput(config)
-  recorder = create_goodput_recorder(config)
-
-  max_logging.log("GRPO training with goodput monitoring started")
-
   # Ckpt saving
   checkpointing_options = ocp.CheckpointManagerOptions(save_interval_steps=SAVE_INTERVAL_STEPS, max_to_keep=MAX_TO_KEEP)
 
@@ -1057,55 +1012,37 @@ def __init__(self):
 
   # ## Evaluate before training
   #
-  max_logging.log("Starting pre-training evaluation...")
-
-  with maybe_record_goodput(recorder, GoodputEvent.DATA_LOADING):
-    # pylint: disable=unbalanced-tuple-unpacking
-    (corr, total, accuracy, partial_accuracy, format_accuracy) = evaluate(
-        test_dataset,
-        rl_cluster,
-        **GENERATION_CONFIGS["greedy"],
-    )
-  print(f"Pre GRPO Training: {corr=}, {total=}, {accuracy=}%, {partial_accuracy=}%," f" {format_accuracy=}%")
 
-  max_logging.log(f"Pre-training evaluation completed: {accuracy}% accuracy")
+  # pylint: disable=unbalanced-tuple-unpacking
+  (corr, total, accuracy, partial_accuracy, format_accuracy) = evaluate(
+      test_dataset,
+      rl_cluster,
+      **GENERATION_CONFIGS["greedy"],
+  )
+  print(f"Pre GRPO Training: {corr=}, {total=}, {accuracy=}%, {partial_accuracy=}%," f" {format_accuracy=}%")
 
   # ## Start training
   #
-  max_logging.log("Starting GRPO training with goodput monitoring...")
 
   jax.profiler.start_trace(PROFILE_DIR)
   with mesh, nn_partitioning.axis_rules(config_policy.logical_axis_rules):
-    with maybe_record_goodput(recorder, GoodputEvent.TRAINING_PREPARATION):
-      max_logging.log("Training preparation phase recorded")
-
-    # Record the main training phase
-    with maybe_record_goodput(recorder, GoodputEvent.STEP):
-      grpo_trainer.train(DATASET)
-
+    grpo_trainer.train(DATASET)
   jax.profiler.stop_trace()
 
-  max_logging.log("GRPO training completed")
-
   print("HBM usage after training:")
   show_hbm_usage()
 
   # ## Evaluate
   #
   # Let's evaluate our model!
-  max_logging.log("Starting post-training evaluation...")
-
-  with maybe_record_goodput(recorder, GoodputEvent.DATA_LOADING):
-    # pylint: disable=unbalanced-tuple-unpacking
-    (corr, total, accuracy, partial_accuracy, format_accuracy) = evaluate(
-        test_dataset,
-        rl_cluster,
-        **GENERATION_CONFIGS["greedy"],
-    )
-  print(f"Post GRPO Training: {corr=}, {total=}, {accuracy=}%, {partial_accuracy=}%," f" {format_accuracy=}%")
 
-  max_logging.log(f"Post-training evaluation completed: {accuracy}% accuracy")
-  max_logging.log("GRPO training with goodput monitoring finished successfully")
+  # pylint: disable=unbalanced-tuple-unpacking
+  (corr, total, accuracy, partial_accuracy, format_accuracy) = evaluate(
+      test_dataset,
+      rl_cluster,
+      **GENERATION_CONFIGS["greedy"],
+  )
+  print(f"Post GRPO Training: {corr=}, {total=}, {accuracy=}%, {partial_accuracy=}%," f" {format_accuracy=}%")
 
 
 if __name__ == "__main__":
diff --git a/src/MaxText/experimental/rl/grpo_trainer.py b/src/MaxText/experimental/rl/grpo_trainer.py
@@ -79,6 +79,7 @@
 from MaxText.data_loader import DataLoader
 from MaxText.experimental.rl import grpo_input_pipeline
 from MaxText.experimental.rl import grpo_utils
+from MaxText.experimental.rl.hooks import GRPOTrainingHooks, GRPODataHooks
 from MaxText.globals import EPS
 from MaxText.metric_logger import MetricLogger
 from MaxText.train import get_first_step
@@ -708,9 +709,18 @@ def train_loop(config, config_inference, recorder, state=None):
   data_loader = DataLoader(config_inference, inference_mesh, data_iterator, recorder)
   metric_logger = MetricLogger(config=config, learning_rate_schedule=learning_rate_schedule)
 
+  # Initialize GRPO training hooks
+  training_hooks = GRPOTrainingHooks(
+      config=config, mesh=mesh, learning_rate_schedule=learning_rate_schedule, goodput_recorder=recorder
+  )
+  data_hooks = GRPODataHooks(config=config, data_iterator=data_iterator, eval_data_iterator=eval_data_iterator)
+
   # Write train config params, num model params, and XLA flags to tensorboard
   metric_logger.write_setup_info_to_tensorboard(state.params["params"])
 
+  # Call on_train_start hook
+  training_hooks.on_train_start(state, start_step)
+
   def generation_worker_fn(
       worker_inference_engine,
       worker_tokenizer_model,
@@ -765,6 +775,9 @@ def generation_worker_fn(
   inference_engine_lock = threading.Lock()
 
   max_logging.log("Inference Rollout")
+  # Track initial generation
+  training_hooks.on_generation_start(start_step)
+  gen_start_time = time.time()
   generate_completions(
       data_loader,
       inference_engine,
@@ -776,6 +789,10 @@ def generation_worker_fn(
       data_sharding,
       inference_engine_lock,
   )
+  gen_time = time.time() - gen_start_time
+  with data_buffer_lock:
+    num_completions = sum(batch[config.train_data_columns].shape[0] for batch in data_buffer)
+  training_hooks.on_generation_end(start_step, num_completions, gen_time)
 
   required_batch_size = int(config.per_device_batch_size * config.num_generations * mesh.size)
   generation_thread = threading.Thread(
@@ -798,6 +815,9 @@ def generation_worker_fn(
   try:
     last_step_completion = datetime.datetime.now()
     for step in np.arange(start_step, config.steps):
+      # Call on_train_step_start hook
+      training_hooks.on_train_step_start(step)
+
       prof.maybe_activate_profiler(step, state)
 
       with jax.profiler.StepTraceAnnotation("train", step_num=step):
@@ -837,7 +857,11 @@ def generation_worker_fn(
       last_step_completion = datetime.datetime.now()
 
       state_to_save = _split_grpo_state(state)[0]
-      checkpointing.maybe_save_checkpoint(checkpoint_manager, state_to_save, config, data_iterator, step)
+      checkpoint_saved = checkpointing.maybe_save_checkpoint(
+          checkpoint_manager, state_to_save, config, data_iterator, step
+      )
+      if checkpoint_saved:
+        training_hooks.on_checkpoint_save(step, config.checkpoint_dir)
 
       if config.dump_hlo and step == start_step:
         jax.block_until_ready(state)  # Ensure compilation has finished.
@@ -851,17 +875,23 @@ def generation_worker_fn(
 
       if config.eval_interval > 0 and step > start_step and (step + 1) % config.eval_interval == 0:
         assert eval_data_iterator
+        # Call on_eval_start hook
+        training_hooks.on_eval_start(step)
         eval_step_count = 0
         # pylint: disable=not-callable
         for eval_batch in eval_data_iterator:
           if 0 < config.eval_steps <= eval_step_count:
             break
           with mesh, nn_partitioning.axis_rules(config.logical_axis_rules):
             eval_metrics = p_eval_step(state, eval_batch, rng)
+          # Call on_eval_step hook
+          training_hooks.on_eval_step(eval_metrics)
           metric_logger.record_eval_metrics(step, metrics=eval_metrics)
           max_logging.log(f"Completed eval step {eval_step_count}")
           eval_step_count += 1
         metric_logger.record_eval_metrics(step, eval_step_count=eval_step_count)
+        # Call on_eval_end hook
+        training_hooks.on_eval_end(step)
         if metric_logger.cumulative_eval_metrics["scalar"]["eval/avg_loss"] <= config.target_eval_loss:
           prof.deactivate()
           raise exceptions.StopTraining(f"Target loss {config.target_eval_loss=} is achieved.")
@@ -872,11 +902,17 @@ def generation_worker_fn(
         max_utils.print_mem_stats("After params initialized")
 
       metric_logger.buffer_and_write_train_metrics(metrics, step, step_time_delta)
+
+      # Call on_train_step_end hook
+      training_hooks.on_train_step_end(step, metrics, step_time_delta.total_seconds())
+
       state_to_save = _split_grpo_state(state)[0]
       checkpointing.maybe_save_checkpoint(checkpoint_manager, state_to_save, config, data_iterator)
   except exceptions.StopTraining as e:
     max_logging.log(f"Training stopped: {str(e)}")
   finally:
+    # Call on_train_end hook
+    training_hooks.on_train_end(step)
     metric_logger.flush_metrics_and_cleanup()
     max_logging.log("Training loop finished or exited. Signaling generation worker to stop.")
     stop_event.set()
diff --git a/src/MaxText/experimental/rl/hooks.py b/src/MaxText/experimental/rl/hooks.py