pytorch
diff --git a/‎sota-implementations/grpo/README.md
Lines changed: 4 additions & 3 deletions b/‎sota-implementations/grpo/README.md
Lines changed: 4 additions & 3 deletions
diff --git a/‎sota-implementations/grpo/config/grpo_gsm8k.yaml
Lines changed: 22 additions & 10 deletions b/‎sota-implementations/grpo/config/grpo_gsm8k.yaml
Lines changed: 22 additions & 10 deletions
diff --git a/‎sota-implementations/grpo/config/grpo_ifeval.yaml
Lines changed: 31 additions & 19 deletions b/‎sota-implementations/grpo/config/grpo_ifeval.yaml
Lines changed: 31 additions & 19 deletions
diff --git a/‎sota-implementations/grpo/config/mode/async.yaml
Lines changed: 2 additions & 16 deletions b/‎sota-implementations/grpo/config/mode/async.yaml
Lines changed: 2 additions & 16 deletions
diff --git a/‎sota-implementations/grpo/config/mode/sync.yaml
Lines changed: 0 additions & 14 deletions b/‎sota-implementations/grpo/config/mode/sync.yaml
Lines changed: 0 additions & 14 deletions
@@ -83,7 +83,7 @@ for data in collector:  # Data collection loop
             loss = loss_fn(batch)
             loss.backward()
             optimizer.step()
-    # Weight updte
+    # Weight update
     weight_updater.push_weights(policy_training)
 ```
 
@@ -119,8 +119,9 @@ Key differences:
    - Async: Each piece of data is processed a non-deterministic number of times.
 
 4. **Weight updates**:
-   - Sync: Weights are updated befor every collection of data
-   - Async: Weights are updated at a given interval (in gradient steps)
+   - Sync: Weights are updated befor every collection of data.
+   - Async: Weights are updated at a given interval (in gradient steps). This will require a synchronization between the training
+     and inference processes, and frequent updates will cause both workers to often wait for each other.
 
 The async mode offers better performance by:
 - Running data collection and optimization concurrently
 
@@ -16,26 +16,38 @@ env:
 
 # Base model configuration
 model:
+  # A 3B model is sufficient for this task:
   name: Qwen/Qwen2.5-3B
   compile: false
 
 # Base training configuration - will be merged with mode-specific settings
 train:
-  # Fields defined in mode configs (async.yaml and sync.yaml)
-  # mixed_precision: true  # Whether to use mixed precision training
-  # epochs: 1  # Number of training epochs
-  # steps_per_batch: 32  # Number of steps per batch
-  # total_dialog_turns: 1_000_000  # Total number of dialog turns to collect
-  # optim_batch_size: 2  # Batch size for optimization
-  # gradient_accumulation_steps: 1  # Number of gradient accumulation steps
-  # kl_coef_in_loss: true  # Whether to include KL coefficient in loss
-  # sync: false  # Default to async, will be overridden by mode configs
-  # buffer_size: 128  # Size of replay buffer
+  # Some fields are defined in mode configs (async.yaml and sync.yaml)
+  # The following fields are task-specific:
   exp_name: "grpo-gsm8k"
 
+  # Whether to use mixed precision training.
+  mixed_precision: true
+
+  # Total number of dialog turns to collect during training.
+  total_dialog_turns: 100_000
+
+  # Number of steps in each batch. Higher values will cause the inference step to be slower, but won't use more GPU memory.
+  steps_per_batch: 32
+
+  # Number of gradient accumulation steps. Higher values will use less GPU memory (comparing with bigger batches and lower gradient_accumulation_steps), 
+  # but will make the optimization step slower.
+  gradient_accumulation_steps: 1
+
   # Fields used by both scripts but with different semantics
   checkpoint_frequency: 100  # Save checkpoint every N steps/batches
 
+  # Batch size for optimization. Higher values will use more GPU memory.
+  optim_batch_size: 1
+
+  # Whether to include the KL coefficient in the loss function. Alternatively, the KL ref-to-train will be added to the reward.
+  kl_coef_in_loss: true 
+
   # KL coefficients for the KL divergence to the reference and inference policies
   kl_to_ref_coeff: 1e-2
   kl_to_inference_coeff: 0.0
 
@@ -10,44 +10,56 @@ env:
   dataset: ifeval  # choices: [gsm8k, ifeval]
   # Number of environments to run in parallel. This determines the batch size passed to vLLM.
   # More envs consume more GPU memory.
-  num_envs: 2
+  num_envs: 4
   # Number of times to repeat the same prompt for GRPO. This does not affect the GPU memory usage.
   repeats: 16
 
 # Base model configuration
 model:
-  name: Qwen/Qwen2.5-3B
+  # A 7B model works well for this task.
+  name: Qwen/Qwen2.5-7b
   compile: false
 
 # Base training configuration - will be merged with mode-specific settings
 train:
-  # Fields defined in mode configs (async.yaml and sync.yaml)
-  # mixed_precision: true  # Whether to use mixed precision training
-  # epochs: 1  # Number of training epochs
-  # steps_per_batch: 32  # Number of steps per batch
-  # total_dialog_turns: 1_000_000  # Total number of dialog turns to collect
-  # optim_batch_size: 2  # Batch size for optimization
-  # gradient_accumulation_steps: 1  # Number of gradient accumulation steps
-  # kl_coef_in_loss: true  # Whether to include KL coefficient in loss
-  # sync: false  # Default to async, will be overridden by mode configs
-  # buffer_size: 128  # Size of replay buffer
+  # Some fields are defined in mode configs (async.yaml and sync.yaml)
+  # The following fields are task-specific:
   exp_name: "grpo-ifeval"
 
+  # Whether to use mixed precision training.
+  mixed_precision: true
+
+  # Total number of dialog turns to collect during training.
+  total_dialog_turns: 100_000
+
+  # Number of steps in each batch. Higher values will cause the inference step to be slower, but won't use more GPU memory.
+  steps_per_batch: 64
+
+  # Number of gradient accumulation steps. Higher values will use less GPU memory (comparing with bigger batches and lower gradient_accumulation_steps), 
+  # but will make the optimization step slower.
+  gradient_accumulation_steps: 4
+
   # Fields used by both scripts but with different semantics
   checkpoint_frequency: 100  # Save checkpoint every N steps/batches
 
+  # Batch size for optimization. Higher values will use more GPU memory.
+  optim_batch_size: 2
+
+  # Whether to include the KL coefficient in the loss function. Alternatively, the KL ref-to-train will be added to the reward.
+  kl_coef_in_loss: false 
+
   # KL coefficients for the KL divergence to the reference and inference policies
-  kl_to_ref_coeff: 1e-2
-  kl_to_inference_coeff: 0.0
+  kl_to_ref_coeff: 1e-1
+  kl_to_inference_coeff: 1e-1
   entropy_coeff: 0.01
 
   # Fields used only by grpo-async.py / grpo-sync.py
-  logging_frequency: 10  # Log metrics every N steps
+  logging_frequency: 1  # Log metrics every N steps - here at each optimization step
 
 # Training model configuration
 train_model:
   gradient_checkpointing: true  # Enabled for memory efficiency
-  num_devices: 1  # Number of devices to use
+  num_devices: 4  # Number of devices to use
   lora:
     enabled: true  # Using LoRA for memory efficiency
     r: 8  # LoRA rank - controls capacity of adaptations
@@ -60,7 +72,7 @@ train_model:
 
 # Inference model configuration
 inference_model:
-  num_devices: 1  # Number of devices to use
+  num_devices: 2  # Number of devices to use
   quantization:
     enabled: false  # Enable 4-bit quantization for base model
   attn_implementation: sdpa  # Using flash attention for memory efficiency
@@ -74,7 +86,7 @@ inference_model:
 # Reference model configuration
 ref_model:
   gradient_checkpointing: false  # Always false, no backprop
-  num_devices: 1  # Number of devices to use
+  num_devices: 2  # Number of devices to use
   lora:
     enabled: true  # Using LoRA for memory efficiency
     r: 8  # LoRA rank - controls capacity of adaptations
@@ -89,7 +101,7 @@ ref_model:
 optimizer:
   name: AdamW
   lr: 1e-5
-  clip_grad_norm: 1.0
+  clip_grad_norm: 10.0
   weight_decay: 0.0
 
 # Ray configuration
 
@@ -3,23 +3,9 @@ train:
   # Mode-specific setting
   sync: false  # Force asynchronous mode
 
-  # Shared training settings
-  # Whether to use mixed precision training.
-  mixed_precision: true
   # Number of epochs to train for, every time a batch is collected. Per se, not directly used in async - aside from computing the total number of steps.
   epochs: 1
-  # Number of steps in each batch. Higher values will cause the inference step to be slower, but won't use more GPU memory.
-  steps_per_batch: 16
-  # Leave buffer_size empty to use steps_per_batch in async mode
-  buffer_size:
-  # Total number of dialog turns to collect during training.
-  total_dialog_turns: 100_000
-  # Batch size for optimization. Higher values will use more GPU memory.
-  optim_batch_size: 1
-  # Number of gradient accumulation steps. Higher values will use less GPU memory (comparing with bigger batches and lower gradient_accumulation_steps), 
-  # but will make the optimization step slower.
-  gradient_accumulation_steps: 4
-  # Whether to include the KL coefficient in the loss function. Alternatively, the KL ref-to-train will be added to the reward.
-  kl_coef_in_loss: true 
+  # The buffer size can be controlled in async mode
+  buffer_size: 128
   # Update policy weights every N steps - can be set to any positive integer in async mode
   weight_update_frequency: 10
@@ -3,23 +3,9 @@ train:
   # Mode-specific setting
   sync: true  # Force synchronous mode
 
-  # Shared training settings
-  # Whether to use mixed precision training.
-  mixed_precision: true
   # Number of epochs to train for, every time a batch is collected.
   epochs: 1
-  # Number of steps in each batch. Higher values will cause the inference step to be slower, but won't use more GPU memory.
-  steps_per_batch: 64
   # Leave buffer_size empty to use steps_per_batch in sync mode
   buffer_size:
-  # Total number of dialog turns to collect during training.
-  total_dialog_turns: 100_000
-  # Batch size for optimization. Higher values will use more GPU memory.
-  optim_batch_size: 1
-  # Number of gradient accumulation steps. Higher values will use less GPU memory (comparing with bigger batches and lower gradient_accumulation_steps), 
-  # but will make the optimization step slower.
-  gradient_accumulation_steps: 1
-  # Whether to include the KL coefficient in the loss function. Alternatively, the KL ref-to-train will be added to the reward.
-  kl_coef_in_loss: true 
   # Update policy weights every N steps - must be left empty in sync mode
   weight_update_frequency: