pytorch
diff --git a/‎docs/source/reference/llms.rst
Lines changed: 24 additions & 0 deletions b/‎docs/source/reference/llms.rst
Lines changed: 24 additions & 0 deletions
diff --git a/‎sota-implementations/expert-iteration/config/ei_gsm8k.yaml
Lines changed: 143 additions & 0 deletions b/‎sota-implementations/expert-iteration/config/ei_gsm8k.yaml
Lines changed: 143 additions & 0 deletions
diff --git a/‎sota-implementations/expert-iteration/config/ei_ifeval.yaml
Lines changed: 137 additions & 0 deletions b/‎sota-implementations/expert-iteration/config/ei_ifeval.yaml
Lines changed: 137 additions & 0 deletions
diff --git a/‎sota-implementations/expert-iteration/config/mode/async.yaml
Lines changed: 11 additions & 0 deletions b/‎sota-implementations/expert-iteration/config/mode/async.yaml
Lines changed: 11 additions & 0 deletions
diff --git a/‎sota-implementations/expert-iteration/config/mode/sync.yaml
Lines changed: 9 additions & 0 deletions b/‎sota-implementations/expert-iteration/config/mode/sync.yaml
Lines changed: 9 additions & 0 deletions
@@ -256,6 +256,9 @@ LLM post training require some appropriate versions of the losses implemented in
 GRPO
 ~~~~
 
+The :class:`~torchrl.objectives.llm.GRPOLoss` class is a thin wrapper around the :class:`~torchrl.objectives.PPOLoss` class
+that codes the LLM-specific functionnalities.
+
 .. currentmodule:: torchrl.objectives.llm
 
 .. autosummary::
@@ -265,3 +268,24 @@ GRPO
     GRPOLoss
     GRPOLossOutput
     MCAdvantage
+
+
+SFT
+~~~
+
+.. currentmodule:: torchrl.objectives.llm
+
+.. autosummary::
+    :toctree: generated/
+    :template: rl_template.rst
+
+    SFTLoss
+    SFTLossOutput
+
+.. currentmodule:: torchrl.data.llm
+
+.. autosummary::
+    :toctree: generated/
+    :template: rl_template.rst
+
+    TopKRewardSelector
@@ -0,0 +1,143 @@
+# @package _global_
+defaults:
+  - mode: ${mode:async}  # Default to async mode, can be overridden by scripts
+  - _self_
+  - override hydra/hydra_logging: disabled
+  - override hydra/job_logging: disabled
+
+# Environment configuration
+env:
+  dataset: gsm8k  # choices: [gsm8k, ifeval]
+  # Number of environments to run in parallel. This determines the batch size passed to vLLM.
+  #  More envs consume more GPU memory.
+  num_envs: 8  # Reduced from 8 to save memory
+  # Number of times to repeat the same prompt for GRPO. This does not affect the GPU memory usage.
+  repeats: 32
+
+# Base model configuration
+model:
+  # A 3B model is sufficient for this task:
+  name: Qwen/Qwen2.5-3B
+  compile: false
+
+# Base training configuration - will be merged with mode-specific settings
+train:
+  # Some fields are defined in mode configs (async.yaml and sync.yaml)
+  # The following fields are task-specific:
+  exp_name: "grpo-gsm8k"
+
+  # Whether to use mixed precision training.
+  mixed_precision: true
+
+  # Number of top-k rewards to select for training.
+  topk_size: 8
+
+  # Total number of dialog turns to collect during training.
+  total_dialog_turns: 100_000
+
+  # Number of steps in each batch. Higher values will cause the inference step to be slower, but won't use more GPU memory.
+  steps_per_batch: 256
+
+  # Replay buffer size. For a given prompt, we will query the LLM a total of `env.repeats` times.
+  # Then, the top-k rewards will be selected from these `env.repeats` rewards.
+  # A single batch collected has size `train.steps_per_batch`, and the fraction written to the replay buffer is `train.topk_size / env.repeats`.
+  # If `buffer_size` is not set, it will default to `train.steps_per_batch * train.topk_size / env.repeats`.
+  buffer_size: 
+
+  # Number of gradient accumulation steps. Higher values will use less GPU memory (comparing with bigger batches and lower gradient_accumulation_steps), 
+  # but will make the optimization step slower.
+  gradient_accumulation_steps: 16
+
+  # Fields used by both scripts but with different semantics
+  checkpoint_frequency: 100  # Save checkpoint every N steps/batches
+
+  # Batch size for optimization. Higher values will use more GPU memory.
+  optim_batch_size: 2
+
+  # KL coefficients for the KL divergence to the reference and inference policies
+  kl_to_ref_coeff: 1e-1
+  
+  # Fields used only by grpo-async.py / grpo-sync.py
+  logging_frequency: 10  # Log metrics every N steps
+
+# Training model configuration
+train_model:
+  gradient_checkpointing: true  # Enabled for memory efficiency
+  num_devices: 1  # Number of devices to use
+  lora:
+    enabled: true  # Using LoRA for memory efficiency
+    r: 8  # LoRA rank - controls capacity of adaptations
+    alpha: 16  # LoRA alpha - scales the adaptations
+    dropout: 0.1  # Dropout probability for LoRA layers
+  quantization:
+    enabled: false  # Enable 4-bit quantization for base model
+  attn_implementation: sdpa  # Using flash attention for memory efficiency
+  torch_dtype: bfloat16
+
+# Inference model configuration
+inference_model:
+  num_devices: 1  # Number of devices to use
+  quantization:
+    enabled: false  # Enable 4-bit quantization for base model
+  attn_implementation: sdpa  # Using flash attention for memory efficiency
+  torch_dtype: bfloat16
+  gpu_memory_utilization: 0.5  # Limit GPU memory usage
+  temperature: 0.8
+  max_tokens: 1024
+  include_stop_str_in_output: true
+  enforce_eager: false
+
+# Reference model configuration
+ref_model:
+  gradient_checkpointing: false  # Always false, no backprop
+  num_devices: 1  # Number of devices to use
+  lora:
+    enabled: true  # Using LoRA for memory efficiency
+    r: 8  # LoRA rank - controls capacity of adaptations
+    alpha: 16  # LoRA alpha - scales the adaptations
+    dropout: 0.1  # Dropout probability for LoRA layers
+  quantization:
+    enabled: false  # Enable 4-bit quantization for base model
+  attn_implementation: sdpa  # Using flash attention for memory efficiency
+  torch_dtype: bfloat16
+
+# Optimizer configuration
+optimizer:
+  name: AdamW
+  lr: 2e-5
+  clip_grad_norm: 100.0
+  weight_decay: 0.0
+
+# Ray configuration
+ray:
+  init_config:
+    num_cpus: 96  # Total available CPUs
+    num_gpus: 8  # Explicitly set number of GPUs
+    runtime_env:
+      working_dir: "."
+    _temp_dir: "/tmp/ray_grpo"  # Custom temp directory
+    _system_config:
+      object_spilling_threshold: 0.8  # Spill when 80% full
+      max_direct_memory_size: 10 * 1024 * 1024 * 1024  # 10GB limit
+      object_store_full_delay_ms: 100  # Delay when store is full
+      object_store_full_max_retries: 3  # Max retries when store is full
+  collector_config:
+    num_cpus: 24  # CPUs for inference and ref model
+  train_handler_config:
+    num_cpus: 24  # Dedicated CPUs for training
+  replay_buffer_config:
+    num_cpus: 24  # CPUs for replay buffer
+    num_gpus: 0.0  # No GPU needed for replay buffer
+
+# Logging configuration
+logging:
+  experiment_name: null  # Will be auto-generated if not provided
+  checkpoint_dir: "checkpoints"
+  checkpoint_frequency: 10  # Save checkpoint every N batches
+
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num} 
@@ -0,0 +1,137 @@
+# @package _global_
+defaults:
+  - mode: ${mode:async}  # Default to async mode, can be overridden by scripts
+  - _self_
+  - override hydra/hydra_logging: disabled
+  - override hydra/job_logging: disabled
+
+# Environment configuration
+env:
+  dataset: ifeval  # choices: [gsm8k, ifeval]
+  # Number of environments to run in parallel. This determines the batch size passed to vLLM.
+  # More envs consume more GPU memory.
+  num_envs: 4
+  # Number of times to repeat the same prompt for GRPO. This does not affect the GPU memory usage.
+  repeats: 32
+
+# Base model configuration
+model:
+  # A 7B model works well for this task.
+  name: Qwen/Qwen2.5-7b
+  compile: false
+
+# Base training configuration - will be merged with mode-specific settings
+train:
+  # Some fields are defined in mode configs (async.yaml and sync.yaml)
+  # The following fields are task-specific:
+  exp_name: "grpo-ifeval"
+
+  # Whether to use mixed precision training.
+  mixed_precision: true
+
+  # Number of top-k rewards to select for training.
+  topk_size: 8
+
+  # Total number of dialog turns to collect during training.
+  total_dialog_turns: 100_000
+
+  # Number of steps in each batch. Higher values will cause the inference step to be slower, but won't use more GPU memory.
+  steps_per_batch: 64
+
+  # Number of gradient accumulation steps. Higher values will use less GPU memory (comparing with bigger batches and lower gradient_accumulation_steps), 
+  # but will make the optimization step slower.
+  gradient_accumulation_steps: 4
+
+  # Fields used by both scripts but with different semantics
+  checkpoint_frequency: 100  # Save checkpoint every N steps/batches
+
+  # Batch size for optimization. Higher values will use more GPU memory.
+  optim_batch_size: 2
+
+  # KL coefficients for the KL divergence to the reference and inference policies
+  kl_to_ref_coeff:
+
+  # Fields used only by grpo-async.py / grpo-sync.py
+  logging_frequency: 1  # Log metrics every N steps - here at each optimization step
+
+# Training model configuration
+train_model:
+  gradient_checkpointing: true  # Enabled for memory efficiency
+  num_devices: 4  # Number of devices to use
+  lora:
+    enabled: true  # Using LoRA for memory efficiency
+    r: 8  # LoRA rank - controls capacity of adaptations
+    alpha: 16  # LoRA alpha - scales the adaptations
+    dropout: 0.1  # Dropout probability for LoRA layers
+  quantization:
+    enabled: false  # Enable 4-bit quantization for base model
+  attn_implementation: sdpa  # Using flash attention for memory efficiency
+  torch_dtype: bfloat16
+
+# Inference model configuration
+inference_model:
+  num_devices: 2  # Number of devices to use
+  quantization:
+    enabled: false  # Enable 4-bit quantization for base model
+  attn_implementation: sdpa  # Using flash attention for memory efficiency
+  torch_dtype: bfloat16
+  gpu_memory_utilization: 0.5  # Limit GPU memory usage
+  temperature: 0.8
+  max_tokens: 2048
+  include_stop_str_in_output: true
+  enforce_eager: false
+
+# Reference model configuration
+ref_model:
+  gradient_checkpointing: false  # Always false, no backprop
+  num_devices: 2  # Number of devices to use
+  lora:
+    enabled: true  # Using LoRA for memory efficiency
+    r: 8  # LoRA rank - controls capacity of adaptations
+    alpha: 16  # LoRA alpha - scales the adaptations
+    dropout: 0.1  # Dropout probability for LoRA layers
+  quantization:
+    enabled: false  # Enable 4-bit quantization for base model
+  attn_implementation: sdpa  # Using flash attention for memory efficiency
+  torch_dtype: bfloat16
+
+# Optimizer configuration
+optimizer:
+  name: AdamW
+  lr: 1e-5
+  clip_grad_norm: 10.0
+  weight_decay: 0.0
+
+# Ray configuration
+ray:
+  init_config:
+    num_cpus: 96  # Total available CPUs
+    num_gpus: 8  # Explicitly set number of GPUs
+    runtime_env:
+      working_dir: "."
+    _temp_dir: "/tmp/ray_grpo"  # Custom temp directory
+    _system_config:
+      object_spilling_threshold: 0.8  # Spill when 80% full
+      max_direct_memory_size: 10 * 1024 * 1024 * 1024  # 10GB limit
+      object_store_full_delay_ms: 100  # Delay when store is full
+      object_store_full_max_retries: 3  # Max retries when store is full
+  collector_config:
+    num_cpus: 24  # CPUs for inference and ref model (co-located)
+  train_handler_config:
+    num_cpus: 24  # Dedicated CPUs for training
+  replay_buffer_config:
+    num_cpus: 24  # CPUs for replay buffer
+    num_gpus: 0.0  # No GPU needed for replay buffer
+
+# Logging configuration
+logging:
+  experiment_name: null  # Will be auto-generated if not provided
+  checkpoint_dir: "checkpoints"
+  checkpoint_frequency: 10  # Save checkpoint every N batches
+
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num} 
@@ -0,0 +1,11 @@
+# @package _global_
+train:
+  # Mode-specific setting
+  sync: false  # Force asynchronous mode
+  
+  # Number of epochs to train for, every time a batch is collected. Per se, not directly used in async - aside from computing the total number of steps.
+  epochs: 1
+  # Replay buffer size.
+  buffer_size: 128
+  # Update policy weights every N steps - can be set to any positive integer in async mode
+  weight_update_frequency: 10
@@ -0,0 +1,9 @@
+# @package _global_
+train:
+  # Mode-specific setting
+  sync: true  # Force synchronous mode
+  
+  # Number of epochs to train for, every time a batch is collected.
+  epochs: 1
+  # Update policy weights every N steps - must be left empty in sync mode
+  weight_update_frequency: