huggingface · SSusantAchary · Nov 3, 2025 · Nov 3, 2025 · Nov 3, 2025 · Nov 3, 2025
diff --git a/docs/source/paper_index.md b/docs/source/paper_index.md
@@ -5,8 +5,25 @@
 
 ## Group Relative Policy Optimization
 
-Papers relating to the [`GRPOTrainer`]
+### DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models
+**📜 Paper**: https://huggingface.co/papers/2402.03300
 
+Introduces **GRPO** and shows strong math-reasoning gains from math-centric pretraining plus group-relative PPO-style optimization.  
+**Used in TRL via:** [`GRPOTrainer`]
+
+```python
+# Minimal GRPO setup (mirrors style used for other papers on the page).
+from trl import GRPOConfig
+
+training_args = GRPOConfig(
+    loss_type="grpo",
+    epsilon=2e-4,             # clip range (use paper/experiment settings if you mirror them)
+    gradient_accumulation_steps=1,
+    num_generations=8,        # completions per prompt (adjust to your compute)
+    max_prompt_length=1024,
+    max_completion_length=1024,
+)
+```
 ### Group Sequence Policy Optimization
 
 **📜 Paper**: https://huggingface.co/papers/2507.18071
@@ -256,7 +273,7 @@ config = GRPOConfig(
 
 ## Direct Policy Optimization
 
-Papers relating to the [`DPOTrainer`]
+- Papers relating to the [`DPOTrainer`]
 
 ### Direct Preference Optimization (DPO): Your Language Model is Secretly a Reward Model
 
@@ -479,6 +496,91 @@ training_args = DPOConfig(
 
 These parameters only appear in the [published version](https://aclanthology.org/2025.tacl-1.22.pdf)
 
+### Statistical Rejection Sampling Improves Preference Optimization
+**📜 Paper**: https://huggingface.co/papers/2309.06657
+
+Proposes **RSO**, selecting stronger preference pairs via statistical rejection sampling to boost offline preference optimization; complements DPO/SLiC.
+```python
+from datasets import load_dataset
+from trl import DPOConfig, DPOTrainer
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# 1) Model & tokenizer (keep the example self-contained)
+model = AutoModelForCausalLM.from_pretrained("...") 
+tokenizer = AutoTokenizer.from_pretrained("...")
+
+# 2) Load and (optionally) filter your training data
+train_dataset = load_dataset(...)  # e.g., ("json", data_files="rso_pairs.jsonl")
+
+def rso_accept(example):  # replace with your actual filter/score logic
+    return example.get("rso_keep", True)
+
+train_dataset = train_dataset.filter(rso_accept)
+
+# 3) Configure DPO
+training_args = DPOConfig(
+    loss_type="sigmoid",
+    beta=0.1,
+)
+
+# 4) Train
+trainer = DPOTrainer(
+    model=model,
+    args=training_args,
+    tokenizer=tokenizer,
+    train_dataset=train_dataset,
+)
+trainer.train()
+
+```
+
+### Nash Learning from Human Feedback
+**📜 Paper**: https://huggingface.co/papers/2312.00886
+
+Frames alignment as a **two-player game**, learning Nash policies from human feedback; connects to multi-objective and competitive preference training.
+```python
+# Train a DPO policy (one side of the game-theoretic setup)
+from trl import DPOConfig, DPOTrainer
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("..."); tok = AutoTokenizer.from_pretrained("...")
+args = DPOConfig()
+trainer = DPOTrainer(model=model, args=args, tokenizer=tok, train_dataset=...)
+trainer.train()
+
+```
+
+
+### Direct Language Model Alignment from Online AI Feedback
+**📜 Paper**: https://huggingface.co/papers/2402.04792
+
+Uses **online AI feedback (OAIF)** to supply real-time preference signals, improving direct alignment beyond purely offline pairs.
+```python
+# Sketch: collect online AI feedback -> extend DPO dataset -> train
+from trl import DPOConfig, DPOTrainer
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+policy = AutoModelForCausalLM.from_pretrained("..."); tok = AutoTokenizer.from_pretrained("...")
+
+def ai_feedback_score(prompt, response) -> float:
+    # plug a judge / RM / heuristic here (return scalar)
+    pass
+
+new_pairs = []
+for ex in prompts_ds:
+    out = policy.generate(**tok(ex["prompt"], return_tensors="pt").to(policy.device), max_new_tokens=256)
+    resp = tok.decode(out[0], skip_special_tokens=True)
+    score = ai_feedback_score(ex["prompt"], resp)
+    # build a (chosen, rejected) pair using score (e.g., compare vs baseline response)
+    new_pairs.append({"prompt": ex["prompt"], "chosen": resp, "rejected": ex["baseline"]})
+
+augmented_pairs = dpo_pairs.add_items(new_pairs)
+
+training_args = DPOConfig()
+trainer = DPOTrainer(model=model, args=training_args, tokenizer=tok, train_dataset=...)
+trainer.train()
+```
+
 ## Supervised Fine-Tuning
 
 Papers relating to the [`SFTTrainer`]
@@ -530,6 +632,43 @@ SFTConfig(
     per_device_train_batch_size=8,
     gradient_accumulation_steps=32,
 )
+```
+## Parameter-Efficient Fine-Tuning (PEFT)
+
+### LoRA: Low-Rank Adaptation of Large Language Models
+**📜 Paper**: https://huggingface.co/papers/2106.09685
+
+Parameter-efficient fine-tuning via **low-rank adapters**, cutting trainable parameters and memory while preserving quality (see PEFT integration in TRL).
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from trl import SFTTrainer, SFTConfig
+from peft import LoraConfig
+
+# minimal, self-contained setup
+model_id = "meta-llama/Llama-3.1-8B-Instruct"  # any causal LM
+tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto")
+
+# only the essential PEFT info; keep the rest at defaults
+peft_cfg = LoraConfig(task_type="CAUSAL_LM")
+# If a model needs explicit targets, uncomment and adjust:
+# peft_cfg = LoraConfig(task_type="CAUSAL_LM", target_modules=["q_proj","k_proj","v_proj","o_proj"])
+
+training_args = SFTConfig(  # lean config; defaults otherwise
+    max_seq_length=2048,
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=8,
+)
+
+trainer = SFTTrainer(
+    model=model,
+    tokenizer=tok,
+    args=training_args,
+    peft_config=peft_cfg,
+    train_dataset=...,  # your dataset
+)
+trainer.train()
+
 ```
 
 ## Reinforce Leave-One-Out
@@ -627,6 +766,101 @@ def add_margin(example):
 
 dataset = dataset.map(add_margin)
 ```
+### Solving Math Word Problems with Process- and Outcome-Based Feedback
+**📜 Paper**: https://huggingface.co/papers/2211.14275
+
+Shows benefits of **process supervision** (step-level) alongside outcome labels for math reasoning, motivating richer feedback signals for alignment.
+```python
+# Combine step-level (process) and final-answer (outcome) rewards
+from trl import GRPOConfig, GRPOTrainer
+
+def process_reward(sample) -> float:
+    # e.g., +1 for each verified-correct reasoning step (paper-specific)
+    return sample.get("num_correct_steps", 0) / max(1, sample.get("num_steps", 1))
+
+def outcome_reward(sample) -> float:
+    return 1.0 if sample.get("is_correct") else 0.0
+
+def fused_reward(sample) -> float:
+    return 0.5 * process_reward(sample) + 0.5 * outcome_reward(sample)
+
+args = GRPOConfig(loss_type="grpo", beta=0.0, steps_per_generation=4, num_generations=4)
+trainer = GRPOTrainer(model=..., args=args, tokenizer=..., train_dataset=..., reward_funcs=[fused_reward])
+trainer.train()
+
+```
+
+## Distillation / Post-training (Background)
+
+### On-Policy Distillation of Language Models
+**📜 Paper**: https://huggingface.co/papers/2306.13649
+
+Introduces **GKD**, aligning student with teacher **on-policy** to stabilize/boost instruction tuning and integrate cleanly with RLHF pipelines.
+```python
+# 1) Have the TEACHER generate outputs on current prompts (on-policy)
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from trl import SFTTrainer, SFTConfig
+
+teacher = AutoModelForCausalLM.from_pretrained("teacher-model"); tok = AutoTokenizer.from_pretrained("teacher-model")
+def teacher_label(prompt):
+    out = teacher.generate(**tok(prompt, return_tensors="pt").to(teacher.device), max_new_tokens=256)
+    return tok.decode(out[0], skip_special_tokens=True)
+
+distill_ds = prompts_ds.map(lambda ex: {"prompt": ex["prompt"], "response": teacher_label(ex["prompt"])})
+
+# 2) Train STUDENT with SFT on those on-policy pairs
+student = AutoModelForCausalLM.from_pretrained("student-model")
+args = SFTConfig(max_seq_length=2048, per_device_train_batch_size=4, learning_rate=5e-5)
+trainer = SFTTrainer(model=student, args=args, tokenizer=tok, train_dataset=distill_ds)
+trainer.train()
+
+```
+
+## Foundations & Systems (Background)
+
+### Proximal Policy Optimization Algorithms
+**📜 Paper**: https://huggingface.co/papers/1707.06347
+
+Foundational **PPO** objective with clipped ratios and minibatch epochs—baseline for many RL/RLHF variants used in TRL.
+```python
+from trl import GRPOConfig, GRPOTrainer
+args = GRPOConfig(
+    loss_type="grpo",
+    epsilon=0.2, epsilon_high=0.2,  # classic PPO-style symmetric clip
+    beta=0.01,                      # KL coef if you want explicit KL
+    steps_per_generation=4, num_generations=8,
+)
+trainer = GRPOTrainer(model=..., args=args, tokenizer=..., train_dataset=...)
+trainer.train()
+
+```
+
+### ZeRO: Memory Optimizations Toward Training Trillion-Parameter Models
+**📜 Paper**: https://huggingface.co/papers/1910.02054
+
+**ZeRO** partitions optimizer states/gradients/params to scale training efficiently; relevant when configuring DeepSpeed/Accelerate with TRL.
+```python
+# Most TRL configs forward to HF TrainingArguments under the hood.
+# Supply a DeepSpeed ZeRO config via the "deepspeed" argument/path.
+from trl import DPOConfig, DPOTrainer
+
+args = DPOConfig(
+    loss_type="sigmoid",
+    beta=0.1,
+    deepspeed="ds_zero_stage2.json",  # path to your ZeRO config
+    bf16=True,
+)
+
+# ds_zero_stage2.json (very small sketch)
+# {
+#   "zero_optimization": { "stage": 2 },
+#   "train_micro_batch_size_per_gpu": 4,
+#   "gradient_accumulation_steps": 8
+# }
+
+trainer = DPOTrainer(model=..., args=args, tokenizer=..., train_dataset=...)
+trainer.train()
+
 
 ## Distillation
 Papers relating to training a student model with the help of a teacher model.