Initial GRPO exps on the Numina dataset

huggingface · Feb 10, 2025 · c4d3043 · c4d3043
1 parent 0da0f7c
commit c4d3043
Show file tree

Hide file tree

Showing 21 changed files with 1,136 additions and 0 deletions.
diff --git a/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_v00.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_v00.00.yaml
@@ -0,0 +1,53 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: AI-MO/NuminaMath-TIR
+dataset_configs:
+- all
+num_processes: 7
+ddp_find_unused_parameters: false
+# GRPO trainer config
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+bf16: true
+do_eval: false
+eval_strategy: "no"
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-1.5-GRPO-v00.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_level: info
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_train_epochs: 0.1
+num_generations: 7
+output_dir: data/DeepSeek-R1-Distill-Qwen-1.5-GRPO-v00.00
+overwrite_output_dir: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+seed: 42
+warmup_ratio: 0.1
+
+# Saving and eval callbacks
+save_strategy: "steps"
+save_steps: 25
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+- gpqa
diff --git a/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_v00.01.yaml b/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_v00.01.yaml
@@ -0,0 +1,53 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: AI-MO/NuminaMath-TIR
+dataset_configs:
+- all
+num_processes: 7
+ddp_find_unused_parameters: false
+# GRPO trainer config
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+bf16: true
+do_eval: false
+eval_strategy: "no"
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-1.5-GRPO-v00.01
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_level: info
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_train_epochs: 0.1
+num_generations: 7
+output_dir: data/DeepSeek-R1-Distill-Qwen-1.5-GRPO-v00.01
+overwrite_output_dir: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+seed: 42
+warmup_ratio: 0.1
+
+# Saving and eval callbacks
+save_strategy: "steps"
+save_steps: 25
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+- gpqa
diff --git a/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_v00.02.yaml b/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_v00.02.yaml
@@ -0,0 +1,53 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: AI-MO/NuminaMath-TIR
+dataset_configs:
+- all
+num_processes: 7
+ddp_find_unused_parameters: false
+# GRPO trainer config
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+bf16: true
+do_eval: false
+eval_strategy: "no"
+gradient_accumulation_steps: 8
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-1.5-GRPO-v00.02
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_level: info
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_train_epochs: 0.1
+num_generations: 7
+output_dir: data/DeepSeek-R1-Distill-Qwen-1.5-GRPO-v00.02
+overwrite_output_dir: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+seed: 42
+warmup_ratio: 0.1
+
+# Saving and eval callbacks
+save_strategy: "steps"
+save_steps: 25
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+- gpqa
diff --git a/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_v00.03.yaml b/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_v00.03.yaml
@@ -0,0 +1,53 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: AI-MO/NuminaMath-TIR
+dataset_configs:
+- all
+num_processes: 7
+ddp_find_unused_parameters: false
+# GRPO trainer config
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+bf16: true
+do_eval: false
+eval_strategy: "no"
+gradient_accumulation_steps: 8
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-1.5-GRPO-v00.03
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_level: info
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_prompt_length: 1024
+max_completion_length: 9192
+max_steps: -1
+num_train_epochs: 0.1
+num_generations: 7
+output_dir: data/DeepSeek-R1-Distill-Qwen-1.5-GRPO-v00.03
+overwrite_output_dir: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- wandb
+seed: 42
+warmup_ratio: 0.1
+
+# Saving and eval callbacks
+save_strategy: "steps"
+save_steps: 25
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+- gpqa
diff --git a/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_v00.04.yaml b/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_v00.04.yaml
@@ -0,0 +1,53 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: AI-MO/NuminaMath-TIR
+dataset_configs:
+- all
+num_processes: 7
+ddp_find_unused_parameters: false
+# GRPO trainer config
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+bf16: true
+do_eval: false
+eval_strategy: "no"
+gradient_accumulation_steps: 8
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-1.5-GRPO-v00.04
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_level: info
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_prompt_length: 1024
+max_completion_length: 16368
+max_steps: -1
+num_train_epochs: 0.1
+num_generations: 7
+output_dir: data/DeepSeek-R1-Distill-Qwen-1.5-GRPO-v00.04
+overwrite_output_dir: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 2
+push_to_hub: true
+report_to:
+- wandb
+seed: 42
+warmup_ratio: 0.1
+
+# Saving and eval callbacks
+save_strategy: "steps"
+save_steps: 25
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+- gpqa
diff --git a/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_v00.05.yaml b/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_v00.05.yaml
@@ -0,0 +1,53 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: AI-MO/NuminaMath-TIR
+dataset_configs:
+- all
+num_processes: 7
+ddp_find_unused_parameters: false
+# GRPO trainer config
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+bf16: true
+do_eval: false
+eval_strategy: "no"
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-1.5-GRPO-v00.05
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_level: info
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_train_epochs: 0.1
+num_generations: 7
+output_dir: data/DeepSeek-R1-Distill-Qwen-1.5-GRPO-v00.05
+overwrite_output_dir: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+seed: 42
+warmup_ratio: 0.1
+
+# Saving and eval callbacks
+save_strategy: "steps"
+save_steps: 25
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+- gpqa
diff --git a/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_v00.06.yaml b/recipes/DeepSeek-R1-Distill-Qwen-1.5B/grpo/config_v00.06.yaml
@@ -0,0 +1,53 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: AI-MO/NuminaMath-TIR
+dataset_configs:
+- all
+num_processes: 7
+ddp_find_unused_parameters: false
+# GRPO trainer config
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+bf16: true
+do_eval: false
+eval_strategy: "no"
+gradient_accumulation_steps: 8
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-1.5-GRPO-v00.06
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_level: info
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_train_epochs: 0.1
+num_generations: 7
+output_dir: data/DeepSeek-R1-Distill-Qwen-1.5-GRPO-v00.06
+overwrite_output_dir: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 8
+push_to_hub: true
+report_to:
+- wandb
+seed: 42
+warmup_ratio: 0.1
+
+# Saving and eval callbacks
+save_strategy: "steps"
+save_steps: 25
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- math_500
+- aime24
+- gpqa