fix grpo zero3 (#3104)

Jintao-Huang · web-flow · commit 29d2bb01aef3 · 2025-02-14T10:24:16.000+08:00
diff --git a/README.md b/README.md
@@ -117,7 +117,7 @@ Running Environment:
 | trl          | >=0.13,<0.16         | 0.14.0      | RLHF                                      |
 | vllm         | >=0.5.1              | 0.6.5       | Inference/Deployment/Evaluation           |
 | lmdeploy     | lmdeploy>=0.5,<0.6.5 | 0.6.4       | Inference/Deployment/Evaluation           |
-| deepspeed    |                      | 0.14.5      | Training                                  |
+| deepspeed    | >=0.14 |  | Training                                  |
 
 For more optional dependencies, you can refer to [here](https://github.com/modelscope/ms-swift/blob/main/requirements/install_all.sh).
 
diff --git a/README_CN.md b/README_CN.md
@@ -112,7 +112,7 @@ pip install -e .
 | trl | >=0.13,<0.16 | 0.14.0 |RLHF|
 | vllm | >=0.5.1 | 0.6.5 |推理/部署/评测|
 | lmdeploy | lmdeploy>=0.5,<0.6.5 | 0.6.4 |推理/部署/评测|
-| deepspeed |  | 0.14.5 |训练|
+| deepspeed | >=0.14 |  |训练|
 
 更多可选依赖可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/requirements/install_all.sh)。
 
diff --git a/docs/source/GetStarted/SWIFT安装.md b/docs/source/GetStarted/SWIFT安装.md
@@ -63,7 +63,7 @@ pip install ms-swift==2.*
 | trl | >=0.13,<0.16 | 0.14.0 |RLHF|
 | vllm | >=0.5.1 | 0.6.5 |推理/部署/评测|
 | lmdeploy | lmdeploy>=0.5,<0.6.5 | 0.6.4 |推理/部署/评测|
-| deepspeed |  | 0.14.5 |训练|
+| deepspeed | >=0.14 |  |训练|
 
 更多可选依赖可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/requirements/install_all.sh)。
 
diff --git a/docs/source/Instruction/GRPO.md b/docs/source/Instruction/GRPO.md
@@ -84,6 +84,7 @@ A conversation between User and Assistant. The user asks a question, and the Ass
 超参数
 - num_generations: 每个prompt采样的数量，论文中的G值，需要被 per_device_eval_batch_size * nproc_per_node 整除
 - max_completion_length: 采样生成的最大长度，默认为512
+- ds3_gather_for_generation: 该参数适用于DeepSpeed ZeRO-3。如果启用，策略模型权重将被收集用于生成，从而提高生成速度。然而，禁用此选项允许训练超出单个GPU VRAM的模型，尽管生成速度会变慢。禁用此选项与vLLM生成不兼容。默认为True
 - reward_funcs: 奖励函数，根据模型生成结果进行打分，内置accuracy、format、cosine和repetition四个rule-based函数，详细见 swift/plugin/orm.py 文件
 - reward_weights: 每个奖励函数的权重。必须与奖励函数的数量匹配。如果为 None，则所有奖励的权重都相等，为`1.0`
   - 提示：如果GRPO训练中包含`--reward_model`，则其加在奖励函数的最后位置
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -365,6 +365,7 @@ reward模型参数将在PPO、GRPO中使用。
 #### GRPO参数
 - num_generations: GRPO算法中的G值，默认为8
 - max_completion_length: GRPO算法中的最大生成长度，默认为512
+- ds3_gather_for_generation: 该参数适用于DeepSpeed ZeRO-3。如果启用，策略模型权重将被收集用于生成，从而提高生成速度。然而，禁用此选项允许训练超出单个GPU VRAM的模型，尽管生成速度会变慢。禁用此选项与vLLM生成不兼容。默认为True
 - reward_funcs: GRPO算法奖励函数，可选项为`accuracy`、`format`、`cosine` 和 `repetition`，见swift/plugin/orm.py。你也可以在plugin中自定义自己的奖励函数。默认为`[]`
 - reward_weights: 每个奖励函数的权重。必须与奖励函数的数量匹配。如果为 None，则所有奖励的权重都相等，为`1.0`
   - 提示：如果GRPO训练中包含`--reward_model`，则其加在奖励函数的最后位置
diff --git a/docs/source/Instruction/预训练与微调.md b/docs/source/Instruction/预训练与微调.md
@@ -24,7 +24,7 @@
 pip install ms-swift -U
 
 # 若使用deepspeed zero2/zero3
-pip install deepspeed==0.14.5
+pip install deepspeed -U
 ```
 
 ## 预训练
@@ -73,7 +73,7 @@ ms-swift使用了分层式的设计思想，用户可以使用命令行界面、
 - 无法对QLoRA训练的模型进行Merge LoRA，因此不建议使用QLoRA进行微调，无法在推理和部署时使用vLLM/LMDeploy进行推理加速。建议使用LoRA/全参数进行微调，合并为完整权重后再使用GPTQ/AWQ/BNB进行[量化](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize)。
 - SWIFT默认在训练时设置`--gradient_checkpointing true`来节约显存，这会略微降低训练速度。
 - 若使用DDP进行训练，出现报错：`RuntimeError: Expected to mark a variable ready only once.`，请额外设置参数`--gradient_checkpointing_kwargs '{"use_reentrant": false}'`或者使用DeepSpeed进行训练。
-- 如果要使用deepspeed，你需要安装deepspeed：`pip install deepspeed==0.14.5`。使用deepspeed可以节约显存，但会略微降低训练速度。
+- 如果要使用deepspeed，你需要安装deepspeed：`pip install deepspeed -U`。使用deepspeed可以节约显存，但会略微降低训练速度。
 - 如果您的机器是A100等高性能显卡，且模型支持flash-attn，推荐你安装[flash-attn](https://github.com/Dao-AILab/flash-attention/releases)，并设置`--attn_impl flash_attn`，这将会加快训练和推理的速度并略微降低显存占用。
 
 **如何debug：**
diff --git a/docs/source_en/GetStarted/SWIFT-installation.md b/docs/source_en/GetStarted/SWIFT-installation.md
@@ -64,7 +64,7 @@ You can view the image [here](https://modelscope.cn/docs/intro/environment-setup
 | trl          | >=0.13,<0.16         | 0.14.0      | RLHF                                      |
 | vllm         | >=0.5.1              | 0.6.5       | Inference/Deployment/Evaluation           |
 | lmdeploy     | lmdeploy>=0.5,<0.6.5 | 0.6.4       | Inference/Deployment/Evaluation           |
-| deepspeed    |                      | 0.14.5      | Training                                  |
+| deepspeed    | >=0.14 |  | Training                                  |
 
 For more optional dependencies, you can refer to [here](https://github.com/modelscope/ms-swift/blob/main/requirements/install_all.sh).
 
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -376,6 +376,7 @@ The meanings of the following parameters can be referenced [here](https://huggin
 #### GRPO Arguments
 - num_generations: The G value in the GRPO algorithm, default is 8.
 - max_completion_length: The maximum generation length in the GRPO algorithm, default is 512.
+- ds3_gather_for_generation: This parameter applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, improving generation speed. However, disabling this option allows training models that exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible with vLLM generation. The default is True.
 - reward_funcs: Reward functions in the GRPO algorithm; options include `accuracy`,`format`,`cosine` and `repetition`, as seen in `swift/plugin/orm.py`. You can also customize your own reward functions in the plugin. Default is `[]`.
 - reward_weights: Weights for each reward function. Must match the number of reward functions. If `None`, all rewards are weighted equally with weight `1.0`.
   - Note: If `--reward_model` is included in GRPO training, it is added to the end of the reward functions.
diff --git a/docs/source_en/Instruction/GRPO.md b/docs/source_en/Instruction/GRPO.md
@@ -86,6 +86,7 @@ Hyperparameters
 
 - num_generations: The number of samples for each prompt, referred to as the G value in the paper, needs to be divisible by per_device_eval_batch_size * - nproc_per_node.
 - max_completion_length: The maximum length for sampling generation, default is 512.
+- ds3_gather_for_generation: This parameter applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, improving generation speed. However, disabling this option allows training models that exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible with vLLM generation. The default is True.
 - reward_funcs: Reward functions to score the results generated by the model. Includes built-in accuracy, format , cosine and repetition rule-based functions, detailed in the swift/plugin/orm.py file.
 - reward_weights: Weights for each reward function. Must match the number of reward functions. If `None`, all rewards are weighted equally with weight `1.0`.
   - Note: If `--reward_model` is included in GRPO training, it is added to the end of the reward functions.
diff --git a/docs/source_en/Instruction/Pre-training-and-Fine-tuning.md b/docs/source_en/Instruction/Pre-training-and-Fine-tuning.md
@@ -25,7 +25,7 @@ Refer to the [SWIFT installation documentation](../GetStarted/SWIFT-installation
 pip install ms-swift -U
 
 # If using deepspeed zero2/zero3
-pip install deepspeed==0.14.5
+pip install deepspeed -U
 ```
 
 ## Pre-training
@@ -77,7 +77,7 @@ Additionally, we offer a series of scripts to help you understand the training c
 - Merging LoRA for models trained with QLoRA is not possible, so it is not recommended to use QLoRA for fine-tuning, as it cannot utilize vLLM/LMDeploy for inference acceleration during inference and deployment. It is recommended to use LoRA or full parameter fine-tuning, merge them into complete weights, and then use GPTQ/AWQ/BNB for [quantization](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize).
 - By default, SWIFT sets `--gradient_checkpointing true` during training to save memory, which may slightly slow down the training speed.
 - If you are using DDP for training and encounter the error: `RuntimeError: Expected to mark a variable ready only once.`, please additionally set the parameter `--gradient_checkpointing_kwargs '{"use_reentrant": false}'` or use DeepSpeed for training.
-- To use DeepSpeed, you need to install it: `pip install deepspeed==0.14.5`. Using DeepSpeed can save memory but may slightly reduce training speed.
+- To use DeepSpeed, you need to install it: `pip install deepspeed -U`. Using DeepSpeed can save memory but may slightly reduce training speed.
 - If your machine has high-performance GPUs like A100 and the model supports flash-attn, it is recommended to install [flash-attn](https://github.com/Dao-AILab/flash-attention/releases) and set `--attn_impl flash_attn`, as this will accelerate training and inference while slightly reducing memory usage.
 
 **How to debug:**
diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
@@ -42,6 +42,7 @@ class PPOArguments:
 class GRPOArguments(GRPOArgumentsMixin):
     num_generations: int = 8  # G in the GRPO paper
     max_completion_length: int = 512
+    ds3_gather_for_generation: bool = True
     reward_funcs: List[str] = field(default_factory=list)
     reward_weights: List[float] = None
     log_completions: bool = False
diff --git a/swift/llm/sampling/vanilla_sampler.py b/swift/llm/sampling/vanilla_sampler.py
@@ -35,8 +35,8 @@ def __init__(self, *args, **kwargs):
             raise ValueError(f'Cannot find engine name: {self.args.sampler_engine}')
         self.infer_engine = None
         if _Engine:
-            self.template = self.args.get_model_processor(model=self.args.model, load_model=False)
             self.infer_engine = _Engine(self.args.model, model_type=self.args.model_type, **self.args.engine_kwargs)
+            self.infer_engine.default_template = self.template
         self.caches = self.read_cache()
 
     def read_cache(self):
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -11,6 +11,7 @@
 from accelerate.utils import broadcast_object_list, gather, gather_object
 from transformers import PreTrainedModel
 from trl import GRPOTrainer as HFGRPOTrainer
+from trl.models import unwrap_model_for_generation
 
 from swift.llm import InferRequest, RequestConfig, to_device
 from swift.plugin.orm import orms
@@ -201,7 +202,9 @@ def _prepare_inputs(self, inputs) -> Dict[str, Union[torch.Tensor, Any]]:
             is_multimodal = self.model.model_meta.is_multimodal
             if is_multimodal:
                 models = self.template.remove_post_encode_hook()
-            outputs = self.engine.infer(inputs, self.request_config, use_tqdm=False)
+            with unwrap_model_for_generation(self.model, self.accelerator):
+                # same reference
+                outputs = self.engine.infer(inputs, self.request_config, use_tqdm=False)
             if is_multimodal:
                 self.template.register_post_encode_hook(models)
 
diff --git a/swift/trainers/rlhf_trainer/rlhf_mixin.py b/swift/trainers/rlhf_trainer/rlhf_mixin.py
@@ -57,7 +57,7 @@ def _save_load_context(trainer):
         finally:
             deepspeed_model.__dict__['module'] = _old_model
             deepspeed_model._modules['module'] = _old_model
-            trainer.model = deepspeed_model
+            trainer.model = _old_model
 
 
 class RLHFTrainerMixin: