Merge branch 'main' into release/3.2

Jintao-Huang · Jintao-Huang · commit 47a9b76c42e7 · 2025-03-25T22:49:43.000+08:00
diff --git a/README.md b/README.md
@@ -125,7 +125,7 @@ Running Environment:
 | peft | >=0.11,<0.16 | ||
 | trl | >=0.13,<0.17 | 0.16 |RLHF|
 | deepspeed    | >=0.14       | 0.14.5 | Training                                  |
-| vllm         | >=0.5.1      | 0.7.3       | Inference/Deployment/Evaluation           |
+| vllm         | >=0.5.1,<0.8      | 0.7.3       | Inference/Deployment/Evaluation           |
 | lmdeploy     | >=0.5        | 0.7.2.post1       | Inference/Deployment/Evaluation           |
 | evalscope | >=0.11       |  | Evaluation |
 
diff --git a/README_CN.md b/README_CN.md
@@ -120,7 +120,7 @@ pip install -e .
 | peft | >=0.11,<0.16 | ||
 | trl | >=0.13,<0.17 | 0.16 |RLHF|
 | deepspeed | >=0.14       | 0.14.5 |训练|
-| vllm | >=0.5.1      | 0.7.3 |推理/部署/评测|
+| vllm | >=0.5.1,<0.8      | 0.7.3 |推理/部署/评测|
 | lmdeploy | >=0.5        | 0.7.2.post1 |推理/部署/评测|
 | evalscope | >=0.11       | |评测|
 
diff --git a/docs/source/BestPractices/Embedding训练.md b/docs/source/BestPractices/Embedding训练.md
@@ -63,6 +63,8 @@ loss的源代码可以在[这里](https://github.com/modelscope/ms-swift/blob/ma
 {"query": "<image>sentence1", "response":  "sentence2", "images": "/some/images.jpg", "label": 0}
 ```
 
+评测的指标分别是两个embedding的欧式距离、点积等的pearson系数以及spearman系数，共八个指标。
+
 ### infonce 格式
 
 ```json lines
@@ -82,6 +84,11 @@ infonce loss支持几个环境变量：
 > 也可以在数据集中将hard negatives数量设置为数量相等，这样即使不设置也不会使用for循环方式，加快计算速度
 > rejected_response也可以没有，这种情况下INFONCE_USE_BATCH保持为True，会使用一个batch内部的其他samples作为rejected responses
 
+infonce loss的评测会有下面几个指标：
+- mean_neg 所有hard_negative的平均值
+- mean_pos 所有positive的平均值
+- margin positive-max_hard_negative的平均值
+
 ## 脚手架
 
 SWIFT提供了两个脚手架训练脚本：
diff --git a/docs/source/GetStarted/SWIFT安装.md b/docs/source/GetStarted/SWIFT安装.md
@@ -65,7 +65,7 @@ pip install ms-swift==2.*
 | peft | >=0.11,<0.16 | ||
 | trl | >=0.13,<0.17 | 0.16 |RLHF|
 | deepspeed | >=0.14       | 0.14.5 |训练|
-| vllm | >=0.5.1      | 0.7.3 |推理/部署/评测|
+| vllm | >=0.5.1,<0.8      | 0.7.3 |推理/部署/评测|
 | lmdeploy | >=0.5        | 0.7.2.post1 |推理/部署/评测|
 | evalscope | >=0.11       | |评测|
 
diff --git a/docs/source/Instruction/GRPO.md b/docs/source/Instruction/GRPO.md
@@ -116,8 +116,8 @@ A conversation between User and Assistant. The user asks a question, and the Ass
 - offload_optimizer: 是否在vLLM/LMDeploy推理时offload optimizer参数，默认为False
 - offload_model: 是否在vLLM/LMDeploy推理时offload 模型本身，默认为False
 - gc_collect_after_offload: 是否在offload结束时进行gc（python gc和GPU gc），默认为False
-- mini_batch_size：用于将每个设备上的批次大小（per_device_batch）进一步切分为更小的子批次。为确保切分有效，per_device_batch 需要能够被 mini_batch_size 整除。
-
+- multi_turn_func: 多轮GRPO参数, 传入对应的plugin名称, 同时在plugin/multi_turn.py中添加好对应的实现
+- mini_batch_size：用于将每个设备上的批次大小（per_device_batch）进一步切分为更小的子批次。为确保切分有效，per_device_batch 需要能够被 mini_batch_size 整除
 
 奖励函数超参，见[内置奖励函数](#内置奖励函数)
 
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -411,7 +411,8 @@ reward模型参数将在PPO、GRPO中使用。
 - offload_optimizer: 是否在vLLM/LMDeploy推理时offload optimizer参数，默认为False
 - offload_model: 是否在vLLM/LMDeploy推理时offload 模型本身，默认为False
 - gc_collect_after_offload: 是否在offload结束时进行gc（python gc和GPU gc），默认为False
-- mini_batch_size：用于将每个设备上的批次大小（per_device_batch）进一步切分为更小的子批次。为确保切分有效，per_device_train_batch_size 需要能够被 mini_batch_size 整除。
+- multi_turn_func: 多轮GRPO参数, 传入对应的plugin名称, 同时在plugin/multi_turn.py中添加好对应的实现
+- mini_batch_size：用于将每个设备上的批次大小（per_device_batch）进一步切分为更小的子批次。为确保切分有效，per_device_train_batch_size 需要能够被 mini_batch_size 整除
 
 cosine 奖励参数
 - cosine_min_len_value_wrong：cosine 奖励函数参数，生成错误答案时，最小长度对应的奖励值。默认值为0.0
diff --git a/docs/source_en/BestPractices/Embedding.md b/docs/source_en/BestPractices/Embedding.md
@@ -52,6 +52,8 @@ The source code for the loss functions can be found [here](https://github.com/mo
 {"query": "sentence1", "response": "<image>sentence2", "images": ["/some/images1.jpg"], "label": 0.7}
 ```
 
+The eval metrics are the Pearson and Spearman's Rank Correlation Coefficient of the embeddings' euclidean distance/dot production and so on, totally 8 values.
+
 ### Format for Contrastive/Online Contrastive Loss
 
 ```json lines
@@ -82,6 +84,10 @@ InfoNCE loss supports the following environment variables:
 >
 > `rejected_response` can also be omitted. In this case, `INFONCE_USE_BATCH` remains `True` and will use other samples within the batch as rejected responses.
 
+The evaluation of InfoNCE loss includes the following metrics:
+- mean_neg: The average of all hard negatives
+- mean_pos: The average of all positives
+- margin: The average of (positive - max hard negative)
 
 ## Scaffolding
 
diff --git a/docs/source_en/GetStarted/SWIFT-installation.md b/docs/source_en/GetStarted/SWIFT-installation.md
@@ -66,7 +66,7 @@ You can view the image [here](https://modelscope.cn/docs/intro/environment-setup
 | peft         | >=0.11,<0.16 |             |                                           |
 | trl          | >=0.13,<0.17 | 0.16      | RLHF                                      |
 | deepspeed    | >=0.14       | 0.14.5 | Training                                  |
-| vllm         | >=0.5.1      | 0.7.3       | Inference/Deployment/Evaluation           |
+| vllm         | >=0.5.1,<0.8      | 0.7.3       | Inference/Deployment/Evaluation           |
 | lmdeploy     | >=0.5        | 0.7.2.post1       | Inference/Deployment/Evaluation           |
 | evalscope | >=0.11       | | Evaluation |
 
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -422,7 +422,9 @@ The meanings of the following parameters can be referenced [here](https://huggin
 - offload_optimizer: Whether to offload optimizer parameters during inference with vLLM/LMDeploy. The default is `False`.
 - offload_model: Whether to offload the model itself during inference with vLLM/LMDeploy. The default is `False`.
 - gc_collect_after_offload: Whether to perform garbage collection (both Python GC and GPU GC) after offloading. The default is `False`.
-- mini_batch_size: Used to further split the batch size on each device (per_device_batch) into smaller sub-batches. To ensure the split is valid, per_device_train_batch_size needs be divisible by mini_batch_size.
+- multi_turn_func: The multi turn GRPO plugin name. Add your multi-turn implementation in plugin/multi_turn.py
+- mini_batch_size: Used to further split the batch size on each device (per_device_batch) into smaller sub-batches. To ensure the split is valid, per_device_train_batch_size needs be divisible by mini_batch_size
+
 cosine reward function arguments
 - `cosine_min_len_value_wrong` (default: 0.0): Reward value corresponding to the minimum length when the answer is incorrect. Default is 0.0
 - `cosine_max_len_value_wrong` (default: -0.5): Reward value corresponding to the maximum length when the answer is incorrect. Default is -0.5
diff --git a/docs/source_en/Instruction/GRPO.md b/docs/source_en/Instruction/GRPO.md
@@ -118,8 +118,8 @@ Hyperparameters
 - offload_optimizer: Whether to offload optimizer parameters during inference with vLLM/LMDeploy. The default is `False`.
 - offload_model: Whether to offload the model itself during inference with vLLM/LMDeploy. The default is `False`.
 - gc_collect_after_offload: Whether to perform garbage collection (both Python GC and GPU GC) after offloading. The default is `False`.
-- mini_batch_size: Used to further split the batch size on each device (per_device_batch) into smaller sub-batches. To ensure the split is valid, per_device_train_batch_size needs be divisible by mini_batch_size.
-
+- multi_turn_func: The multi turn GRPO plugin name. Add your multi-turn implementation in plugin/multi_turn.py
+- mini_batch_size: Used to further split the batch size on each device (per_device_batch) into smaller sub-batches. To ensure the split is valid, per_device_train_batch_size needs be divisible by mini_batch_size
 
 The hyperparameters for the reward function can be found in the [Built-in Reward Functions section](#built-in-reward-functions).
 
diff --git a/requirements/install_all.sh b/requirements/install_all.sh
@@ -1,6 +1,6 @@
 # please use python=3.10, cuda12.*
 # sh requirements/install_all.sh
-pip install "vllm>=0.5.1" -U
+pip install "vllm>=0.5.1,<0.8" -U
 pip install "lmdeploy>=0.5" -U --no-deps
 pip install autoawq -U --no-deps
 pip install auto_gptq optimum bitsandbytes -U
diff --git a/swift/llm/infer/infer_engine/utils.py b/swift/llm/infer/infer_engine/utils.py
@@ -470,12 +470,12 @@ def new_group_context():
 
 @contextmanager
 def set_device_context(device: Union[str, int]):
-    original_device = torch.cuda.current_device()
-    torch.cuda.set_device(device)
+    origin_device = get_current_device()
+    set_device(device)
     try:
         yield
     finally:
-        torch.cuda.set_device(original_device)
+        set_device(origin_device)
 
 
 @contextmanager
diff --git a/swift/plugin/loss.py b/swift/plugin/loss.py
@@ -83,7 +83,10 @@ def loss_scale_func(outputs, labels, loss_scale=None, num_items_in_batch=None) -
 
 
 def _parse_pair_sentence(outputs):
-    last_hidden_state = outputs['last_hidden_state']
+    if isinstance(outputs, dict):
+        last_hidden_state = outputs['last_hidden_state']
+    else:
+        last_hidden_state = outputs
     batch_size = last_hidden_state.shape[0]
     shape_len = len(last_hidden_state.shape)
     first_sentence = list(range(0, batch_size, 2))
@@ -126,6 +129,114 @@ def contrastive_loss(outputs, labels, loss_scale=None, num_items_in_batch=None)
     return losses.mean()
 
 
+def calculate_paired_metrics(embeddings, labels):
+    from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, \
+        paired_manhattan_distances
+    from scipy.stats import pearsonr, spearmanr
+
+    embeddings1, embeddings2 = _parse_pair_sentence(embeddings)
+    cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
+    manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
+    euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
+    dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)]
+
+    eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
+    eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
+
+    eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
+    eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
+
+    eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
+    eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
+
+    eval_pearson_dot, _ = pearsonr(labels, dot_products)
+    eval_spearman_dot, _ = spearmanr(labels, dot_products)
+
+    return {
+        'pearson_cosine': eval_pearson_cosine,
+        'pearson_euclidean': eval_pearson_manhattan,
+        'pearson_manhattan': eval_pearson_euclidean,
+        'pearson_dot_product': eval_pearson_dot,
+        'spearman_cosine': eval_spearman_cosine,
+        'spearman_euclidean': eval_spearman_manhattan,
+        'spearman_manhattan': eval_spearman_euclidean,
+        'spearman_dot_product': eval_spearman_dot,
+    }
+
+
+def calculate_infonce_metrics(embeddings, labels):
+    from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, \
+        paired_manhattan_distances
+    from scipy.stats import pearsonr, spearmanr
+    hard_negatives = os.environ.get('INFONCE_HARD_NEGATIVES', None)
+    use_batch = strtobool(os.environ.get('INFONCE_USE_BATCH', 'True'))
+    split_tensors = _parse_multi_negative_sentences(torch.tensor(embeddings), torch.tensor(labels), hard_negatives)
+    split_tensors = [t.numpy() for t in split_tensors]
+    can_batched = hard_negatives is not None
+    if hard_negatives is None and len(set([s.shape[0] for s in split_tensors])) == 1:
+        can_batched = True
+    all_similarity_matrix = []
+    all_labels = []
+    pos_neg_margins = []
+    if not use_batch:
+        if can_batched:
+            sentences = np.stack(split_tensors, axis=0)
+            similarity_matrix = np.matmul(sentences[:, 0:1], sentences[:, 1:].transpose((0, 2, 1))).squeeze(1)
+            all_similarity_matrix.append(similarity_matrix)
+            labels = np.zeros_like(similarity_matrix)
+            labels[:, 0] = 1
+            all_labels.append(labels)
+        else:
+            for tensor in split_tensors:
+                similarity_matrix = np.matmul(tensor[0], tensor[1:].T)
+                all_similarity_matrix.append(similarity_matrix)
+                labels = np.zeros_like(similarity_matrix)
+                labels[0] = 1
+                all_labels.append(labels)
+                max_neg_scores = np.max(similarity_matrix[labels == 0], axis=-1)
+                pos_neg_margins.append(np.mean(similarity_matrix[labels == 1] - max_neg_scores).item())
+    else:
+        if can_batched:
+            sentences = np.stack(split_tensors, axis=0)
+            similarity_matrix = np.matmul(sentences[:, 0], sentences[:, 1:].reshape(-1, sentences.shape[2]).T)
+            all_similarity_matrix.append(similarity_matrix)
+            labels = np.zeros_like(similarity_matrix)
+            for row, col in enumerate(range(0, sentences.shape[0] * (sentences.shape[1] - 1), sentences.shape[1] - 1)):
+                labels[row, col] = 1
+            all_labels.append(labels)
+        else:
+            all_tensors = []
+            for tensor in split_tensors:
+                all_tensors.append(tensor[1:])
+            sentences = np.concatenate(all_tensors, axis=0)
+            length = 0
+            for idx, tensor in enumerate(split_tensors):
+                similarity_matrix = np.matmul(tensor[0], sentences.T)
+                all_similarity_matrix.append(similarity_matrix)
+                labels = np.zeros_like(similarity_matrix)
+                labels[length] = 1
+                all_labels.append(labels)
+                length += tensor.shape[0] - 1
+                max_neg_scores = np.max(similarity_matrix[labels == 0], axis=-1)
+                pos_neg_margins.append(np.mean(similarity_matrix[labels == 1] - max_neg_scores).item())
+
+    similarity_matrix = np.concatenate(all_similarity_matrix, axis=0)
+    labels = np.concatenate(all_labels, axis=0)
+    if can_batched:
+        pos_scores = similarity_matrix[labels == 1].reshape(similarity_matrix.shape[0], -1)
+        neg_scores = similarity_matrix[labels == 0].reshape(similarity_matrix.shape[0], -1)
+        max_neg_scores = np.max(neg_scores, axis=-1)
+        pos_neg_margin = np.mean(pos_scores - max_neg_scores).item()
+    else:
+        pos_scores = similarity_matrix[labels == 1]
+        neg_scores = similarity_matrix[labels == 0]
+        pos_neg_margin = np.mean(pos_neg_margins)
+
+    mean_neg = np.mean(neg_scores)
+    mean_pos = np.mean(pos_scores)
+    return {'margin': pos_neg_margin, 'mean_neg': mean_neg, 'mean_pos': mean_pos}
+
+
 def _parse_multi_negative_sentences(sentences, labels, hard_negatives=None):
     split_indices = torch.nonzero(labels, as_tuple=False).squeeze().tolist()
     if isinstance(split_indices, int):
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -630,7 +630,7 @@ def reorder_outputs(outputs, distributed_idx):
 
         return [index_to_output[idx] for idx in sorted(index_to_output.keys())]
 
-    def _infer_multi_turn(self, inputs_slice, request_config) -> List[List[Dict[str, Any]]]:
+    def _infer_multi_turn(self, inputs_slice, request_config) -> List[List[List[Dict[str, Any]]]]:
         from swift.llm.infer.protocol import ChatCompletionResponse
         rank, _, _, _ = get_dist_setting()
         request_config = copy(request_config)
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
@@ -71,48 +71,11 @@ def __init__(self, *args, **kwargs):
         self.label_names = ['labels']
 
     def calculate_metric(self, eval_prediction: EvalPrediction) -> Dict[str, float]:
-        from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, \
-            paired_manhattan_distances
-        from scipy.stats import pearsonr, spearmanr
-
-        embeddings = eval_prediction.predictions
-        labels = eval_prediction.label_ids
-        batch_size = 2 * self.args.per_device_eval_batch_size
-        half_batch_size = self.args.per_device_eval_batch_size
-        embeddings1 = []
-        embeddings2 = []
-        for i in range(embeddings.shape[0] // batch_size):
-            embeddings1.append(embeddings[i * batch_size:i * batch_size + half_batch_size])
-            embeddings2.append(embeddings[i * batch_size + half_batch_size:(i + 1) * batch_size])
-
-        embeddings1 = np.concatenate(embeddings1)
-        embeddings2 = np.concatenate(embeddings2)
-        if len(embeddings1.shape) == 3:
-            embeddings1 = embeddings1[:, 0]
-            embeddings2 = embeddings2[:, 0]
-        cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
-        manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
-        euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
-        dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)]
-
-        eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
-        eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
-
-        eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
-        eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
-
-        eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
-        eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
-
-        eval_pearson_dot, _ = pearsonr(labels, dot_products)
-        eval_spearman_dot, _ = spearmanr(labels, dot_products)
-
-        return {
-            'cosine': eval_spearman_cosine,
-            'euclidean': eval_pearson_euclidean,
-            'manhattan': eval_pearson_manhattan,
-            'dot_product': eval_spearman_dot,
-        }
+        from swift.plugin.loss import infonce_loss, calculate_paired_metrics, calculate_infonce_metrics
+        if self.compute_loss_func is infonce_loss:
+            return calculate_infonce_metrics(eval_prediction.predictions, eval_prediction.label_ids)
+        else:
+            return calculate_paired_metrics(eval_prediction.predictions, eval_prediction.label_ids)
 
 
 class Seq2SeqTrainer(SwiftMixin, HfSeq2SeqTrainer):