trainer support eval_accumulation_steps. (PaddlePaddle#5426)

vivienfanghuagood · Mar 27, 2023 · 07a5874 · 07a5874
1 parent f98fb04
commit 07a5874
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 0 deletions.
diff --git a/docs/trainer.md b/docs/trainer.md
@@ -249,6 +249,16 @@ Trainer 是一个简单，但功能完整的 Paddle训练和评估模块，并
                         Number of updates steps to accumulate before
                         performing a backward/update pass. (default: 1)
 
+  --eval_accumulation_steps
+                        在将结果移动到CPU之前，累积输出张量的预测步骤数。如果如果未设置，
+                        则在移动到CPU之前，整个预测都会在GPU上累积（速度更快需要更多的显存）。
+                        （`int`，可选，默认为 None 不设置）
+
+                        Number of predictions steps to accumulate the output tensors for,
+                        before moving the results to the CPU. If left unset, the whole predictions are
+                        accumulated on GPU before being moved to the CPU (faster butrequires more memory)
+                        (default: None)
+
   --learning_rate
                         优化器的初始学习率, （`float`，可选，默认为 5e-05）
 

diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
@@ -1746,8 +1746,28 @@ def evaluation_loop(
                     logits = self.preprocess_logits_for_metrics(logits, labels)
                 preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
             self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
+
+            # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
+            if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0:
+                if losses_host is not None:
+                    losses = nested_numpify(losses_host)
+                    all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
+                if preds_host is not None:
+                    logits = nested_numpify(preds_host)
+                    all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
+
+                if labels_host is not None:
+                    labels = nested_numpify(labels_host)
+                    all_labels = (
+                        labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
+                    )
+
+                # Set back to None to begin a new accumulation
+                losses_host, preds_host, labels_host = None, None, None
+
             if max_eval_iters > 0 and step >= max_eval_iters - 1:
                 break
+
         # Gather all remaining tensors and put them back on the CPU
         if losses_host is not None:
             losses = nested_numpify(losses_host)

diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py
@@ -109,6 +109,10 @@ class TrainingArguments:
 
             </Tip>
 
+        eval_accumulation_steps (`int`, *optional*):
+            Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If
+            left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster but
+            requires more memory).
         learning_rate (`float`, *optional*, defaults to 5e-5):
             The initial learning rate for [`AdamW`] optimizer.
         weight_decay (`float`, *optional*, defaults to 0):
@@ -302,6 +306,10 @@ class TrainingArguments:
         default=1,
         metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."},
     )
+    eval_accumulation_steps: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of predictions steps to accumulate before moving the tensors to the CPU."},
+    )
 
     learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
     weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."})