[QEff. Finetune]: Added support to sync gradients across devices during optimizer step only. (#477)

quic-meetkuma · web-flow · commit 3aaa2d8986ad · 2025-07-09T15:47:25.000+05:30
Disabling gradient is necessary when using gradient_accumulation_step &gt;
1 with ddp enabled.
Currently, we are syncing gradient at every loss.backward() call, which
is called at all steps. When using gradient accumulation, the weight
update during opt.step() step. Only during that step, the gradients
across each devices should sync with each other.

with model.no_sync() --&gt; context manager solves this issue.

Here, we are not using it but instead setting
ddp_model.require_backward_grad_sync to True or False depending on which
step we are.

---------

Signed-off-by: Meet Patel &lt;meetkuma@qti.qualcomm.com&gt;
Signed-off-by: meetkuma &lt;meetkuma@qti.qualcomm.com&gt;
diff --git a/QEfficient/finetune/utils/helper.py b/QEfficient/finetune/utils/helper.py
@@ -5,6 +5,15 @@
 #
 # -----------------------------------------------------------------------------
 import os
+from contextlib import nullcontext
+
+import torch
+
+try:
+    import torch_qaic.debug as qaic_debug  # noqa: F401
+except ImportError as e:
+    print(f"Warning: {e}. Moving ahead without these qaic modules.")
+
 
 TASK_TYPE = ["generation", "seq_classification"]
 PEFT_METHOD = ["lora"]
@@ -14,3 +23,34 @@
 
 def get_num_ddp_devices():
     return int(os.getenv("WORLD_SIZE", 1))
+
+
+def get_autocast_ctx(use_autocast, device_type, dtype=torch.float16):
+    return torch.autocast(device_type=device_type, dtype=dtype) if use_autocast else nullcontext()
+
+
+def get_op_verifier_ctx(
+    use_op_by_op_verifier,
+    train_device,
+    dump_dir,
+    step,
+    ref_device="cpu",
+    ref_dtype=torch.float32,
+    atol=1e-1,
+    rtol=1e-5,
+    use_ref_output_on_mismatch=True,
+):
+    if not use_op_by_op_verifier:
+        return nullcontext()
+
+    filter_config = qaic_debug.DispatchFilterConfig.default(train_device)
+    dump_dir = dump_dir + "_" + str(step)
+    return qaic_debug.OpByOpVerifierMode(
+        ref_device=ref_device,
+        ref_dtype=ref_dtype,
+        atol=atol,
+        rtol=rtol,
+        use_ref_output_on_mismatch=use_ref_output_on_mismatch,
+        filter_config=filter_config,
+        dump_root_dir=dump_dir,
+    )
diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py
@@ -8,8 +8,8 @@
 import json
 import os
 import time
-from contextlib import nullcontext
 from datetime import datetime
+from functools import partial
 from typing import Dict, List, Tuple
 
 import torch
@@ -19,6 +19,7 @@
 from tqdm import tqdm
 
 from QEfficient.finetune.configs.training import TrainConfig
+from QEfficient.finetune.utils.helper import get_autocast_ctx, get_op_verifier_ctx
 
 try:
     import torch_qaic  # noqa: F401
@@ -110,6 +111,9 @@ def train(
             num_classes = model.classifier.out_features
         acc_helper = torchmetrics.classification.MulticlassAccuracy(num_classes=num_classes).to(device)
 
+    autocast_ctx = get_autocast_ctx(train_config.use_autocast, device_type, dtype=torch.float16)
+    op_verifier_ctx = partial(get_op_verifier_ctx, train_config.opByOpVerifier, device, train_config.dump_root_dir)
+
     # Start the training loop
     for epoch in range(train_config.num_epochs):
         if loss_0_counter.item() == train_config.convergence_counter:
@@ -174,60 +178,38 @@ def train(
                 break
             batch = {k: v.to(device) for k, v in batch.items()}  # move the batch elements to qaic device
 
-            with (
-                torch.autocast(device_type=device_type, dtype=torch.float16)
-                if train_config.use_autocast
-                else nullcontext()
-            ):
-                # an additional condition can be put here to avoid opByOpVerifier getting triggered for each step
-                if train_config.opByOpVerifier:
-                    with qaic_debug.OpByOpVerifierMode(
-                        ref_device="cpu",
-                        ref_dtype=torch.float32,
-                        # adjust atol & rtol this as required
-                        atol=1e-1,
-                        use_ref_output_on_mismatch=True,
-                        filter_config=qaic_debug.DispatchFilterConfig.default(device),
-                        dump_root_dir=train_config.dump_root_dir + str(step),
-                    ) as verifier:
-                        model_outputs = model(**batch)
-                        loss = model_outputs.loss  # Forward call
-                        if (batch["labels"] != -100).sum() == 0:
-                            loss = loss.nan_to_num(nan=0.0)
-                            num_dummy_samples += train_config.train_batch_size
-                        else:
-                            num_dummy_samples_per_batch = (
-                                (torch.sum(batch["labels"] == -100, dim=1) == batch["labels"].shape[1]).sum().item()
-                            )
-                            if num_dummy_samples_per_batch > 0:
-                                num_dummy_samples += num_dummy_samples_per_batch
-                                loss = loss * train_config.train_batch_size / num_dummy_samples_per_batch
-
-                        if train_config.task_type == "seq_classification":
-                            logits = model_outputs.logits
-                            labels = batch["labels"][:, 0]
-                            preds = torch.nn.functional.softmax(logits, dim=-1)
-                            acc_helper.forward(preds, labels)
-                    print("Mismatches detected:", verifier.get_perop_mismatch_count())
+            is_optimizer_step = (step + 1) % train_config.gradient_accumulation_steps == 0 or step == len(
+                train_dataloader
+            ) - 1
+            if train_config.enable_ddp:
+                # Below block derived from : https://github.com/karpathy/nanoGPT/blob/93a43d9a5c22450bbf06e78da2cb6eeef084b717/train.py#L293
+                # in DDP training we only need to sync gradients at the last micro step.
+                # the official way to do this is with model.no_sync() context manager, but
+                # using too many context managers may bloat the code and forces us to repeat code
+                # looking at the source of that context manager, it just toggles this variable
+                model.require_backward_grad_sync = is_optimizer_step
+
+            with autocast_ctx, op_verifier_ctx(step) as verifier:
+                model_outputs = model(**batch)
+                loss = model_outputs.loss  # Forward call
+                if (batch["labels"] != -100).sum() == 0:
+                    loss = loss.nan_to_num(nan=0.0)
+                    num_dummy_samples += train_config.train_batch_size
                 else:
-                    model_outputs = model(**batch)
-                    loss = model_outputs.loss  # Forward call
-                    if (batch["labels"] != -100).sum() == 0:
-                        loss = loss.nan_to_num(nan=0.0)
-                        num_dummy_samples += train_config.train_batch_size
-                    else:
-                        num_dummy_samples_per_batch = (
-                            (torch.sum(batch["labels"] == -100, dim=1) == batch["labels"].shape[1]).sum().item()
-                        )
-                        if num_dummy_samples_per_batch > 0:
-                            num_dummy_samples += num_dummy_samples_per_batch
-                            loss = loss * train_config.train_batch_size / num_dummy_samples_per_batch
+                    num_dummy_samples_per_batch = (
+                        (torch.sum(batch["labels"] == -100, dim=1) == batch["labels"].shape[1]).sum().item()
+                    )
+                    if num_dummy_samples_per_batch > 0:
+                        num_dummy_samples += num_dummy_samples_per_batch
+                        loss = loss * train_config.train_batch_size / num_dummy_samples_per_batch
 
-                    if train_config.task_type == "seq_classification":
-                        logits = model_outputs.logits
-                        labels = batch["labels"][:, 0]
-                        preds = torch.nn.functional.softmax(logits, dim=-1)
-                        acc_helper.forward(preds, labels)
+                if train_config.task_type == "seq_classification":
+                    logits = model_outputs.logits
+                    labels = batch["labels"][:, 0]
+                    preds = torch.nn.functional.softmax(logits, dim=-1)
+                    acc_helper.forward(preds, labels)
+            if train_config.opByOpVerifier:
+                print("Mismatches detected:", verifier.get_perop_mismatch_count())
 
             total_loss += loss.detach().float()
 
@@ -274,7 +256,7 @@ def train(
             else:
                 loss.backward()  # backward pass
 
-            if (step + 1) % train_config.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+            if is_optimizer_step:
                 if train_config.grad_scaler:
                     scaler.step(optimizer)
                     scaler.update()
@@ -468,6 +450,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device):
     device_type = torch.device(device).type
 
     num_dummy_samples = 0
+    autocast_ctx = get_autocast_ctx(train_config.use_autocast, device_type, dtype=torch.float16)
     for step, batch in enumerate(tqdm(eval_dataloader, colour="green", desc="evaluating Epoch", dynamic_ncols=True)):
         #  stop when the maximum number of eval steps is reached
         if train_config.max_eval_step > 0 and step > train_config.max_eval_step:
@@ -478,11 +461,7 @@ def evaluation_helper(model, train_config, eval_dataloader, device):
         # Ensure no gradients are computed for this scope to save memory
         with torch.no_grad():
             # Forward pass and compute loss
-            with (
-                torch.autocast(device_type=device_type, dtype=torch.float16)
-                if train_config.use_autocast
-                else nullcontext()
-            ):
+            with autocast_ctx:
                 outputs = model(**batch)
             loss = outputs.loss