Added torchmetrics as dependency and fixed loss computation for ddp case.

Meet Patel · quic-meetkuma · commit 5a7999f0387b · 2025-04-09T16:46:04.000+05:30
Signed-off-by: Meet Patel &lt;quic_meetkuma@quicinc.com&gt;
diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py
@@ -300,8 +300,18 @@ def train(
             else:
                 train_epoch_loss = total_loss / len(train_dataloader)
 
+        if train_config.enable_ddp:
+            # Get the correct train loss from all the nodes.
+            dist.barrier()
+            dist.all_reduce(train_epoch_loss, op=dist.ReduceOp.SUM)
+            train_epoch_loss /= dist.get_world_size()
+            
         if train_config.task_type == "seq_classification":
-            train_perplexity = acc_helper.compute()
+            accuracy = acc_helper.compute()
+            if train_config.enable_ddp:
+                dist.all_reduce(accuracy, op=dist.ReduceOp.SUM)
+                accuracy /= dist.get_world_size()
+            train_perplexity = accuracy
         else:
             train_perplexity = torch.exp(train_epoch_loss)
 
@@ -319,6 +329,7 @@ def train(
                 )
                 dist.barrier()
                 dist.all_reduce(eval_epoch_loss, op=dist.ReduceOp.SUM)
+                eval_epoch_loss /= dist.get_world_size()
                 if local_rank == 0:
                     tensorboard_updates.add_scalars("loss", {"eval": eval_epoch_loss}, total_train_steps)
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ license = { file = "LICENSE" }
 authors = [{ name = "Qualcomm Cloud AI ML Team" }]
 keywords = ["transformers", "Cloud AI 100", "Inference"]
 classifiers = [
-    "Programming Language :: Python :: 3", 
+    "Programming Language :: Python :: 3",
     "Development Status :: 5 - Development/Unstable",
     "Intended Audience :: Developers",
     "Intended Audience :: Education",
@@ -38,6 +38,7 @@ dependencies = [
     "tensorboard",
     "fire",
     "py7zr",
+    "torchmetrics==1.7.0",
     "torch==2.4.1; platform_machine=='aarch64'",
     # Specifying torch cpu package URL per python version, update the list once pytorch releases whl for python>3.11
     "torch@https://download.pytorch.org/whl/cpu/torch-2.4.1%2Bcpu-cp38-cp38-linux_x86_64.whl ; python_version=='3.8' and platform_machine=='x86_64'",
@@ -60,7 +61,7 @@ namespaces = false
 
 [tool.setuptools.dynamic.version]
 attr = "QEfficient.__version__"
- 
+
 [tool.ruff]
 line-length = 120
 # Enable the isort rules.