[c10d] Add _allgather_base , reduce_scatter , and _reduce_scatter_base into ProcessGroupMPI to enable FSDP with MPI backend (pytorch#150162)

nariaki3551 · pytorchmergebot · commit 23a3cef5d98e · 2025-04-14T19:31:38.000Z
This PR implements _allgather_base, reduce_scatter, and _reduce_scatter_base in the MPI backend (ProcessGroupMPI), enabling support for Fully Sharded Data Parallel (FSDP) in environments that use MPI for distributed communication. ### Context As noted in pytorch#85628, FSDP currently supports only the NCCL backend. Due to this limitation, FSDP cannot run on legacy HPC environments or clusters that rely on MPI. By implementing just these three collective operations, we can enable FSDP to work with the MPI backend. These collectives are implemented in a similar manner to existing operations such as allgather. ### Testing We validated this PR using pytorch/build/bin/ProcessGroupMPITest with OpenMPI, and all tests passed successfully. Pull Request resolved: pytorch#150162 Approved by: https://github.com/H-Huang
diff --git a/test/cpp/c10d/ProcessGroupMPITest.cpp b/test/cpp/c10d/ProcessGroupMPITest.cpp
@@ -185,6 +185,113 @@ void testAllgather(int iter = 10000) {
   }
 }
 
+void testAllgatherBase(int iter = 10000) {
+  auto pg = c10d::ProcessGroupMPI::createProcessGroupMPI();
+  std::vector<c10::intrusive_ptr<::c10d::Work>> works;
+
+  // Get the world size
+  auto worldSize = pg->getSize();
+  auto rank = pg->getRank();
+
+  // Generate inputs
+  for (const auto i : c10::irange(iter)) {
+    auto tensor = at::ones({16, 16}) * i * rank;
+    auto output = at::zeros({worldSize, 16, 16});
+
+    // Queue the work.
+    c10::intrusive_ptr<::c10d::Work> work = pg->_allgather_base(output, tensor);
+    works.push_back(std::move(work));
+  }
+
+  auto outputTensors = waitFuture(pg, works);
+
+  // Verify outputs
+  for (const auto i : c10::irange(iter)) {
+    for (const auto j : c10::irange(worldSize)) {
+      const auto expected = i * j;
+      auto data = outputTensors[i][0][j].data_ptr<float>();
+      for (auto k = 0; k < outputTensors[i][0][j].numel(); ++k) {
+        if (data[k] != static_cast<float>(expected)) {
+          TORCH_CHECK(false, "BOOM!");
+        }
+      }
+    }
+  }
+}
+
+void testReduceScatter(int iter = 10000) {
+  auto pg = c10d::ProcessGroupMPI::createProcessGroupMPI();
+  std::vector<c10::intrusive_ptr<::c10d::Work>> works;
+
+  // Get the world size
+  auto worldSize = pg->getSize();
+  auto rank = pg->getRank();
+
+  // Generate inputs
+  int count = 2;
+  for (const auto i : c10::irange(iter)) {
+    auto tensors = std::vector<std::vector<at::Tensor>>(1);
+    tensors[0].resize(worldSize);
+    for (const auto j : c10::irange(worldSize)) {
+      tensors[0][j] = at::ones({count, count}) * i * rank;
+    }
+    auto output = at::zeros({count, count});
+    auto outputs = std::vector<at::Tensor>({output});
+
+    // Queue the work.
+    c10::intrusive_ptr<::c10d::Work> work =
+        pg->reduce_scatter(outputs, tensors);
+    works.push_back(std::move(work));
+  }
+
+  auto outputTensors = waitFuture(pg, works);
+
+  // Verify outputs
+  for (const auto i : c10::irange(iter)) {
+    const auto expected = i * (worldSize * (worldSize - 1)) / 2.0;
+    auto data = outputTensors[i][0].data_ptr<float>();
+    for (auto j = 0; j < outputTensors[i][0].numel(); ++j) {
+      if (data[j] != static_cast<float>(expected)) {
+        TORCH_CHECK(false, "BOOM!");
+      }
+    }
+  }
+}
+
+void testReduceScatterBase(int iter = 10000) {
+  auto pg = c10d::ProcessGroupMPI::createProcessGroupMPI();
+  std::vector<c10::intrusive_ptr<::c10d::Work>> works;
+
+  // Get the world size
+  auto worldSize = pg->getSize();
+  auto rank = pg->getRank();
+
+  // Generate inputs
+  for (const auto i : c10::irange(iter)) {
+    auto tensor = at::ones({worldSize, 16, 16}) * i * rank;
+    auto output = at::zeros({16, 16});
+    auto outputs = std::vector<at::Tensor>({output});
+
+    // Queue the work.
+    c10::intrusive_ptr<::c10d::Work> work =
+        pg->_reduce_scatter_base(output, tensor);
+    works.push_back(std::move(work));
+  }
+
+  auto outputTensors = waitFuture(pg, works);
+
+  // Verify outputs
+  for (const auto i : c10::irange(iter)) {
+    const auto expected = i * (worldSize * (worldSize - 1)) / 2.0;
+    auto data = outputTensors[i][0].data_ptr<float>();
+    for (auto j = 0; j < outputTensors[i][0].numel(); ++j) {
+      if (data[j] != static_cast<float>(expected)) {
+        TORCH_CHECK(false, "BOOM!");
+      }
+    }
+  }
+}
+
 void testGather(int iter = 10000) {
   auto pg = c10d::ProcessGroupMPI::createProcessGroupMPI();
   std::vector<c10::intrusive_ptr<::c10d::Work>> works;
@@ -355,6 +462,9 @@ int main(int argc, char** argv) {
   testBroadcast();
   testReduce();
   testAllgather();
+  testAllgatherBase();
+  testReduceScatter();
+  testReduceScatterBase();
   testGather();
   testScatter();
   testSendRecv(false);
diff --git a/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp b/torch/csrc/distributed/c10d/ProcessGroupMPI.cpp
@@ -695,7 +695,47 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::reduce_scatter(
     std::vector<at::Tensor>& outputTensors,
     std::vector<std::vector<at::Tensor>>& inputTensors,
     const ReduceScatterOptions& opts) {
-  TORCH_CHECK(false, "ProcessGroupMPI does not support reduce_scatter");
+  checkSingleTensor(outputTensors);
+  if (inputTensors.size() != 1) {
+    TORCH_CHECK(
+        false,
+        "MPI process group only supports a single "
+        "tensor op");
+  }
+  if (static_cast<size_t>(size_) != inputTensors[0].size()) {
+    TORCH_CHECK(
+        false,
+        "Reduce scatter: number of input tensors should equal "
+        "to the world size");
+  }
+  checkSameSizeAndType(outputTensors[0], inputTensors[0]);
+
+  std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
+      [opts, this](std::unique_ptr<WorkEntry>& entry) {
+        auto data = (entry->dst)[0];
+        auto flatInputTensor = newLikeFlat(entry->src);
+        for (const auto i : c10::irange(entry->src.size())) {
+          flatInputTensor[static_cast<int64_t>(i)].copy_(entry->src[i]);
+        }
+        int recvcount = flatInputTensor.numel() / size_;
+
+        c10::DeviceGuard guard(data.device());
+        std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+        MPI_CHECK(MPI_Reduce_scatter_block(
+            flatInputTensor.data_ptr(),
+            data.data_ptr(),
+            recvcount,
+            mpiDatatype.at(data.scalar_type()),
+            mpiOp.at(opts.reduceOp),
+            pgComm_));
+      };
+
+  auto entry = std::make_unique<WorkEntry>(
+      &inputTensors[0], &outputTensors, std::move(runFunc));
+  return enqueue(
+      std::move(entry),
+      "mpi:reduce_scatter",
+      std::optional<std::vector<at::Tensor>>(inputTensors[0]));
 }
 
 c10::intrusive_ptr<Work> ProcessGroupMPI::alltoall_base(
@@ -941,10 +981,70 @@ c10::intrusive_ptr<Work> ProcessGroupMPI::barrier(const BarrierOptions& opts) {
 }
 
 c10::intrusive_ptr<Work> ProcessGroupMPI::_allgather_base(
-    at::Tensor& /*unused */,
-    at::Tensor& /*unused */,
-    const AllgatherOptions& /*unused */) {
-  TORCH_CHECK(false, "no support for _allgather_base in MPI process group");
+    at::Tensor& outputTensor,
+    at::Tensor& inputTensor,
+    const AllgatherOptions& opts) {
+  TORCH_CHECK(
+      outputTensor.numel() == inputTensor.numel() * size_,
+      "All gather: output tensor size must be equal to input tensor size times the world size");
+
+  std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
+      [this](std::unique_ptr<WorkEntry>& entry) {
+        auto dstdata = (entry->dst)[0];
+        auto srcdata = (entry->src)[0];
+        c10::DeviceGuard guard(srcdata.device());
+        std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+        MPI_CHECK(MPI_Allgather(
+            srcdata.data_ptr(),
+            srcdata.numel(),
+            mpiDatatype.at(srcdata.scalar_type()),
+            dstdata.data_ptr(),
+            srcdata.numel(),
+            mpiDatatype.at(dstdata.scalar_type()),
+            pgComm_));
+      };
+
+  auto inputTensors = std::vector<at::Tensor>({inputTensor});
+  auto outputTensors = std::vector<at::Tensor>({outputTensor});
+  auto entry = std::make_unique<WorkEntry>(
+      &inputTensors, &outputTensors, std::move(runFunc));
+  return enqueue(
+      std::move(entry),
+      "mpi:_allgather_base",
+      std::optional<std::vector<at::Tensor>>(inputTensors));
+}
+
+c10::intrusive_ptr<Work> ProcessGroupMPI::_reduce_scatter_base(
+    at::Tensor& outputTensor,
+    at::Tensor& inputTensor,
+    const ReduceScatterOptions& opts) {
+  TORCH_CHECK(
+      outputTensor.numel() * size_ == inputTensor.numel(),
+      "Reduce scatter: input tensor size must be equal to output tensor size times the world size");
+
+  std::function<void(std::unique_ptr<WorkEntry>&)> runFunc =
+      [opts, this](std::unique_ptr<WorkEntry>& entry) {
+        auto dstdata = (entry->dst)[0];
+        auto srcdata = (entry->src)[0];
+        c10::DeviceGuard guard(srcdata.device());
+        std::unique_lock<std::mutex> globalLock(pgGlobalMutex_);
+        MPI_CHECK(MPI_Reduce_scatter_block(
+            srcdata.data_ptr(),
+            dstdata.data_ptr(),
+            dstdata.numel(),
+            mpiDatatype.at(srcdata.scalar_type()),
+            mpiOp.at(opts.reduceOp),
+            pgComm_));
+      };
+
+  auto inputTensors = std::vector<at::Tensor>({inputTensor});
+  auto outputTensors = std::vector<at::Tensor>({outputTensor});
+  auto entry = std::make_unique<WorkEntry>(
+      &inputTensors, &outputTensors, std::move(runFunc));
+  return enqueue(
+      std::move(entry),
+      "mpi:_reduce_scatter_base",
+      std::optional<std::vector<at::Tensor>>(inputTensors));
 }
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp b/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
@@ -197,6 +197,11 @@ class TORCH_API ProcessGroupMPI : public Backend {
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
 
+  c10::intrusive_ptr<Work> _reduce_scatter_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
   c10::intrusive_ptr<Work> alltoall_base(
       at::Tensor& outputTensor,
       at::Tensor& inputTensor,