NVIDIA · wujingyue · Jan 10, 2026 · Dec 29, 2025 · Jan 2, 2026 · Jan 2, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -349,9 +349,10 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/preseg_passes/remove_empty.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/reorder_sharded_axis.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/segment_inplace_update.cpp
+  ${NVFUSER_SRCS_DIR}/host_ir/allocate_and_deallocate.cpp
+  ${NVFUSER_SRCS_DIR}/host_ir/assign_streams.cpp
   ${NVFUSER_SRCS_DIR}/host_ir/pass/convert_op_to_communication.cpp
   ${NVFUSER_SRCS_DIR}/host_ir/pass/stream_parallel_type.cpp
-  ${NVFUSER_SRCS_DIR}/host_ir/allocate_and_deallocate.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/translate_no_reduction_matmul_to_mul_squeeze.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/translate_repeat_to_expand.cpp
   ${NVFUSER_SRCS_DIR}/preseg_passes/translate_scatter_accumulate.cpp

diff --git a/csrc/host_ir/allocate_and_deallocate.h b/csrc/host_ir/allocate_and_deallocate.h
@@ -7,7 +7,6 @@
 // clang-format on
 #pragma once
 
-#include "host_ir/container.h"
 #include "optimization_pass.h"
 
 namespace nvfuser::hir {

diff --git a/csrc/host_ir/assign_streams.cpp b/csrc/host_ir/assign_streams.cpp
@@ -0,0 +1,64 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+
+#include "host_ir/assign_streams.h"
+
+#include "host_ir/container.h"
+#include "ir/builder.h"
+
+namespace nvfuser::hir {
+
+void AssignStreams::runPass(Fusion* fusion) {
+  auto* hic = dynamic_cast<HostIrContainer*>(fusion);
+  NVF_CHECK(hic != nullptr);
+  FusionGuard fg(hic);
+
+  for (auto it = hic->topLevel().exprs().begin();
+       it != hic->topLevel().exprs().end();) {
+    auto next_it = std::next(it);
+
+    auto* for_loop = dynamic_cast<ForLoop*>(*it);
+    if (for_loop == nullptr) {
+      it = next_it;
+      continue;
+    }
+
+    // We should check that the loop is stream-parallel. This is not necessary
+    // at this moment because all loops are stream-parallel. This is also hard
+    // to do because hir::ForLoop doesn't point to the source IterDomain.
+
+    Stream* main_stream = IrBuilder::create<Stream>();
+    hic->topLevel().insert(
+        it, IrBuilder::create<GetCurrentStream>(main_stream));
+
+    // At the beginning of each iteration: set stream and synchronize with main
+    // stream
+    auto* worker_stream = IrBuilder::create<Stream>(for_loop->index());
+    auto* set_stream = IrBuilder::create<SetCurrentStream>(worker_stream);
+    auto* sync_main = IrBuilder::create<Synchronize>(main_stream);
+    auto old_begin = for_loop->body().exprs().begin();
+    for_loop->body().insert(old_begin, set_stream);
+    for_loop->body().insert(old_begin, sync_main);
-    auto old_begin = for_loop->body().exprs().begin();
-    for_loop->body().insert(old_begin, set_stream);
-    for_loop->body().insert(old_begin, sync_main);
+    for_loop->body().insert(old_begin, sync_main);
+    for_loop->body().insert(old_begin, set_stream);
-    for_loop->body().insert(old_begin, set_stream);
-    for_loop->body().insert(old_begin, sync_main);
+    for_loop->body().insert(old_begin, set_stream);
+    for_loop->body().insert(old_begin, sync_main);
-    for_loop->body().insert(old_begin, set_stream);
-    for_loop->body().insert(old_begin, sync_main);
+    auto old_begin = for_loop->body().exprs().begin();
+    for_loop->body().insert(old_begin, set_stream);
+    for_loop->body().insert(old_begin, sync_main);
-    for_loop->body().insert(old_begin, set_stream);
-    for_loop->body().insert(old_begin, sync_main);
+    auto old_begin = for_loop->body().exprs().begin();
+    for_loop->body().insert(old_begin, set_stream);
+    for_loop->body().insert(old_begin, sync_main);
-    for_loop->body().insert(old_begin, set_stream);
-    for_loop->body().insert(old_begin, sync_main);
+    auto old_begin = for_loop->body().exprs().begin();
+    for_loop->body().insert(old_begin, set_stream);
+    auto new_pos = for_loop->body().exprs().begin();
+    std::advance(new_pos, 1);
+    for_loop->body().insert(new_pos, sync_main);
-    for_loop->body().insert(old_begin, set_stream);
-    for_loop->body().insert(old_begin, sync_main);
+    auto old_begin = for_loop->body().exprs().begin();
+    for_loop->body().insert(old_begin, set_stream);
+    for_loop->body().insert(for_loop->body().exprs().begin(), sync_main);
-    auto old_begin = for_loop->body().exprs().begin();
-    for_loop->body().insert(old_begin, set_stream);
-    for_loop->body().insert(old_begin, sync_main);
+    auto* worker_stream = IrBuilder::create<Stream>(for_loop->index());
+    auto* set_stream = IrBuilder::create<SetCurrentStream>(worker_stream);
+    auto* sync_main = IrBuilder::create<Synchronize>(main_stream);
+    auto old_begin = for_loop->body().exprs().begin();
+    for_loop->body().insert(old_begin, set_stream);
+    auto new_begin = for_loop->body().exprs().begin();
+    for_loop->body().insert(std::next(new_begin), sync_main);
-    auto old_begin = for_loop->body().exprs().begin();
-    for_loop->body().insert(old_begin, set_stream);
-    for_loop->body().insert(old_begin, sync_main);
+    auto old_begin = for_loop->body().exprs().begin();
+    auto it = for_loop->body().insert(old_begin, set_stream);
+    for_loop->body().insert(std::next(it), sync_main);
-    auto old_begin = for_loop->body().exprs().begin();
-    for_loop->body().insert(old_begin, set_stream);
-    for_loop->body().insert(old_begin, sync_main);
+    for_loop->body().insert(old_begin, sync_main);
+    for_loop->body().insert(old_begin, set_stream);
-    for_loop->body().insert(old_begin, set_stream);
-    for_loop->body().insert(old_begin, sync_main);
+    for_loop->body().insert(old_begin, set_stream);
+    for_loop->body().insert(old_begin, sync_main);
-    for_loop->body().insert(old_begin, set_stream);
-    for_loop->body().insert(old_begin, sync_main);
+    auto old_begin = for_loop->body().exprs().begin();
+    for_loop->body().insert(old_begin, set_stream);
+    for_loop->body().insert(old_begin, sync_main);
-    for_loop->body().insert(old_begin, set_stream);
-    for_loop->body().insert(old_begin, sync_main);
+    auto old_begin = for_loop->body().exprs().begin();
+    for_loop->body().insert(old_begin, set_stream);
+    for_loop->body().insert(old_begin, sync_main);
-    for_loop->body().insert(old_begin, set_stream);
-    for_loop->body().insert(old_begin, sync_main);
+    auto old_begin = for_loop->body().exprs().begin();
+    for_loop->body().insert(old_begin, set_stream);
+    auto new_pos = for_loop->body().exprs().begin();
+    std::advance(new_pos, 1);
+    for_loop->body().insert(new_pos, sync_main);
-    for_loop->body().insert(old_begin, set_stream);
-    for_loop->body().insert(old_begin, sync_main);
+    auto old_begin = for_loop->body().exprs().begin();
+    for_loop->body().insert(old_begin, set_stream);
+    for_loop->body().insert(for_loop->body().exprs().begin(), sync_main);
-    auto old_begin = for_loop->body().exprs().begin();
-    for_loop->body().insert(old_begin, set_stream);
-    for_loop->body().insert(old_begin, sync_main);
+    auto* worker_stream = IrBuilder::create<Stream>(for_loop->index());
+    auto* set_stream = IrBuilder::create<SetCurrentStream>(worker_stream);
+    auto* sync_main = IrBuilder::create<Synchronize>(main_stream);
+    auto old_begin = for_loop->body().exprs().begin();
+    for_loop->body().insert(old_begin, set_stream);
+    auto new_begin = for_loop->body().exprs().begin();
+    for_loop->body().insert(std::next(new_begin), sync_main);
-    auto old_begin = for_loop->body().exprs().begin();
-    for_loop->body().insert(old_begin, set_stream);
-    for_loop->body().insert(old_begin, sync_main);
+    auto old_begin = for_loop->body().exprs().begin();
+    auto it = for_loop->body().insert(old_begin, set_stream);
+    for_loop->body().insert(std::next(it), sync_main);
+
+    // After the loop: create a joining loop to synchronize all worker streams
+    hic->topLevel().insert(
+        next_it, IrBuilder::create<SetCurrentStream>(main_stream));
+    auto* join_loop = IrBuilder::create<ForLoop>(
+        for_loop->index(), for_loop->start(), for_loop->stop());
+    hic->topLevel().insert(next_it, join_loop);
-    hic->topLevel().insert(
-        next_it, IrBuilder::create<SetCurrentStream>(main_stream));
-    auto* join_loop = IrBuilder::create<ForLoop>(
-        for_loop->index(), for_loop->start(), for_loop->stop());
-    hic->topLevel().insert(next_it, join_loop);
+    // After the loop: create a joining loop to synchronize all worker streams
+    auto set_main_it = hic->topLevel().insert(
+        next_it, IrBuilder::create<SetCurrentStream>(main_stream));
+    auto* join_loop = IrBuilder::create<ForLoop>(
+        for_loop->index(), for_loop->start(), for_loop->stop());
+    hic->topLevel().insert(std::next(set_main_it), join_loop);
-    hic->topLevel().insert(
-        next_it, IrBuilder::create<SetCurrentStream>(main_stream));
-    auto* join_loop = IrBuilder::create<ForLoop>(
-        for_loop->index(), for_loop->start(), for_loop->stop());
-    hic->topLevel().insert(next_it, join_loop);
+    // After the loop: create a joining loop to synchronize all worker streams
+    auto set_main_it = hic->topLevel().insert(
+        next_it, IrBuilder::create<SetCurrentStream>(main_stream));
+    auto* join_loop = IrBuilder::create<ForLoop>(
+        for_loop->index(), for_loop->start(), for_loop->stop());
+    hic->topLevel().insert(std::next(set_main_it), join_loop);
+
+    // In the joining loop: synchronize each worker stream
+    auto* join_worker_stream = IrBuilder::create<Stream>(join_loop->index());
+    auto* sync_worker = IrBuilder::create<Synchronize>(join_worker_stream);
+    join_loop->body().pushBack(sync_worker);
+
+    it = next_it;
+  }
+}
+
+} // namespace nvfuser::hir
diff --git a/csrc/host_ir/assign_streams.h b/csrc/host_ir/assign_streams.h
@@ -0,0 +1,26 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+
+#include "optimization_pass.h"
+
+namespace nvfuser::hir {
+
+// A host IR pass that assigns streams to stream-parallel loops.
+class AssignStreams : public OptimizationPass<AssignStreams> {
+  friend class OptimizationPass<AssignStreams>;
+
+ protected:
+  static void runPass(Fusion* fusion);
+
+  static constexpr std::string_view name() {
+    return "AssignStreams";
+  }
+};
+
+} // namespace nvfuser::hir
diff --git a/csrc/host_ir/ir.h b/csrc/host_ir/ir.h
@@ -15,7 +15,6 @@
 #include "ir/base_nodes.h"
 #include "ir/builder.h"
 #include "multidevice/communication.h"
-#include "scheduler/heuristic.h"
 
 namespace nvfuser {
 // This works around a circular dependency: compiled_kernel.h ==>

diff --git a/csrc/host_ir/passes.cpp b/csrc/host_ir/passes.cpp
@@ -9,11 +9,13 @@
 #include "host_ir/passes.h"
 
 #include "host_ir/allocate_and_deallocate.h"
+#include "host_ir/assign_streams.h"
 
 namespace nvfuser::hir {
 
 void runPasses(HostIrContainer& hic) {
   OptimizationPass<hir::AllocateAndDeallocate>::runPass(&hic);
+  OptimizationPass<hir::AssignStreams>::runPass(&hic);
 }
 
 } // namespace nvfuser::hir
diff --git a/csrc/ir/internal_nodes.h b/csrc/ir/internal_nodes.h
@@ -77,6 +77,7 @@ class Scope {
     return std::ssize(exprs_);
   }
 
+  // Returns an iterator pointing to the inserted expression.
   Iterator insert(Iterator pos, Expr* expr);
 
   Iterator pushBack(Expr* e) {

diff --git a/tests/python/direct/test_stream.py b/tests/python/direct/test_stream.py
@@ -7,7 +7,7 @@
 from nvfuser_direct import FusionDefinition, ParallelType, DataType
 
 
-def test_matmul(nvfuser_direct_test):
+def test_matmul():
     c = 3
 
     with FusionDefinition() as fd:
@@ -46,7 +46,7 @@ def test_matmul(nvfuser_direct_test):
         assert event.input_shapes == [[5, 7], [7, 2], [5, 2]]
 
 
-def test_two_matmuls_inlinable(nvfuser_direct_test):
+def test_two_matmuls_inlinable():
     c = 3
 
     with FusionDefinition() as fd:
@@ -97,7 +97,7 @@ def test_two_matmuls_inlinable(nvfuser_direct_test):
         assert event.input_shapes[0][0] == 2
 
 
-def test_two_matmuls_not_inlinable(nvfuser_direct_test):
+def test_two_matmuls_not_inlinable():
     c = 3
 
     with FusionDefinition() as fd:

diff --git a/tests/python/multidevice/benchmark_utils.py b/tests/python/multidevice/benchmark_utils.py
@@ -28,12 +28,18 @@ def wrapper(*args, **kwargs):
 
 # Returns two functors, the first with profiler off and the second with profiler
 # on. The first functor is usually used for warmup and the second for actual
-# benchmarking. This way, one
-# can collect stats of the first few non-warmup benchmark iterations using
-# ```bash
-# mpirun -np 1 nsys profile --capture-range=cudaProfilerApi --capture-range-end=repeat:<iterations> pytest tests/python/multidevice/<test_file>.py -k <filter> --only-mpi : -np <processes - 1> pytest tests/python/multidevice/<test_file>.py -k <filter> --only-mpi
-# ```
-# and then display the stats using e.g. `nsys stats --report=cuda_gpu_kern_sum
-# report1.nsys-rep`.
+# benchmarking. This way, one can collect stats of non-warmup
+# benchmark iterations using `nsys profile --capture-range=cudaProfilerApi`.
+#
+# https://docs.nvidia.com/nsight-systems/UserGuide/index.html#handling-application-launchers-mpirun-deepspeed-etc
+# has described several ways to profile multi-process applications.
+#
+# For single-node profiling, I recommend putting `nsys profile` before
+# `mpirun`, e.g., `nsys profile ... mpirun -np 8 ...` instead of `mpirun -np 8
+# nsys profile ...` or `mpirun -np 1 nsys profile ... : -np 7 ...`. This config
+# tries to collect and align traces on different GPUs so it gives the most
+# complete picture.  See
+# https://github.com/NVIDIA/Fuser/pull/5751/files#r2663586669 for my
+# experiment.
 def get_benchmark_fns(func):
     return get_benchmark_fn(func, profile=False), get_benchmark_fn(func, profile=True)
diff --git a/tests/python/multidevice/test_overlap.py b/tests/python/multidevice/test_overlap.py
@@ -14,19 +14,7 @@
 from benchmark_utils import get_benchmark_fns
 
 
-@pytest.mark.mpi
-def test_row_parallel_linear_forward(multidevice_test):
-    # This is a port of CollectiveBasedOverlapTest.RowParallelLinear_Forward.
-    h, s, t = 2, 3, 6
-    d = multidevice_test.size
-    if (h * 4) % d != 0:
-        pytest.skip(
-            f"Row-parallel linear requires {h * 4} to be divisible by world size {d}."
-        )
-    assert t % s == 0
-
-    mesh = nvfuser.multidevice.DeviceMesh(range(d))
-
+def row_parallel_linear_forward(h, mesh, num_chunks):
     with FusionDefinition() as fd:
         inp = fd.define_tensor(
             shape=[-1, h * 4], contiguity=True, dtype=DataType.BFloat16
@@ -40,11 +28,11 @@ def test_row_parallel_linear_forward(multidevice_test):
         for tv in (inp, weight):
             tv.set_device_mesh(mesh)
 
-        inp.split(0, s, inner_split=False)
+        inp.outer_split(0, num_chunks)
         inp.axis(0).parallelize(nvfuser.ParallelType.stream)
-        inp.split(2, d, inner_split=False)
+        inp.outer_split(2, mesh.size)
         inp.axis(2).parallelize(nvfuser.ParallelType.mesh_x)
-        weight.split(1, d, inner_split=False)
+        weight.outer_split(1, mesh.size)
         weight.axis(1).parallelize(nvfuser.ParallelType.mesh_x)
 
     # Expected pre-segmentation IR:
@@ -67,22 +55,50 @@ def test_row_parallel_linear_forward(multidevice_test):
     #                  /\.
     #                 s*
 
-    # Expected host IR:
+    # The host IR dumped with NVFUSER_DUMP=host_ir is similar to `row_parallel_linear_forward_reference`:
     #
     # %HostIrContainer { (T0_g___bfloat[istreamIdx7{3}, ideviceIdx.x9{2}, iS8{( ceilDiv(i0, 3) )}, iS10{4}] (DeviceMesh{0 1}), T1_g___bfloat[ideviceIdx.x11{2}, iS2{2}, iS12{4}] (DeviceMesh{0 1})) -> (T2_g___bfloat[istreamIdx27{3}, rdeviceIdx.x26{2}, iS28{( ceilDiv(i0, 3) )}, iS25{2}] (DeviceMesh{0 1})) :
     #   T2_g___bfloat[istreamIdx27{3}, rdeviceIdx.x26{2}, iS28{( ceilDiv(i0, 3) )}, iS25{2}] (DeviceMesh{0 1}) = ALLOCATE(buffer=T2_g___bfloat[istreamIdx27{3}, rdeviceIdx.x26{2}, iS28{( ceilDiv(i0, 3) )}, iS25{2}] (DeviceMesh{0 1}), mem_type=global, size=( i0 * 2 ), zero_init=false, resets_to_zero=false)
+    #   Stream 0x174e5c80 = GetCurrentStream()
     #   FOR i535 from 0 to 3:
-    #     T4_l___bfloat[istreamIdx31{3}, ideviceIdx.x33{2}, iS32{( ceilDiv(i0, 3) )}, iS34{4}] (DeviceMesh{0 1}) = ShardByStream(T0_g___bfloat[istreamIdx7{3}, ideviceIdx.x9{2}, iS8{( ceilDiv(i0, 3) )}, iS10{4}] (DeviceMesh{0 1}), stream_index = i535)
+    #     SetCurrentStream(Stream i535)
+    #     Synchronize(Stream 0x174e5c80)
+    #     T4_l___bfloat[istreamIdx37{3}, iS38{( ceilDiv(i0, 3) )}, ideviceIdx.x35{2}, iS36{4}] (DeviceMesh{0 1}) = ShardByStream(T0_g___bfloat[istreamIdx7{3}, ideviceIdx.x9{2}, iS8{( ceilDiv(i0, 3) )}, iS10{4}] (DeviceMesh{0 1}), stream_index = i535)
+    #     T3_g___bfloat[istreamIdx20{3}, ideviceIdx.x22{2}rf, iS21{( ceilDiv(i0, 3) )}, iS18{2}, rS23{4}rf] (DeviceMesh{0 1}) = ALLOCATE(buffer=T3_g___bfloat[istreamIdx20{3}, ideviceIdx.x22{2}rf, iS21{( ceilDiv(i0, 3) )}, iS18{2}, rS23{4}rf] (DeviceMesh{0 1}), mem_type=global, size=( ( ceilDiv(i0, 3) ) * 12 ), zero_init=false, resets_to_zero=false)
     #     T3_g___bfloat[istreamIdx20{3}, ideviceIdx.x22{2}rf, iS21{( ceilDiv(i0, 3) )}, iS18{2}, rS23{4}rf] (DeviceMesh{0 1})
-    #        = linear(T4_l___bfloat[istreamIdx31{3}, ideviceIdx.x33{2}, iS32{( ceilDiv(i0, 3) )}, iS34{4}] (DeviceMesh{0 1}),
+    #        = linear(T4_l___bfloat[istreamIdx37{3}, iS38{( ceilDiv(i0, 3) )}, ideviceIdx.x35{2}, iS36{4}] (DeviceMesh{0 1}),
     #                 T1_g___bfloat[ideviceIdx.x11{2}, iS2{2}, iS12{4}] (DeviceMesh{0 1})      )
-    #     T5_l___bfloat[istreamIdx37{3}, iS38{( ceilDiv(i0, 3) )}, iS36{2}] (DeviceMesh{0 1}) = ShardByStream(T2_g___bfloat[istreamIdx27{3}, rdeviceIdx.x26{2}, iS28{( ceilDiv(i0, 3) )}, iS25{2}] (DeviceMesh{0 1}), stream_index = i535)
-    #     Communication 250 (type=Allreduce, team=(0 1), input=T3_g___bfloat[istreamIdx20{3}, ideviceIdx.x22{2}rf, iS21{( ceilDiv(i0, 3) )}, iS18{2}, rS23{4}rf] (DeviceMesh{0 1}), output=T5_l___bfloat[istreamIdx37{3}, iS38{( ceilDiv(i0, 3) )}, iS36{2}] (DeviceMesh{0 1}), backend=NCCL)
-    #     Wait Communication 250
+    #     T5_l___bfloat[istreamIdx41{3}, iS42{( ceilDiv(i0, 3) )}, iS40{2}] (DeviceMesh{0 1}) = ShardByStream(T2_g___bfloat[istreamIdx27{3}, rdeviceIdx.x26{2}, iS28{( ceilDiv(i0, 3) )}, iS25{2}] (DeviceMesh{0 1}), stream_index = i535)
+    #     Communication 272 (type=Allreduce, team=(0 1), input=T3_g___bfloat[istreamIdx20{3}, ideviceIdx.x22{2}rf, iS21{( ceilDiv(i0, 3) )}, iS18{2}, rS23{4}rf] (DeviceMesh{0 1}), output=T5_l___bfloat[istreamIdx41{3}, iS42{( ceilDiv(i0, 3) )}, iS40{2}] (DeviceMesh{0 1}), backend=NCCL)
+    #     Wait(Communication 272)
+    #   SetCurrentStream(Stream 0x174e5c80)
+    #   FOR i535 from 0 to 3:
+    #     Synchronize(Stream i535)
     # } // %HostIrContainer
 
-    inp_ref = torch.randint(-2, 3, (t, h * 4), dtype=torch.int32).to(torch.bfloat16)
-    weight_ref = torch.randint(-2, 3, (h, h * 4), dtype=torch.int32).to(torch.bfloat16)
+    return fd
+
+
+@pytest.mark.mpi
+def test_row_parallel_linear_forward(multidevice_test):
+    # This is a port of CollectiveBasedOverlapTest.RowParallelLinear_Forward.
+    h, s, t = 2, 3, 6
+    d = multidevice_test.size
+    if (h * 4) % d != 0:
+        pytest.skip(
+            f"Row-parallel linear requires {h * 4} to be divisible by world size {d}."
+        )
+    assert t % s == 0
+
+    mesh = nvfuser.multidevice.DeviceMesh(range(d))
+    fd = row_parallel_linear_forward(h, mesh, s)
+
+    inp_ref = torch.testing.make_tensor(t, h * 4, dtype=torch.int32, device="cpu").to(
+        torch.bfloat16
+    )
+    weight_ref = torch.testing.make_tensor(
+        h, h * 4, dtype=torch.int32, device="cpu"
+    ).to(torch.bfloat16)
     out_ref = torch.nn.functional.linear(inp_ref, weight_ref)
 
     inp = multidevice_test.shard_tensor(inp_ref, -1, mesh)
@@ -105,6 +121,35 @@ def test_row_parallel_linear_forward(multidevice_test):
         assert event.input_shapes == [[m, k], [k, n], [m, n]]
 
 
+@pytest.mark.mpi
+@pytest.mark.benchmark
+@pytest.mark.parametrize("s", [1, 2, 4])
+def test_row_parallel_linear_forward_benchmark(multidevice_test, benchmark, s):
+    # This is a port of CollectiveBasedOverlapTest.RowParallelLinear_Forward.
+    h, t = 8192, 8192
+    d = multidevice_test.size
+    if (h * 4) % d != 0:
+        pytest.skip(
+            f"Row-parallel linear requires {h * 4} to be divisible by world size {d}."
+        )
+    assert t % s == 0
+
+    mesh = nvfuser.multidevice.DeviceMesh(range(d))
+    fd = row_parallel_linear_forward(h, mesh, s)
+
+    inp_ref = torch.randn(t, h * 4, dtype=torch.bfloat16, device="cpu")
+    weight_ref = torch.randn(h, h * 4, dtype=torch.bfloat16, device="cpu")
+
+    inp = multidevice_test.shard_tensor(inp_ref, -1, mesh)
+    weight = multidevice_test.shard_tensor(weight_ref, -1, mesh)
+
+    warmup_fn, benchmark_fn = get_benchmark_fns(
+        lambda: fd.execute([inp, weight], _enable_options=["host_ir_lowering"])
+    )
+    warmup_fn()
+    benchmark.pedantic(benchmark_fn, rounds=5)
+
+
 # The caching allocator in PyTorch can't cache buffers across streams, so we
 # have to reuse streams to avoid repeated cudaMalloc. torch.cuda.Stream() is
 # backed by a stream pool as well but I failed to find a way to set its size.