Fix bugs in MSCCL XML generation (Azure#17)

yzygitzh · web-flow · commit 1c7afa44b564 · 2024-10-10T17:07:59.000+08:00
1. Change reduce_scatter to reducescatter in XML generation to match naming style of other collectives and executor-side implementation.
2. Avoid valid thread block channel being overwritten by 0.
diff --git a/examples/mscclang/reducescatter_allpairs.py b/examples/mscclang/reducescatter_allpairs.py
@@ -0,0 +1,40 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from msccl.language import *
+from msccl.topologies import *
+from msccl.language.collectives import ReduceScatter
+
+def allreduce_allpairs(gpus, protocol):
+    size = gpus
+    topology = fully_connected(size)
+    collective = ReduceScatter(gpus, gpus, True)
+    with MSCCLProgram("reducescatter_pairs", topology, collective, 1, protocol=protocol,
+        threadblock_policy=ThreadblockPolicy.manual, dependence_nop=True):
+
+        # Each rank sends the nth chunk to the nth rank into scratch space
+        for r1 in range(size):
+            for r2 in range(size):
+                if r1 != r2:
+                    index = r2 * size
+                    c = chunk(r1, Buffer.input, index, size=size)
+                    c.copy(r2, 'scratch', sendtb=r2, recvtb=r1)
+
+        # Each rank performs a local reduction on the nth chunk
+        # Utilize 8 threadblocks for this reduction for better parallelism
+        for r in range(size):
+            for index in range(0, size * (size-1)):
+                c = chunk(r, Buffer.input, r*size + (index % size))
+                c.reduce(chunk(r, 'scratch', index), sendtb=(index % size))
+
+        XML()
+        Check()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('num_gpus', type=int, help ='number of gpus')
+parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol')
+
+args = parser.parse_args()
+
+allreduce_allpairs(args.num_gpus, args.protocol)
diff --git a/msccl/language/collectives.py b/msccl/language/collectives.py
@@ -174,7 +174,7 @@ def get_buffer_index(self, rank, buffer, index):
 class ReduceScatter(Collective):
     def __init__(self, num_ranks, chunk_factor, inplace):
         Collective.__init__(self, num_ranks, chunk_factor, inplace)
-        self.name = "reduce_scatter"
+        self.name = "reducescatter"
 
     def init_buffers(self):
         rank_buffers = []
diff --git a/msccl/language/tb_assignment.py b/msccl/language/tb_assignment.py
@@ -30,7 +30,8 @@ def manual_assign_tbs(rank_dag):
         tb = rank_dag.tbs[rank][tbid]
         if _verify_tb_op_compatible(tb, op):
             tb.ops.append(op)
-            tb.channel = op.channel if op.channel != -1 else 0
+            if tb.channel == -1:
+                tb.channel = op.channel if op.channel != -1 else 0
             tb.send = op.dst.rank if op.is_send() else tb.send
             tb.recv = op.src.rank if op.is_recv() else tb.recv
             op.step = len(tb.ops)-1
diff --git a/tests/configs/test-config.json b/tests/configs/test-config.json
@@ -63,6 +63,10 @@
         "filename": "pipeline_a100_ring.py",
         "args": ["8", "4", "2"]
     },
+    {
+        "filename": "reducescatter_allpairs.py",
+        "args": ["8"]
+    },
     {
         "filename": "mscclpp/allreduce_a100_allpairs_packet_mscclpp.py",
         "args": ["8", "8"]