microsoft
diff --git a/‎include/mscclpp/npkit/npkit_event.hpp‎
Lines changed: 1 addition & 1 deletion b/‎include/mscclpp/npkit/npkit_event.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/mscclpp/__main__.py‎
Lines changed: 28 additions & 7 deletions b/‎python/mscclpp/__main__.py‎
Lines changed: 28 additions & 7 deletions
diff --git a/‎python/mscclpp/language/channel.py‎
Lines changed: 49 additions & 0 deletions b/‎python/mscclpp/language/channel.py‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎python/mscclpp/language/default_algos/allreduce_2nodes.py‎
Lines changed: 43 additions & 14 deletions b/‎python/mscclpp/language/default_algos/allreduce_2nodes.py‎
Lines changed: 43 additions & 14 deletions
diff --git a/‎python/mscclpp/language/internal/operations.py‎
Lines changed: 56 additions & 2 deletions b/‎python/mscclpp/language/internal/operations.py‎
Lines changed: 56 additions & 2 deletions
diff --git a/‎python/mscclpp/language/internal/types.py‎
Lines changed: 2 additions & 0 deletions b/‎python/mscclpp/language/internal/types.py‎
Lines changed: 2 additions & 0 deletions
@@ -41,6 +41,6 @@
 #define NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT 0x1C
 
 #define NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY 0x1D
-#define NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT 0x37
+#define NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT 0x39
 
 #endif
@@ -12,10 +12,10 @@
 
 default_algo_configs = [
     {
-        "filename": "allreduce_2nodes.json",
+        "filename": "allreduce_2nodes_1K_64K.json",
         "function": def_algo.allreduce_2nodes,
         "spec": AlgoSpec(
-            name="allreduce_2nodes",
+            name="allreduce_2nodes_1K_64K",
             collective=AllReduce(16, 1, True),
             nranks_per_node=8,
             world_size=16,
@@ -27,11 +27,32 @@
             reuse_resources=True,
             use_double_scratch_buffer=True,
             min_message_size=1 << 10,
+            max_message_size=64 << 10,
+            tags={"default": 1},
+        ),
+        "additional_kwargs": {"thread_block_group_size": 1},
+    },
+    {
+        "filename": "allreduce_2nodes_128K_2M.json",
+        "function": def_algo.allreduce_2nodes,
+        "spec": AlgoSpec(
+            name="allreduce_2nodes_128K_2M",
+            collective=AllReduce(16, 1, True),
+            nranks_per_node=8,
+            world_size=16,
+            in_place=True,
+            instances=1,
+            protocol="LL",
+            auto_sync=False,
+            num_threads_per_block=1024,
+            reuse_resources=True,
+            use_double_scratch_buffer=True,
+            min_message_size=128 << 10,
             max_message_size=2 << 20,
             tags={"default": 1},
         ),
-        "additional_args": [4],
-    }
+        "additional_kwargs": {"thread_block_group_size": 4},
+    },
 ]
 
 
@@ -46,12 +67,12 @@ def create_default_plans():
         filename = config["filename"]
         func = config["function"]
         spec = config["spec"]
-        additional_args = config.get("additional_args", [])
+        additional_kwargs = config.get("additional_kwargs", {})
         plan_path = os.path.join(plan_dir, filename)
 
         try:
-            if additional_args:
-                prog = func(spec, *additional_args)
+            if additional_kwargs:
+                prog = func(spec, **additional_kwargs)
             else:
                 prog = func(spec)
 
 
@@ -682,6 +682,55 @@ def put_with_signal_and_flush(self, dst_chunk: Chunk, src_chunk: Chunk, tb: int)
 
         get_program().add_operation(self.src_rank, tb, op)
 
+    def put_packets(self, dst_chunk: Chunk, src_chunk: Chunk, tb: int):
+        """Transfer data from local buffer to remote scratch buffer in packet format.
+
+        Performs a specialized put operation that transfers data from the source rank's buffer
+        to the destination rank's scratch buffer in packet format through the port channel.
+        The destination chunk must be a scratch buffer.
+
+        Args:
+            dst_chunk (Chunk): The destination scratch chunk on the destination rank.
+            src_chunk (Chunk): The source chunk on the source rank (any buffer type).
+            tb (int): The thread block ID that will execute this operation.
+
+        Raises:
+            RuntimeError: If chunk ranks don't match channel configuration, if destination
+                chunk is not a scratch buffer, or if chunk sizes don't match.
+
+        Example:
+            >>> channel.put_packets(dst_chunk, src_chunk, tb=0)
+        """
+        if src_chunk.rank != self.src_rank:
+            raise RuntimeError(
+                f"Source chunk rank {src_chunk.rank} does not match current channel source rank {self.src_rank}."
+            )
+        if dst_chunk.rank != self.dst_rank:
+            raise RuntimeError(
+                f"Dst chunk rank {dst_chunk.rank} does not match current channel dst rank {self.dst_rank}."
+            )
+        if dst_chunk.buffer != BufferType.scratch:
+            raise RuntimeError(f"Destination chunk must be of type scratch.")
+        if dst_chunk.size != src_chunk.size:
+            raise RuntimeError(
+                f"Destination chunk size {dst_chunk.size} does not match source chunk size {src_chunk.size}."
+            )
+
+        remote_chunk = RemoteBuffer(src_chunk.rank, dst_chunk.rank, dst_chunk.buffer, self.channel_type)
+        tb_chunk_id = get_program().setup_remote_chunk(self.src_rank, tb, remote_chunk, self.channel_type)
+        tb_channel_ids = get_program().setup_channel(tb, self)
+
+        op = PutOperation(
+            src_buff=[LocalChunk(src_chunk.buffer, src_chunk.index, src_chunk.size)],
+            dst_buff=[RemoteChunk(dst_chunk.buffer, dst_chunk.index, dst_chunk.size, tb_chunk_id)],
+            channel_ids=tb_channel_ids,
+            channel_type=self.channel_type,
+            from_packet=False,
+            to_packet=True,
+        )
+
+        get_program().add_operation(self.src_rank, tb, op)
+
     def read_put_packets(self, dst_chunk: Chunk, src_chunk: Chunk, tb: int):
         """Transfer data in packet format from local to remote scratch buffer.
 
 
@@ -34,15 +34,31 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
         inter_node_port_channels = {}
         scratch_buffers = []
         thread_block_offset = 1
-        thread_block_group = ThreadBlockGroup(
-            tb_list=[i for i in range(thread_block_offset, thread_block_offset + thread_block_group_size)]
+        thread_block_groups = []
+        global_intra_node_tbg = ThreadBlockGroup(
+            tb_list=[
+                i
+                for i in range(thread_block_offset, thread_block_offset + (gpus_per_node - 1) * thread_block_group_size)
+            ]
         )
+        for i in range(gpus_per_node - 1):
+            thread_block_groups.append(
+                ThreadBlockGroup(
+                    tb_list=[
+                        i
+                        for i in range(
+                            thread_block_offset + i * thread_block_group_size,
+                            thread_block_offset + (i + 1) * thread_block_group_size,
+                        )
+                    ]
+                )
+            )
 
+        scratch_buffer_size = packets_per_gpu * (total_gpus + 1)
         for node_id in range(num_nodes):
             for local_gpu_id in range(gpus_per_node):
                 current_rank_id = local_gpu_id + gpus_per_node * node_id
                 next_node_rank_id = (local_gpu_id + gpus_per_node * (node_id + 1)) % total_gpus
-                scratch_buffer_size = 2 * total_gpus
                 scratch_buffers.append(Buffer(current_rank_id, scratch_buffer_size))
                 for peer_gpu_id in range(gpus_per_node):
                     if peer_gpu_id != local_gpu_id:
@@ -64,13 +80,14 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
                 for peer_gpu_id in range(gpus_per_node):
                     peer_rank_id = peer_gpu_id + gpus_per_node * node_id
                     peer_data_offset = peer_gpu_id * packets_per_gpu
+                    tbg_id = peer_gpu_id if peer_gpu_id < local_gpu_id else peer_gpu_id - 1
                     if peer_gpu_id != local_gpu_id:
                         intra_node_memory_channels[(peer_rank_id, current_rank_id)].put_packets(
                             scratch_buffers[peer_rank_id][
                                 local_gpu_id * packets_per_gpu : local_gpu_id * packets_per_gpu + packets_per_gpu
                             ],
                             input_buffer[peer_data_offset : peer_data_offset + packets_per_gpu],
-                            tb_group=thread_block_group,
+                            tb_group=thread_block_groups[tbg_id],
                         )
 
                 # Intra Node Reduce
@@ -84,20 +101,24 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
                 current_rank.reduce(
                     input_buffer[local_gpu_id * packets_per_gpu : local_gpu_id * packets_per_gpu + packets_per_gpu],
                     other_gpu_data,
-                    tb_group=thread_block_group,
+                    tb_group=global_intra_node_tbg,
                     packet=True,
                 )
 
-                # Copy Reduced Data to Scratch Buffer and send to Next Node
                 current_rank.copy_packets(
                     scratch_buffers[current_rank_id][
                         local_gpu_id * packets_per_gpu : local_gpu_id * packets_per_gpu + packets_per_gpu
                     ],
                     input_buffer[local_gpu_id * packets_per_gpu : local_gpu_id * packets_per_gpu + packets_per_gpu],
-                    tb_group=thread_block_group,
+                    tb_group=global_intra_node_tbg,
                 )
+
+                current_rank.barrier(
+                    tb_list=[i for i in range(thread_block_offset + (gpus_per_node - 1) * thread_block_group_size)]
+                )
+
                 inter_node_offset = total_gpus
-                inter_node_port_channels[current_rank_id].read_put_packets(
+                inter_node_port_channels[current_rank_id].put_packets(
                     scratch_buffers[next_node_rank_id][
                         inter_node_offset
                         + local_gpu_id * packets_per_gpu : inter_node_offset
@@ -122,31 +143,39 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
                 current_rank.reduce(
                     input_buffer[local_gpu_id * packets_per_gpu : local_gpu_id * packets_per_gpu + packets_per_gpu],
                     inter_node_data,
-                    tb_group=thread_block_group,
+                    tb_group=global_intra_node_tbg,
                     packet=True,
                 )
 
+                current_rank.copy_packets(
+                    scratch_buffers[current_rank_id][scratch_buffer_size - packets_per_gpu : scratch_buffer_size],
+                    input_buffer[local_gpu_id * packets_per_gpu : local_gpu_id * packets_per_gpu + packets_per_gpu],
+                    tb_group=global_intra_node_tbg,
+                )
+
                 # Broadcast Reduced Data
                 for peer_gpu_id in range(gpus_per_node):
                     peer_rank_id = peer_gpu_id + gpus_per_node * node_id
 
                     if peer_gpu_id != local_gpu_id:
-                        intra_node_memory_channels[(peer_rank_id, current_rank_id)].put_packets(
+                        tbg_id = peer_gpu_id if peer_gpu_id < local_gpu_id else peer_gpu_id - 1
+                        intra_node_memory_channels[(peer_rank_id, current_rank_id)].read_put_packets(
                             scratch_buffers[peer_rank_id][
                                 inter_node_offset
                                 + local_gpu_id * packets_per_gpu : inter_node_offset
                                 + local_gpu_id * packets_per_gpu
                                 + packets_per_gpu
                             ],
-                            input_buffer[
-                                local_gpu_id * packets_per_gpu : local_gpu_id * packets_per_gpu + packets_per_gpu
+                            scratch_buffers[current_rank_id][
+                                scratch_buffer_size - packets_per_gpu : scratch_buffer_size
                             ],
-                            tb_group=thread_block_group,
+                            tb_group=thread_block_groups[tbg_id],
                         )
 
                 # Unpack Data Received from other GPUs in the same node
                 for peer_gpu_id in range(gpus_per_node):
                     if peer_gpu_id != local_gpu_id:
+                        tbg_id = peer_gpu_id if peer_gpu_id < local_gpu_id else peer_gpu_id - 1
                         current_rank.unpack_packets(
                             input_buffer[
                                 peer_gpu_id * packets_per_gpu : peer_gpu_id * packets_per_gpu + packets_per_gpu
@@ -157,7 +186,7 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
                                 + peer_gpu_id * packets_per_gpu
                                 + packets_per_gpu
                             ],
-                            tb_group=thread_block_group,
+                            tb_group=thread_block_groups[tbg_id],
                         )
 
     return prog
@@ -604,6 +604,7 @@ def __init__(
         self,
         local_src_buff: List[LocalChunk],
         local_dst_buff: List[LocalChunk],
+        local_pkt_dst_buff: List[LocalChunk] = None,
         remote_src_buff: List[RemoteChunk] = None,
         remote_dst_buff: List[RemoteChunk] = None,
         channel_ids: List[int] = None,
@@ -613,19 +614,26 @@ def __init__(
         tbg_info: ThreadBlockGroupInfo = None,
         packet: bool = False,
     ):
+        local_pkt_dst_buff = local_pkt_dst_buff if local_pkt_dst_buff is not None else []
         remote_src_buff = remote_src_buff if remote_src_buff is not None else []
         remote_dst_buff = remote_dst_buff if remote_dst_buff is not None else []
         channel_ids = channel_ids if channel_ids is not None else []
         put_channel_ids = put_channel_ids if put_channel_ids is not None else []
 
         if len(remote_src_buff) == 0 and len(remote_dst_buff) == 0:
             if packet:
-                super().__init__(Instruction.reduce_packet)
+                if len(local_pkt_dst_buff) == 0:
+                    super().__init__(Instruction.reduce_packet)
+                else:
+                    super().__init__(Instruction.reduce_copy_packet)
             else:
                 super().__init__(Instruction.reduce)
         elif len(remote_src_buff) == 0:
             if packet:
-                super().__init__(Instruction.reduce_send_packet)
+                if len(local_pkt_dst_buff) == 0:
+                    super().__init__(Instruction.reduce_send_packet)
+                else:
+                    super().__init__(Instruction.reduce_copy_send_packet)
             else:
                 super().__init__(Instruction.reduce_send)
         elif len(remote_dst_buff) == 0 and not packet:
@@ -637,6 +645,7 @@ def __init__(
 
         self.local_src_buff = local_src_buff
         self.local_dst_buff = local_dst_buff
+        self.local_pkt_dst_buff = local_pkt_dst_buff
         self.remote_src_buff = remote_src_buff
         self.remote_dst_buff = remote_dst_buff
         self.channel_ids = channel_ids
@@ -741,6 +750,49 @@ def __add__(self, other):
                 tbg_info=self.tbg_info,
                 packet=self.packet,
             )
+        if (
+            isinstance(other, CopyOperation)
+            and self.name == Instruction.reduce_packet
+            and other.name == Instruction.copy_packet
+            and self.local_dst_buff[0] == other.src_buff[0]
+            and self.tbg_info == other.tbg_info
+        ):
+            fused_operation = ReduceOperation(
+                self.local_src_buff,
+                self.local_dst_buff,
+                local_pkt_dst_buff=other.dst_buff,
+                remote_src_buff=self.remote_src_buff,
+                remote_dst_buff=self.remote_dst_buff,
+                channel_ids=self.channel_ids,
+                put_channel_ids=self.put_channel_ids,
+                channel_type=self.channel_type,
+                reduce_operation=self.reduce_operation,
+                tbg_info=self.tbg_info,
+                packet=self.packet,
+            )
+        if (
+            isinstance(other, PutOperation)
+            and (self.name == Instruction.reduce_copy_packet or self.name == Instruction.reduce_copy_send_packet)
+            and (
+                (other.name == Instruction.put_packet and self.local_dst_buff[0] == other.src_buff[0])
+                or (other.name == Instruction.read_put_packet and self.local_pkt_dst_buff[0] == other.src_buff[0])
+            )
+            and other.channel_type == ChannelType.memory
+            and self.tbg_info == other.tbg_info
+        ):
+            fused_operation = ReduceOperation(
+                self.local_src_buff,
+                self.local_dst_buff,
+                local_pkt_dst_buff=self.local_pkt_dst_buff,
+                remote_src_buff=self.remote_src_buff,
+                remote_dst_buff=self.remote_dst_buff + other.dst_buff,
+                channel_ids=self.channel_ids,
+                put_channel_ids=self.put_channel_ids + other.channel_ids,
+                channel_type=other.channel_type,
+                reduce_operation=self.reduce_operation,
+                tbg_info=self.tbg_info,
+                packet=self.packet,
+            )
 
         return fused_operation
 
@@ -752,6 +804,8 @@ def to_dict(self):
         result["dst_buff"] = []
         for chunk in self.local_dst_buff:
             result["dst_buff"].append(chunk.to_dict())
+        for chunk in self.local_pkt_dst_buff:
+            result["dst_buff"].append(chunk.to_dict())
 
         if len(self.remote_src_buff) > 0:
             for chunk in self.remote_src_buff:
 
@@ -69,6 +69,7 @@ class Instruction(Enum):
     unpack_packet = "upkt"
     reduce = "re"
     reduce_packet = "repkt"
+    reduce_copy_packet = "recpkt"
     sem_acquire = "sem_acquire"
     sem_release = "sem_release"
     signal = "signal"
@@ -85,6 +86,7 @@ class Instruction(Enum):
     put_with_signal_and_flush = "pwsf"
     reduce_send = "res"
     reduce_send_packet = "respkt"
+    reduce_copy_send_packet = "recspkt"
     read_reduce = "rre"
     read_reduce_send = "rres"
     group_store = "gstore"