addressing review comments, adding example to docs, adding docstring, restructuring test and adding function for real-imag extract by removing hardcode

apbose · apbose · commit 109e5c2f73b2 · 2025-06-27T23:41:52.000-07:00
diff --git a/examples/distributed_inference/rotary_embedding.py b/examples/distributed_inference/rotary_embedding.py
@@ -1,3 +1,14 @@
+"""
+.. _rotary_embedding:
+
+Rotary Embedding Implementation for Tensor Parallel Attention
+============================================================
+
+This module provides an implementation of rotary positional embeddings (RoPE) for transformer models
+with support for tensor parallel distributed inference. Rotary embeddings are used to encode positional
+information in transformer attention mechanisms.
+"""
+
 import time
 
 import tensorrt as trt
@@ -49,7 +60,7 @@ def rotary_embedding(xq, xk, dim, freqs_cis=None):
     Returns:
         tuple: Tuple containing the rotated query and key tensors.
     """
-
+    freqs_cis = freqs_cis[None, :, None, :]
     xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
     xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
 
diff --git a/examples/distributed_inference/tensor_parallel_initialize_dist.py b/examples/distributed_inference/tensor_parallel_initialize_dist.py
@@ -1,3 +1,11 @@
+"""
+.. _tensor_parallel_initialize_dist:
+Tensor Parallel Initialize Distributed Environment
+==================================================
+
+This module provides functions to initialize and clean up the distributed environment for tensor parallel distributed inference.
+"""
+
 import logging
 import os
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
diff --git a/examples/distributed_inference/tensor_parallel_rotary_embedding.py b/examples/distributed_inference/tensor_parallel_rotary_embedding.py
@@ -1,3 +1,14 @@
+"""
+.. _tensor_parallel_rotary_embedding:
+Tensor Parallel Rotary Embedding Example
+=======================================
+
+This example demonstrates how to use Torch-TensorRT with tensor parallel distributed inference
+for models that use rotary positional embeddings (RoPE). It lowers the complex
+operations in attention models with rotary embeddings across multiple GPUs.
+
+"""
+
 import logging
 import os
 import time
@@ -17,6 +28,7 @@
 
 """
 This example covers the rotary embedding in Llama3 model and is derived from https://lightning.ai/lightning-ai/studios/tensor-parallelism-supercharging-large-model-training-with-pytorch-lightning
+Command to run with single GPU: mpirun -n 1 --allow-run-as-root python tensor_parallel_rotary_embedding.pyx
 """
 
 BATCH = 2
@@ -35,7 +47,7 @@
 
     logger.info("Torch-tensorrt compilation for rotary embedding")
 
-    model = torch.compile(model, backend="torch_tensorrt", options={"debug": True})
+    model = torch.compile(model, backend="torch_tensorrt")
 
     try:
         for i in range(15):
diff --git a/examples/distributed_inference/tensor_parallel_simple_example.py b/examples/distributed_inference/tensor_parallel_simple_example.py
@@ -1,3 +1,24 @@
+"""
+.. _tensor_parallel_simple_example:
+
+Torch Parallel Distributed example for simple model
+=========================================
+
+Below example shows how to use Torch-TensorRT backend for distributed inference with tensor parallelism.
+
+This example demonstrates:
+    - Setting up distributed environment for tensor parallelism
+    - Model sharding across multiple GPUs
+    - Compilation with Torch-TensorRT
+    - Distributed inference execution
+
+Usage
+-----
+.. code-block:: bash
+
+    mpirun -n 2 --allow-run-as-root python tensor_parallel_simple_example.py
+"""
+
 import time
 
 import tensorrt as trt
@@ -21,7 +42,7 @@
 )
 
 """
-This example copies some code from https://github.com/pytorch/examples/blob/main/distributed/tensor_parallelism/tensor_parallel_example.py
+This example takes some code from https://github.com/pytorch/examples/blob/main/distributed/tensor_parallelism/tensor_parallel_example.py
 """
 
 
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/complex_graph_rewrite.py b/py/torch_tensorrt/dynamo/lowering/passes/complex_graph_rewrite.py
@@ -1,11 +1,10 @@
 import logging
-import operator
-from typing import Callable, List, Optional, Set, Tuple
+from typing import Callable, List, Set, Tuple
 
 import torch
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.fx import GraphModule, Node
-from torch.fx.subgraph_rewriter import Match
+from torch.fx.experimental.proxy_tensor import unset_fake_temporarily
 from torch_tensorrt.dynamo._settings import CompilationSettings
 from torch_tensorrt.dynamo.lowering.passes.pass_utils import (
     clean_up_graph_after_modifications,
@@ -25,7 +24,7 @@ def __init__(
         self.subgraph_nodes = subgraph_nodes
         self.input_nodes = input_nodes
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return (
             f"ComplexOpSubGraphInfo(anchor_nodes={[n.name for n in self.anchor_nodes]}, "
             f"subgraph={[n.name for n in self.subgraph_nodes]}, "
@@ -34,7 +33,7 @@ def __repr__(self):
 
 
 class ComplexOpDetector:
-    def __init__(self):
+    def __init__(self) -> None:
         pass
 
     def is_complex_dtype(self, node: Node) -> bool:
@@ -106,16 +105,18 @@ def find_complex_op_subgraphs(
 
 
 class ComplexGraphRewriter:
-    def __init__(self, gm: GraphModule, truncate_double: bool = False):
+    def __init__(self, gm: GraphModule, truncate_double: bool = False) -> None:
         self.gm = gm
         self.truncate_double = truncate_double
 
-    def extract_shape_dtype_device(self, input_node):
+    def extract_shape_dtype_device(
+        self, input_node: Node
+    ) -> Tuple[Tuple[int, ...], torch.dtype, torch.device]:
         if input_node.op == "placeholder":
             tensor_val = input_node.meta["val"]
 
         elif input_node.op == "get_attr":
-            tensor_val = self.get_attr_tensor(input_node.target)
+            tensor_val = self.get_attr_tensor(input_node.target)  # type: ignore
 
         else:
             raise ValueError(f"Unsupported node type: {input_node.op}")
@@ -134,7 +135,7 @@ def extract_shape_dtype_device(self, input_node):
 
         return new_node_shape, new_node_dtype, device
 
-    def get_attr_tensor(self, target):
+    def get_attr_tensor(self, target):  # type: ignore
         # Check if target is param or buffer
         if target in dict(self.gm.named_parameters()):
             return self.gm.get_parameter(target)
@@ -145,7 +146,7 @@ def get_attr_tensor(self, target):
                 f"Attribute {target} not found in gm parameters or buffers."
             )
 
-    def replace_input_node(self, input_node):
+    def replace_input_node(self, input_node: Node) -> None:
         modified = False
         logger.debug(f"Replacing input node: {input_node.name}")
         new_shape, new_dtype, device = self.extract_shape_dtype_device(input_node)
@@ -160,10 +161,8 @@ def replace_input_node(self, input_node):
 
         elif input_node.op == "get_attr":
             new_attr_name = input_node.target + "_reshaped"
-            from torch._subclasses.fake_tensor import unset_fake_temporarily
-
             with unset_fake_temporarily():
-                original_tensor = self.get_attr_tensor(input_node.target)
+                original_tensor = self.get_attr_tensor(input_node.target)  # type: ignore
                 stacked_tensor = torch.stack(
                     [original_tensor.real, original_tensor.imag], dim=-1
                 )
@@ -181,7 +180,7 @@ def replace_input_node(self, input_node):
         self.gm.graph.erase_node(input_node)
         clean_up_graph_after_modifications(self.gm)
 
-    def rewrite_subgraph_nodes(self, subgraphs):
+    def rewrite_subgraph_nodes(self, subgraphs: List[ComplexSubGraphInfo]) -> None:
         modified = False
         for subgraph in subgraphs:
             for input_node in subgraph.input_nodes:
@@ -196,11 +195,20 @@ def rewrite_subgraph_nodes(self, subgraphs):
                 elif node.target == torch.ops.aten.mul.Tensor:
                     # this is complex mul where inputs = a+ib and output = c+id.
                     # complex mul returns (ac - bd) + (ad + bc)i
-                    # which is then view_as_real as (ac-bd), ad+bc stacked along the last dimension with last dimension size 2
+                    # which is then view_as_real as (ac-bd), (ad+bc) stacked along the last dimension with last dimension size 2
+                    x_placeholder_or_func = (
+                        True if node.args[0].op != "get_attr" else False
+                    )
+                    y_placeholder_or_func = (
+                        True if node.args[1].op != "get_attr" else False
+                    )
+
                     replaced_nodes = []
-                    original_mul, replacement = complex_mul_replacement()
+                    original_mul, replacement = complex_mul_replacement(
+                        x_placeholder_or_func, y_placeholder_or_func
+                    )
 
-                    def match_complex_mul(
+                    def match_complex_mul(  # type: ignore[no-untyped-def]
                         match: torch.fx.subgraph_rewriter.Match,
                         original_graph,
                         pattern_graph,
@@ -233,7 +241,7 @@ def match_complex_mul(
             self.gm.graph.lint()
             self.gm.recompile()
 
-    def propagate_metadata(self):
+    def propagate_metadata(self) -> None:
         fake_inputs = []
         from torch._subclasses.fake_tensor import FakeTensorMode
         from torch.fx.passes.fake_tensor_prop import FakeTensorProp
@@ -260,7 +268,34 @@ def propagate_metadata(self):
         ).propagate(*fake_inputs)
 
 
-def complex_mul_replacement() -> Tuple[
+def extract_real_imag(input, placeholder_or_func: bool = True):  # type: ignore
+    """Extract real and imaginary parts from a tensor.
+    This function handles different tensor types based on whether they are placeholder/function
+    tensors or get_attr tensors. For placeholder/function tensors, it uses select operations,
+    while for get_attr tensors, it uses indexing.
+    Args:
+        input: Input tensor to extract real and imaginary parts from
+        placeholder_or_func: Boolean flag indicating if the input is a placeholder/function tensor (True)
+                           or a get_attr tensor (False). Defaults to True.
+    Returns:
+        Tuple of (real_part, imaginary_part) where both parts have the same type as the input
+    Note:
+        - When placeholder_or_func=True: Uses torch.ops.aten.select.int operations
+        - When placeholder_or_func=False: Uses tensor indexing [..., 0] and [..., 1]
+    """
+    if placeholder_or_func:
+        # For ITensor, use select operations
+        real_part = torch.ops.aten.select.int(input, -1, 0)
+        imag_part = torch.ops.aten.select.int(input, -1, 1)
+        return real_part, imag_part
+    else:
+        # For get_attr, use indexing
+        return input[..., 0], input[..., 1]
+
+
+def complex_mul_replacement(
+    x_placeholder_or_func: bool = True, y_placeholder_or_func: bool = True
+) -> Tuple[
     Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
     Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
 ]:
@@ -280,9 +315,8 @@ def original_mul(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 
     # Replacement function: manual complex multiplication on real/imag stacked tensors
     def replacement(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-        x_real = torch.ops.aten.select.int(x, -1, 0)
-        x_imag = torch.ops.aten.select.int(x, -1, 1)  # x is reshape tensor
-        y_real, y_imag = y[..., 0], y[..., 1]  # y is frozen param
+        x_real, x_imag = extract_real_imag(x, x_placeholder_or_func)
+        y_real, y_imag = extract_real_imag(y, y_placeholder_or_func)
 
         real_part1 = torch.ops.aten.mul.Tensor(x_real, y_real)
         real_part2 = torch.ops.aten.mul.Tensor(x_imag, y_imag)
@@ -304,10 +338,18 @@ def replacement(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 
 
 # This lowering pass is used to detect and rewrite complex subgraphs in the graph
-# This lowering pass works for complex tensor in mul which are parameter or buffers in the graph
 def complex_graph_detection(
     gm: GraphModule, settings: CompilationSettings
-) -> List[ComplexSubGraphInfo]:
+) -> GraphModule:
+    """Detect and rewrite complex subgraphs in the graph.
+    This lowering pass is used to detect and rewrite complex subgraphs in the graph.
+    This lowering pass works for complex tensor in mul which are parameter or buffers in the graph.
+    Args:
+        gm: The GraphModule to process
+        settings: Compilation settings
+    Returns:
+        The modified GraphModule with complex subgraphs rewritten
+    """
     complex_op_detector = ComplexOpDetector()
     complex_subgraphs = complex_op_detector.find_complex_op_subgraphs(
         gm, anchor_target=torch.ops.aten.view_as_real.default
diff --git a/tests/py/dynamo/lowering/test_aten_lowering_passes.py b/tests/py/dynamo/lowering/test_aten_lowering_passes.py
@@ -237,12 +237,6 @@ def forward(self, input, mat1, mat2):
         torch._dynamo.reset()
 
 
-def rotary_embedding(x, dim, freqs_cis=None):
-    x_ = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
-    x_out_flatten = torch.view_as_real(x_ * freqs_cis)
-    return x_out_flatten.type_as(x)
-
-
 class TestComplexSubgraph(TestCase):
     def test_complex_subgraph(self):
         BATCH = 1
@@ -263,6 +257,11 @@ def __init__(self):
                     persistent=True,
                 )
 
+            def rotary_embedding(self, x, dim, freqs_cis=None):
+                x_ = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+                x_out_flatten = torch.view_as_real(x_ * freqs_cis)
+                return x_out_flatten.type_as(x)
+
             def _freqs_ex_tensor(self):
                 real = torch.tensor([[[[1.0000]], [[2.0000]]]], device="cuda")
                 imag = torch.tensor([[[[0.0000]], [[3.0000]]]], device="cuda")
@@ -273,14 +272,13 @@ def _freqs_ex_tensor(self):
             def forward(self, x):
                 q = self.wq(x)
                 freqs_cis = self._freqs_ex_tensor().to(q.device)
-                q_out = rotary_embedding(q, self.dim, freqs_cis=freqs_cis)
+                q_out = self.rotary_embedding(q, self.dim, freqs_cis=freqs_cis)
                 return q_out
 
         inputs = [torch.randn(BATCH, SEQ_LEN, HEADS, DIM).cuda()]
         model = RotaryAttention()
         model = model.cuda()
 
-        fx_graph = torch.fx.symbolic_trace(RotaryAttention().cuda())
         expected_ops = {torch.ops.aten.mul.Tensor}
         unexpected_ops = {
             torch.ops.aten.view_as_complex.default,