pytorch · jerryzh168 · Jun 22, 2025
diff --git a/test/dtypes/test_fbgemm_fp8.py b/test/dtypes/test_fbgemm_fp8.py
@@ -128,6 +128,8 @@ def forward(self, x):
         weight = torch.randn(10, 128, 256, dtype=dtype, device=device)
         m = M(weight).eval()
         original = m(input)
+        # we need to transpose the weight first for bmm
+        m.weight = torch.nn.Parameter(m.weight.transpose(1, 2).contiguous())
         quantize_(m, self.bmm_config, filter_fn=lambda x, fqn: True)
         quantized = m(input)
         self.assertTrue(compute_error(original, quantized) > 20)

diff --git a/test/dtypes/test_fbgemm_int4.py b/test/dtypes/test_fbgemm_int4.py
@@ -39,7 +39,6 @@ def setUp(self):
             weight_dtype=torch.int4,
             output_dtype=torch.bfloat16,
             block_size=[1, 1, 128],
-            transpose_input=True,
         )
         self.GPU_DEVICES = ["cuda"] if torch.cuda.is_available() else []
 
@@ -134,6 +133,8 @@ def forward(self, x):
         weight = torch.randn(10, 128, 256, dtype=dtype, device=device)
         m = M(weight).eval()
         original = m(input)
+        # we need to transpose the weight first for bmm
+        m.weight = torch.nn.Parameter(m.weight.transpose(1, 2).contiguous())
         quantize_(m, self.bmm_config, filter_fn=lambda x, fqn: True)
         quantized = m(input)
         self.assertTrue(compute_error(original, quantized) > 18)

diff --git a/torchao/dtypes/fbgemm_fp8_tensor.py b/torchao/dtypes/fbgemm_fp8_tensor.py
@@ -90,7 +90,6 @@ def from_float(
         cls,
         w: torch.Tensor,
         activation_scale_ub: Optional[float] = None,
-        transpose_input: bool = False,
     ):
         if activation_scale_ub is None:
             activation_scale_ub = 1200.0
@@ -100,12 +99,6 @@ def from_float(
             dtype=torch.float,
             device=w.device,
         )
-        if transpose_input:
-            if w.ndim == 3:
-                w = w.transpose(-1, -2)
-            else:
-                w = w.t()
-
         wq, w_scale = torch.ops.triton.quantize_fp8_row(w)
         # wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_row(w)
         dtype = w.dtype

diff --git a/torchao/dtypes/fbgemm_int4_tensor.py b/torchao/dtypes/fbgemm_int4_tensor.py
@@ -93,20 +93,13 @@ def from_float(
         cls,
         w: torch.Tensor,
         block_size: List[int],
-        transpose_input: bool = False,
     ):
         assert len(block_size) == w.ndim, (
             f"Expecting the length of block_size to be equal to the dimension of the weight, got {block_size=} and {w.ndim=}"
         )
         if int4_row_quantize_zp is None:
             raise ImportError("Requires fbgemm-gpu-genai >= 1.2.0")
 
-        if transpose_input:
-            if w.ndim == 3:
-                w = w.transpose(-1, -2)
-            else:
-                w = w.t()
-
         group_size = block_size[-1]
         original_shape = w.shape
 

diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -2046,7 +2046,6 @@ class FbgemmConfig(AOBaseConfig):
     output_dtype: torch.dtype
     block_size: Optional[List[int]] = None
     activation_scale_ub: Optional[float] = None
-    transpose_input: bool = False
     preshuffle: bool = False
 
 
@@ -2080,7 +2079,6 @@ def _(module: torch.nn.Module, config: FbgemmConfig) -> torch.nn.Module:
             weight = to_fbgemm_int4(
                 module.weight,
                 config.block_size,
-                config.transpose_input,
             )
         module.weight = torch.nn.Parameter(weight, requires_grad=False)
         module.extra_repr = types.MethodType(_linear_extra_repr, module)