Fix aqt implementation for aten.mm/aten.addmm fallback path (#2072)

jerryzh168 · web-flow · commit 9a56a1dd5f89 · 2025-04-17T19:52:39.000-07:00
Summary:
Previously in try except there was a side effect of transposing weight_tensor, even if the
dispatch failed, this could cause errors in the fallback path. This PR fixes it.

Test Plan:
python test/dtypes/test_affine_quantized.py -k test_matmul

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
@@ -21,6 +21,7 @@
     Int4XPULayout,
     PlainLayout,
     SemiSparseLayout,
+    to_affine_quantized_intx,
     to_affine_quantized_intx_static,
 )
 from torchao.quantization import (
@@ -352,6 +353,23 @@ def test_slice(self, device, dtype):
         _ = dummy.weight.narrow(0, 0, 64)
         _ = dummy.weight.narrow(1, 0, 128)
 
+    @common_utils.parametrize("device", ["cuda"])
+    @common_utils.parametrize("dtype", [torch.bfloat16])
+    def test_matmul(self, device, dtype):
+        x = torch.randn(53, 2048)
+        w = torch.randn(53, 2048)
+        w = to_affine_quantized_intx(
+            w,
+            mapping_type=MappingType.SYMMETRIC,
+            block_size=(1, 32),
+            target_dtype=torch.int8,
+            quant_min=-8,
+            quant_max=7,
+            eps=torch.finfo(torch.float32).eps,
+        )
+        # make sure it runs
+        torch.matmul(x, w.t())
+
 
 common_utils.instantiate_parametrized_tests(TestAffineQuantized)
 common_utils.instantiate_parametrized_tests(TestAffineQuantizedBasic)
diff --git a/torchao/dtypes/affine_quantized_tensor_ops.py b/torchao/dtypes/affine_quantized_tensor_ops.py
@@ -157,6 +157,9 @@ class QuantizedLinearNotImplementedError(NotImplementedError):
     pass
 
 
+# input_tensor: dimension is (M1, M2, ..., in_features)
+# weight_tensor: dimension is (out_features, in_features)
+# bias: dimension is (out_features,)
 @staticmethod
 def _quantized_linear_op(input_tensor, weight_tensor, bias):
     for dispatch_condition, impl in _AQT_QLINEAR_DISPATCH_TABLE.items():
@@ -335,12 +338,19 @@ def _(func, types, args, kwargs):
             f"{func} is not implemented for non floating point input"
         )
 
+    assert input_tensor.shape[-1] == weight_tensor.shape[0], (
+        f"need mat1 shape: {input_tensor.shape} final dim"
+        f"to match mat2 shape: {weight_tensor.shape} first dim"
+    )
+
     # using try/except here so that we can have a general fallback when input_tensor/weight_tensor
     # is not picked up by any of the dispatch paths in `_quantized_linear_op`, this allows us to
     # make the branches easier to understand in `_quantized_linear_op`
     try:
-        weight_tensor = weight_tensor.t()
-        return weight_tensor._quantized_linear_op(input_tensor, weight_tensor, bias)
+        transposed_weight_tensor = weight_tensor.t()
+        return weight_tensor._quantized_linear_op(
+            input_tensor, transposed_weight_tensor, bias
+        )
     except QuantizedLinearNotImplementedError as e:
         # fallback path is only called when user did not specify a specfic quantized linear implementation with `_layout.quantized_linear_impl`
         if (
@@ -365,9 +375,16 @@ def _(func, types, args, kwargs):
             f"{func} is not implemented for non floating point input"
         )
 
+    assert input_tensor.shape[-1] == weight_tensor.shape[0], (
+        f"need mat1 shape: {input_tensor.shape} final dim"
+        f"to match mat2 shape: {weight_tensor.shape} first dim"
+    )
+
     try:
-        weight_tensor = weight_tensor.t()
-        return weight_tensor._quantized_linear_op(input_tensor, weight_tensor, bias)
+        transposed_weight_tensor = weight_tensor.t()
+        return weight_tensor._quantized_linear_op(
+            input_tensor, transposed_weight_tensor, bias
+        )
     except QuantizedLinearNotImplementedError as e:
         # fallback path is only called when user did not specify a specfic quantized linear implementation with `_layout.quantized_linear_impl`
         if (