[ROCm] Update meta_registration for efficient attention (pytorch#146979)

AmdSampsa · jithunnair-amd · commit 68180d0c817a · 2025-04-24T08:09:32.000-05:00
Fixes a series of failing and skipped unit tests. For nvidia hw, the longsumexp last dimension is required to be a multiple of 32. This is not the case for rocm. A related issue: pytorch#146848 The unit tests in question: ```bash inductor.test_fused_attention SDPAPatternRewriterCudaDynamicTests test_sdpa_prev_13_cuda inductor.test_fused_attention SDPAPatternRewriterCudaDynamicTests test_sdpa_prev_14_cuda inductor.test_fused_attention SDPAPatternRewriterCudaDynamicTests test_sdpa_prev_15_cuda inductor.test_fused_attention SDPAPatternRewriterCudaDynamicTests test_sdpa_rewriter_11_cuda inductor.test_fused_attention SDPAPatternRewriterCudaDynamicTests test_sdpa_rewriter_14_cuda inductor.test_fused_attention SDPAPatternRewriterCudaDynamicTests test_sdpa_rewriter_15_cuda inductor.test_fused_attention SDPAPatternRewriterCudaDynamicTests test_sdpa_rewriter_17_cuda inductor.test_fused_attention SDPAPatternRewriterCudaDynamicTests test_sdpa_rewriter_1_cuda inductor.test_fused_attention SDPAPatternRewriterCudaDynamicTests test_sdpa_rewriter_1_freezing inductor.test_fused_attention SDPAPatternRewriterCudaDynamicTests test_sdpa_rewriter_2_cuda inductor.test_fused_attention SDPAPatternRewriterCudaDynamicTests test_sdpa_rewriter_3_cuda inductor.test_fused_attention SDPAPatternRewriterCudaDynamicTests test_sdpa_rewriter_4_cuda inductor.test_fused_attention SDPAPatternRewriterCudaDynamicTests test_sdpa_rewriter_6_cuda inductor.test_fused_attention SDPAPatternRewriterCudaTests test_sdpa_prev_13_cuda inductor.test_fused_attention SDPAPatternRewriterCudaTests test_sdpa_prev_14_cuda inductor.test_fused_attention SDPAPatternRewriterCudaTests test_sdpa_prev_15_cuda inductor.test_fused_attention SDPAPatternRewriterCudaTests test_sdpa_rewriter_11_cuda inductor.test_fused_attention SDPAPatternRewriterCudaTests test_sdpa_rewriter_14_cuda inductor.test_fused_attention SDPAPatternRewriterCudaTests test_sdpa_rewriter_15_cuda inductor.test_fused_attention SDPAPatternRewriterCudaTests test_sdpa_rewriter_17_cuda inductor.test_fused_attention SDPAPatternRewriterCudaTests test_sdpa_rewriter_1_cuda inductor.test_fused_attention SDPAPatternRewriterCudaTests test_sdpa_rewriter_1_freezing inductor.test_fused_attention SDPAPatternRewriterCudaTests test_sdpa_rewriter_2_cuda inductor.test_fused_attention SDPAPatternRewriterCudaTests test_sdpa_rewriter_3_cuda inductor.test_fused_attention SDPAPatternRewriterCudaTests test_sdpa_rewriter_4_cuda inductor.test_fused_attention SDPAPatternRewriterCudaTests test_sdpa_rewriter_6_cuda ``` Pull Request resolved: pytorch#146979 Approved by: https://github.com/shunting314
diff --git a/test/inductor/test_fused_attention.py b/test/inductor/test_fused_attention.py
@@ -105,7 +105,6 @@ def _check_common(
                     ):
                         self.assertEqual(arg1.grad, arg2.grad, atol=atol, rtol=rtol)
 
-    @skipIfRocm
     def _test_sdpa_rewriter_1(self):
         def dot_prod_attention(
             query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
@@ -132,7 +131,6 @@ def dot_prod_attention(
                 rtol=rtol,
             )
 
-    @skipIfRocm
     @torch._inductor.config.patch("freezing", True)
     def _test_sdpa_rewriter_1_freezing(self):
         def dot_prod_attention(
@@ -264,7 +262,6 @@ def dot_prod_attention(
         _, (source_code,) = run_and_get_code(dot_prod_attention, *args)
         self.assertNotIn("aten._scaled_dot_product_efficient_attention", source_code)
 
-    @skipIfRocm
     def _test_sdpa_rewriter_2(self):
         def dot_prod_attention(
             query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
@@ -279,7 +276,6 @@ def dot_prod_attention(
         self._check_common(dot_prod_attention)
         self._check_common(checkpoint_wrapper(dot_prod_attention))
 
-    @skipIfRocm  # AssertionError: expected size 4==4, stride 32==64 at dim=0
     def _test_sdpa_rewriter_3(self):
         def dot_prod_attention(
             query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, training: bool
@@ -296,7 +292,6 @@ def dot_prod_attention(
             checkpoint_wrapper(dot_prod_attention), contains=False, has_dropout=True
         )
 
-    @skipIfRocm  # AssertionError: expected size 4==4, stride 32==64 at dim=0
     def _test_sdpa_rewriter_4(self):
         def dot_prod_attention(
             query: torch.Tensor,
@@ -346,7 +341,6 @@ def sfdp_pattern_5_v2(query, key, value):
         self._check_common(sfdp_pattern_5_v2, contains=False)
         self._check_common(checkpoint_wrapper(sfdp_pattern_5_v2), contains=False)
 
-    @skipIfRocm
     def _test_sdpa_rewriter_6(self):
         def sfdp_pattern_6(query, key, value, training):
             attn_mask = torch.ones(
@@ -570,7 +564,6 @@ def forward(self, query, key, value, attn_mask) -> torch.Tensor:
                 model, args1=args, contains=False, atol=1e-4, has_fuse_pattern=False
             )
 
-    @skipIfRocm
     def _test_sdpa_rewriter_11(self):
         def dot_prod_attention(
             query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
@@ -611,7 +604,6 @@ def dot_prod_attention(
 
         self._check_common(dot_prod_attention, contains=False, has_dropout=True)
 
-    @skipIfRocm
     def _test_sdpa_prev_13(self):
         def dot_prod_attention(
             query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
@@ -628,7 +620,6 @@ def dot_prod_attention(
         self._check_common(dot_prod_attention, check_train=False)
         self._check_common(checkpoint_wrapper(dot_prod_attention), check_train=False)
 
-    @skipIfRocm
     def _test_sdpa_prev_14(self):
         def dot_prod_attention(
             query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
@@ -644,7 +635,6 @@ def dot_prod_attention(
         self._check_common(dot_prod_attention, check_train=False)
         self._check_common(checkpoint_wrapper(dot_prod_attention), check_train=False)
 
-    @skipIfRocm
     def _test_sdpa_prev_15(self):
         def dot_prod_attention(
             query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
@@ -694,7 +684,6 @@ def dot_prod_attention(
             rtol=1e-2,
         )
 
-    @skipIfRocm
     def _test_sdpa_rewriter_14(self):
         def dot_prod_attention(
             query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
@@ -717,7 +706,6 @@ def dot_prod_attention(
 
         self._check_common(dot_prod_attention)
 
-    @skipIfRocm
     def _test_sdpa_rewriter_15(self):
         def dot_prod_attention(
             query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
@@ -810,7 +798,6 @@ def dot_prod_attention(
             dot_prod_attention, args1=args, contains=False, has_dropout=True
         )
 
-    @skipIfRocm
     def _test_sdpa_rewriter_17(self):
         def dot_prod_attention(
             query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, training
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -10590,10 +10590,6 @@ def fn(z):
     def test_scaled_dot_product_attention(self):
         if self.device == "cuda" and not PLATFORM_SUPPORTS_FLASH_ATTENTION:
             raise unittest.SkipTest("Can't run flash attention on this platform")
-        if self.device == "cuda" and TEST_WITH_ROCM:
-            raise unittest.SkipTest(
-                "Flash attention support is incomplete on this platform"
-            )
 
         def fn(q, k, v):
             return torch.nn.functional.scaled_dot_product_attention(
diff --git a/torch/_inductor/fx_passes/fuse_attention.py b/torch/_inductor/fx_passes/fuse_attention.py
@@ -5,7 +5,6 @@
 import math
 
 import torch
-from torch.nn.attention import sdpa_kernel, SDPBackend
 
 from ..._dynamo.utils import counters
 from ..pattern_matcher import (
@@ -20,14 +19,7 @@
 aten = torch.ops.aten
 
 
-if torch.version.hip:
-
-    def _scaled_dot_product_attention(*args, **kwargs):
-        with sdpa_kernel(backends=[SDPBackend.MATH, SDPBackend.FLASH_ATTENTION]):
-            return aten.scaled_dot_product_attention(*args, **kwargs)
-
-else:
-    _scaled_dot_product_attention = aten.scaled_dot_product_attention
+_scaled_dot_product_attention = aten.scaled_dot_product_attention
 
 
 def _sfdp_pattern_1(query, key, value, inv_scale):
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
@@ -4050,8 +4050,7 @@ def pool3d_shape_check(
     torch._check(
         dT > 0 and dW > 0 and dH > 0,
         lambda: (
-            f"stride should be greater than zero, but got "
-            f"dT: {dT}, dH: {dH}, dW: {dW}"
+            f"stride should be greater than zero, but got dT: {dT}, dH: {dH}, dW: {dW}"
         ),
     )
     torch._check(
@@ -5330,7 +5329,14 @@ def meta__scaled_dot_product_efficient_attention(
 
     res = torch.empty(B, M, num_heads, Kv, dtype=query.dtype, device=query.device)
 
-    logsumexp_dim = math.ceil(M / 32) * 32 if compute_log_sumexp else 0
+    if torch.version.hip and torch.cuda.is_available():
+        """Please see: https://github.com/pytorch/pytorch/issues/146848
+        longsumexp last dim should be seq length
+        """
+        logsumexp_dim = M if compute_log_sumexp else 0
+    else:
+        logsumexp_dim = math.ceil(M / 32) * 32 if compute_log_sumexp else 0
+
     logsum_exp = torch.empty(
         (B, num_heads, logsumexp_dim),
         dtype=torch.float,