Revert "Allow fx graph caching higher order operators (opt-in) (pytorch#135877)"

pytorchmergebot · pytorchmergebot · commit e9bfbf78d5d8 · 2024-09-23T09:04:24.000Z
This reverts commit 66d5eb6. Reverted pytorch#135877 on behalf of https://github.com/jeanschmidt due to seems to have introduced regressions on rocm signals ([comment](pytorch#135877 (comment)))
diff --git a/test/inductor/test_codecache.py b/test/inductor/test_codecache.py
@@ -362,64 +362,13 @@ def fn2(x):
         self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 2)
         self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
 
-    @requires_gpu()
-    @config.patch({"fx_graph_cache": True})
-    @config.patch({"fx_graph_remote_cache": False})
-    def test_flex_attention_caching(self):
-        from torch.nn.attention.flex_attention import create_block_mask, flex_attention
-
-        block_mask = create_block_mask(
-            lambda b, h, q, kv: q >= kv, None, None, 2048, 2048
-        )
-
-        def score_mod(score, b, h, q, kv):
-            return score + (q - kv)
-
-        def fn(q, k, v):
-            return flex_attention(q, k, v, score_mod=score_mod, block_mask=block_mask)
-
-        def score_mod2(score, b, h, q, kv):
-            return score
-
-        def fn2(q, k, v):
-            return flex_attention(q, k, v, score_mod=score_mod2, block_mask=block_mask)
-
-        a, b, c = (torch.randn(1, 4, 512, 64).cuda() for _ in range(3))
-        compiled_fn = torch.compile(fn)
-        compiled_fn2 = torch.compile(fn2)
-
-        # A first call should miss in the cache.
-        self.assertEqual(fn(a, b, c), compiled_fn(a, b, c))
-        self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
-        self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
-        self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 0)
-
-        # A second call should hit. (First reset so in-memory guards
-        # don't prevent compilation).
-        for m in torch._inductor.codecache.PyCodeCache.cache.values():
-            os.remove(m.__file__)
-        self.reset()
-        self.assertEqual(fn(a, b, c), compiled_fn(a, b, c))
-        self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
-        self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
-        self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 1)
-
-        # A third call with different score_mod should have a cache miss
-        for m in torch._inductor.codecache.PyCodeCache.cache.values():
-            os.remove(m.__file__)
-        self.reset()
-        self.assertEqual(fn2(a, b, c), compiled_fn2(a, b, c))
-        self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 2)
-        self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
-        self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 1)
-
     @requires_gpu()
     @requires_triton()
     @config.patch({"fx_graph_cache": True})
     @config.patch({"fx_graph_remote_cache": False})
-    def test_triton_higher_order_op_bypass(self):
+    def test_higher_order_op_bypass(self):
         """
-        Verify that we bypass the cache when we have a triton higher order ops.
+        Verify that we bypass the cache when we have higher order ops.
         """
 
         def fn(x, y):
diff --git a/torch/_higher_order_ops/flex_attention.py b/torch/_higher_order_ops/flex_attention.py
@@ -87,7 +87,7 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
 
 class FlexAttentionHOP(HigherOrderOperator):
     def __init__(self) -> None:
-        super().__init__("flex_attention", cacheable=True)
+        super().__init__("flex_attention")
 
     def __call__(
         self,
diff --git a/torch/_higher_order_ops/triton_kernel_wrap.py b/torch/_higher_order_ops/triton_kernel_wrap.py
@@ -523,7 +523,7 @@ def identify_mutated_tensors(kernel, kwargs):
 # Used for wrapping a Triton Kernel
 class TritonKernelWrapperMutation(HigherOrderOperator):
     def __init__(self) -> None:
-        super().__init__("triton_kernel_wrapper_mutation", cacheable=False)
+        super().__init__("triton_kernel_wrapper_mutation")
 
     def __call__(self, kernel_idx, constant_args_idx, grid, kwargs):
         return super().__call__(
@@ -540,7 +540,7 @@ def __call__(self, kernel_idx, constant_args_idx, grid, kwargs):
 # Used for wrapping a Triton Kernel in a functional manner
 class TritonKernelWrapperFunctional(HigherOrderOperator):
     def __init__(self) -> None:
-        super().__init__("triton_kernel_wrapper_functional", cacheable=False)
+        super().__init__("triton_kernel_wrapper_functional")
 
     def __call__(self, kernel_idx, constant_args_idx, grid, kwargs, tensors_to_clone):
         return super().__call__(
diff --git a/torch/_higher_order_ops/wrap.py b/torch/_higher_order_ops/wrap.py
@@ -107,7 +107,7 @@ class WrapActivationCheckpoint(HigherOrderOperator):
     """
 
     def __init__(self) -> None:
-        super().__init__("wrap_activation_checkpoint", cacheable=False)
+        super().__init__("wrap_activation_checkpoint")
 
     def __call__(self, function, *args, **kwargs):
         # use_reentrant is set to False because this op is going to be traced.
@@ -146,7 +146,7 @@ class TagActivationCheckpoint(HigherOrderOperator):
     """
 
     def __init__(self) -> None:
-        super().__init__("tag_activation_checkpoint", cacheable=False)
+        super().__init__("tag_activation_checkpoint")
 
     @staticmethod
     def divide_kwargs(kwargs):
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
@@ -1268,22 +1268,18 @@ def _check_can_cache(gm: torch.fx.GraphModule) -> None:
             log.debug("fx graph cache no shape env")
             raise BypassFxGraphCache("No shape env")
 
-        # We skip caching if there are any torchbind objects.
-        for module in gm.modules():
-            if not isinstance(module, torch.fx.GraphModule):
-                continue
-            for node in module.graph.nodes:
-                if (
-                    isinstance(node.target, torch._ops.HigherOrderOperator)
-                    and not node.target.cacheable()
-                ):
-                    raise BypassFxGraphCache(
-                        f"Can't cache HigherOrderOperator: {node.target.name()}"
-                    )
-                if node.op == "getattr" and isinstance(
-                    getattr(gm, node.target), torch._C.ScriptObject
-                ):
-                    raise BypassFxGraphCache("Can't cache torchbind objects")
+        # HigherOrderOperators should be handled on a case-by-case basis.
+        # Currently, we just skip caching if we have any.
+        # We also skip if there are any torchbind objects.
+        for node in gm.graph.nodes:
+            if isinstance(node.target, torch._ops.HigherOrderOperator):
+                raise BypassFxGraphCache(
+                    f"Can't cache HigherOrderOperator: {node.target.name()}"
+                )
+            if node.op == "getattr" and isinstance(
+                getattr(gm, node.target), torch._C.ScriptObject
+            ):
+                raise BypassFxGraphCache("Can't cache torchbind objects")
 
     @staticmethod
     def prepare_key(
diff --git a/torch/_ops.py b/torch/_ops.py
@@ -245,7 +245,7 @@ class HigherOrderOperator(OperatorBase, abc.ABC):
     # If you're creating a new HigherOrderOperator, please do not change the
     # default. Adding operators to the global torch.ops namespace is a bad
     # practice due to name collisions.
-    def __init__(self, name, *, cacheable=False):
+    def __init__(self, name):
         super().__init__()
         if type(self) is HigherOrderOperator:
             raise RuntimeError(
@@ -258,7 +258,6 @@ def __init__(self, name, *, cacheable=False):
         _higher_order_ops[name] = self
         self._ns = "higher_order"
         self.__module__ = "torch.ops.higher_order"
-        self._cacheable = cacheable
 
         self.non_fallthrough_keys = torch._C._dispatch_keyset_full()
 
@@ -282,9 +281,6 @@ def py_impl(self, k):
     def namespace(self):
         return self._ns
 
-    def cacheable(self):
-        return self._cacheable
-
     def fallthrough(self, dispatch_key):
         self.non_fallthrough_keys = self.non_fallthrough_keys.remove(dispatch_key)