Test compile with inner-padding

y-sq · facebook-github-bot · commit 0fdfe3ebd6a5 · 2024-09-09T15:28:58.000-07:00
Summary: Add test cases to verify that the compile of inner-padding works with the triton PR triton-lang/triton#4222. Before the triton PR, the inductor code-gen kernel fails at ``` tmp10 = tl.where(tmp6, tmp8, tmp9) TypeError: unexpected type fp8e5 and fp8e5 ``` Reviewed By: irobert0126 Differential Revision: D62003827
diff --git a/test/float8/test_compile.py b/test/float8/test_compile.py
@@ -40,10 +40,13 @@ def _test_compile_base(
     fullgraph: bool,
     config: Float8LinearConfig,
     dtype: torch.dtype,
+    pad_inner_dim: bool,
 ):
     random.seed(0)
     torch.manual_seed(0)
     x_shape = (16, 16)
+    if pad_inner_dim:
+        x_shape = (17, 16)
     linear_dtype = torch.bfloat16
 
     x = torch.randn(*x_shape, device="cuda", dtype=linear_dtype)
@@ -114,6 +117,9 @@ def _get_config(
     "scaling_type_grad_output", [ScalingType.DELAYED, ScalingType.DYNAMIC, ScalingType.STATIC]
 )
 @pytest.mark.parametrize("emulate", [False, True] if is_cuda_8_9 else [True])
+@pytest.mark.parametrize(
+    "pad_inner_dim", [True, False]
+)
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
 @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
 def test_eager_only(
@@ -122,17 +128,20 @@ def test_eager_only(
     scaling_type_input: ScalingType,
     scaling_type_weight: ScalingType,
     scaling_type_grad_output: ScalingType,
+    pad_inner_dim: bool,
     dtype: torch.dtype,
 ):
     torch._dynamo.reset()
     config = _get_config(
         scaling_type_input, scaling_type_weight, scaling_type_grad_output, emulate,
+        pad_inner_dim=pad_inner_dim,
     )
     _test_compile_base(
         "eager",
         fullgraph,
         config,
         dtype,
+        pad_inner_dim,
     )
 
 
@@ -147,6 +156,9 @@ def test_eager_only(
 @pytest.mark.parametrize(
     "scaling_type_grad_output", [ScalingType.DELAYED, ScalingType.DYNAMIC, ScalingType.STATIC]
 )
+@pytest.mark.parametrize(
+    "pad_inner_dim", [True, False]
+)
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
 @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
 def test_aot_eager(
@@ -155,17 +167,20 @@ def test_aot_eager(
     scaling_type_input: ScalingType,
     scaling_type_weight: ScalingType,
     scaling_type_grad_output: ScalingType,
+    pad_inner_dim: bool,
     dtype: torch.dtype,
 ):
     torch._dynamo.reset()
     config = _get_config(
         scaling_type_input, scaling_type_weight, scaling_type_grad_output, emulate,
+        pad_inner_dim=pad_inner_dim,
     )
     _test_compile_base(
         "aot_eager",
         fullgraph,
         config,
         dtype,
+        pad_inner_dim,
     )
 
 
@@ -180,6 +195,9 @@ def test_aot_eager(
 @pytest.mark.parametrize(
     "scaling_type_grad_output", [ScalingType.DELAYED, ScalingType.DYNAMIC, ScalingType.STATIC]
 )
+@pytest.mark.parametrize(
+    "pad_inner_dim", [False, True]
+)
 @unittest.skipIf(not torch.cuda.is_available() or not is_cuda_8_9, "CUDA with float8 support not available")
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
 def test_inductor(
@@ -188,17 +206,20 @@ def test_inductor(
     scaling_type_input: ScalingType,
     scaling_type_weight: ScalingType,
     scaling_type_grad_output: ScalingType,
+    pad_inner_dim: bool,
     dtype: torch.dtype,
 ):
     torch._dynamo.reset()
     config = _get_config(
         scaling_type_input, scaling_type_weight, scaling_type_grad_output, emulate,
+        pad_inner_dim=pad_inner_dim,
     )
     _test_compile_base(
         "inductor",
         fullgraph,
         config,
         dtype,
+        pad_inner_dim,
     )