Test compile with inner-padding (#858)

y-sq · facebook-github-bot · commit 9ce26f846025 · 2024-09-10T14:10:44.000-07:00
Summary: Pull Request resolved: #858 Add test cases to verify that the compile of inner-padding works with the triton PR triton-lang/triton#4222. Before the triton PR, the inductor code-gen kernel fails at ``` tmp10 = tl.where(tmp6, tmp8, tmp9) TypeError: unexpected type fp8e5 and fp8e5 ``` Reviewed By: irobert0126 Differential Revision: D62003827
diff --git a/test/float8/test_compile.py b/test/float8/test_compile.py
@@ -40,10 +40,13 @@ def _test_compile_base(
     fullgraph: bool,
     config: Float8LinearConfig,
     dtype: torch.dtype,
+    pad_inner_dim: bool,
 ):
     random.seed(0)
     torch.manual_seed(0)
     x_shape = (16, 16)
+    if pad_inner_dim:
+        x_shape = (17, 16)
     linear_dtype = torch.bfloat16
 
     x = torch.randn(*x_shape, device="cuda", dtype=linear_dtype)
@@ -70,6 +73,7 @@ def _get_config(
     scaling_type_input, 
     scaling_type_weight, 
     scaling_type_grad_output, 
+    pad_inner_dim,
     emulate,
 ):
     if scaling_type_input is ScalingType.STATIC:
@@ -99,6 +103,7 @@ def _get_config(
         cast_config_weight=cast_config_weight,
         cast_config_grad_output=cast_config_grad_output,
         emulate=emulate,
+        pad_inner_dim=pad_inner_dim,
     )
     return config
 
@@ -114,6 +119,9 @@ def _get_config(
     "scaling_type_grad_output", [ScalingType.DELAYED, ScalingType.DYNAMIC, ScalingType.STATIC]
 )
 @pytest.mark.parametrize("emulate", [False, True] if is_cuda_8_9 else [True])
+@pytest.mark.parametrize(
+    "pad_inner_dim", [True, False]
+)
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
 @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
 def test_eager_only(
@@ -122,17 +130,19 @@ def test_eager_only(
     scaling_type_input: ScalingType,
     scaling_type_weight: ScalingType,
     scaling_type_grad_output: ScalingType,
+    pad_inner_dim: bool,
     dtype: torch.dtype,
 ):
     torch._dynamo.reset()
     config = _get_config(
-        scaling_type_input, scaling_type_weight, scaling_type_grad_output, emulate,
+        scaling_type_input, scaling_type_weight, scaling_type_grad_output, emulate,pad_inner_dim,
     )
     _test_compile_base(
         "eager",
         fullgraph,
         config,
         dtype,
+        pad_inner_dim,
     )
 
 
@@ -147,6 +157,9 @@ def test_eager_only(
 @pytest.mark.parametrize(
     "scaling_type_grad_output", [ScalingType.DELAYED, ScalingType.DYNAMIC, ScalingType.STATIC]
 )
+@pytest.mark.parametrize(
+    "pad_inner_dim", [True, False]
+)
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
 @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
 def test_aot_eager(
@@ -155,17 +168,19 @@ def test_aot_eager(
     scaling_type_input: ScalingType,
     scaling_type_weight: ScalingType,
     scaling_type_grad_output: ScalingType,
+    pad_inner_dim: bool,
     dtype: torch.dtype,
 ):
     torch._dynamo.reset()
     config = _get_config(
-        scaling_type_input, scaling_type_weight, scaling_type_grad_output, emulate,
+        scaling_type_input, scaling_type_weight, scaling_type_grad_output, emulate,pad_inner_dim,
     )
     _test_compile_base(
         "aot_eager",
         fullgraph,
         config,
         dtype,
+        pad_inner_dim,
     )
 
 
@@ -180,6 +195,9 @@ def test_aot_eager(
 @pytest.mark.parametrize(
     "scaling_type_grad_output", [ScalingType.DELAYED, ScalingType.DYNAMIC, ScalingType.STATIC]
 )
+@pytest.mark.parametrize(
+    "pad_inner_dim", [False, True]
+)
 @unittest.skipIf(not torch.cuda.is_available() or not is_cuda_8_9, "CUDA with float8 support not available")
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
 def test_inductor(
@@ -188,17 +206,19 @@ def test_inductor(
     scaling_type_input: ScalingType,
     scaling_type_weight: ScalingType,
     scaling_type_grad_output: ScalingType,
+    pad_inner_dim: bool,
     dtype: torch.dtype,
 ):
     torch._dynamo.reset()
     config = _get_config(
-        scaling_type_input, scaling_type_weight, scaling_type_grad_output, emulate,
+        scaling_type_input, scaling_type_weight, scaling_type_grad_output, emulate, pad_inner_dim,
     )
     _test_compile_base(
         "inductor",
         fullgraph,
         config,
         dtype,
+        pad_inner_dim,
     )