add-to-benchmarks

drisspg · drisspg · commit d2d4383ee1cc · 2025-06-23T15:30:41.000-07:00
diff --git a/benchmarks/float8/bench_matmul.py b/benchmarks/float8/bench_matmul.py
@@ -64,10 +64,14 @@ def run(
 ):
     device = "cuda"
     # TODO(future PR): this is ugly
-    assert recipe in ("tensorwise", "rowwise", "mxfp8_cublas", "mxfp4_cutlass"), (
-        "unsupported"
-    )
-    use_fp4 = recipe == "mxfp4_cutlass"
+    assert recipe in (
+        "tensorwise",
+        "rowwise",
+        "mxfp8_cublas",
+        "mxfp4_cutlass",
+        "nvfp4",
+    ), "unsupported"
+    use_fp4 = recipe in ("mxfp4_cutlass", "nvfp4")
 
     specs = get_specs()
     bf16_peak_tops = specs["bf16_peak_tops"]
@@ -118,11 +122,20 @@ def run(
         A_hp = torch.randn(M, K, device=device)
         B_hp_t = torch.randn(N, K, device=device)
 
-        if use_fp4:
+        if recipe == "mxfp4_cutlass":
             _, A = to_mx(A_hp, torch.float4_e2m1fn_x2, 32)
             _, Bt = to_mx(B_hp_t, torch.float4_e2m1fn_x2, 32)
             B = Bt.contiguous().T
             peak_tops = fp4_peak_tops
+        elif recipe == "nvfp4":
+            from torchao.prototype.mx_formats.nvfp4_tensor import nvfp4_quantize
+
+            # Quantize tensors to nvfp4 format - get blockwise scales
+            A_scales, A_data = nvfp4_quantize(A_hp, block_size=16)
+            B_scales, B_data = nvfp4_quantize(B_hp_t, block_size=16)
+            A = A_data.view(torch.float4_e2m1fn_x2)
+            B = B_data.view(torch.float4_e2m1fn_x2).T
+            peak_tops = fp4_peak_tops
         else:
             # raw float8 matmul (upper bound for what we can achive in eager mode)
             # TODO(future): add e5m2
@@ -140,6 +153,10 @@ def run(
         elif recipe in ("mxfp8_cublas", "mxfp4_cutlass"):
             scale_a = torch.ones(M, K // 32, device=device, dtype=torch.float8_e8m0fnu)
             scale_b = torch.ones(N, K // 32, device=device, dtype=torch.float8_e8m0fnu)
+        elif recipe == "nvfp4":
+            # Use the blockwise scales from nvfp4_quantize
+            scale_a = A_scales.view(torch.float8_e4m3fn)
+            scale_b = B_scales.view(torch.float8_e4m3fn)
         else:
             assert False, f"unknown recipe {recipe}"
 
@@ -155,7 +172,17 @@ def do_matmul_mxfp4(A, B):
             nonlocal scale_b
             return mx_fp4_bf16(A, B, scale_a, scale_b)
 
-        do_matmul = do_matmul_mxfp4 if use_fp4 else do_matmul_fp8
+        def do_matmul_nvfp4(A, B):
+            nonlocal scale_a
+            nonlocal scale_b
+            return torch._scaled_mm(A, B, scale_a, scale_b, out_dtype=dtype)
+
+        if recipe == "mxfp4_cutlass":
+            do_matmul = do_matmul_mxfp4
+        elif recipe == "nvfp4":
+            do_matmul = do_matmul_nvfp4
+        else:
+            do_matmul = do_matmul_fp8
 
         time_sec, tops_sec, pct_top_peak = do_benchmarks(
             tops, peak_tops, use_gpu_kernel_time, do_matmul, A, B
@@ -164,7 +191,11 @@ def do_matmul_mxfp4(A, B):
             f"time_sec {time_sec:.2E}, tops/sec {tops_sec:.2E}, pct_peak {pct_top_peak:.3f}"
         )
 
-        del A, B, scale_a, scale_b
+        del A, B
+        if scale_a is not None:
+            del scale_a
+        if scale_b is not None:
+            del scale_b
 
         results.append(
             [