[Kernel] Enable FP8 Cutlass for Ada Lovelace (vllm-project#6950)

varun-sundar-rabindranath · Varun Sundar Rabindranath · web-flow · commit 93548eb37e95 · 2024-07-31T14:40:22.000-07:00
Co-authored-by: Varun Sundar Rabindranath &lt;varun@neuralmagic.com&gt;
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -38,13 +38,7 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
   if (cuda_device_capability >= 90) {
     return CUDA_VERSION >= 12000;
   } else if (cuda_device_capability >= 89) {
-    // CUTLASS Kernels have not been tuned for Ada Lovelace systems
-    // and are slower than torch.mm. Return false unconditionally in this case.
-    return false;
-
-    // Once the CUTLASS kernels have been optimized for Lovelace systems,
-    // use the following check:
-    // return CUDA_VERSION >= 12040;
+    return CUDA_VERSION >= 12040;
   }
 #endif