|
| 1 | +#pragma once |
| 2 | + |
| 3 | +#include "scaled_mm_c2x.cuh" |
| 4 | + |
| 5 | +/** |
| 6 | + * This file defines Gemm kernel configurations for SM75 based on the Gemm |
| 7 | + * shape. |
| 8 | + */ |
| 9 | + |
| 10 | +namespace vllm { |
| 11 | + |
| 12 | +template <typename InType, typename OutType, |
| 13 | + template <typename, typename> typename Epilogue> |
| 14 | +struct sm75_config_default { |
| 15 | + // This config is used in 2 cases, |
| 16 | + // - M in (256, inf] |
| 17 | + // - M in (64, 128] |
| 18 | + // Shared memory required by this Gemm 32768 |
| 19 | + static_assert(std::is_same<InType, int8_t>()); |
| 20 | + using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>; |
| 21 | + using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; |
| 22 | + using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>; |
| 23 | + using Cutlass2xGemm = |
| 24 | + cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType, |
| 25 | + Epilogue, TileShape, WarpShape, InstructionShape, 2>; |
| 26 | +}; |
| 27 | + |
| 28 | +template <typename InType, typename OutType, |
| 29 | + template <typename, typename> typename Epilogue> |
| 30 | +struct sm75_config_M256 { |
| 31 | + // M in (128, 256] |
| 32 | + // Shared memory required by this Gemm 65536 |
| 33 | + static_assert(std::is_same<InType, int8_t>()); |
| 34 | + using TileShape = typename cutlass::gemm::GemmShape<128, 128, 128>; |
| 35 | + using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; |
| 36 | + using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>; |
| 37 | + using Cutlass2xGemm = |
| 38 | + cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType, |
| 39 | + Epilogue, TileShape, WarpShape, InstructionShape, 2>; |
| 40 | +}; |
| 41 | + |
| 42 | +template <typename InType, typename OutType, |
| 43 | + template <typename, typename> typename Epilogue> |
| 44 | +struct sm75_config_M64 { |
| 45 | + // M in (32, 64] |
| 46 | + // Shared memory required by this Gemm 49152 |
| 47 | + static_assert(std::is_same<InType, int8_t>()); |
| 48 | + using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>; |
| 49 | + using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; |
| 50 | + using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>; |
| 51 | + using Cutlass2xGemm = |
| 52 | + cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType, |
| 53 | + Epilogue, TileShape, WarpShape, InstructionShape, 2>; |
| 54 | +}; |
| 55 | + |
| 56 | +template <typename InType, typename OutType, |
| 57 | + template <typename, typename> typename Epilogue> |
| 58 | +struct sm75_config_M32 { |
| 59 | + // M in [1, 32] |
| 60 | + // Shared memory required by this Gemm 49152 |
| 61 | + static_assert(std::is_same<InType, int8_t>()); |
| 62 | + using TileShape = typename cutlass::gemm::GemmShape<32, 128, 64>; |
| 63 | + using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>; |
| 64 | + using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>; |
| 65 | + using Cutlass2xGemm = |
| 66 | + cutlass_2x_gemm<cutlass::arch::Sm75, enable_sm75_to_sm80, InType, OutType, |
| 67 | + Epilogue, TileShape, WarpShape, InstructionShape, 2>; |
| 68 | +}; |
| 69 | + |
| 70 | +template <typename InType, typename OutType, |
| 71 | + template <typename, typename> typename Epilogue, |
| 72 | + typename... EpilogueArgs> |
| 73 | +inline void cutlass_gemm_sm75_dispatch(torch::Tensor& out, |
| 74 | + torch::Tensor const& a, |
| 75 | + torch::Tensor const& b, |
| 76 | + EpilogueArgs&&... args) { |
| 77 | + static_assert(std::is_same<InType, int8_t>()); |
| 78 | + TORCH_CHECK(a.dtype() == torch::kInt8); |
| 79 | + TORCH_CHECK(b.dtype() == torch::kInt8); |
| 80 | + |
| 81 | + using Cutlass2xGemmDefault = |
| 82 | + typename sm75_config_default<InType, OutType, Epilogue>::Cutlass2xGemm; |
| 83 | + using Cutlass2xGemmM256 = |
| 84 | + typename sm75_config_M256<InType, OutType, Epilogue>::Cutlass2xGemm; |
| 85 | + using Cutlass2xGemmM128 = Cutlass2xGemmDefault; |
| 86 | + using Cutlass2xGemmM64 = |
| 87 | + typename sm75_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm; |
| 88 | + using Cutlass2xGemmM32 = |
| 89 | + typename sm75_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm; |
| 90 | + |
| 91 | + // Due to shared memory requirements, some Gemms may fail to run on some |
| 92 | + // GPUs. As the name indicates, the Fallback Gemm is used as an alternative |
| 93 | + // in such cases. |
| 94 | + // sm75_config_default has the least shared-memory requirements. |
| 95 | + using FallbackGemm = Cutlass2xGemmDefault; |
| 96 | + |
| 97 | + uint32_t const m = a.size(0); |
| 98 | + uint32_t const mp2 = |
| 99 | + std::max(static_cast<uint32_t>(32), next_pow_2(m)); // next power of 2 |
| 100 | + if (mp2 <= 32) { |
| 101 | + // M in [1, 32] |
| 102 | + return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>( |
| 103 | + out, a, b, std::forward<EpilogueArgs>(args)...); |
| 104 | + } else if (mp2 <= 64) { |
| 105 | + // M in (32, 64] |
| 106 | + return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>( |
| 107 | + out, a, b, std::forward<EpilogueArgs>(args)...); |
| 108 | + } else if (mp2 <= 128) { |
| 109 | + // M in (64, 128] |
| 110 | + return fallback_cutlass_gemm_caller<Cutlass2xGemmM128, FallbackGemm>( |
| 111 | + out, a, b, std::forward<EpilogueArgs>(args)...); |
| 112 | + } else if (mp2 <= 256) { |
| 113 | + // M in (128, 256] |
| 114 | + return fallback_cutlass_gemm_caller<Cutlass2xGemmM256, FallbackGemm>( |
| 115 | + out, a, b, std::forward<EpilogueArgs>(args)...); |
| 116 | + } else { |
| 117 | + // M in (256, inf) |
| 118 | + return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>( |
| 119 | + out, a, b, std::forward<EpilogueArgs>(args)...); |
| 120 | + } |
| 121 | +} |
| 122 | + |
| 123 | +} // namespace vllm |
0 commit comments