-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCMakeLists.txt
207 lines (177 loc) · 6.4 KB
/
CMakeLists.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
cmake_minimum_required(VERSION 3.21)
project(sarathi_extensions LANGUAGES CXX)
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
include(${CMAKE_CURRENT_LIST_DIR}/cmake/FindFlashinfer.cmake)
#
# Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py.
#
set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
# Supported NVIDIA architectures.
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
#
# Supported/expected torch versions for CUDA/ROCm.
#
# Currently, having an incorrect pytorch version results in a warning
# rather than an error.
#
# Note: the CUDA torch version is derived from pyproject.toml and various
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm
#
set(TORCH_SUPPORTED_VERSION_CUDA "2.1.2")
#
# Try to find python package with an executable that exactly matches
# `SARATHI_PYTHON_EXECUTABLE` and is one of the supported versions.
#
if (SARATHI_PYTHON_EXECUTABLE)
find_python_from_executable(${SARATHI_PYTHON_EXECUTABLE} "${PYTHON_SUPPORTED_VERSIONS}")
else()
message(FATAL_ERROR
"Please set SARATHI_PYTHON_EXECUTABLE to the path of the desired python version"
" before running cmake configure.")
endif()
#
# Update cmake's `CMAKE_PREFIX_PATH` with torch location.
#
append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
#
# Import torch cmake configuration.
# Torch also imports CUDA (and partially HIP) languages with some customizations,
# so there is no need to do this explicitly with check_language/enable_language,
# etc.
#
find_package(Torch REQUIRED)
# Add debug symbols
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
execute_process(
COMMAND python3 -c "import torch; print(torch._C._PYBIND11_COMPILER_TYPE, end='')"
OUTPUT_VARIABLE _PYBIND11_COMPILER_TYPE
)
execute_process(
COMMAND python3 -c "import torch; print(torch._C._PYBIND11_STDLIB, end='')"
OUTPUT_VARIABLE _PYBIND11_STDLIB
)
execute_process(
COMMAND python3 -c "import torch; print(torch._C._PYBIND11_BUILD_ABI, end='')"
OUTPUT_VARIABLE _PYBIND11_BUILD_ABI
)
message(STATUS "PYBIND11_COMPILER_TYPE:" ${_PYBIND11_COMPILER_TYPE})
message(STATUS "PYBIND11_STDLIB:" ${_PYBIND11_STDLIB})
message(STATUS "PYBIND11_BUILD_ABI:" ${_PYBIND11_BUILD_ABI})
add_compile_definitions(PYBIND11_COMPILER_TYPE="${_PYBIND11_COMPILER_TYPE}" PYBIND11_STDLIB="${_PYBIND11_STDLIB}" PYBIND11_BUILD_ABI="${_PYBIND11_BUILD_ABI}")
#
# Normally `torch.utils.cpp_extension.CUDAExtension` would add
# `libtorch_python.so` for linking against an extension. Torch's cmake
# configuration does not include this library (presumably since the cmake
# config is used for standalone C++ binaries that link against torch).
# The `libtorch_python.so` library defines some of the glue code between
# torch/python via pybind and is required by SARATHI extensions for this
# reason. So, add it by manually using `append_torchlib_if_found` from
# torch's cmake setup.
#
append_torchlib_if_found(torch_python)
#
# Set up GPU language and check the torch version and warn if it isn't
# what is expected.
#
if (CUDA_FOUND)
set(SARATHI_GPU_LANG "CUDA")
if (NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_CUDA})
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_CUDA} "
"expected for CUDA build, saw ${Torch_VERSION} instead.")
endif()
else()
message(FATAL_ERROR "Can't find CUDA installation.")
endif()
#
# Override the GPU architectures detected by cmake/torch and filter them by
# the supported versions for the current language.
# The final set of arches is stored in `SARATHI_GPU_ARCHES`.
#
override_gpu_arches(SARATHI_GPU_ARCHES
${SARATHI_GPU_LANG}
"${${SARATHI_GPU_LANG}_SUPPORTED_ARCHS}")
#
# Query torch for additional GPU compilation flags for the given
# `SARATHI_GPU_LANG`.
# The final set of arches is stored in `SARATHI_GPU_FLAGS`.
#
get_torch_gpu_compiler_flags(SARATHI_GPU_FLAGS ${SARATHI_GPU_LANG})
#
# Set nvcc parallelism.
#
if(NVCC_THREADS AND SARATHI_GPU_LANG STREQUAL "CUDA")
list(APPEND SARATHI_GPU_FLAGS "--threads=${NVCC_THREADS}")
endif()
#
# Define extension targets
#
#
# _C extension
#
set(SARATHI_KERNEL_COMMONS_EXT_SRC
"csrc/kernels/pos_encoding_kernels.cu"
"csrc/kernels/layernorm_kernels.cu"
"csrc/kernels/activation_kernels.cu"
"csrc/kernels/moe_align_block_size_kernels.cu"
"csrc/kernels/moe_topk_softmax_kernels.cu"
)
define_gpu_static_target(
_kernels_common
DESTINATION sarathi
LANGUAGE ${SARATHI_GPU_LANG}
SOURCES ${SARATHI_KERNEL_COMMONS_EXT_SRC}
COMPILE_FLAGS ${SARATHI_GPU_FLAGS}
ARCHITECTURES ${SARATHI_GPU_ARCHES}
)
set(SARATHI_KERNELS_EXT_SRC
"csrc/kernels/pybind.cpp"
)
define_gpu_extension_target(
_kernels_C
DESTINATION sarathi
LANGUAGE ${SARATHI_GPU_LANG}
SOURCES ${SARATHI_KERNELS_EXT_SRC}
COMPILE_FLAGS ${SARATHI_GPU_FLAGS}
ARCHITECTURES ${SARATHI_GPU_ARCHES}
LIBRARIES _kernels_common
WITH_SOABI)
set(SARATHI_MODEL_EXECUTOR_EXT_SRC
"csrc/model_executor/pybind.cpp"
"csrc/model_executor/parallel_utils/ProcessGroupWrapper.cpp"
"csrc/model_executor/parallel_utils/ParallelOps.cpp"
"csrc/model_executor/models/Llama.cpp"
"csrc/model_executor/layers/RotaryEmbedding.cpp"
"csrc/model_executor/layers/NormLayers.cpp"
"csrc/model_executor/layers/LinearLayers.cpp"
"csrc/model_executor/layers/FlashinferAttentionWrapper.cpp"
)
define_gpu_extension_target(
_model_executor_C
DESTINATION sarathi
LANGUAGE ${SARATHI_GPU_LANG}
SOURCES ${SARATHI_MODEL_EXECUTOR_EXT_SRC}
COMPILE_FLAGS ${SARATHI_GPU_FLAGS}
ARCHITECTURES ${SARATHI_GPU_ARCHES}
LIBRARIES _kernels_common Flashinfer::Flashinfer
WITH_SOABI)
#
# Add the `default` target which detects which extensions should be
# built based on platform/architecture. This is the same logic that
# setup.py uses to select which extensions should be built and should
# be kept in sync.
#
# The `default` target makes direct use of cmake easier since knowledge
# of which extensions are supported has been factored in, e.g.
#
# mkdir build && cd build
# cmake -G Ninja -DSARATHI_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../sarathi ..
# cmake --build . --target default
#
add_custom_target(default)
add_dependencies(default _kernels_C)
add_dependencies(default _model_executor_C)