Skip to content

Commit

Permalink
Bump minimum CUDA version to 12.0 (#1103)
Browse files Browse the repository at this point in the history
* Bump minimum CUDA version to 12.0

Signed-off-by: Tim Moon <[email protected]>

* Debug CUDA version check

Signed-off-by: Tim Moon <[email protected]>

* Debug CMake build

Signed-off-by: Tim Moon <[email protected]>

* Review suggestions from @ksivaman and @ptrendx

Remove logic for CUDA <12.0 in PyTorch and Paddle builds. Update version in docs and README.

Signed-off-by: Tim Moon <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Tim Moon <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
timmoon10 and pre-commit-ci[bot] authored Aug 14, 2024
1 parent 8ef3308 commit cc329b7
Show file tree
Hide file tree
Showing 7 changed files with 67 additions and 48 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
name: 'Core'
runs-on: ubuntu-latest
container:
image: nvcr.io/nvidia/cuda:12.5.0-devel-ubuntu22.04
image: nvcr.io/nvidia/cuda:12.0.0-devel-ubuntu22.04
options: --user root
steps:
- name: 'Dependencies'
Expand Down
6 changes: 3 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,8 @@ Installation
Pre-requisites
^^^^^^^^^^^^^^^^^^^^
* Linux x86_64
* CUDA 11.8+ for Hopper and CUDA 12.1+ for Ada
* NVIDIA Driver supporting CUDA 11.8 or later
* CUDA 12.0+ for Hopper and CUDA 12.1+ for Ada
* NVIDIA Driver supporting CUDA 12.0 or later
* cuDNN 8.1 or later
* For fused attention, CUDA 12.1 or later, NVIDIA Driver supporting CUDA 12.1 or later, and cuDNN 8.9 or later.

Expand Down Expand Up @@ -182,7 +182,7 @@ From source

Compiling with FlashAttention-2
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Transformer Engine release v0.11.0 adds support for FlashAttention-2 in PyTorch for improved performance.
Transformer Engine release v0.11.0 adds support for FlashAttention-2 in PyTorch for improved performance.

It is a known issue that FlashAttention-2 compilation is resource-intensive and requires a large amount of RAM (see `bug <https://github.com/Dao-AILab/flash-attention/issues/358>`_), which may lead to out of memory errors during the installation of Transformer Engine. Please try setting **MAX_JOBS=1** in the environment to circumvent the issue.

Expand Down
4 changes: 2 additions & 2 deletions build_tools/build_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ def _build_cmake(self, build_dir: Path, install_dir: Path) -> None:
configure_command.append(f"-Dpybind11_DIR={pybind11_dir}")

# CMake build and install commands
build_command = [_cmake_bin, "--build", build_dir]
install_command = [_cmake_bin, "--install", build_dir]
build_command = [_cmake_bin, "--build", build_dir, "--verbose"]
install_command = [_cmake_bin, "--install", build_dir, "--verbose"]

# Check whether parallel build is restricted
max_jobs = get_max_jobs_for_parallel_build()
Expand Down
18 changes: 12 additions & 6 deletions build_tools/paddle.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,18 @@ def setup_paddle_extension(
except FileNotFoundError:
print("Could not determine CUDA Toolkit version")
else:
if version >= (11, 2):
nvcc_flags.extend(["--threads", os.getenv("NVTE_BUILD_THREADS_PER_JOB", "1")])
if version >= (11, 0):
nvcc_flags.extend(["-gencode", "arch=compute_80,code=sm_80"])
if version >= (11, 8):
nvcc_flags.extend(["-gencode", "arch=compute_90,code=sm_90"])
if version < (12, 0):
raise RuntimeError("Transformer Engine requires CUDA 12.0 or newer")
nvcc_flags.extend(
(
"--threads",
os.getenv("NVTE_BUILD_THREADS_PER_JOB", "1"),
"-gencode",
"arch=compute_80,code=sm_80",
"-gencode",
"arch=compute_90,code=sm_90",
)
)

# Construct Paddle CUDA extension
sources = [str(path) for path in sources]
Expand Down
18 changes: 12 additions & 6 deletions build_tools/pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,18 @@ def setup_pytorch_extension(
except FileNotFoundError:
print("Could not determine CUDA Toolkit version")
else:
if version >= (11, 2):
nvcc_flags.extend(["--threads", os.getenv("NVTE_BUILD_THREADS_PER_JOB", "1")])
if version >= (11, 0):
nvcc_flags.extend(["-gencode", "arch=compute_80,code=sm_80"])
if version >= (11, 8):
nvcc_flags.extend(["-gencode", "arch=compute_90,code=sm_90"])
if version < (12, 0):
raise RuntimeError("Transformer Engine requires CUDA 12.0 or newer")
nvcc_flags.extend(
(
"--threads",
os.getenv("NVTE_BUILD_THREADS_PER_JOB", "1"),
"-gencode",
"arch=compute_80,code=sm_80",
"-gencode",
"arch=compute_90,code=sm_90",
)
)

# Libraries
library_dirs = []
Expand Down
4 changes: 2 additions & 2 deletions docs/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ Prerequisites
.. _driver link: https://www.nvidia.com/drivers

1. Linux x86_64
2. `CUDA 11.8 <https://developer.nvidia.com/cuda-downloads>`__
3. |driver link|_ supporting CUDA 11.8 or later.
2. `CUDA 12.0 <https://developer.nvidia.com/cuda-downloads>`__
3. |driver link|_ supporting CUDA 12.0 or later.
4. `cuDNN 8.1 <https://developer.nvidia.com/cudnn>`__ or later.
5. For FP8/FP16/BF16 fused attention, `CUDA 12.1 <https://developer.nvidia.com/cuda-downloads>`__ or later, |driver link|_ supporting CUDA 12.1 or later, and `cuDNN 8.9.1 <https://developer.nvidia.com/cudnn>`__ or later.

Expand Down
63 changes: 35 additions & 28 deletions transformer_engine/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,39 +4,27 @@

cmake_minimum_required(VERSION 3.21)

# Language options
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES 70 80 89 90)
endif()

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)

project(transformer_engine LANGUAGES CUDA CXX)

set(BUILD_THREADS_PER_JOB $ENV{NVTE_BUILD_THREADS_PER_JOB})
if (NOT BUILD_THREADS_PER_JOB)
set(BUILD_THREADS_PER_JOB 1)
endif()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --threads ${BUILD_THREADS_PER_JOB}")

if(DEFINED ENV{MAX_JOBS})
set(JOBS $ENV{MAX_JOBS})
elseif(DEFINED ENV{NVTE_BUILD_MAX_JOBS})
set(JOBS $ENV{NVTE_BUILD_MAX_JOBS})
else()
set(JOBS "max number of")
endif()

message(STATUS "Parallel build with ${JOBS} jobs and ${BUILD_THREADS_PER_JOB} threads per job")

if (CMAKE_BUILD_TYPE STREQUAL "Debug")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -G")
endif()

# Transformer Engine library
project(transformer_engine LANGUAGES CUDA CXX)

# CUDA Toolkit
find_package(CUDAToolkit REQUIRED)
if (CUDAToolkit_VERSION VERSION_LESS 12.0)
message(FATAL_ERROR "CUDA 12.0+ is required, but found CUDA ${CUDAToolkit_VERSION}")
endif()

# Check for cuDNN frontend API
# cuDNN frontend API
set(CUDNN_FRONTEND_INCLUDE_DIR
"${CMAKE_SOURCE_DIR}/../../3rdparty/cudnn-frontend/include")
if(NOT EXISTS "${CUDNN_FRONTEND_INCLUDE_DIR}")
Expand All @@ -47,10 +35,11 @@ if(NOT EXISTS "${CUDNN_FRONTEND_INCLUDE_DIR}")
endif()
include(${CMAKE_SOURCE_DIR}/../../3rdparty/cudnn-frontend/cmake/cuDNN.cmake)

# Python
find_package(Python COMPONENTS Interpreter Development.Module REQUIRED)
include_directories(${PROJECT_SOURCE_DIR}/..)

# Configure Transformer Engine library
include_directories(${PROJECT_SOURCE_DIR}/..)
set(transformer_engine_SOURCES)
list(APPEND transformer_engine_SOURCES
pycudnn.cpp
Expand Down Expand Up @@ -89,8 +78,6 @@ add_library(transformer_engine SHARED ${transformer_engine_SOURCES})
target_include_directories(transformer_engine PUBLIC
"${CMAKE_CURRENT_SOURCE_DIR}/include")

target_compile_definitions(transformer_engine PUBLIC NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING)

# Configure dependencies
target_link_libraries(transformer_engine PUBLIC
CUDA::cublas
Expand All @@ -100,7 +87,10 @@ target_include_directories(transformer_engine PRIVATE
${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
target_include_directories(transformer_engine PRIVATE "${CUDNN_FRONTEND_INCLUDE_DIR}")

# Make header files with C++ strings
# Hack to enable dynamic loading in cuDNN frontend
target_compile_definitions(transformer_engine PUBLIC NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING)

# Helper functions to make header files with C++ strings
function(make_string_header STRING STRING_NAME)
configure_file(util/string_header.h.in
"string_headers/${STRING_NAME}.h"
Expand All @@ -112,10 +102,11 @@ function(make_string_header_from_file file_ STRING_NAME)
"string_headers/${STRING_NAME}.h"
@ONLY)
endfunction()

# Header files with C++ strings
list(GET CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES 0 cuda_include_path)
make_string_header("${cuda_include_path}"
string_path_cuda_include)

make_string_header_from_file(transpose/rtc/cast_transpose_fusion.cu
string_code_transpose_rtc_cast_transpose_fusion_cu)
make_string_header_from_file(transpose/rtc/cast_transpose.cu
Expand All @@ -126,7 +117,6 @@ make_string_header_from_file(utils.cuh
string_code_utils_cuh)
make_string_header_from_file(util/math.h
string_code_util_math_h)

target_include_directories(transformer_engine PRIVATE
"${CMAKE_CURRENT_BINARY_DIR}/string_headers")

Expand All @@ -139,6 +129,23 @@ set_source_files_properties(fused_softmax/scaled_masked_softmax.cu
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")

# Number of parallel build jobs
if(ENV{MAX_JOBS})
set(BUILD_JOBS_STR "$ENV{MAX_JOBS}")
elseif(ENV{NVTE_BUILD_MAX_JOBS})
set(BUILD_JOBS_STR "$ENV{NVTE_BUILD_MAX_JOBS}")
else()
set(BUILD_JOBS_STR "max")
endif()
message(STATUS "Parallel build jobs: ${BUILD_JOBS_STR}")

# Number of threads per parallel build job
set(BUILD_THREADS_PER_JOB $ENV{NVTE_BUILD_THREADS_PER_JOB})
if (NOT BUILD_THREADS_PER_JOB)
set(BUILD_THREADS_PER_JOB 1)
endif()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --threads ${BUILD_THREADS_PER_JOB}")
message(STATUS "Threads per parallel build job: ${BUILD_THREADS_PER_JOB}")

# Install library
install(TARGETS transformer_engine DESTINATION .)

0 comments on commit cc329b7

Please sign in to comment.