Skip to content

Commit cb1c56c

Browse files
drisspgpytorchmergebot
authored andcommitted
Set target dependencies to always build for sm90a on rowwise scaling (pytorch#129402)
# Summary Instead of landing global builder changes; pytorch/builder#1878 This PR targets only the Rowwise file and adds the sm90a featurs. Verified locally by setting: ``` TORCH_CUDA_ARCH_LIST=9.0 ``` We can see in the build.ninja file that the proper flags are set: ``` build caffe2/CMakeFiles/torch_cuda.dir/__/aten/src/ATen/native/cuda/RowwiseScaledMM.cu.o: CUDA_COMPILER__torch_cuda_unscanned_Release /home/drisspg/meta/pytorch/aten/src/ATen/native/cuda/RowwiseScaledMM.cu || cmake_object_order_depends_target_torch_cuda DEFINES = -DAT_PER_OPERATOR_HEADERS -DFLASHATTENTION_DISABLE_ALIBI -DHAVE_MALLOC_USABLE_SIZE=1 -DHAVE_MMAP=1 -DHAVE_SHM_OPEN=1 -DHAVE_SHM_UNLINK=1 -DMINIZ_DISABLE_ZIP_READER_CRC32_CHECKS -DONNXIFI_ENABLE_EXT=1 -DONNX_ML=1 -DONNX_NAMESPACE=onnx_torch -DTORCH_CUDA_BUILD_MAIN_LIB -DUSE_C10D_GLOO -DUSE_C10D_NCCL -DUSE_CUDA -DUSE_DISTRIBUTED -DUSE_EXTERNAL_MZCRC -DUSE_FLASH_ATTENTION -DUSE_MEM_EFF_ATTENTION -DUSE_NCCL -DUSE_RPC -DUSE_TENSORPIPE -D_FILE_OFFSET_BITS=64 -Dtorch_cuda_EXPORTS DEP_FILE = caffe2/CMakeFiles/torch_cuda.dir/__/aten/src/ATen/native/cuda/RowwiseScaledMM.cu.o.d FLAGS = -DLIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_OPERATIONS -D_GLIBCXX_USE_CXX11_ABI=1 -Xfatbin -compress-all -DONNX_NAMESPACE=onnx_torch -gencode arch=compute_90,code=sm_90 -Xcudafe --diag_suppress=cc_clobber_ignored,--diag_suppress=field_without_dll_interface,--diag_suppress=base_class_has_different_dll_interface,--diag_suppress=dll_interface_conflict_none_assumed,--diag_suppress=dll_interface_conflict_dllexport_assumed,--diag_suppress=bad_friend_decl --expt-relaxed-constexpr --expt-extended-lambda -Wno-deprecated-gpu-targets --expt-extended-lambda -DCUB_WRAPPED_NAMESPACE=at_cuda_detail -DCUDA_HAS_FP16=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -O3 -DNDEBUG -std=c++17 -Xcompiler=-fPIC -DTORCH_USE_LIBUV -DCAFFE2_USE_GLOO -Xcompiler=-Wall,-Wextra,-Wdeprecated,-Wno-unused-parameter,-Wno-missing-field-initializers,-Wno-unknown-pragmas,-Wno-type-limits,-Wno-array-bounds,-Wno-unknown-pragmas,-Wno-strict-overflow,-Wno-strict-aliasing,-Wno-unused-function,-Wno-maybe-uninitialized -Wno-deprecated-copy -gencode arch=compute_90a,code=sm_90a INCLUDES = -I/home/drisspg/meta/pytorch/build/aten/src -I/home/drisspg/meta/pytorch/aten/src -I/home/drisspg/meta/pytorch/build -I/home/drisspg/meta/pytorch -I/home/drisspg/meta/pytorch/third_party/onnx -I/home/drisspg/meta/pytorch/build/third_party/onnx -I/home/drisspg/meta/pytorch/third_party/foxi -I/home/drisspg/meta/pytorch/build/third_party/foxi -I/home/drisspg/meta/pytorch/aten/src/THC -I/home/drisspg/meta/pytorch/aten/src/ATen/cuda -I/home/drisspg/meta/pytorch/aten/src/ATen/../../../third_party/cutlass/include -I/home/drisspg/meta/pytorch/aten/src/ATen/../../../third_party/cutlass/tools/util/include -I/home/drisspg/meta/pytorch/build/caffe2/aten/src -I/home/drisspg/meta/pytorch/aten/src/ATen/.. -I/home/drisspg/meta/pytorch/build/nccl/include -I/home/drisspg/meta/pytorch/c10/cuda/../.. -I/home/drisspg/meta/pytorch/c10/.. -I/home/drisspg/meta/pytorch/third_party/tensorpipe -I/home/drisspg/meta/pytorch/build/third_party/tensorpipe -I/home/drisspg/meta/pytorch/third_party/tensorpipe/third_party/libnop/include -I/home/drisspg/meta/pytorch/torch/csrc/api -I/home/drisspg/meta/pytorch/torch/csrc/api/include -isystem /home/drisspg/meta/pytorch/build/third_party/gloo -isystem /home/drisspg/meta/pytorch/cmake/../third_party/gloo -isystem /home/drisspg/meta/pytorch/cmake/../third_party/tensorpipe/third_party/libuv/include -isystem /home/drisspg/meta/pytorch/third_party/protobuf/src -isystem /home/drisspg/meta/pytorch/third_party/ittapi/include -isystem /home/drisspg/meta/pytorch/cmake/../third_party/eigen -isystem /usr/local/cuda-12.3/include -isystem /home/drisspg/meta/pytorch/third_party/ideep/mkl-dnn/include/oneapi/dnnl -isystem /home/drisspg/meta/pytorch/third_party/ideep/include -isystem /home/drisspg/meta/pytorch/cmake/../third_party/cudnn_frontend/include OBJECT_DIR = caffe2/CMakeFiles/torch_cuda.dir OBJECT_FILE_DIR = caffe2/CMakeFiles/torch_cuda.dir/__/aten/src/ATen/native/cuda ``` Pull Request resolved: pytorch#129402 Approved by: https://github.com/malfet
1 parent 71ebe51 commit cb1c56c

File tree

1 file changed

+15
-0
lines changed

1 file changed

+15
-0
lines changed

cmake/Codegen.cmake

+15
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,21 @@ if(INTERN_BUILD_ATEN_OPS)
6969

7070
file(GLOB_RECURSE all_python "${CMAKE_CURRENT_LIST_DIR}/../torchgen/*.py")
7171

72+
# RowwiseScaled.cu requires sm90a flags
73+
if(USE_CUDA)
74+
set(ROWWISE_SCALED_MM_FILE "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/RowwiseScaledMM.cu")
75+
76+
# Get existing arch flags
77+
torch_cuda_get_nvcc_gencode_flag(EXISTING_ARCH_FLAGS)
78+
79+
# Check NVCC version and existing arch flags
80+
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND
81+
EXISTING_ARCH_FLAGS MATCHES ".*compute_90.*")
82+
set_source_files_properties(${ROWWISE_SCALED_MM_FILE}
83+
PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
84+
endif()
85+
endif()
86+
7287
set(GEN_ROCM_FLAG)
7388
if(USE_ROCM)
7489
set(GEN_ROCM_FLAG --rocm)

0 commit comments

Comments
 (0)