Revert "Add wrappers for synchronous GPUDirect Storage APIs (pytorch#130633)"

pytorchmergebot · pytorchmergebot · commit e191b834623d · 2024-07-26T18:08:20.000Z
This reverts commit 709ddf7. Reverted pytorch#130633 on behalf of https://github.com/clee2000 due to still failing internally D60265673 ([comment](pytorch#130633 (comment)))
diff --git a/BUILD.bazel b/BUILD.bazel
@@ -413,7 +413,6 @@ cc_library(
         "@cuda//:nvrtc",
         "@cudnn",
         "@cudnn_frontend",
-        "@cuda//:cufile",
     ],
     alwayslink = True,
 )
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -251,15 +251,6 @@ cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF)
 cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
                        "USE_CUDNN" OFF)
 cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF)
-# Binary builds will fail for cufile due to https://github.com/pytorch/builder/issues/1924
-# Using TH_BINARY_BUILD to check whether is binary build.
-# USE_ROCM is guarded against in Dependencies.cmake because USE_ROCM is not properly defined here
-if(DEFINED ENV{TH_BINARY_BUILD})
-  cmake_dependent_option(USE_CUFILE "Use cuFile" ON
-                         "USE_CUDA AND NOT $ENV{TH_BINARY_BUILD} AND NOT WIN32" OFF)
-else()
-  cmake_dependent_option(USE_CUFILE "Use cuFile" ON "USE_CUDA AND NOT WIN32" OFF)
-endif()
 option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
 option(USE_KINETO "Use Kineto profiling library" ON)
 option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
diff --git a/build_variables.bzl b/build_variables.bzl
@@ -773,7 +773,6 @@ libtorch_python_cuda_core_sources = [
     "torch/csrc/cuda/shared/cudart.cpp",
     "torch/csrc/cuda/shared/nvtx.cpp",
     "torch/csrc/cuda/utils.cpp",
-    "torch/csrc/cuda/GdsFile.cpp",
 ]
 
 libtorch_python_cuda_sources = libtorch_python_cuda_core_sources + [
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
@@ -928,10 +928,6 @@ elseif(USE_CUDA)
   torch_compile_options(torch_cuda)  # see cmake/public/utils.cmake
   target_compile_definitions(torch_cuda PRIVATE USE_CUDA)
 
-  if(USE_CUFILE)
-    target_link_libraries(torch_cuda PRIVATE torch::cufile)
-    target_compile_definitions(torch_cuda PRIVATE USE_CUFILE)
-  endif()
   if(USE_CUSPARSELT)
       target_link_libraries(torch_cuda PRIVATE torch::cusparselt)
       target_compile_definitions(torch_cuda PRIVATE USE_CUSPARSELT)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
@@ -39,7 +39,6 @@ if(USE_CUDA)
   set(CAFFE2_USE_CUDA ${USE_CUDA})
   set(CAFFE2_USE_CUDNN ${USE_CUDNN})
   set(CAFFE2_USE_CUSPARSELT ${USE_CUSPARSELT})
-  set(CAFFE2_USE_CUFILE ${USE_CUFILE})
   set(CAFFE2_USE_NVRTC ${USE_NVRTC})
   include(${CMAKE_CURRENT_LIST_DIR}/public/cuda.cmake)
   if(CAFFE2_USE_CUDA)
@@ -61,9 +60,6 @@ if(USE_CUDA)
     else()
       caffe2_update_option(USE_CUSPARSELT OFF)
     endif()
-    if(CAFFE2_USE_CUFILE)
-      list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS torch::cufile)
-    endif()
     find_program(SCCACHE_EXECUTABLE sccache)
     if(SCCACHE_EXECUTABLE)
       # Using RSP/--options-file renders output noncacheable by sccache
@@ -83,7 +79,6 @@ if(USE_CUDA)
     set(CAFFE2_USE_CUDA OFF)
     set(CAFFE2_USE_CUDNN OFF)
     set(CAFFE2_USE_CUSPARSELT OFF)
-    set(CAFFE2_USE_CUFILE OFF)
     set(CAFFE2_USE_NVRTC OFF)
   endif()
 endif()
@@ -1040,6 +1035,7 @@ if(USE_ROCM)
       caffe2_update_option(USE_SYSTEM_NCCL ON)
     endif()
 
+
     list(APPEND HIP_CXX_FLAGS -fPIC)
     list(APPEND HIP_CXX_FLAGS -D__HIP_PLATFORM_AMD__=1)
     list(APPEND HIP_CXX_FLAGS -DCUDA_HAS_FP16=1)
diff --git a/cmake/Modules/FindCUDAToolkit.cmake b/cmake/Modules/FindCUDAToolkit.cmake
@@ -978,14 +978,6 @@ if(CUDAToolkit_FOUND)
     _CUDAToolkit_find_and_add_import_lib(cublas_static DEPS culibos)
   endif()
 
-  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.4)
-    _CUDAToolkit_find_and_add_import_lib(cuFile ALT cufile DEPS culibos)
-    _CUDAToolkit_find_and_add_import_lib(cuFile_static ALT cufile_static DEPS culibos)
-
-    _CUDAToolkit_find_and_add_import_lib(cuFile_rdma ALT cufile_rdma DEPS cuFile culibos)
-    _CUDAToolkit_find_and_add_import_lib(cuFile_rdma_static ALT cufile_rdma_static DEPS cuFile_static culibos)
-  endif()
-
   # cuFFTW depends on cuFFT
   _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft)
   _CUDAToolkit_find_and_add_import_lib(cufftw_static DEPS cufft_static)
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
@@ -74,7 +74,6 @@ function(caffe2_print_configuration_summary)
     message(STATUS "    CUDA static link    : ${CAFFE2_STATIC_LINK_CUDA}")
     message(STATUS "    USE_CUDNN           : ${USE_CUDNN}")
     message(STATUS "    USE_CUSPARSELT      : ${USE_CUSPARSELT}")
-    message(STATUS "    USE_CUFILE          : ${USE_CUFILE}")
     message(STATUS "    CUDA version        : ${CUDA_VERSION}")
     message(STATUS "    USE_FLASH_ATTENTION : ${USE_FLASH_ATTENTION}")
     message(STATUS "    USE_MEM_EFF_ATTENTION : ${USE_MEM_EFF_ATTENTION}")
@@ -84,9 +83,6 @@ function(caffe2_print_configuration_summary)
     if(${USE_CUSPARSELT})
       message(STATUS "    cuSPARSELt version  : ${CUSPARSELT_VERSION}")
     endif()
-    if(${USE_CUFILE})
-      message(STATUS "    cufile library    : ${CUDA_cuFile_LIBRARY}")
-    endif()
     message(STATUS "    CUDA root directory : ${CUDA_TOOLKIT_ROOT_DIR}")
     message(STATUS "    CUDA library        : ${CUDA_cuda_driver_LIBRARY}")
     message(STATUS "    cudart library      : ${CUDA_cudart_LIBRARY}")
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
@@ -244,22 +244,6 @@ else()
   message(STATUS "USE_CUSPARSELT is set to 0. Compiling without cuSPARSELt support")
 endif()
 
-# cufile
-if(CAFFE2_USE_CUFILE)
-  add_library(torch::cufile INTERFACE IMPORTED)
-  if(CAFFE2_STATIC_LINK_CUDA AND NOT WIN32)
-      set_property(
-          TARGET torch::cufile PROPERTY INTERFACE_LINK_LIBRARIES
-          CUDA::cuFile_static)
-  else()
-      set_property(
-          TARGET torch::cufile PROPERTY INTERFACE_LINK_LIBRARIES
-          CUDA::cuFile)
-  endif()
-else()
-  message(STATUS "USE_CUFILE is set to 0. Compiling without cuFile support")
-endif()
-
 # curand
 add_library(caffe2::curand INTERFACE IMPORTED)
 if(CAFFE2_STATIC_LINK_CUDA AND NOT WIN32)
diff --git a/docs/source/cuda.rst b/docs/source/cuda.rst
@@ -181,7 +181,6 @@ See the :doc:`documentation <cuda._sanitizer>` for information on how to use it.
 .. for tracking purposes
 .. py:module:: torch.cuda.comm
 .. py:module:: torch.cuda.error
-.. py:module:: torch.cuda.gds
 .. py:module:: torch.cuda.graphs
 .. py:module:: torch.cuda.jiterator
 .. py:module:: torch.cuda.memory
diff --git a/setup.py b/setup.py
@@ -38,9 +38,6 @@
 #   USE_CUSPARSELT=0
 #     disables the cuSPARSELt build
 #
-#   USE_CUFILE=0
-#     disables the cuFile build
-#
 #   USE_FBGEMM=0
 #     disables the FBGEMM build
 #
diff --git a/test/test_cuda.py b/test/test_cuda.py
@@ -17,8 +17,6 @@
 from itertools import product
 from random import randint
 
-import psutil
-
 import torch
 import torch.cuda
 from torch import inf, nan
@@ -64,7 +62,6 @@
     skipIfRocm,
     slowTest,
     subtest,
-    TemporaryFileName,
     TEST_CUDA,
     TEST_CUDA_GRAPH,
     TEST_NUMPY,
@@ -4025,15 +4022,6 @@ def test_device_count_not_cached_pre_init(self):
         x = torch.cuda.device_count()
         self.assertEqual(f"{x}, 1", r)
 
-    def test_gds_fails_in_ci(self):
-        if IS_WINDOWS or TEST_WITH_ROCM:
-            error_msg = "is not supported on this platform"
-        else:
-            error_msg = "cuFileHandleRegister failed"
-        with TemporaryFileName() as f:
-            with self.assertRaisesRegex(RuntimeError, error_msg):
-                file = torch.cuda.gds._GdsFile(f, os.O_CREAT | os.O_RDWR)
-
 
 @torch.testing._internal.common_utils.markDynamoStrictTest
 class TestCudaMallocAsync(TestCase):
@@ -5181,40 +5169,6 @@ def test_graph_grad_scaling(self, device, dtype, optim_info, foreach, fused):
             self.assertEqual(scaler._growth_tracker, growth_tracker)
 
 
-class TestGDS(TestCase):
-    def _get_tmp_dir_fs_type(self):
-        my_path = os.path.realpath("/tmp")
-        root_type = ""
-        for part in psutil.disk_partitions():
-            if part.mountpoint == "/":
-                root_type = part.fstype
-                continue
-            if part.mountpoint == my_path:
-                return part.fstype
-        return root_type
-
-    @unittest.skipIf(IS_WINDOWS or TEST_WITH_ROCM, "Not supported on Windows or ROCm")
-    def test_gds_read_write_tensors(self):
-        if self._get_tmp_dir_fs_type() not in ("ext4", "xfs"):
-            self.skipTest("GPUDirect Storage requires ext4/xfs for local filesystem")
-        src1 = torch.randn(1024, device="cuda")
-        src2 = torch.randn(2, 1024, device="cuda")
-        torch.cuda.gds._gds_register_buffer(src1.untyped_storage())
-        torch.cuda.gds._gds_register_buffer(src2.untyped_storage())
-        dest1 = torch.empty(1024, device="cuda")
-        dest2 = torch.empty(2, 1024, device="cuda")
-        with TemporaryFileName() as f:
-            file = torch.cuda.gds._GdsFile(f, os.O_CREAT | os.O_RDWR)
-            file.save_storage(src1.untyped_storage(), offset=0)
-            file.save_storage(src2.untyped_storage(), offset=src1.nbytes)
-            file.load_storage(dest1.untyped_storage(), offset=0)
-            file.load_storage(dest2.untyped_storage(), offset=src1.nbytes)
-        self.assertEqual(src1, dest1)
-        self.assertEqual(src2, dest2)
-        torch.cuda.gds._gds_deregister_buffer(src1.untyped_storage())
-        torch.cuda.gds._gds_deregister_buffer(src2.untyped_storage())
-
-
 instantiate_parametrized_tests(TestCuda)
 instantiate_parametrized_tests(TestCudaMallocAsync)
 instantiate_device_type_tests(TestCudaOptims, globals())
diff --git a/third_party/cuda.BUILD b/third_party/cuda.BUILD
@@ -60,12 +60,6 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
-cc_library(
-    name = "cufile",
-    srcs = ["targets/x86_64-linux/lib/libcufile.so"],
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "nvrtc",
     srcs = [
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
@@ -312,10 +312,6 @@ if(USE_NUMPY)
   target_compile_definitions(torch_python PRIVATE USE_NUMPY)
 endif()
 
-if(USE_CUFILE AND NOT USE_ROCM)
-  target_compile_definitions(torch_python PRIVATE USE_CUFILE)
-endif()
-
 if(HAVE_SOVERSION)
   set_target_properties(torch_python PROPERTIES
       VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
@@ -1975,14 +1975,6 @@ def _can_use_cudnn_attention(params: _SDPAParams, debug: _bool) -> _bool: ...
 def _can_use_flash_attention(params: _SDPAParams, debug: _bool) -> _bool: ...
 def _can_use_mem_efficient_attention(params: _SDPAParams, debug: _bool) -> _bool: ...
 
-# Defined in torch/csrc/cuda/GdsFile.cpp
-def _gds_register_buffer(t: Storage) -> None: ...
-def _gds_deregister_buffer(t: Storage) -> None: ...
-def _gds_register_handle(fd: _int) -> _int: ...
-def _gds_deregister_handle(handle: _int) -> None: ...
-def _gds_load_storage(handle: _int, s: Storage, offset: _int) -> None: ...
-def _gds_save_storage(handle: _int, s: Storage, offset: _int) -> None: ...
-
 # Defined in torch/csrc/cuda/python_comm.cpp
 def _broadcast(tensor: Tensor, devices: List[_int]) -> List[Tensor]: ...
 def _broadcast_out(tensor: Tensor, out_tensors: List[Tensor]) -> List[Tensor]: ...
diff --git a/torch/__init__.py b/torch/__init__.py
@@ -306,7 +306,6 @@ def _load_global_deps() -> None:
             "cuda_runtime": "libcudart.so.*[0-9]",
             "cuda_cupti": "libcupti.so.*[0-9]",
             "cufft": "libcufft.so.*[0-9]",
-            "cufile": "libcufile.so.*[0-9]",
             "curand": "libcurand.so.*[0-9]",
             "nvjitlink": "libnvJitLink.so.*[0-9]",
             "cusparse": "libcusparse.so.*[0-9]",
diff --git a/torch/csrc/cuda/GdsFile.cpp b/torch/csrc/cuda/GdsFile.cpp
diff --git a/torch/csrc/cuda/GdsFile.h b/torch/csrc/cuda/GdsFile.h
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
diff --git a/torch/cuda/gds.py b/torch/cuda/gds.py

Original file line number	Diff line number	Diff line change
`@@ -413,7 +413,6 @@ cc_library(`
`413`	`413`	`"@cuda//:nvrtc",`
`414`	`414`	`"@cudnn",`
`415`	`415`	`"@cudnn_frontend",`
`416`		`- "@cuda//:cufile",`
`417`	`416`	`],`
`418`	`417`	`alwayslink = True,`
`419`	`418`	`)`
Original file line number	Diff line number	Diff line change
`@@ -773,7 +773,6 @@ libtorch_python_cuda_core_sources = [`
`773`	`773`	`"torch/csrc/cuda/shared/cudart.cpp",`
`774`	`774`	`"torch/csrc/cuda/shared/nvtx.cpp",`
`775`	`775`	`"torch/csrc/cuda/utils.cpp",`
`776`		`- "torch/csrc/cuda/GdsFile.cpp",`
`777`	`776`	`]`
`778`	`777`
`779`	`778`	`libtorch_python_cuda_sources = libtorch_python_cuda_core_sources + [`
Original file line number	Diff line number	Diff line change
`@@ -38,9 +38,6 @@`
`38`	`38`	`# USE_CUSPARSELT=0`
`39`	`39`	`# disables the cuSPARSELt build`
`40`	`40`	`#`
`41`		`-# USE_CUFILE=0`
`42`		`-# disables the cuFile build`
`43`		`-#`
`44`	`41`	`# USE_FBGEMM=0`
`45`	`42`	`# disables the FBGEMM build`
`46`	`43`	`#`