luoxiaojian
diff --git a/‎.gitmodules
-9 b/‎.gitmodules
-9
diff --git a/‎CMakeLists.txt
-16 b/‎CMakeLists.txt
-16
diff --git a/‎README.md
+2-6 b/‎README.md
+2-6
diff --git a/‎grape/config.h
-5 b/‎grape/config.h
-5
diff --git a/‎grape/cuda/fragment/host_fragment.h
+8-6 b/‎grape/cuda/fragment/host_fragment.h
+8-6
diff --git a/‎grape/cuda/parallel/parallel_engine.h
+6-7 b/‎grape/cuda/parallel/parallel_engine.h
+6-7
diff --git a/‎grape/cuda/utils/array_view.h
+3-7 b/‎grape/cuda/utils/array_view.h
+3-7
diff --git a/‎grape/cuda/utils/cuda_utils.h
+24-3 b/‎grape/cuda/utils/cuda_utils.h
+24-3
diff --git a/‎grape/cuda/utils/shared_array.h
+1-5 b/‎grape/cuda/utils/shared_array.h
+1-5
diff --git a/‎grape/cuda/utils/shared_value.h
+1-6 b/‎grape/cuda/utils/shared_value.h
+1-6
@@ -1,10 +1 @@
 
-[submodule "thirdparty/cub"]
-	path = thirdparty/cub
-	url = https://github.com/NVIDIA/cub.git
-[submodule "thirdparty/moderngpu"]
-	path = thirdparty/moderngpu
-	url = https://github.com/moderngpu/moderngpu.git
-[submodule "thirdparty/thrust"]
-	path = thirdparty/thrust
-	url = https://github.com/NVIDIA/thrust.git
@@ -105,11 +105,6 @@ else()
   if (NCCL_VERSION VERSION_LESS "2.7")
     message(WARNING "Disable GPU support because NCCL >= 2.7 not found")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-  elseif ((NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/thirdparty/cub/.git") OR
-          (NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/thirdparty/thrust/.git") OR
-          (NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/thirdparty/moderngpu/.git"))
-    message(WARNING "Disable GPU support because dependencies not found, please run 'git submodules update --init --recursive'")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
   else ()
     option(WITH_CUDA "Whether to enable cuda support" ON)
     message(STATUS "Build with CUDA support")
@@ -252,7 +247,6 @@ else ()
 
   if (WITH_CUDA)
     cuda_add_executable(gpu_analytical_apps examples/analytical_apps/flags.cc examples/analytical_apps/run_cuda_app.cu)
-    target_include_directories(gpu_analytical_apps SYSTEM BEFORE PRIVATE thirdparty/cub thirdparty/thrust thirdparty/moderngpu/src)
     target_include_directories(gpu_analytical_apps PRIVATE examples/analytical_apps)
     set_target_properties(gpu_analytical_apps PROPERTIES OUTPUT_NAME run_cuda_app)
     target_link_libraries(gpu_analytical_apps grape-lite ${GFLAGS_LIBRARIES} ${CUDA_LIBS} ${NCCL_LIBRARIES} ${CMAKE_DL_LIBS})
@@ -295,16 +289,6 @@ install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/flat_hash_map
 )
 
 if (WITH_CUDA)
-  install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/cub/cub
-                    ${PROJECT_SOURCE_DIR}/thirdparty/thrust/thrust
-                    ${PROJECT_SOURCE_DIR}/thirdparty/moderngpu/src/moderngpu
-          DESTINATION include
-          FILES_MATCHING
-          PATTERN "*.h"
-          PATTERN "*.cuh"
-          PATTERN "*.hpp"
-          PATTERN "*.hxx"
-  )
   install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/cuda_hashmap
           DESTINATION include/cuda_hashmap
           FILES_MATCHING
 
@@ -60,12 +60,8 @@ make gnn_sampler
 
 ### Building libgrape-lite with GPU support
 
-libgrape-lite supports deploying graph algorithms to GPUs. To enable the support for GPUs, you first
-need initialize the dependencies with the following command before building.
-
-```bash
-git submodule update --init --recursive
-```
+libgrape-lite supports deploying graph algorithms to GPUs.
+When CUDA is detected on the machine and NCCL >= 2.7, GPU support will be enabled automatically.
 
 ## Running libgrape-lite applications
 
 
@@ -22,7 +22,6 @@ limitations under the License.
 
 #ifdef __CUDACC__
 #include <thrust/host_vector.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
 #endif
 
 #include "grape/utils/default_allocator.h"
@@ -56,10 +55,6 @@ using Allocator = DefaultAllocator<T>;
 #define MAX_GRID_SIZE 768
 #define TID_1D (threadIdx.x + blockIdx.x * blockDim.x)
 #define TOTAL_THREADS_1D (gridDim.x * blockDim.x)
-
-template <typename T>
-using pinned_vector =
-    thrust::host_vector<T, thrust::cuda::experimental::pinned_allocator<T>>;
 #else
 #define DEV_HOST
 #define DEV_HOST_INLINE inline
 
@@ -161,7 +161,7 @@ class HostFragment
                                    stream.cuda_stream()));
 
         auto prefix_sum = compute_prefix_sum(ieoffset);
-        ArrayView<VID_T> d_prefix_sum(prefix_sum.data(), prefix_sum.size());
+        ArrayView<VID_T> d_prefix_sum(prefix_sum);
 
         CalculateOffsetWithPrefixSum<nbr_t, vid_t>(
             stream, d_prefix_sum, thrust::raw_pointer_cast(d_ie_.data()),
@@ -176,7 +176,7 @@ class HostFragment
                                    stream.cuda_stream()));
 
         auto prefix_sum = compute_prefix_sum(oeoffset);
-        ArrayView<VID_T> d_prefix_sum(prefix_sum.data(), prefix_sum.size());
+        ArrayView<VID_T> d_prefix_sum(prefix_sum);
 
         CalculateOffsetWithPrefixSum<nbr_t, vid_t>(
             stream, d_prefix_sum, thrust::raw_pointer_cast(d_oe_.data()),
@@ -354,7 +354,7 @@ class HostFragment
                                  cudaMemcpyHostToDevice, stream.cuda_stream()));
 
       auto prefix_sum = compute_prefix_sum(ieoffset);
-      ArrayView<VID_T> d_prefix_sum(prefix_sum.data(), prefix_sum.size());
+      ArrayView<VID_T> d_prefix_sum(prefix_sum);
 
       CalculateOffsetWithPrefixSum<nbr_t, vid_t>(
           stream, d_prefix_sum, thrust::raw_pointer_cast(d_ie_.data()),
@@ -370,7 +370,7 @@ class HostFragment
                                  cudaMemcpyHostToDevice, stream.cuda_stream()));
 
       auto prefix_sum = compute_prefix_sum(oeoffset);
-      ArrayView<VID_T> d_prefix_sum(prefix_sum.data(), prefix_sum.size());
+      ArrayView<VID_T> d_prefix_sum(prefix_sum);
 
       CalculateOffsetWithPrefixSum<nbr_t, vid_t>(
           stream, d_prefix_sum, thrust::raw_pointer_cast(d_oe_.data()),
@@ -414,6 +414,7 @@ class HostFragment
           [] __device__(VID_T * gids, VID_T * lids, VID_T size,
                         CUDASTL::HashMap<VID_T, VID_T> * ovg2l) {
             auto tid = TID_1D;
+            gids = thrust::raw_pointer_cast(gids);
             auto nthreads = TOTAL_THREADS_1D;
 
             for (VID_T idx = 0 + tid; idx < size; idx += nthreads) {
@@ -423,7 +424,8 @@ class HostFragment
               (*ovg2l)[gid] = lid;
             }
           },
-          gids.data(), lids.data(), size, d_ovg2l_.get());
+          thrust::raw_pointer_cast(gids.data()),
+          thrust::raw_pointer_cast(lids.data()), size, d_ovg2l_.get());
     }
 
     d_mirrors_of_frag_holder_.resize(fnum_);
@@ -633,7 +635,7 @@ class HostFragment
       thrust::device_vector<fid_t>& d_fid_list,
       thrust::device_vector<fid_t*>& d_fid_list_offset) {
     pinned_vector<size_t> prefix_sum(ivnum_ + 1, 0);
-    ArrayView<size_t> d_prefix_sum(prefix_sum.data(), prefix_sum.size());
+    ArrayView<size_t> d_prefix_sum(prefix_sum);
 
     for (VID_T i = 0; i < ivnum_; ++i) {
       prefix_sum[i + 1] =
 
@@ -18,13 +18,12 @@ limitations under the License.
 
 #include <cuda_profiler_api.h>
 
+#include <iostream>
 #include <unordered_set>
 #pragma push
 #pragma diag_suppress = initialization_not_reachable
 #include <thrust/binary_search.h>
-
 #include <cub/cub.cuh>
-#include <moderngpu/kernel_sortedsearch.hxx>
 #pragma pop
 
 #include "grape/config.h"
@@ -33,6 +32,7 @@ limitations under the License.
 #include "grape/cuda/utils/launcher.h"
 #include "grape/cuda/utils/shared_value.h"
 #include "grape/cuda/utils/sorted_search.h"
+#include "grape/cuda/utils/stream.h"
 #include "grape/cuda/utils/work_source.h"
 
 // TODO(liang): we may split this to multiple headers
@@ -930,7 +930,7 @@ DEV_INLINE void LBSTRICT(const FRAG_T& dev_frag, const ArrayView<size_t>& sidx,
   while (block_output_processed < block_output_size &&
          iter_input_start < block_input_end) {
     size_t iter_input_size =
-        min((size_t)(blockDim.x - 1), block_input_end - iter_input_start);
+        min((size_t) (blockDim.x - 1), block_input_end - iter_input_start);
     size_t iter_input_end = iter_input_start + iter_input_size;
     size_t iter_output_end =
         iter_input_end < size ? row_offset[iter_input_end] : total_edges;
@@ -1362,10 +1362,9 @@ class ParallelEngine {
         },
         ArrayView<size_t>(seid_per_block));
 
-    sorted_search<mgpu::bounds_lower>(
-        stream, thrust::raw_pointer_cast(seid_per_block.data()), block_num,
-        thrust::raw_pointer_cast(prefix_sum_.data()), size,
-        thrust::raw_pointer_cast(sidx.data()), mgpu::less_t<size_t>());
+    sorted_search(stream, thrust::raw_pointer_cast(seid_per_block.data()),
+                  block_num, thrust::raw_pointer_cast(prefix_sum_.data()), size,
+                  thrust::raw_pointer_cast(sidx.data()));
 
     KernelWrapper<<<block_num, block_size, calc_shmem_size(block_size),
                     stream.cuda_stream()>>>(
 
@@ -15,12 +15,8 @@ limitations under the License.
 
 #ifndef GRAPE_CUDA_UTILS_ARRAY_VIEW_H_
 #define GRAPE_CUDA_UTILS_ARRAY_VIEW_H_
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/swap.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
 
-#include "grape/config.h"
+#include "grape/cuda/utils/cuda_utils.h"
 
 namespace grape {
 namespace cuda {
@@ -33,8 +29,7 @@ class ArrayView {
       : data_(const_cast<T*>(thrust::raw_pointer_cast(vec.data()))),
         size_(vec.size()) {}
 
-  explicit ArrayView(const thrust::host_vector<
-                     T, thrust::cuda::experimental::pinned_allocator<T>>& vec)
+  explicit ArrayView(const pinned_vector<T>& vec)
       : data_(const_cast<T*>(thrust::raw_pointer_cast(vec.data()))),
         size_(vec.size()) {}
 
@@ -69,6 +64,7 @@ class ArrayView {
   T* data_{};
   size_t size_{};
 };
+
 }  // namespace cuda
 }  // namespace grape
 #endif  // GRAPE_CUDA_UTILS_ARRAY_VIEW_H_
@@ -19,11 +19,18 @@ limitations under the License.
 #include <nccl.h>
 #include <sys/resource.h>
 #include <sys/time.h>
+#include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
+#include <thrust/host_vector.h>
+#include <thrust/swap.h>
 #include <thrust/transform_reduce.h>
+#include <cub/cub.cuh>
 
-#include "cub/cub.cuh"
-#include "grape/config.h"
+#if THRUST_VERSION > 101700
+#include <thrust/system/cuda/memory_resource.h>
+#else
+#include <thrust/system/cuda/experimental/pinned_allocator.h>
+#endif
 
 #if defined(__unix__) || defined(__unix) || defined(unix) || \
     (defined(__APPLE__) && defined(__MACH__))
@@ -48,6 +55,8 @@ limitations under the License.
 #include <sys/stat.h>
 #include <sys/types.h>
 
+#include "grape/config.h"
+
 #define CHECK_CUDA(err)                                       \
   do {                                                        \
     cudaError_t errr = (err);                                 \
@@ -161,7 +170,7 @@ size_t get_rss(bool include_shared_memory) {
   if (include_shared_memory) {
     return (size_t) rss * (size_t) sysconf(_SC_PAGESIZE);
   } else {
-    return (size_t)(rss - shared_rss) * (size_t) sysconf(_SC_PAGESIZE);
+    return (size_t) (rss - shared_rss) * (size_t) sysconf(_SC_PAGESIZE);
   }
 #else
   /* Unknown OS ----------------------------------------------- */
@@ -210,6 +219,18 @@ static cudaError_t SortKeys64(void* d_temp_storage, size_t& temp_storage_bytes,
 #endif
 }
 
+#if THRUST_VERSION > 101700
+using memory_resource =
+    thrust::system::cuda::universal_host_pinned_memory_resource;
+template <typename T>
+using pinned_vector = thrust::host_vector<
+    T, thrust::mr::stateless_resource_allocator<T, memory_resource>>;
+#else
+template <typename T>
+using pinned_vector =
+    thrust::host_vector<T, thrust::cuda::experimental::pinned_allocator<T>>;
+#endif
+
 template <typename InputIteratorT, typename OutputIteratorT>
 static cudaError_t PrefixSumKernel64(void* d_temp_storage,
                                      size_t& temp_storage_bytes,
 
@@ -15,9 +15,6 @@ limitations under the License.
 
 #ifndef GRAPE_CUDA_UTILS_SHARED_ARRAY_H_
 #define GRAPE_CUDA_UTILS_SHARED_ARRAY_H_
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
 
 #include "grape/cuda/utils/cuda_utils.h"
 #include "grape/cuda/utils/stream.h"
@@ -30,8 +27,7 @@ class SharedArray {
 
  public:
   using device_t = thrust::device_vector<T>;
-  using host_t =
-      thrust::host_vector<T, thrust::cuda::experimental::pinned_allocator<T>>;
+  using host_t = pinned_vector<T>;
 
   SharedArray() = default;
 
 
@@ -16,10 +16,6 @@ limitations under the License.
 #ifndef GRAPE_CUDA_UTILS_SHARED_VALUE_H_
 #define GRAPE_CUDA_UTILS_SHARED_VALUE_H_
 
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
-
 #include "grape/cuda/utils/cuda_utils.h"
 #include "grape/cuda/utils/stream.h"
 
@@ -81,8 +77,7 @@ class SharedValue {
 
  private:
   thrust::device_vector<T> d_buffer_;
-  thrust::host_vector<T, thrust::cuda::experimental::pinned_allocator<T>>
-      h_buffer_;
+  pinned_vector<T> h_buffer_;
 };
 }  // namespace cuda
 }  // namespace grape