Skip to content

Commit 40ecead

Browse files
authored
Remove all the submodules introduced by GPU features. (alibaba#151)
* Remove submodules for GRAPE-GPU. Signed-off-by: septicmk <[email protected]>
1 parent 45281c4 commit 40ecead

18 files changed

+77
-156
lines changed

.gitmodules

-9
Original file line numberDiff line numberDiff line change
@@ -1,10 +1 @@
11

2-
[submodule "thirdparty/cub"]
3-
path = thirdparty/cub
4-
url = https://github.com/NVIDIA/cub.git
5-
[submodule "thirdparty/moderngpu"]
6-
path = thirdparty/moderngpu
7-
url = https://github.com/moderngpu/moderngpu.git
8-
[submodule "thirdparty/thrust"]
9-
path = thirdparty/thrust
10-
url = https://github.com/NVIDIA/thrust.git

CMakeLists.txt

-16
Original file line numberDiff line numberDiff line change
@@ -105,11 +105,6 @@ else()
105105
if (NCCL_VERSION VERSION_LESS "2.7")
106106
message(WARNING "Disable GPU support because NCCL >= 2.7 not found")
107107
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
108-
elseif ((NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/thirdparty/cub/.git") OR
109-
(NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/thirdparty/thrust/.git") OR
110-
(NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/thirdparty/moderngpu/.git"))
111-
message(WARNING "Disable GPU support because dependencies not found, please run 'git submodules update --init --recursive'")
112-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
113108
else ()
114109
option(WITH_CUDA "Whether to enable cuda support" ON)
115110
message(STATUS "Build with CUDA support")
@@ -252,7 +247,6 @@ else ()
252247

253248
if (WITH_CUDA)
254249
cuda_add_executable(gpu_analytical_apps examples/analytical_apps/flags.cc examples/analytical_apps/run_cuda_app.cu)
255-
target_include_directories(gpu_analytical_apps SYSTEM BEFORE PRIVATE thirdparty/cub thirdparty/thrust thirdparty/moderngpu/src)
256250
target_include_directories(gpu_analytical_apps PRIVATE examples/analytical_apps)
257251
set_target_properties(gpu_analytical_apps PROPERTIES OUTPUT_NAME run_cuda_app)
258252
target_link_libraries(gpu_analytical_apps grape-lite ${GFLAGS_LIBRARIES} ${CUDA_LIBS} ${NCCL_LIBRARIES} ${CMAKE_DL_LIBS})
@@ -295,16 +289,6 @@ install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/flat_hash_map
295289
)
296290

297291
if (WITH_CUDA)
298-
install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/cub/cub
299-
${PROJECT_SOURCE_DIR}/thirdparty/thrust/thrust
300-
${PROJECT_SOURCE_DIR}/thirdparty/moderngpu/src/moderngpu
301-
DESTINATION include
302-
FILES_MATCHING
303-
PATTERN "*.h"
304-
PATTERN "*.cuh"
305-
PATTERN "*.hpp"
306-
PATTERN "*.hxx"
307-
)
308292
install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/cuda_hashmap
309293
DESTINATION include/cuda_hashmap
310294
FILES_MATCHING

README.md

+2-6
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,8 @@ make gnn_sampler
6060

6161
### Building libgrape-lite with GPU support
6262

63-
libgrape-lite supports deploying graph algorithms to GPUs. To enable the support for GPUs, you first
64-
need initialize the dependencies with the following command before building.
65-
66-
```bash
67-
git submodule update --init --recursive
68-
```
63+
libgrape-lite supports deploying graph algorithms to GPUs.
64+
When CUDA is detected on the machine and NCCL >= 2.7, GPU support will be enabled automatically.
6965

7066
## Running libgrape-lite applications
7167

grape/config.h

-5
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ limitations under the License.
2222

2323
#ifdef __CUDACC__
2424
#include <thrust/host_vector.h>
25-
#include <thrust/system/cuda/experimental/pinned_allocator.h>
2625
#endif
2726

2827
#include "grape/utils/default_allocator.h"
@@ -56,10 +55,6 @@ using Allocator = DefaultAllocator<T>;
5655
#define MAX_GRID_SIZE 768
5756
#define TID_1D (threadIdx.x + blockIdx.x * blockDim.x)
5857
#define TOTAL_THREADS_1D (gridDim.x * blockDim.x)
59-
60-
template <typename T>
61-
using pinned_vector =
62-
thrust::host_vector<T, thrust::cuda::experimental::pinned_allocator<T>>;
6358
#else
6459
#define DEV_HOST
6560
#define DEV_HOST_INLINE inline

grape/cuda/fragment/host_fragment.h

+8-6
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ class HostFragment
161161
stream.cuda_stream()));
162162

163163
auto prefix_sum = compute_prefix_sum(ieoffset);
164-
ArrayView<VID_T> d_prefix_sum(prefix_sum.data(), prefix_sum.size());
164+
ArrayView<VID_T> d_prefix_sum(prefix_sum);
165165

166166
CalculateOffsetWithPrefixSum<nbr_t, vid_t>(
167167
stream, d_prefix_sum, thrust::raw_pointer_cast(d_ie_.data()),
@@ -176,7 +176,7 @@ class HostFragment
176176
stream.cuda_stream()));
177177

178178
auto prefix_sum = compute_prefix_sum(oeoffset);
179-
ArrayView<VID_T> d_prefix_sum(prefix_sum.data(), prefix_sum.size());
179+
ArrayView<VID_T> d_prefix_sum(prefix_sum);
180180

181181
CalculateOffsetWithPrefixSum<nbr_t, vid_t>(
182182
stream, d_prefix_sum, thrust::raw_pointer_cast(d_oe_.data()),
@@ -354,7 +354,7 @@ class HostFragment
354354
cudaMemcpyHostToDevice, stream.cuda_stream()));
355355

356356
auto prefix_sum = compute_prefix_sum(ieoffset);
357-
ArrayView<VID_T> d_prefix_sum(prefix_sum.data(), prefix_sum.size());
357+
ArrayView<VID_T> d_prefix_sum(prefix_sum);
358358

359359
CalculateOffsetWithPrefixSum<nbr_t, vid_t>(
360360
stream, d_prefix_sum, thrust::raw_pointer_cast(d_ie_.data()),
@@ -370,7 +370,7 @@ class HostFragment
370370
cudaMemcpyHostToDevice, stream.cuda_stream()));
371371

372372
auto prefix_sum = compute_prefix_sum(oeoffset);
373-
ArrayView<VID_T> d_prefix_sum(prefix_sum.data(), prefix_sum.size());
373+
ArrayView<VID_T> d_prefix_sum(prefix_sum);
374374

375375
CalculateOffsetWithPrefixSum<nbr_t, vid_t>(
376376
stream, d_prefix_sum, thrust::raw_pointer_cast(d_oe_.data()),
@@ -414,6 +414,7 @@ class HostFragment
414414
[] __device__(VID_T * gids, VID_T * lids, VID_T size,
415415
CUDASTL::HashMap<VID_T, VID_T> * ovg2l) {
416416
auto tid = TID_1D;
417+
gids = thrust::raw_pointer_cast(gids);
417418
auto nthreads = TOTAL_THREADS_1D;
418419

419420
for (VID_T idx = 0 + tid; idx < size; idx += nthreads) {
@@ -423,7 +424,8 @@ class HostFragment
423424
(*ovg2l)[gid] = lid;
424425
}
425426
},
426-
gids.data(), lids.data(), size, d_ovg2l_.get());
427+
thrust::raw_pointer_cast(gids.data()),
428+
thrust::raw_pointer_cast(lids.data()), size, d_ovg2l_.get());
427429
}
428430

429431
d_mirrors_of_frag_holder_.resize(fnum_);
@@ -633,7 +635,7 @@ class HostFragment
633635
thrust::device_vector<fid_t>& d_fid_list,
634636
thrust::device_vector<fid_t*>& d_fid_list_offset) {
635637
pinned_vector<size_t> prefix_sum(ivnum_ + 1, 0);
636-
ArrayView<size_t> d_prefix_sum(prefix_sum.data(), prefix_sum.size());
638+
ArrayView<size_t> d_prefix_sum(prefix_sum);
637639

638640
for (VID_T i = 0; i < ivnum_; ++i) {
639641
prefix_sum[i + 1] =

grape/cuda/parallel/parallel_engine.h

+6-7
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,12 @@ limitations under the License.
1818

1919
#include <cuda_profiler_api.h>
2020

21+
#include <iostream>
2122
#include <unordered_set>
2223
#pragma push
2324
#pragma diag_suppress = initialization_not_reachable
2425
#include <thrust/binary_search.h>
25-
2626
#include <cub/cub.cuh>
27-
#include <moderngpu/kernel_sortedsearch.hxx>
2827
#pragma pop
2928

3029
#include "grape/config.h"
@@ -33,6 +32,7 @@ limitations under the License.
3332
#include "grape/cuda/utils/launcher.h"
3433
#include "grape/cuda/utils/shared_value.h"
3534
#include "grape/cuda/utils/sorted_search.h"
35+
#include "grape/cuda/utils/stream.h"
3636
#include "grape/cuda/utils/work_source.h"
3737

3838
// TODO(liang): we may split this to multiple headers
@@ -930,7 +930,7 @@ DEV_INLINE void LBSTRICT(const FRAG_T& dev_frag, const ArrayView<size_t>& sidx,
930930
while (block_output_processed < block_output_size &&
931931
iter_input_start < block_input_end) {
932932
size_t iter_input_size =
933-
min((size_t)(blockDim.x - 1), block_input_end - iter_input_start);
933+
min((size_t) (blockDim.x - 1), block_input_end - iter_input_start);
934934
size_t iter_input_end = iter_input_start + iter_input_size;
935935
size_t iter_output_end =
936936
iter_input_end < size ? row_offset[iter_input_end] : total_edges;
@@ -1362,10 +1362,9 @@ class ParallelEngine {
13621362
},
13631363
ArrayView<size_t>(seid_per_block));
13641364

1365-
sorted_search<mgpu::bounds_lower>(
1366-
stream, thrust::raw_pointer_cast(seid_per_block.data()), block_num,
1367-
thrust::raw_pointer_cast(prefix_sum_.data()), size,
1368-
thrust::raw_pointer_cast(sidx.data()), mgpu::less_t<size_t>());
1365+
sorted_search(stream, thrust::raw_pointer_cast(seid_per_block.data()),
1366+
block_num, thrust::raw_pointer_cast(prefix_sum_.data()), size,
1367+
thrust::raw_pointer_cast(sidx.data()));
13691368

13701369
KernelWrapper<<<block_num, block_size, calc_shmem_size(block_size),
13711370
stream.cuda_stream()>>>(

grape/cuda/utils/array_view.h

+3-7
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,8 @@ limitations under the License.
1515

1616
#ifndef GRAPE_CUDA_UTILS_ARRAY_VIEW_H_
1717
#define GRAPE_CUDA_UTILS_ARRAY_VIEW_H_
18-
#include <thrust/device_vector.h>
19-
#include <thrust/host_vector.h>
20-
#include <thrust/swap.h>
21-
#include <thrust/system/cuda/experimental/pinned_allocator.h>
2218

23-
#include "grape/config.h"
19+
#include "grape/cuda/utils/cuda_utils.h"
2420

2521
namespace grape {
2622
namespace cuda {
@@ -33,8 +29,7 @@ class ArrayView {
3329
: data_(const_cast<T*>(thrust::raw_pointer_cast(vec.data()))),
3430
size_(vec.size()) {}
3531

36-
explicit ArrayView(const thrust::host_vector<
37-
T, thrust::cuda::experimental::pinned_allocator<T>>& vec)
32+
explicit ArrayView(const pinned_vector<T>& vec)
3833
: data_(const_cast<T*>(thrust::raw_pointer_cast(vec.data()))),
3934
size_(vec.size()) {}
4035

@@ -69,6 +64,7 @@ class ArrayView {
6964
T* data_{};
7065
size_t size_{};
7166
};
67+
7268
} // namespace cuda
7369
} // namespace grape
7470
#endif // GRAPE_CUDA_UTILS_ARRAY_VIEW_H_

grape/cuda/utils/cuda_utils.h

+24-3
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,18 @@ limitations under the License.
1919
#include <nccl.h>
2020
#include <sys/resource.h>
2121
#include <sys/time.h>
22+
#include <thrust/device_vector.h>
2223
#include <thrust/execution_policy.h>
24+
#include <thrust/host_vector.h>
25+
#include <thrust/swap.h>
2326
#include <thrust/transform_reduce.h>
27+
#include <cub/cub.cuh>
2428

25-
#include "cub/cub.cuh"
26-
#include "grape/config.h"
29+
#if THRUST_VERSION > 101700
30+
#include <thrust/system/cuda/memory_resource.h>
31+
#else
32+
#include <thrust/system/cuda/experimental/pinned_allocator.h>
33+
#endif
2734

2835
#if defined(__unix__) || defined(__unix) || defined(unix) || \
2936
(defined(__APPLE__) && defined(__MACH__))
@@ -48,6 +55,8 @@ limitations under the License.
4855
#include <sys/stat.h>
4956
#include <sys/types.h>
5057

58+
#include "grape/config.h"
59+
5160
#define CHECK_CUDA(err) \
5261
do { \
5362
cudaError_t errr = (err); \
@@ -161,7 +170,7 @@ size_t get_rss(bool include_shared_memory) {
161170
if (include_shared_memory) {
162171
return (size_t) rss * (size_t) sysconf(_SC_PAGESIZE);
163172
} else {
164-
return (size_t)(rss - shared_rss) * (size_t) sysconf(_SC_PAGESIZE);
173+
return (size_t) (rss - shared_rss) * (size_t) sysconf(_SC_PAGESIZE);
165174
}
166175
#else
167176
/* Unknown OS ----------------------------------------------- */
@@ -210,6 +219,18 @@ static cudaError_t SortKeys64(void* d_temp_storage, size_t& temp_storage_bytes,
210219
#endif
211220
}
212221

222+
#if THRUST_VERSION > 101700
223+
using memory_resource =
224+
thrust::system::cuda::universal_host_pinned_memory_resource;
225+
template <typename T>
226+
using pinned_vector = thrust::host_vector<
227+
T, thrust::mr::stateless_resource_allocator<T, memory_resource>>;
228+
#else
229+
template <typename T>
230+
using pinned_vector =
231+
thrust::host_vector<T, thrust::cuda::experimental::pinned_allocator<T>>;
232+
#endif
233+
213234
template <typename InputIteratorT, typename OutputIteratorT>
214235
static cudaError_t PrefixSumKernel64(void* d_temp_storage,
215236
size_t& temp_storage_bytes,

grape/cuda/utils/shared_array.h

+1-5
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,6 @@ limitations under the License.
1515

1616
#ifndef GRAPE_CUDA_UTILS_SHARED_ARRAY_H_
1717
#define GRAPE_CUDA_UTILS_SHARED_ARRAY_H_
18-
#include <thrust/device_vector.h>
19-
#include <thrust/host_vector.h>
20-
#include <thrust/system/cuda/experimental/pinned_allocator.h>
2118

2219
#include "grape/cuda/utils/cuda_utils.h"
2320
#include "grape/cuda/utils/stream.h"
@@ -30,8 +27,7 @@ class SharedArray {
3027

3128
public:
3229
using device_t = thrust::device_vector<T>;
33-
using host_t =
34-
thrust::host_vector<T, thrust::cuda::experimental::pinned_allocator<T>>;
30+
using host_t = pinned_vector<T>;
3531

3632
SharedArray() = default;
3733

grape/cuda/utils/shared_value.h

+1-6
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,6 @@ limitations under the License.
1616
#ifndef GRAPE_CUDA_UTILS_SHARED_VALUE_H_
1717
#define GRAPE_CUDA_UTILS_SHARED_VALUE_H_
1818

19-
#include <thrust/device_vector.h>
20-
#include <thrust/host_vector.h>
21-
#include <thrust/system/cuda/experimental/pinned_allocator.h>
22-
2319
#include "grape/cuda/utils/cuda_utils.h"
2420
#include "grape/cuda/utils/stream.h"
2521

@@ -81,8 +77,7 @@ class SharedValue {
8177

8278
private:
8379
thrust::device_vector<T> d_buffer_;
84-
thrust::host_vector<T, thrust::cuda::experimental::pinned_allocator<T>>
85-
h_buffer_;
80+
pinned_vector<T> h_buffer_;
8681
};
8782
} // namespace cuda
8883
} // namespace grape

0 commit comments

Comments
 (0)