Skip to content

Commit a4840a0

Browse files
authored
Improve performance of GPU-based applications. (alibaba#144)
* Improve the performance of GPU-based applications. Signed-off-by: septicmk <[email protected]>
1 parent cfa110b commit a4840a0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+3423
-869
lines changed

.clang-format

+1
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,5 @@ Standard: 'Cpp11'
1010
SpaceAfterCStyleCast: true
1111
AlignAfterOpenBracket: Align
1212
SortIncludes: true
13+
ForEachMacros: [""]
1314
IncludeBlocks: Preserve

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
ldbc_driver/workspace
12
# osx
23
*.DS_Store
34

CMakeLists.txt

+3-1
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,8 @@ else()
125125
SET(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH_LIST})
126126
message(STATUS "CUDA_ARCH = ${CUDA_ARCH_LIST}")
127127
set_property(GLOBAL PROPERTY CUDA_ARCHITECTURES "${CUDA_ARCH_LIST}")
128+
list(GET CUDA_ARCH_LIST 0 CUDA_ARCH_CODE)
129+
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_${CUDA_ARCH_CODE}")
128130
endif()
129131

130132
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Wno-deprecated-gpu-targets")
@@ -240,7 +242,7 @@ else ()
240242

241243
if (WITH_CUDA)
242244
cuda_add_executable(gpu_analytical_apps examples/analytical_apps/flags.cc examples/analytical_apps/run_cuda_app.cu)
243-
target_include_directories(gpu_analytical_apps SYSTEM PRIVATE thirdparty/cub thirdparty/thrust thirdparty/moderngpu/src)
245+
target_include_directories(gpu_analytical_apps SYSTEM BEFORE PRIVATE thirdparty/cub thirdparty/thrust thirdparty/moderngpu/src)
244246
target_include_directories(gpu_analytical_apps PRIVATE examples/analytical_apps)
245247
set_target_properties(gpu_analytical_apps PROPERTIES OUTPUT_NAME run_cuda_app)
246248
target_link_libraries(gpu_analytical_apps grape-lite ${GFLAGS_LIBRARIES} ${CUDA_LIBS} ${NCCL_LIBRARIES} ${CMAKE_DL_LIBS})

examples/analytical_apps/cdlp/cdlp_context.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ class CDLPContext : public VertexDataContext<FRAG_T, typename FRAG_T::oid_t> {
4848
#else
4949
: VertexDataContext<FRAG_T, typename FRAG_T::oid_t>(fragment, true),
5050
#endif
51-
labels(this->data()) {}
51+
labels(this->data()) {
52+
}
5253

5354
void Init(ParallelMessageManager& messages, int max_round) {
5455
auto& frag = this->fragment();

examples/analytical_apps/cuda/app_config.h

+13
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,19 @@ struct AppConfig {
2525
double wl_alloc_factor_out_remote;
2626
LoadBalancing lb;
2727
};
28+
29+
template <grape::LoadStrategy LS>
30+
struct MessageStrategyTrait {
31+
static constexpr grape::MessageStrategy message_strategy =
32+
grape::MessageStrategy::kAlongOutgoingEdgeToOuterVertex;
33+
};
34+
35+
template <>
36+
struct MessageStrategyTrait<grape::LoadStrategy::kBothOutIn> {
37+
static constexpr grape::MessageStrategy message_strategy =
38+
grape::MessageStrategy::kAlongEdgeToOuterVertex;
39+
};
40+
2841
} // namespace cuda
2942
} // namespace grape
3043
#endif // __CUDACC__

examples/analytical_apps/cuda/bfs/bfs.h

+120-59
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/** Copyright 2022 Alibaba Group Holding Limited.
1+
/** Copyright 2023 Alibaba Group Holding Limited.
22
33
Licensed under the Apache License, Version 2.0 (the "License");
44
you may not use this file except in compliance with the License.
@@ -32,13 +32,6 @@ class BFSContext : public grape::VoidContext<FRAG_T> {
3232

3333
explicit BFSContext(const FRAG_T& frag) : grape::VoidContext<FRAG_T>(frag) {}
3434

35-
#ifdef PROFILING
36-
~BFSContext() {
37-
LOG(INFO) << "Get msg time: " << get_msg_time * 1000;
38-
LOG(INFO) << "BFS kernel time: " << traversal_kernel_time * 1000;
39-
}
40-
#endif
41-
4235
void Init(GPUMessageManager& messages, AppConfig app_config, oid_t src_id) {
4336
auto& frag = this->fragment();
4437
auto vertices = frag.Vertices();
@@ -50,7 +43,9 @@ class BFSContext : public grape::VoidContext<FRAG_T> {
5043
depth.Init(vertices, std::numeric_limits<depth_t>::max());
5144
depth.H2D();
5245
in_q.Init(iv.size());
53-
out_q_local.Init(iv.size());
46+
current_active_map.Init(iv);
47+
next_active_map.Init(iv);
48+
visited.Init(iv);
5449

5550
messages.InitBuffer((sizeof(depth_t) + sizeof(vid_t)) * ov.size(),
5651
(sizeof(depth_t) + sizeof(vid_t)) * iv.size());
@@ -68,14 +63,13 @@ class BFSContext : public grape::VoidContext<FRAG_T> {
6863
}
6964

7065
oid_t src_id{};
66+
double active_ratio;
7167
LoadBalancing lb{};
7268
depth_t curr_depth{};
7369
VertexArray<depth_t, vid_t> depth;
74-
Queue<vertex_t, vid_t> in_q, out_q_local;
75-
#ifdef PROFILING
76-
double get_msg_time{};
77-
double traversal_kernel_time{};
78-
#endif
70+
Queue<vertex_t, vid_t> in_q;
71+
DenseVertexSet<vid_t> current_active_map, next_active_map;
72+
DenseVertexSet<vid_t> visited;
7973
};
8074

8175
template <typename FRAG_T>
@@ -89,6 +83,7 @@ class BFS : public GPUAppBase<FRAG_T, BFSContext<FRAG_T>>,
8983
using edata_t = typename fragment_t::edata_t;
9084
using vertex_t = typename dev_fragment_t::vertex_t;
9185
using nbr_t = typename dev_fragment_t::nbr_t;
86+
static constexpr bool need_split_edges = true;
9287

9388
void PEval(const fragment_t& frag, context_t& ctx,
9489
message_manager_t& messages) {
@@ -101,88 +96,154 @@ class BFS : public GPUAppBase<FRAG_T, BFSContext<FRAG_T>>,
10196
messages.stream(),
10297
[=] __device__(dev_fragment_t d_frag,
10398
dev::VertexArray<depth_t, vid_t> depth,
104-
dev::Queue<vertex_t, vid_t> in_q) {
99+
dev::DenseVertexSet<vid_t> d_current_active_map,
100+
dev::DenseVertexSet<vid_t> d_visited) {
105101
auto tid = TID_1D;
106102

107103
if (tid == 0) {
108104
depth[source] = 0;
109-
in_q.Append(source);
105+
d_current_active_map.Insert(source);
106+
d_visited.Insert(source);
110107
}
111108
},
112109
frag.DeviceObject(), ctx.depth.DeviceObject(),
113-
ctx.in_q.DeviceObject());
110+
ctx.current_active_map.DeviceObject(), ctx.visited.DeviceObject());
114111
}
115112
messages.ForceContinue();
116113
}
117114

118115
void IncEval(const fragment_t& frag, context_t& ctx,
119116
message_manager_t& messages) {
120117
auto d_frag = frag.DeviceObject();
118+
auto iv = frag.InnerVertices();
119+
auto ov = frag.OuterVertices();
121120
auto d_depth = ctx.depth.DeviceObject();
122121
auto& in_q = ctx.in_q;
123122
auto d_in_q = in_q.DeviceObject();
124-
auto& out_q_local = ctx.out_q_local;
125-
auto d_out_q_local = out_q_local.DeviceObject();
123+
auto& current_active_map = ctx.current_active_map;
124+
auto d_current_active_map = current_active_map.DeviceObject();
125+
auto& visited = ctx.visited;
126+
auto d_visited = visited.DeviceObject();
127+
auto& next_active_map = ctx.next_active_map;
128+
auto d_next_active_map = next_active_map.DeviceObject();
126129
auto curr_depth = ctx.curr_depth;
127130
auto next_depth = curr_depth + 1;
128131
auto& stream = messages.stream();
129132
auto d_mm = messages.DeviceObject();
133+
bool isDirected = frag.load_strategy == grape::LoadStrategy::kBothOutIn;
134+
135+
next_active_map.Clear(stream);
136+
in_q.Clear(stream);
130137

131-
#ifdef PROFILING
132-
ctx.get_msg_time -= grape::GetCurrentTime();
133-
auto process_msg_time = grape::GetCurrentTime();
134-
#endif
135138
messages.template ParallelProcess<dev_fragment_t, grape::EmptyType>(
136139
d_frag, [=] __device__(vertex_t v) mutable {
137140
assert(d_frag.IsInnerVertex(v));
138141

139142
if (curr_depth < d_depth[v]) {
140143
d_depth[v] = curr_depth;
141-
d_in_q.AppendWarp(v);
144+
d_current_active_map.Insert(v);
145+
d_visited.Insert(v);
142146
}
143147
});
144-
auto in_size = in_q.size(stream);
145-
146-
WorkSourceArray<vertex_t> ws_in(in_q.data(), in_size);
147148

148-
#ifdef PROFILING
149-
VLOG(1) << "Frag " << frag.fid() << " In: " << in_size;
150-
process_msg_time = grape::GetCurrentTime() - process_msg_time;
151-
ctx.get_msg_time += grape::GetCurrentTime();
152-
auto traversal_kernel_time = grape::GetCurrentTime();
153-
#endif
154-
155-
ForEachOutgoingEdge(
156-
stream, d_frag, ws_in,
157-
[=] __device__(const vertex_t& u, const nbr_t& nbr) mutable {
158-
vertex_t v = nbr.get_neighbor();
159-
160-
if (next_depth < d_depth[v]) {
161-
d_depth[v] = next_depth;
162-
163-
if (d_frag.IsInnerVertex(v)) {
164-
d_out_q_local.AppendWarp(v);
165-
} else {
149+
auto ivnum = iv.size();
150+
auto active = current_active_map.Count(stream);
151+
auto visited_num = visited.Count(stream);
152+
double active_ratio = (active + 0.0) / ivnum;
153+
double visited_ratio = (visited_num + 0.0) / ivnum;
154+
bool usePush = (2.5 * active_ratio < (1 - visited_ratio)) || (active == 0);
155+
if (usePush) {
156+
// push-based search
157+
WorkSourceRange<vertex_t> ws_iv(*iv.begin(), iv.size());
158+
ForEach(stream, ws_iv, [=] __device__(vertex_t v) mutable {
159+
if (d_current_active_map.Exist(v)) {
160+
d_in_q.AppendWarp(v);
161+
}
162+
});
163+
WorkSourceArray<vertex_t> ws_in(in_q.data(), in_q.size(stream));
164+
165+
ForEachOutgoingEdge(
166+
stream, d_frag, ws_in,
167+
[=] __device__(const vertex_t& u, const nbr_t& nbr) mutable {
168+
vertex_t v = nbr.get_neighbor();
169+
170+
if (next_depth < d_depth[v]) {
171+
d_depth[v] = next_depth;
172+
if (d_frag.IsInnerVertex(v)) {
173+
d_next_active_map.Insert(v);
174+
d_visited.Insert(v);
175+
} else {
176+
d_mm.SyncStateOnOuterVertex(d_frag, v);
177+
}
178+
}
179+
},
180+
ctx.lb);
181+
} else {
182+
// pull-based search
183+
WorkSourceRange<vertex_t> ws_ov(*ov.begin(), ov.size());
184+
depth_t MAX_DEPTH = std::numeric_limits<depth_t>::max();
185+
ForEach(stream, ws_ov, [=] __device__(vertex_t v) mutable {
186+
if (d_depth[v] == MAX_DEPTH) {
187+
auto ies = d_frag.GetIncomingAdjList(v);
188+
for (auto& e : ies) {
189+
auto u = e.get_neighbor();
190+
assert(d_frag.IsInnerVertex(u));
191+
if (d_current_active_map.Exist(u)) {
192+
d_depth[v] = next_depth;
166193
d_mm.SyncStateOnOuterVertex(d_frag, v);
194+
break;
167195
}
168196
}
169-
},
170-
ctx.lb);
197+
}
198+
});
199+
200+
WorkSourceRange<vertex_t> ws_iv(*iv.begin(), iv.size());
201+
ForEach(stream, ws_iv, [=] __device__(vertex_t v) mutable {
202+
if (!d_visited.Exist(v)) {
203+
d_in_q.AppendWarp(v);
204+
}
205+
});
206+
WorkSourceArray<vertex_t> ws_in(in_q.data(), in_q.size(stream));
207+
208+
if (isDirected) {
209+
ForEach(stream, ws_in, [=] __device__(vertex_t v) mutable {
210+
auto ies = d_frag.GetIncomingInnerVertexAdjList(v);
211+
for (auto& e : ies) {
212+
auto u = e.get_neighbor();
213+
if (d_current_active_map.Exist(u)) {
214+
d_depth[v] = next_depth;
215+
d_next_active_map.Insert(v);
216+
d_visited.Insert(v);
217+
break;
218+
}
219+
}
220+
});
221+
} else {
222+
ForEach(stream, ws_in, [=] __device__(vertex_t v) mutable {
223+
auto oes = d_frag.GetOutgoingInnerVertexAdjList(v);
224+
for (auto& e : oes) {
225+
auto u = e.get_neighbor();
226+
assert(d_frag.IsInnerVertex(u));
227+
if (d_current_active_map.Exist(u)) {
228+
d_depth[v] = next_depth;
229+
d_next_active_map.Insert(v);
230+
d_visited.Insert(v);
231+
break;
232+
}
233+
}
234+
});
235+
}
236+
}
237+
238+
auto has_work = next_active_map.Count(stream);
171239
stream.Sync();
172-
auto local_out_size = out_q_local.size(stream);
173-
#ifdef PROFILING
174-
traversal_kernel_time = grape::GetCurrentTime() - traversal_kernel_time;
175-
VLOG(2) << "Frag " << frag.fid() << " Local out: " << local_out_size
176-
<< " ProcessMsg time: " << process_msg_time * 1000
177-
<< " Kernel time: " << traversal_kernel_time * 1000;
178-
ctx.traversal_kernel_time += traversal_kernel_time;
179-
#endif
180-
in_q.Clear(stream);
181-
out_q_local.Swap(in_q);
182-
ctx.curr_depth = next_depth;
183-
if (local_out_size > 0) {
240+
241+
if (has_work > 0) {
184242
messages.ForceContinue();
185243
}
244+
245+
ctx.curr_depth = next_depth;
246+
current_active_map.Swap(next_active_map);
186247
}
187248
};
188249
} // namespace cuda

0 commit comments

Comments
 (0)