Skip to content

Commit 99b9f09

Browse files
committed
Add missing CUDA generator
This is @e-schrade's work
1 parent 2145fb1 commit 99b9f09

File tree

2 files changed

+347
-1
lines changed

2 files changed

+347
-1
lines changed

LICENSE

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
BSD 2-Clause License
22

3-
Copyright (c) 2016-2017, Sebastian Lamm <[email protected]> and Lorenz Hübschle-Schneider <[email protected]>
3+
Copyright (c) 2016-2017, Sebastian Lamm <[email protected]>, Lorenz
4+
Hübschle-Schneider <[email protected]>, and Emanuel Schrade <[email protected]>.
45
All rights reserved.
56

67
Redistribution and use in source and binary forms, with or without

cuda_gen.cu

Lines changed: 345 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,345 @@
1+
/*******************************************************************************
2+
* cuda_gen.cu
3+
*
4+
* Copyright (C) 2016 Emanuel Schrade <[email protected]>
5+
*
6+
* All rights reserved. Published under the BSD-2 license in the LICENSE file.
7+
******************************************************************************/
8+
9+
#include <cassert>
10+
#include <limits>
11+
#include <random>
12+
#include <cuda.h>
13+
#include <curand_kernel.h>
14+
#include <curand.h>
15+
//#define PROFILING
16+
#ifdef PROFILING
17+
#include <cuda_profiler_api.h>
18+
#endif
19+
#include "util.h"
20+
#include "timer.h"
21+
#include <stocc/stocc.h>
22+
#include <stocc/randomc.h>
23+
24+
#include <thrust/copy.h>
25+
#include <thrust/find.h>
26+
#include <thrust/device_vector.h>
27+
#include <thrust/execution_policy.h>
28+
29+
#include <include/arg_parser.h>
30+
31+
#include <algorithm>
32+
#include <cmath>
33+
#include <utility>
34+
35+
#ifndef M_PI
36+
#define M_PI 3.14159265358979323814
37+
#endif
38+
39+
#ifdef USE_ALGORITHM_H
40+
#include <set>
41+
#else
42+
#define _DEFINITIONS_H_
43+
#include <sampler.h>
44+
#endif
45+
46+
//initialize curand states
47+
__global__ void initCurand(curandState *state, unsigned long long *seed)
48+
{
49+
size_t threadId = blockIdx.x * blockDim.x + threadIdx.x;
50+
curand_init(seed[threadId], 0, 0, &state[threadId]);
51+
}
52+
53+
//sample random numbers with geometric distribution
54+
template <typename T>
55+
__global__ void sample(curandState *state, T *data, const size_t count, const T num, const double oneminusp)
56+
{
57+
size_t threadId = blockIdx.x * blockDim.x + threadIdx.x;
58+
size_t idx = threadId * num;
59+
curandState mState = state[threadId];
60+
//each thread generates 'num' random numbers with geometric distribution
61+
float lpinv = 1.0/log2(oneminusp);
62+
for(unsigned int i = 0; i < num && idx < count; i+=4, idx+=4)
63+
{
64+
float4 r = make_float4(__log2f(1-curand_uniform(&mState)),__log2f(1-curand_uniform(&mState)),__log2f(1-curand_uniform(&mState)),__log2f(1-curand_uniform(&mState)));
65+
if(i < num && idx < count)
66+
data[idx] = r.x*lpinv+1;
67+
if(i+1 < num && idx+1 < count)
68+
data[idx+1] = r.y*lpinv+1;
69+
if(i+2 < num && idx+2 < count)
70+
data[idx+2] = r.z*lpinv+1;
71+
if(i+3 < num && idx+3 < count)
72+
data[idx+3] = r.w*lpinv+1;
73+
}
74+
state[threadId] = mState;
75+
}
76+
77+
//set elements to MAX to mark for removal
78+
template <typename T>
79+
__global__ void markRemove(T *data, const size_t count, T *remove, const size_t remove_count)
80+
{
81+
size_t threadId = blockIdx.x * blockDim.x + threadIdx.x;
82+
if(threadId < remove_count)
83+
data[remove[threadId]] = T(-1);
84+
}
85+
86+
//XXX Not working in all cases
87+
#if 0
88+
//replace k elements with k last ones
89+
template <typename T>
90+
__global__ void replace(T *data, const size_t count, T *remove, const size_t remove_count)
91+
{
92+
size_t threadId = blockIdx.x * blockDim.x + threadIdx.x;
93+
T idx = (T)-1;
94+
if(threadId < remove_count)
95+
idx = remove[threadId];
96+
97+
if(idx != (T)-1 && data[idx] == (T)-1 && data[count-threadId-1] != (T)-1)
98+
{
99+
data[idx] = data[count-threadId-1];
100+
remove[threadId] = (T)-1;
101+
}
102+
if(idx == count-threadId-1)
103+
remove[threadId] = (T)-1;
104+
//otherwise replace in next iteration
105+
}
106+
#endif
107+
108+
//functor for filtering elements less than a constant
109+
template <typename T>
110+
struct less_functor
111+
{
112+
T max_idx;
113+
less_functor(const T _max_idx)
114+
: max_idx(_max_idx)
115+
{ }
116+
117+
__host__ __device__
118+
bool operator()(T x)
119+
{
120+
return x < max_idx;
121+
}
122+
};
123+
124+
//initialize prng only once (seed can be set nevertheless)
125+
template <typename T>
126+
class cuda_generator
127+
{
128+
private:
129+
static cuda_generator<T> *_instance;
130+
131+
T *device_data;
132+
133+
//(GPU-)states for random number generators
134+
curandState *state;
135+
//seeds on CPU and GPU for each thread of the random number generator
136+
unsigned long long *seeds, *seeds_device;
137+
size_t N;
138+
const T num_threads = 1<<16;
139+
const T num_threads_per_group = 1<<3;
140+
const T num_groups = num_threads / num_threads_per_group;
141+
142+
cuda_generator()
143+
{
144+
N = 0;
145+
cudaMalloc((void**)&state, num_threads * sizeof(curandState));
146+
cudaMalloc((void**)&seeds_device, num_threads * sizeof(unsigned long long));
147+
seeds = new unsigned long long[num_threads];
148+
device_data = 0;
149+
}
150+
151+
~cuda_generator()
152+
{
153+
cudaFree((void**)&state);
154+
cudaFree((void**)&seeds_device);
155+
delete [] seeds;
156+
}
157+
158+
159+
public:
160+
static cuda_generator<T> *instance()
161+
{
162+
if(!_instance)
163+
_instance = new cuda_generator<T>();
164+
return _instance;
165+
}
166+
167+
template <typename It>
168+
void generate_block(It dest, size_t size, size_t k, size_t universe, double p,
169+
unsigned long long seed = 0)
170+
{
171+
using value_type = typename std::iterator_traits<It>::value_type;
172+
assert(p > 0 && p < 1);
173+
174+
if (seed == 0)
175+
{
176+
seed = std::random_device{}();
177+
}
178+
179+
//allocate memory on GPU if blocksize changed
180+
if(N != size)
181+
{
182+
if(device_data)
183+
cudaFree(device_data);
184+
cudaMalloc((void**)&device_data, sizeof(T)*size);
185+
N = size;
186+
}
187+
for(int i = 0; i < num_threads; i++)
188+
seeds[i] = seed+i;
189+
cudaMemcpy(seeds_device, seeds, num_threads*sizeof(unsigned long long), cudaMemcpyHostToDevice);
190+
//initialize prng-states
191+
initCurand<<<num_groups, num_threads_per_group>>>(state, seeds_device);
192+
//fill array with geometrically distributed random numbers
193+
T samples_per_thread = max((T)ceil(N/(double)num_threads), (T)1);
194+
T num_indices = 0;
195+
timer t;
196+
do {
197+
sample<<<num_groups, num_threads_per_group>>>(state, device_data, N, samples_per_thread, 1.0-p);
198+
//prefix-sum for calculating array-indices
199+
thrust::inclusive_scan(thrust::device, device_data, device_data+N, device_data);
200+
//count_if is faster than the other two.
201+
num_indices = thrust::count_if(thrust::device, device_data, device_data+N, less_functor<T>((T)universe));
202+
//num_indices = thrust::find_if(thrust::device, device_data, device_data+N, less_functor<T>((T)universe)) - device_data;
203+
//num_indices = thrust::copy_if(thrust::device, device_data, device_data+N, device_data, less_functor<T>((T)universe)) - device_data;
204+
} while(num_indices < k);
205+
206+
//sample num_indices - k elements to remove
207+
T *removeArray = new T[num_indices - k];
208+
#ifdef USE_ALGORITHM_H
209+
std::mt19937 gen(seed);
210+
std::uniform_int_distribution<T> dist(0, num_indices-1);
211+
std::set<T> remove;
212+
213+
while(remove.size() < num_indices - k)
214+
remove.insert(dist(gen));
215+
216+
int idx = 0;
217+
for(auto it = remove.begin(); it != remove.end(); it++)
218+
removeArray[idx++] = *it;
219+
#else
220+
size_t hole_idx = 0;
221+
const size_t basecase = 1024;
222+
size_t to_remove = num_indices - k;
223+
HashSampling<> hs((ULONG)seed, to_remove);
224+
SeqDivideSampling<> s(hs, basecase, (ULONG)seed);
225+
// end - begin - 1 because the range is inclusive
226+
s.sample(num_indices - 1, to_remove, [&](ULONG pos) {
227+
removeArray[hole_idx++] = pos;
228+
});
229+
assert(hole_idx == to_remove);
230+
#endif
231+
//copy indices to GPU
232+
T *device_removeArray;
233+
cudaMalloc((void**)&device_removeArray, sizeof(T) * (num_indices-k));
234+
cudaMemcpyAsync(device_removeArray, removeArray, sizeof(T) * (num_indices-k), cudaMemcpyHostToDevice);
235+
236+
markRemove<<<(num_indices-k)/num_threads_per_group*4, num_threads_per_group*4>>>(device_data, num_indices, device_removeArray, num_indices-k);
237+
#if 1
238+
//compaction
239+
int removed = thrust::copy_if(thrust::device, device_data, device_data+num_indices, device_data, less_functor<T>(universe)) - device_data;
240+
#else
241+
//copy elements and count collisions, restart until no collisions left
242+
remove_remaining = num_indices-k;
243+
while(remove_remaining > 0)
244+
{
245+
replace<<<remove_remaining, num_threads_per_group>>>(device_data, k+remove_remaining, device_removeArray, num_indices-k);
246+
remove_remaining = thrust::copy_if(thrust::device, device_removeArray, device_removeArray + remove_remaining, device_removeArray, less_functor<T>((T)universe))-device_removeArray;
247+
}
248+
249+
cudaFree(&device_removeArray);
250+
#endif
251+
cudaFree(&device_removeArray);
252+
delete [] removeArray;
253+
//read back data
254+
//cudaMemcpy(&dest[0], device_data, n*sizeof(T), cudaMemcpyDeviceToHost);
255+
}
256+
};
257+
template <typename T> cuda_generator<T> *cuda_generator<T>::_instance = 0;
258+
259+
struct cuda_gen {
260+
template <typename It>
261+
static void generate_block(It dest, size_t size, size_t k, size_t universe, double p,
262+
unsigned long long seed = 0)
263+
{
264+
using value_type = typename std::iterator_traits<It>::value_type;
265+
cuda_generator<value_type> *generator = cuda_generator<value_type>::instance();
266+
267+
generator->generate_block(dest, size, k, universe, p, seed);
268+
}
269+
270+
template <typename It>
271+
static void generate_block(It begin, It end, double p,
272+
unsigned int long long = 0)
273+
{
274+
generate_block(begin, end-begin, p, seed);
275+
}
276+
};
277+
278+
// copied from include/sampler.h
279+
// Formulas from "Sequential Random Sampling" by Ahrens and Dieter, 1985
280+
static std::pair<double, size_t> calc_params(size_t universe, size_t k /* samples */) {
281+
double r = sqrt(k);
282+
double a = sqrt(log(1+k/(2*M_PI)));
283+
a = a + a*a/(3.0 * r);
284+
size_t b = k + size_t(4 * a * r);
285+
double p = (k + a * r) / universe;
286+
return std::make_pair(p, b);
287+
}
288+
289+
int main(int argc, char **argv)
290+
{
291+
cudaSetDevice(3);
292+
size_t num_threads = 1;
293+
arg_parser args(argc, argv);
294+
#ifdef USE64BIT
295+
size_t universe = args.get<size_t>("n", 1ULL << 50);
296+
#else
297+
size_t universe = args.get<size_t>("n", 1<<30);
298+
#endif
299+
size_t k = args.get<size_t>("k", 1<<28); // sample size
300+
301+
size_t iterations = args.get<size_t>("i", 3*(1ULL<<30)/k);
302+
const bool verbose = args.is_set("v") || args.is_set("vv");
303+
const bool very_verbose = args.is_set("vv");
304+
const bool quiet = args.is_set("q");
305+
306+
double p; size_t ssize;
307+
std::tie(p, ssize) = calc_params(universe, k);
308+
#ifdef USE64BIT
309+
std::vector<unsigned long long> vec(ssize);
310+
#else
311+
std::vector<unsigned int> vec(ssize);
312+
#endif
313+
statistics mt_stats;
314+
315+
// warmup
316+
// k = number of indices (output), p = probability, universe = maximum index size
317+
// ssize = size of index array
318+
cuda_gen::generate_block(vec.begin(), ssize, k, universe, p);
319+
320+
std::stringstream extra_stream;
321+
extra_stream << " k=" << k << " b=" << ssize
322+
<< " p=" << p << " N=" << universe;
323+
auto extra = extra_stream.str();
324+
325+
// Measure
326+
cudaDeviceSynchronize();
327+
328+
// stats for multi-threaded version including sync / load imbalance
329+
timer t;
330+
for(int i = 0; i < iterations; i++)
331+
{
332+
cuda_gen::generate_block(vec.begin(), ssize, k, universe, p);
333+
cudaDeviceSynchronize();
334+
double time = t.get_and_reset();
335+
mt_stats.push(time);
336+
}
337+
std::cout << " mt_time=" << mt_stats.avg()
338+
<< " mt_dev=" << mt_stats.stddev()
339+
<< " numthreads=" << num_threads
340+
<< " iterations=" << iterations
341+
<< extra << std::endl;
342+
#ifdef PROFILING
343+
cudaProfilerStop();
344+
#endif
345+
}

0 commit comments

Comments
 (0)