Skip to content

Commit 1e41a72

Browse files
committed
Some initial progress.
1 parent 8c4ecc0 commit 1e41a72

File tree

4 files changed

+209
-64
lines changed

4 files changed

+209
-64
lines changed

.gitignore

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
1+
# Ignore everything ...
2+
*
3+
# ... except directories ...
4+
!*/
5+
# ... and all files WITH extensions.
6+
!*.*
7+
18
*.swp
2-
HelloSP
3-
UnitSP
49
# Prerequisites
510
*.d
611

HelloSP.cu

Lines changed: 57 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,17 @@
99

1010
#include "SpatialPooler.cu"
1111

12+
#define checkError(ans) { gpuAssert((ans), __FILE__, __LINE__); }
13+
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
14+
{
15+
if (code != cudaSuccess)
16+
{
17+
fprintf(stderr,"CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
18+
if (abort) exit(code);
19+
}
20+
}
21+
22+
1223
using namespace std;
1324

1425
typedef unsigned int UInt;
@@ -164,15 +175,7 @@ void printErrorMessage(cudaError_t error, int memorySize){
164175

165176
int main(int argc, const char * argv[])
166177
{
167-
const UInt SP_SIZE = 524288;
168-
const UInt IN_SIZE = 1048576;
169-
const UInt BLOCK_SIZE = 64; // Two warps
170-
const UInt NUM_BLOCKS = SP_SIZE/BLOCK_SIZE;
171-
const UInt IN_BLOCK_SIZE = IN_SIZE/NUM_BLOCKS; // Size of chunk of input processed by a single cuda block
172-
const UInt MAX_CONNECTED = 16;
173-
const Real IN_DENSITY = 0.5; // Density of input connections
174-
srand(time(NULL));
175-
178+
srand(time(NULL));
176179
size_t sm = BLOCK_SIZE*(2*sizeof(Real) + sizeof(UInt)) + IN_BLOCK_SIZE*sizeof(bool);
177180

178181
// construct input args
@@ -198,18 +201,29 @@ int main(int argc, const char * argv[])
198201
ar.IN_BLOCK_SIZE = IN_BLOCK_SIZE;
199202

200203
// Host memory pointers
201-
bool* cols_host = new bool[SP_SIZE];
202-
bool* in_host = new bool[IN_SIZE];
203-
UInt* potentialPools;
204-
Real* permanences;
205-
Real* boosts = new Real[SP_SIZE*MAX_CONNECTED];
206-
UInt* numPotential = new UInt[SP_SIZE];
207-
UInt* numConnected = new UInt[SP_SIZE];
204+
bool* cols_host; // = new bool[SP_SIZE];
205+
bool* in_host = &cols_host[SP_SIZE]; // = new bool[IN_SIZE];
206+
UInt* potentialPools = (UInt*) &in_host[IN_SIZE];
207+
UInt* numPotential = &potentialPools[SP_SIZE*MAX_CONNECTED]; // = new UInt[SP_SIZE];
208+
// UInt* numConnected = &numPotential[SP_SIZE]; // = new UInt[SP_SIZE];
209+
Real* permanences = (Real*) &numPotential[SP_SIZE];
210+
Real* boosts = &permanences[SP_SIZE*MAX_CONNECTED]; // = new Real[SP_SIZE*MAX_CONNECTED];
211+
212+
cudaError_t result;
213+
// TODO: Definitely need to allocate contiguous chunk here as well
214+
result = cudaHostAlloc((void**)&cols_host, SP_SIZE*sizeof(bool), cudaHostAllocDefault); if(result) printErrorMessage(result, 0);
215+
result = cudaHostAlloc((void**)&in_host, IN_SIZE*sizeof(bool), cudaHostAllocDefault); if(result) printErrorMessage(result, 0);
216+
result = cudaHostAlloc((void**)&boosts, SP_SIZE*MAX_CONNECTED*sizeof(Real), cudaHostAllocDefault); if(result) printErrorMessage(result, 0);
217+
result = cudaHostAlloc((void**)&potentialPools, SP_SIZE*MAX_CONNECTED*sizeof(UInt), cudaHostAllocDefault); if(result) printErrorMessage(result, 0);
218+
result = cudaHostAlloc((void**)&permanences, SP_SIZE*MAX_CONNECTED*sizeof(Real), cudaHostAllocDefault); if(result) printErrorMessage(result, 0);
219+
result = cudaHostAlloc((void**)&numPotential, SP_SIZE*sizeof(UInt), cudaHostAllocDefault); if(result) printErrorMessage(result, 0);
220+
// result = cudaHostAlloc((void**)&numConnected, SP_SIZE*sizeof(UInt), cudaHostAllocDefault); if(result) printErrorMessage(result, 0);
221+
208222

209223
// Host memory allocation
210-
std::fill_n(boosts, SP_SIZE*MAX_CONNECTED, 1);
211-
std::fill_n(numPotential, SP_SIZE, 0);
212-
std::fill_n(numConnected, SP_SIZE, 0);
224+
memset(boosts, 1, SP_SIZE*MAX_CONNECTED);
225+
memset(numPotential, 0, SP_SIZE);
226+
// memset(numConnected, 0, SP_SIZE);
213227

214228
potentialPools = generatePotentialPools(SP_SIZE, IN_BLOCK_SIZE, ar.potentialPct, MAX_CONNECTED, numPotential);
215229
permanences = generatePermanences(SP_SIZE, IN_SIZE, potentialPools, ar.connectedPct, ar.synPermConnected, ar.synPermMax, MAX_CONNECTED, numPotential,
@@ -220,39 +234,41 @@ int main(int argc, const char * argv[])
220234

221235
// Global memory pointers
222236
args* ar_dev;
237+
void* data_dev;
223238

224239
// Global memory allocation
225-
cudaError_t result;
226-
result = cudaMalloc((void **) &ar_dev, sizeof(ar)); if(result) printErrorMessage(result, 0);
227-
result = cudaMalloc((void **) &ar.in_dev, IN_SIZE*sizeof(bool)); if(result) printErrorMessage(result, 0);
228-
result = cudaMalloc((void **) &ar.olaps_dev, SP_SIZE*sizeof(UInt)); if(result) printErrorMessage(result, 0);
229-
result = cudaMalloc((void **) &ar.cols_dev, SP_SIZE*sizeof(bool)); if(result) printErrorMessage(result, 0);
230-
result = cudaMalloc((void **) &ar.numPot_dev, SP_SIZE*sizeof(UInt)); if(result) printErrorMessage(result, 0);
231-
result = cudaMalloc((void **) &ar.pot_dev, MAX_CONNECTED*SP_SIZE*sizeof(UInt)); if(result) printErrorMessage(result, 0); // width, height, x, y
232-
result = cudaMalloc((void **) &ar.per_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)); if(result) printErrorMessage(result, 0);
233-
result = cudaMalloc((void **) &ar.odc_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)); if(result) printErrorMessage(result, 0);
234-
result = cudaMalloc((void **) &ar.adc_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)); if(result) printErrorMessage(result, 0);
235-
result = cudaMalloc((void **) &ar.boosts_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)); if(result) printErrorMessage(result, 0);
236-
result = cudaMalloc((void **) &ar.minOdc_dev, NUM_BLOCKS*sizeof(Real)); if(result) printErrorMessage(result, 0);
240+
size_t data_size = IN_SIZE*sizeof(bool) + SP_SIZE*(sizeof(UInt) + 3*sizeof(Real)) + MAX_CONNECTED*SP_SIZE*(sizeof(UInt) + 2*sizeof(Real));
241+
checkError( cudaMalloc((void **) &ar_dev, sizeof(ar)) );
242+
checkError( cudaMalloc((void **) &data_dev, data_size) );
243+
// checkError( cudaMalloc((void **) &ar.in_dev, IN_SIZE*sizeof(bool)) );
244+
// checkError( cudaMalloc((void **) &ar.olaps_dev, SP_SIZE*sizeof(UInt)) );
245+
// checkError( cudaMalloc((void **) &ar.cols_dev, SP_SIZE*sizeof(bool)) );
246+
// checkError( cudaMalloc((void **) &ar.numPot_dev, SP_SIZE*sizeof(UInt)) );
247+
// checkError( cudaMalloc((void **) &ar.pot_dev, MAX_CONNECTED*SP_SIZE*sizeof(UInt)) );
248+
// checkError( cudaMalloc((void **) &ar.per_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)) );
249+
// checkError( cudaMalloc((void **) &ar.odc_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)) );
250+
// checkError( cudaMalloc((void **) &ar.adc_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)) );
251+
// checkError( cudaMalloc((void **) &ar.boosts_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)) );
252+
// checkError( cudaMalloc((void **) &ar.minOdc_dev, NUM_BLOCKS*sizeof(Real)) );
237253

238254
// Memcpy to device
239-
result = cudaMemcpy(ar_dev, &ar, sizeof(ar), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
240-
result = cudaMemcpy(ar.in_dev, in_host, IN_SIZE*sizeof(bool), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
241-
result = cudaMemcpy(ar.numPot_dev, numPotential, SP_SIZE*sizeof(UInt), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
242-
result = cudaMemcpy(ar.pot_dev, potentialPools, MAX_CONNECTED*SP_SIZE*sizeof(UInt), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
243-
result = cudaMemcpy(ar.per_dev, permanences, MAX_CONNECTED*SP_SIZE*sizeof(Real), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
244-
result = cudaMemcpy(ar.boosts_dev, boosts, MAX_CONNECTED*SP_SIZE*sizeof(Real), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
255+
checkError( cudaMemcpy(ar_dev, &ar, sizeof(ar), cudaMemcpyHostToDevice) );
256+
checkError( cudaMemcpy(data_dev, in_host, data_size, cudaMemcpyHostToDevice) );
257+
// result = cudaMemcpy(ar.in_dev, in_host, IN_SIZE*sizeof(bool), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
258+
// result = cudaMemcpy(ar.numPot_dev, numPotential, SP_SIZE*sizeof(UInt), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
259+
// result = cudaMemcpy(ar.pot_dev, potentialPools, MAX_CONNECTED*SP_SIZE*sizeof(UInt), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
260+
// result = cudaMemcpy(ar.per_dev, permanences, MAX_CONNECTED*SP_SIZE*sizeof(Real), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
261+
// result = cudaMemcpy(ar.boosts_dev, boosts, MAX_CONNECTED*SP_SIZE*sizeof(Real), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
245262

246263
// Kernel call
247-
compute<<<NUM_BLOCKS, BLOCK_SIZE, sm>>>(ar_dev);
264+
compute<<<NUM_BLOCKS, BLOCK_SIZE, sm>>>(ar_dev, data_dev);
248265

249266
// Memcpy from device
250-
result = cudaMemcpy(cols_host, ar.cols_dev, SP_SIZE*sizeof(bool), cudaMemcpyDeviceToHost); if(result) printErrorMessage(result, 0);
267+
result = cudaMemcpy(cols_host, data_dev, SP_SIZE*sizeof(bool), cudaMemcpyDeviceToHost); if(result) printErrorMessage(result, 0);
251268

252269
visualize_output(cols_host, SP_SIZE);
253270

254-
cudaFree(ar.in_dev); cudaFree(ar.cols_dev); cudaFree(ar.pot_dev); cudaFree(ar.per_dev); cudaFree(ar.boosts_dev);
255-
cudaFree(ar.odc_dev); cudaFree(ar.adc_dev); cudaFree(ar.numPot_dev);
271+
cudaFree(ar_dev); cudaFree(data_dev);
256272

257273
return 0;
258274
}

SpatialPooler.cu

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,15 @@ using namespace std;
55
typedef unsigned int UInt;
66
typedef float Real;
77

8+
// Define global constants
9+
const UInt SP_SIZE = 131072;
10+
const UInt IN_SIZE = 262144;
11+
const UInt BLOCK_SIZE = 64; // Two warps
12+
const UInt NUM_BLOCKS = SP_SIZE/BLOCK_SIZE;
13+
const UInt IN_BLOCK_SIZE = IN_SIZE/NUM_BLOCKS; // Size of chunk of input processed by a single cuda block
14+
const UInt MAX_CONNECTED = 16;
15+
const Real IN_DENSITY = 0.5; // Density of input connections
16+
817
struct args
918
{
1019
// Parameters
@@ -24,18 +33,6 @@ struct args
2433
Real minPctOdc;
2534
bool learn;
2635

27-
// Global memory pointers
28-
bool* in_dev;
29-
bool* cols_dev;
30-
UInt* olaps_dev;
31-
UInt* pot_dev;
32-
Real* per_dev;
33-
Real* boosts_dev;
34-
Real* odc_dev; // odc serve to maintain same act. freq. for each col. (per block)
35-
Real* adc_dev; // adc serve to compute boost factors
36-
UInt* numPot_dev;
37-
Real* minOdc_dev; // Stores minumum overlap duty cycles per block
38-
3936
// Constants
4037
UInt SP_SIZE;
4138
UInt MAX_CONNECTED;
@@ -53,7 +50,6 @@ struct args
5350
UInt update_period;
5451
};
5552

56-
5753
// TODO: This could be done via parallel matrix multiplication.
5854
__device__
5955
void calculateOverlap(bool* in_dev, bool* in_sh, UInt* pot_dev, Real* per_dev, Real* boosts_dev, UInt* numPot_dev, UInt* olaps_sh, Real threshold, const UInt inBlockSize, const UInt MAX_CONNECTED)
@@ -278,8 +274,21 @@ void updateMinOdcReduction(Real* odc_dev, Real* odc_sh, Real* minOdc_dev, Real m
278274

279275

280276
__global__
281-
void compute(args* ar_ptr)
277+
void compute(args* ar_ptr, void* data)
282278
{
279+
// Global memory pointers
280+
bool* cols_dev = (bool*) &data;
281+
bool* in_dev = &cols_dev[SP_SIZE];
282+
UInt* pot_dev = (UInt*) &in_dev[IN_SIZE];
283+
UInt* numPot_dev = &pot_dev[SP_SIZE*MAX_CONNECTED];
284+
Real* per_dev = (Real*) &numPot_dev[SP_SIZE];
285+
Real* boosts_dev = &per_dev[SP_SIZE*MAX_CONNECTED];
286+
UInt* olaps_dev = (UInt*) &boosts_dev[SP_SIZE*MAX_CONNECTED];
287+
Real* odc_dev = (Real*) &olaps_dev[SP_SIZE]; // odc serve to maintain same act. freq. for each col. (per block)
288+
Real* adc_dev = &odc_dev[SP_SIZE]; // adc serve to compute boost factors
289+
Real* minOdc_dev = &adc_dev[SP_SIZE]; // Stores minumum overlap duty cycles per block
290+
291+
283292
if (blockIdx.x == 0 && threadIdx.x == 0)
284293
ar_ptr->iteration_num++;
285294

@@ -295,29 +304,29 @@ void compute(args* ar_ptr)
295304
Real* odc_sh = &active_sh[blockDim.x];
296305
bool* in_sh = (bool*) &odc_sh[blockDim.x];
297306

298-
calculateOverlap(ar.in_dev, in_sh, ar.pot_dev, ar.per_dev, ar.boosts_dev, ar.numPot_dev, olaps_sh, ar.synPermConnected, ar.IN_BLOCK_SIZE, ar.MAX_CONNECTED);
307+
calculateOverlap(in_dev, in_sh, pot_dev, per_dev, boosts_dev, numPot_dev, olaps_sh, ar.synPermConnected, ar.IN_BLOCK_SIZE, ar.MAX_CONNECTED);
299308

300309
__syncthreads();
301310

302-
inhibitColumns(olaps_sh, ar.cols_dev, active_sh, active, ar.localAreaDensity);
311+
inhibitColumns(olaps_sh, cols_dev, active_sh, active, ar.localAreaDensity);
303312

304313
__syncthreads();
305314

306-
adaptSynapses(ar.in_dev, ar.pot_dev, ar.per_dev, ar.synPermActiveInc, ar.synPermInactiveDec, active, ar.IN_BLOCK_SIZE, ar.MAX_CONNECTED);
315+
adaptSynapses(in_dev, pot_dev, per_dev, ar.synPermActiveInc, ar.synPermInactiveDec, active, ar.IN_BLOCK_SIZE, ar.MAX_CONNECTED);
307316

308-
updateDutyCycles(ar.odc_dev, ar.adc_dev, olaps_sh, active, ar.iteration_num, ar.dutyCyclePeriod);
317+
updateDutyCycles(odc_dev, adc_dev, olaps_sh, active, ar.iteration_num, ar.dutyCyclePeriod);
309318

310319
// active_sh will hold average activity per block for each column
311320
averageActivityReduction(active_sh);
312321

313322
__syncthreads();
314323

315-
updateBoosts(ar.adc_dev, ar.boosts_dev, avg_act, ar.boostStrength);
324+
updateBoosts(adc_dev, boosts_dev, avg_act, ar.boostStrength);
316325

317-
bumpUpColumnsWithWeakOdc(ar.odc_dev, ar.per_dev, ar.numPot_dev, ar.minOdc_dev, ar.synPermBelowStimulusInc, ar.MAX_CONNECTED);
326+
bumpUpColumnsWithWeakOdc(odc_dev, per_dev, numPot_dev, minOdc_dev, ar.synPermBelowStimulusInc, ar.MAX_CONNECTED);
318327

319328
if(ar.iteration_num % ar.update_period == 0)
320-
updateMinOdc(ar.odc_dev, odc_sh, ar.minOdc_dev, ar.minPctOdc, ar.SP_SIZE);
329+
updateMinOdc(odc_dev, odc_sh, minOdc_dev, ar.minPctOdc, ar.SP_SIZE);
321330
}
322331

323332
__global__
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
#include <stdio.h>
2+
#include <assert.h>
3+
4+
// Convenience function for checking CUDA runtime API results
5+
// can be wrapped around any runtime API call. No-op in release builds.
6+
inline
7+
cudaError_t checkCuda(cudaError_t result)
8+
{
9+
#if defined(DEBUG) || defined(_DEBUG)
10+
if (result != cudaSuccess) {
11+
fprintf(stderr, "CUDA Runtime Error: %s\n",
12+
cudaGetErrorString(result));
13+
assert(result == cudaSuccess);
14+
}
15+
#endif
16+
return result;
17+
}
18+
19+
void profileCopies(float *h_a,
20+
float *h_b,
21+
float *d,
22+
unsigned int n,
23+
char *desc)
24+
{
25+
printf("\n%s transfers\n", desc);
26+
27+
unsigned int bytes = n * sizeof(float);
28+
29+
// events for timing
30+
cudaEvent_t startEvent, stopEvent;
31+
32+
checkCuda( cudaEventCreate(&startEvent) );
33+
checkCuda( cudaEventCreate(&stopEvent) );
34+
35+
checkCuda( cudaEventRecord(startEvent, 0) );
36+
checkCuda( cudaMemcpy(d, h_a, bytes, cudaMemcpyHostToDevice) );
37+
checkCuda( cudaEventRecord(stopEvent, 0) );
38+
checkCuda( cudaEventSynchronize(stopEvent) );
39+
40+
float time;
41+
checkCuda( cudaEventElapsedTime(&time, startEvent, stopEvent) );
42+
printf(" Host to Device bandwidth (GB/s): %f\n", bytes * 1e-6 / time);
43+
44+
checkCuda( cudaEventRecord(startEvent, 0) );
45+
checkCuda( cudaMemcpy(h_b, d, bytes, cudaMemcpyDeviceToHost) );
46+
checkCuda( cudaEventRecord(stopEvent, 0) );
47+
checkCuda( cudaEventSynchronize(stopEvent) );
48+
49+
checkCuda( cudaEventElapsedTime(&time, startEvent, stopEvent) );
50+
printf(" Device to Host bandwidth (GB/s): %f\n", bytes * 1e-6 / time);
51+
52+
for (int i = 0; i < n; ++i) {
53+
if (h_a[i] != h_b[i]) {
54+
printf("*** %s transfers failed ***\n", desc);
55+
break;
56+
}
57+
}
58+
59+
// clean up events
60+
checkCuda( cudaEventDestroy(startEvent) );
61+
checkCuda( cudaEventDestroy(stopEvent) );
62+
}
63+
64+
int main()
65+
{
66+
unsigned int nElements = 4*1024*1024;
67+
const unsigned int bytes = nElements * sizeof(float);
68+
69+
// host arrays
70+
float *h_aPageable, *h_bPageable;
71+
float *h_aPinned, *h_bPinned;
72+
float *h_aWC, *h_bWC;
73+
74+
// device array
75+
float *d_a;
76+
77+
// allocate and initialize
78+
h_aPageable = (float*)malloc(bytes); // host pageable
79+
h_bPageable = (float*)malloc(bytes); // host pageable
80+
checkCuda( cudaMallocHost((void**)&h_aPinned, bytes) ); // host pinned
81+
checkCuda( cudaMallocHost((void**)&h_bPinned, bytes) ); // host pinned
82+
checkCuda( cudaHostAlloc((void**)&h_aWC, bytes, cudaHostAllocWriteCombined) ); // host write-combined
83+
checkCuda( cudaHostAlloc((void**)&h_bWC, bytes, cudaHostAllocWriteCombined) ); // host write-combined
84+
checkCuda( cudaMalloc((void**)&d_a, bytes) ); // device
85+
86+
for (int i = 0; i < nElements; ++i) h_aPageable[i] = i;
87+
memcpy(h_aPinned, h_aPageable, bytes);
88+
memcpy(h_aWC, h_aPageable, bytes);
89+
memset(h_bPageable, 0, bytes);
90+
memset(h_bPinned, 0, bytes);
91+
memset(h_bWC, 0, bytes);
92+
93+
// output device info and transfer size
94+
cudaDeviceProp prop;
95+
checkCuda( cudaGetDeviceProperties(&prop, 0) );
96+
97+
printf("\nDevice: %s\n", prop.name);
98+
printf("Transfer size (MB): %d\n", bytes / (1024 * 1024));
99+
100+
// perform copies and report bandwidth
101+
profileCopies(h_aPageable, h_bPageable, d_a, nElements, "Pageable");
102+
profileCopies(h_aPinned, h_bPinned, d_a, nElements, "Pinned");
103+
profileCopies(h_aWC, h_bWC, d_a, nElements, "Write-combined");
104+
105+
printf("n");
106+
107+
// cleanup
108+
cudaFree(d_a);
109+
cudaFreeHost(h_aPinned);
110+
cudaFreeHost(h_bPinned);
111+
free(h_aPageable);
112+
free(h_bPageable);
113+
114+
return 0;
115+
}

0 commit comments

Comments
 (0)