9
9
10
10
#include " SpatialPooler.cu"
11
11
12
+ #define checkError (ans ) { gpuAssert ((ans), __FILE__, __LINE__); }
13
+ inline void gpuAssert (cudaError_t code, const char *file, int line, bool abort=true )
14
+ {
15
+ if (code != cudaSuccess)
16
+ {
17
+ fprintf (stderr," CUDA error: %s %s %d\n " , cudaGetErrorString (code), file, line);
18
+ if (abort ) exit (code);
19
+ }
20
+ }
21
+
22
+
12
23
using namespace std ;
13
24
14
25
typedef unsigned int UInt;
@@ -164,15 +175,7 @@ void printErrorMessage(cudaError_t error, int memorySize){
164
175
165
176
int main (int argc, const char * argv[])
166
177
{
167
- const UInt SP_SIZE = 524288 ;
168
- const UInt IN_SIZE = 1048576 ;
169
- const UInt BLOCK_SIZE = 64 ; // Two warps
170
- const UInt NUM_BLOCKS = SP_SIZE/BLOCK_SIZE;
171
- const UInt IN_BLOCK_SIZE = IN_SIZE/NUM_BLOCKS; // Size of chunk of input processed by a single cuda block
172
- const UInt MAX_CONNECTED = 16 ;
173
- const Real IN_DENSITY = 0.5 ; // Density of input connections
174
- srand (time (NULL ));
175
-
178
+ srand (time (NULL ));
176
179
size_t sm = BLOCK_SIZE*(2 *sizeof (Real) + sizeof (UInt)) + IN_BLOCK_SIZE*sizeof (bool );
177
180
178
181
// construct input args
@@ -198,18 +201,29 @@ int main(int argc, const char * argv[])
198
201
ar.IN_BLOCK_SIZE = IN_BLOCK_SIZE;
199
202
200
203
// Host memory pointers
201
- bool * cols_host = new bool [SP_SIZE];
202
- bool * in_host = new bool [IN_SIZE];
203
- UInt* potentialPools;
204
- Real* permanences;
205
- Real* boosts = new Real[SP_SIZE*MAX_CONNECTED];
206
- UInt* numPotential = new UInt[SP_SIZE];
207
- UInt* numConnected = new UInt[SP_SIZE];
204
+ bool * cols_host; // = new bool[SP_SIZE];
205
+ bool * in_host = &cols_host[SP_SIZE]; // = new bool[IN_SIZE];
206
+ UInt* potentialPools = (UInt*) &in_host[IN_SIZE];
207
+ UInt* numPotential = &potentialPools[SP_SIZE*MAX_CONNECTED]; // = new UInt[SP_SIZE];
208
+ // UInt* numConnected = &numPotential[SP_SIZE]; // = new UInt[SP_SIZE];
209
+ Real* permanences = (Real*) &numPotential[SP_SIZE];
210
+ Real* boosts = &permanences[SP_SIZE*MAX_CONNECTED]; // = new Real[SP_SIZE*MAX_CONNECTED];
211
+
212
+ cudaError_t result;
213
+ // TODO: Definitely need to allocate contiguous chunk here as well
214
+ result = cudaHostAlloc ((void **)&cols_host, SP_SIZE*sizeof (bool ), cudaHostAllocDefault); if (result) printErrorMessage (result, 0 );
215
+ result = cudaHostAlloc ((void **)&in_host, IN_SIZE*sizeof (bool ), cudaHostAllocDefault); if (result) printErrorMessage (result, 0 );
216
+ result = cudaHostAlloc ((void **)&boosts, SP_SIZE*MAX_CONNECTED*sizeof (Real), cudaHostAllocDefault); if (result) printErrorMessage (result, 0 );
217
+ result = cudaHostAlloc ((void **)&potentialPools, SP_SIZE*MAX_CONNECTED*sizeof (UInt), cudaHostAllocDefault); if (result) printErrorMessage (result, 0 );
218
+ result = cudaHostAlloc ((void **)&permanences, SP_SIZE*MAX_CONNECTED*sizeof (Real), cudaHostAllocDefault); if (result) printErrorMessage (result, 0 );
219
+ result = cudaHostAlloc ((void **)&numPotential, SP_SIZE*sizeof (UInt), cudaHostAllocDefault); if (result) printErrorMessage (result, 0 );
220
+ // result = cudaHostAlloc((void**)&numConnected, SP_SIZE*sizeof(UInt), cudaHostAllocDefault); if(result) printErrorMessage(result, 0);
221
+
208
222
209
223
// Host memory allocation
210
- std::fill_n (boosts, SP_SIZE*MAX_CONNECTED, 1 );
211
- std::fill_n (numPotential, SP_SIZE, 0 );
212
- std::fill_n (numConnected, SP_SIZE, 0 );
224
+ memset (boosts, 1 , SP_SIZE*MAX_CONNECTED);
225
+ memset (numPotential, 0 , SP_SIZE );
226
+ // memset (numConnected, 0, SP_SIZE );
213
227
214
228
potentialPools = generatePotentialPools (SP_SIZE, IN_BLOCK_SIZE, ar.potentialPct , MAX_CONNECTED, numPotential);
215
229
permanences = generatePermanences (SP_SIZE, IN_SIZE, potentialPools, ar.connectedPct , ar.synPermConnected , ar.synPermMax , MAX_CONNECTED, numPotential,
@@ -220,39 +234,41 @@ int main(int argc, const char * argv[])
220
234
221
235
// Global memory pointers
222
236
args* ar_dev;
237
+ void * data_dev;
223
238
224
239
// Global memory allocation
225
- cudaError_t result;
226
- result = cudaMalloc ((void **) &ar_dev, sizeof (ar)); if (result) printErrorMessage (result, 0 );
227
- result = cudaMalloc ((void **) &ar.in_dev , IN_SIZE*sizeof (bool )); if (result) printErrorMessage (result, 0 );
228
- result = cudaMalloc ((void **) &ar.olaps_dev , SP_SIZE*sizeof (UInt)); if (result) printErrorMessage (result, 0 );
229
- result = cudaMalloc ((void **) &ar.cols_dev , SP_SIZE*sizeof (bool )); if (result) printErrorMessage (result, 0 );
230
- result = cudaMalloc ((void **) &ar.numPot_dev , SP_SIZE*sizeof (UInt)); if (result) printErrorMessage (result, 0 );
231
- result = cudaMalloc ((void **) &ar.pot_dev , MAX_CONNECTED*SP_SIZE*sizeof (UInt)); if (result) printErrorMessage (result, 0 ); // width, height, x, y
232
- result = cudaMalloc ((void **) &ar.per_dev , MAX_CONNECTED*SP_SIZE*sizeof (Real)); if (result) printErrorMessage (result, 0 );
233
- result = cudaMalloc ((void **) &ar.odc_dev , MAX_CONNECTED*SP_SIZE*sizeof (Real)); if (result) printErrorMessage (result, 0 );
234
- result = cudaMalloc ((void **) &ar.adc_dev , MAX_CONNECTED*SP_SIZE*sizeof (Real)); if (result) printErrorMessage (result, 0 );
235
- result = cudaMalloc ((void **) &ar.boosts_dev , MAX_CONNECTED*SP_SIZE*sizeof (Real)); if (result) printErrorMessage (result, 0 );
236
- result = cudaMalloc ((void **) &ar.minOdc_dev , NUM_BLOCKS*sizeof (Real)); if (result) printErrorMessage (result, 0 );
240
+ size_t data_size = IN_SIZE*sizeof (bool ) + SP_SIZE*(sizeof (UInt) + 3 *sizeof (Real)) + MAX_CONNECTED*SP_SIZE*(sizeof (UInt) + 2 *sizeof (Real));
241
+ checkError ( cudaMalloc ((void **) &ar_dev, sizeof (ar)) );
242
+ checkError ( cudaMalloc ((void **) &data_dev, data_size) );
243
+ // checkError( cudaMalloc((void **) &ar.in_dev, IN_SIZE*sizeof(bool)) );
244
+ // checkError( cudaMalloc((void **) &ar.olaps_dev, SP_SIZE*sizeof(UInt)) );
245
+ // checkError( cudaMalloc((void **) &ar.cols_dev, SP_SIZE*sizeof(bool)) );
246
+ // checkError( cudaMalloc((void **) &ar.numPot_dev, SP_SIZE*sizeof(UInt)) );
247
+ // checkError( cudaMalloc((void **) &ar.pot_dev, MAX_CONNECTED*SP_SIZE*sizeof(UInt)) );
248
+ // checkError( cudaMalloc((void **) &ar.per_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)) );
249
+ // checkError( cudaMalloc((void **) &ar.odc_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)) );
250
+ // checkError( cudaMalloc((void **) &ar.adc_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)) );
251
+ // checkError( cudaMalloc((void **) &ar.boosts_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)) );
252
+ // checkError( cudaMalloc((void **) &ar.minOdc_dev, NUM_BLOCKS*sizeof(Real)) );
237
253
238
254
// Memcpy to device
239
- result = cudaMemcpy (ar_dev, &ar, sizeof (ar), cudaMemcpyHostToDevice); if (result) printErrorMessage (result, 0 );
240
- result = cudaMemcpy (ar.in_dev , in_host, IN_SIZE*sizeof (bool ), cudaMemcpyHostToDevice); if (result) printErrorMessage (result, 0 );
241
- result = cudaMemcpy (ar.numPot_dev , numPotential, SP_SIZE*sizeof (UInt), cudaMemcpyHostToDevice); if (result) printErrorMessage (result, 0 );
242
- result = cudaMemcpy (ar.pot_dev , potentialPools, MAX_CONNECTED*SP_SIZE*sizeof (UInt), cudaMemcpyHostToDevice); if (result) printErrorMessage (result, 0 );
243
- result = cudaMemcpy (ar.per_dev , permanences, MAX_CONNECTED*SP_SIZE*sizeof (Real), cudaMemcpyHostToDevice); if (result) printErrorMessage (result, 0 );
244
- result = cudaMemcpy (ar.boosts_dev , boosts, MAX_CONNECTED*SP_SIZE*sizeof (Real), cudaMemcpyHostToDevice); if (result) printErrorMessage (result, 0 );
255
+ checkError ( cudaMemcpy (ar_dev, &ar, sizeof (ar), cudaMemcpyHostToDevice) );
256
+ checkError ( cudaMemcpy (data_dev, in_host, data_size, cudaMemcpyHostToDevice) );
257
+ // result = cudaMemcpy(ar.in_dev, in_host, IN_SIZE*sizeof(bool), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
258
+ // result = cudaMemcpy(ar.numPot_dev, numPotential, SP_SIZE*sizeof(UInt), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
259
+ // result = cudaMemcpy(ar.pot_dev, potentialPools, MAX_CONNECTED*SP_SIZE*sizeof(UInt), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
260
+ // result = cudaMemcpy(ar.per_dev, permanences, MAX_CONNECTED*SP_SIZE*sizeof(Real), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
261
+ // result = cudaMemcpy(ar.boosts_dev, boosts, MAX_CONNECTED*SP_SIZE*sizeof(Real), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
245
262
246
263
// Kernel call
247
- compute<<<NUM_BLOCKS, BLOCK_SIZE, sm>>> (ar_dev);
264
+ compute<<<NUM_BLOCKS, BLOCK_SIZE, sm>>> (ar_dev, data_dev );
248
265
249
266
// Memcpy from device
250
- result = cudaMemcpy (cols_host, ar. cols_dev , SP_SIZE*sizeof (bool ), cudaMemcpyDeviceToHost); if (result) printErrorMessage (result, 0 );
267
+ result = cudaMemcpy (cols_host, data_dev , SP_SIZE*sizeof (bool ), cudaMemcpyDeviceToHost); if (result) printErrorMessage (result, 0 );
251
268
252
269
visualize_output (cols_host, SP_SIZE);
253
270
254
- cudaFree (ar.in_dev ); cudaFree (ar.cols_dev ); cudaFree (ar.pot_dev ); cudaFree (ar.per_dev ); cudaFree (ar.boosts_dev );
255
- cudaFree (ar.odc_dev ); cudaFree (ar.adc_dev ); cudaFree (ar.numPot_dev );
271
+ cudaFree (ar_dev); cudaFree (data_dev);
256
272
257
273
return 0 ;
258
274
}
0 commit comments