Some initial progress.

mirgee · mirgee · commit 1e41a72aa272 · 2018-06-27T18:56:30.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,11 @@
+# Ignore everything ...
+*
+# ... except directories ...
+!*/
+# ... and all files WITH extensions.
+!*.*
+
 *.swp
-HelloSP
-UnitSP
 # Prerequisites
 *.d
 
diff --git a/HelloSP.cu b/HelloSP.cu
@@ -9,6 +9,17 @@
 
 #include "SpatialPooler.cu"
 
+#define checkError(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != cudaSuccess) 
+   {
+      fprintf(stderr,"CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
+
 using namespace std;
 
 typedef unsigned int UInt;
@@ -164,15 +175,7 @@ void printErrorMessage(cudaError_t error, int memorySize){
 
 int main(int argc, const char * argv[])
 {
-	const UInt SP_SIZE = 524288;
-	const UInt IN_SIZE = 1048576;
-	const UInt BLOCK_SIZE = 64; // Two warps
-	const UInt NUM_BLOCKS = SP_SIZE/BLOCK_SIZE;
-	const UInt IN_BLOCK_SIZE = IN_SIZE/NUM_BLOCKS; // Size of chunk of input processed by a single cuda block
-	const UInt MAX_CONNECTED = 16;
-    const Real IN_DENSITY = 0.5; // Density of input connections
-    srand(time(NULL));
-
+	srand(time(NULL));
 	size_t sm = BLOCK_SIZE*(2*sizeof(Real) + sizeof(UInt)) + IN_BLOCK_SIZE*sizeof(bool);
 
     // construct input args
@@ -198,18 +201,29 @@ int main(int argc, const char * argv[])
 	ar.IN_BLOCK_SIZE = IN_BLOCK_SIZE;
 
 	// Host memory pointers
-    bool* cols_host = new bool[SP_SIZE];
-	bool* in_host = new bool[IN_SIZE];
-    UInt* potentialPools;
-	Real* permanences;
-	Real* boosts = new Real[SP_SIZE*MAX_CONNECTED];
-	UInt* numPotential = new UInt[SP_SIZE];
-	UInt* numConnected = new UInt[SP_SIZE];
+    bool* cols_host; 									// = new bool[SP_SIZE];
+	bool* in_host = &cols_host[SP_SIZE]; 										// = new bool[IN_SIZE];
+    UInt* potentialPools = (UInt*) &in_host[IN_SIZE];
+	UInt* numPotential = &potentialPools[SP_SIZE*MAX_CONNECTED];									// = new UInt[SP_SIZE];
+	// UInt* numConnected = &numPotential[SP_SIZE];									// = new UInt[SP_SIZE];
+	Real* permanences = (Real*) &numPotential[SP_SIZE];
+	Real* boosts = &permanences[SP_SIZE*MAX_CONNECTED];										// = new Real[SP_SIZE*MAX_CONNECTED];
+
+	cudaError_t result;
+	// TODO: Definitely need to allocate contiguous chunk here as well
+	result = cudaHostAlloc((void**)&cols_host, SP_SIZE*sizeof(bool), cudaHostAllocDefault); if(result) printErrorMessage(result, 0);
+	result = cudaHostAlloc((void**)&in_host, IN_SIZE*sizeof(bool), cudaHostAllocDefault); if(result) printErrorMessage(result, 0);
+	result = cudaHostAlloc((void**)&boosts, SP_SIZE*MAX_CONNECTED*sizeof(Real), cudaHostAllocDefault); if(result) printErrorMessage(result, 0);
+	result = cudaHostAlloc((void**)&potentialPools, SP_SIZE*MAX_CONNECTED*sizeof(UInt), cudaHostAllocDefault); if(result) printErrorMessage(result, 0);
+	result = cudaHostAlloc((void**)&permanences, SP_SIZE*MAX_CONNECTED*sizeof(Real), cudaHostAllocDefault); if(result) printErrorMessage(result, 0);
+	result = cudaHostAlloc((void**)&numPotential, SP_SIZE*sizeof(UInt), cudaHostAllocDefault); if(result) printErrorMessage(result, 0);
+	// result = cudaHostAlloc((void**)&numConnected, SP_SIZE*sizeof(UInt), cudaHostAllocDefault); if(result) printErrorMessage(result, 0);
+
 
 	// Host memory allocation	
-	std::fill_n(boosts, SP_SIZE*MAX_CONNECTED, 1);
-	std::fill_n(numPotential, SP_SIZE, 0);
-	std::fill_n(numConnected, SP_SIZE, 0);
+	memset(boosts, 1, SP_SIZE*MAX_CONNECTED);
+	memset(numPotential, 0, SP_SIZE);
+	// memset(numConnected, 0, SP_SIZE);
 
 	potentialPools = generatePotentialPools(SP_SIZE, IN_BLOCK_SIZE, ar.potentialPct, MAX_CONNECTED, numPotential);
 	permanences = generatePermanences(SP_SIZE, IN_SIZE, potentialPools, ar.connectedPct, ar.synPermConnected, ar.synPermMax, MAX_CONNECTED, numPotential,
@@ -220,39 +234,41 @@ int main(int argc, const char * argv[])
 
 	// Global memory pointers
 	args* ar_dev;
+	void* data_dev;
 
 	// Global memory allocation
-	cudaError_t result;
-    result = cudaMalloc((void **) &ar_dev, sizeof(ar)); if(result) printErrorMessage(result, 0);
-    result = cudaMalloc((void **) &ar.in_dev, IN_SIZE*sizeof(bool)); if(result) printErrorMessage(result, 0);
-    result = cudaMalloc((void **) &ar.olaps_dev, SP_SIZE*sizeof(UInt)); if(result) printErrorMessage(result, 0);
-    result = cudaMalloc((void **) &ar.cols_dev, SP_SIZE*sizeof(bool)); if(result) printErrorMessage(result, 0);
-	result = cudaMalloc((void **) &ar.numPot_dev, SP_SIZE*sizeof(UInt)); if(result) printErrorMessage(result, 0);
-    result = cudaMalloc((void **) &ar.pot_dev, MAX_CONNECTED*SP_SIZE*sizeof(UInt)); if(result) printErrorMessage(result, 0); // width, height, x, y 
-    result = cudaMalloc((void **) &ar.per_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)); if(result) printErrorMessage(result, 0); 
-    result = cudaMalloc((void **) &ar.odc_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)); if(result) printErrorMessage(result, 0); 
-    result = cudaMalloc((void **) &ar.adc_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)); if(result) printErrorMessage(result, 0); 
-    result = cudaMalloc((void **) &ar.boosts_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)); if(result) printErrorMessage(result, 0); 
-    result = cudaMalloc((void **) &ar.minOdc_dev, NUM_BLOCKS*sizeof(Real)); if(result) printErrorMessage(result, 0); 
+	size_t data_size = IN_SIZE*sizeof(bool) + SP_SIZE*(sizeof(UInt) + 3*sizeof(Real)) + MAX_CONNECTED*SP_SIZE*(sizeof(UInt) + 2*sizeof(Real));
+    checkError( cudaMalloc((void **) &ar_dev, sizeof(ar)) );
+	checkError( cudaMalloc((void **) &data_dev, data_size) );
+    // checkError( cudaMalloc((void **) &ar.in_dev, IN_SIZE*sizeof(bool)) ); 
+    // checkError( cudaMalloc((void **) &ar.olaps_dev, SP_SIZE*sizeof(UInt)) );
+    // checkError( cudaMalloc((void **) &ar.cols_dev, SP_SIZE*sizeof(bool)) );
+	// checkError( cudaMalloc((void **) &ar.numPot_dev, SP_SIZE*sizeof(UInt)) );
+    // checkError( cudaMalloc((void **) &ar.pot_dev, MAX_CONNECTED*SP_SIZE*sizeof(UInt)) );
+    // checkError( cudaMalloc((void **) &ar.per_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)) );
+    // checkError( cudaMalloc((void **) &ar.odc_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)) );
+    // checkError( cudaMalloc((void **) &ar.adc_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)) );
+    // checkError( cudaMalloc((void **) &ar.boosts_dev, MAX_CONNECTED*SP_SIZE*sizeof(Real)) );
+    // checkError( cudaMalloc((void **) &ar.minOdc_dev, NUM_BLOCKS*sizeof(Real)) );
 
 	// Memcpy to device
-    result = cudaMemcpy(ar_dev, &ar, sizeof(ar), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
-    result = cudaMemcpy(ar.in_dev, in_host, IN_SIZE*sizeof(bool), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
-    result = cudaMemcpy(ar.numPot_dev, numPotential, SP_SIZE*sizeof(UInt), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
-    result = cudaMemcpy(ar.pot_dev, potentialPools, MAX_CONNECTED*SP_SIZE*sizeof(UInt), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
-    result = cudaMemcpy(ar.per_dev, permanences, MAX_CONNECTED*SP_SIZE*sizeof(Real), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
-    result = cudaMemcpy(ar.boosts_dev, boosts, MAX_CONNECTED*SP_SIZE*sizeof(Real), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
+    checkError( cudaMemcpy(ar_dev, &ar, sizeof(ar), cudaMemcpyHostToDevice) );
+    checkError( cudaMemcpy(data_dev, in_host, data_size, cudaMemcpyHostToDevice) );
+    // result = cudaMemcpy(ar.in_dev, in_host, IN_SIZE*sizeof(bool), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
+    // result = cudaMemcpy(ar.numPot_dev, numPotential, SP_SIZE*sizeof(UInt), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
+    // result = cudaMemcpy(ar.pot_dev, potentialPools, MAX_CONNECTED*SP_SIZE*sizeof(UInt), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
+    // result = cudaMemcpy(ar.per_dev, permanences, MAX_CONNECTED*SP_SIZE*sizeof(Real), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
+    // result = cudaMemcpy(ar.boosts_dev, boosts, MAX_CONNECTED*SP_SIZE*sizeof(Real), cudaMemcpyHostToDevice); if(result) printErrorMessage(result, 0);
 
 	// Kernel call
-    compute<<<NUM_BLOCKS, BLOCK_SIZE, sm>>>(ar_dev);
+    compute<<<NUM_BLOCKS, BLOCK_SIZE, sm>>>(ar_dev, data_dev);
 
     // Memcpy from device
-    result = cudaMemcpy(cols_host, ar.cols_dev, SP_SIZE*sizeof(bool), cudaMemcpyDeviceToHost); if(result) printErrorMessage(result, 0); 
+    result = cudaMemcpy(cols_host, data_dev, SP_SIZE*sizeof(bool), cudaMemcpyDeviceToHost); if(result) printErrorMessage(result, 0); 
 
 	visualize_output(cols_host, SP_SIZE);
 
-    cudaFree(ar.in_dev); cudaFree(ar.cols_dev); cudaFree(ar.pot_dev); cudaFree(ar.per_dev); cudaFree(ar.boosts_dev);
-	cudaFree(ar.odc_dev); cudaFree(ar.adc_dev); cudaFree(ar.numPot_dev);
+    cudaFree(ar_dev); cudaFree(data_dev);
 
     return 0;
 }
diff --git a/SpatialPooler.cu b/SpatialPooler.cu
@@ -5,6 +5,15 @@ using namespace std;
 typedef unsigned int UInt;
 typedef float Real;
 
+// Define global constants
+const UInt SP_SIZE = 131072;
+const UInt IN_SIZE = 262144;
+const UInt BLOCK_SIZE = 64; // Two warps
+const UInt NUM_BLOCKS = SP_SIZE/BLOCK_SIZE;
+const UInt IN_BLOCK_SIZE = IN_SIZE/NUM_BLOCKS; // Size of chunk of input processed by a single cuda block
+const UInt MAX_CONNECTED = 16;
+const Real IN_DENSITY = 0.5; // Density of input connections
+
 struct args
 {
 	// Parameters
@@ -24,18 +33,6 @@ struct args
 	Real minPctOdc;
 	bool learn;
 
-	// Global memory pointers
-	bool* in_dev;
-    bool* cols_dev;
-	UInt* olaps_dev;
-	UInt* pot_dev;
-	Real* per_dev;
-	Real* boosts_dev;
-	Real* odc_dev; // odc serve to maintain same act. freq. for each col. (per block)
-	Real* adc_dev; // adc serve to compute boost factors
-	UInt* numPot_dev;
-	Real* minOdc_dev; // Stores minumum overlap duty cycles per block 
-
 	// Constants
 	UInt SP_SIZE;
 	UInt MAX_CONNECTED;
@@ -53,7 +50,6 @@ struct args
 	UInt update_period;
 };
 
-
 // TODO: This could be done via parallel matrix multiplication.
 __device__
 void calculateOverlap(bool* in_dev, bool* in_sh, UInt* pot_dev, Real* per_dev, Real* boosts_dev, UInt* numPot_dev, UInt* olaps_sh, Real threshold, const UInt inBlockSize, const UInt MAX_CONNECTED)
@@ -278,8 +274,21 @@ void updateMinOdcReduction(Real* odc_dev, Real* odc_sh, Real* minOdc_dev, Real m
 
 
 __global__
-void compute(args* ar_ptr)
+void compute(args* ar_ptr, void* data)
 {
+	// Global memory pointers
+    bool* cols_dev = (bool*) &data;
+	bool* in_dev = &cols_dev[SP_SIZE];
+	UInt* pot_dev = (UInt*) &in_dev[IN_SIZE];
+	UInt* numPot_dev = &pot_dev[SP_SIZE*MAX_CONNECTED];
+	Real* per_dev = (Real*) &numPot_dev[SP_SIZE];
+	Real* boosts_dev = &per_dev[SP_SIZE*MAX_CONNECTED];
+	UInt* olaps_dev = (UInt*) &boosts_dev[SP_SIZE*MAX_CONNECTED];
+	Real* odc_dev = (Real*) &olaps_dev[SP_SIZE]; // odc serve to maintain same act. freq. for each col. (per block)
+	Real* adc_dev =  &odc_dev[SP_SIZE]; // adc serve to compute boost factors
+	Real* minOdc_dev = &adc_dev[SP_SIZE]; // Stores minumum overlap duty cycles per block 
+
+	
 	if (blockIdx.x == 0 && threadIdx.x == 0) 
 		ar_ptr->iteration_num++;
 	
@@ -295,29 +304,29 @@ void compute(args* ar_ptr)
 	Real* odc_sh = &active_sh[blockDim.x];
 	bool* in_sh = (bool*) &odc_sh[blockDim.x];
 
-	calculateOverlap(ar.in_dev, in_sh, ar.pot_dev, ar.per_dev, ar.boosts_dev, ar.numPot_dev, olaps_sh, ar.synPermConnected, ar.IN_BLOCK_SIZE, ar.MAX_CONNECTED);
+	calculateOverlap(in_dev, in_sh, pot_dev, per_dev, boosts_dev, numPot_dev, olaps_sh, ar.synPermConnected, ar.IN_BLOCK_SIZE, ar.MAX_CONNECTED);
 
 	__syncthreads();
 
-	inhibitColumns(olaps_sh, ar.cols_dev, active_sh, active, ar.localAreaDensity);
+	inhibitColumns(olaps_sh, cols_dev, active_sh, active, ar.localAreaDensity);
 	
 	__syncthreads();
 
-	adaptSynapses(ar.in_dev, ar.pot_dev, ar.per_dev, ar.synPermActiveInc, ar.synPermInactiveDec, active, ar.IN_BLOCK_SIZE, ar.MAX_CONNECTED);
+	adaptSynapses(in_dev, pot_dev, per_dev, ar.synPermActiveInc, ar.synPermInactiveDec, active, ar.IN_BLOCK_SIZE, ar.MAX_CONNECTED);
 
-	updateDutyCycles(ar.odc_dev, ar.adc_dev, olaps_sh, active, ar.iteration_num, ar.dutyCyclePeriod);
+	updateDutyCycles(odc_dev, adc_dev, olaps_sh, active, ar.iteration_num, ar.dutyCyclePeriod);
 
 	// active_sh will hold average activity per block for each column
 	averageActivityReduction(active_sh);
 
 	__syncthreads();
 
-	updateBoosts(ar.adc_dev, ar.boosts_dev, avg_act, ar.boostStrength);
+	updateBoosts(adc_dev, boosts_dev, avg_act, ar.boostStrength);
 
-	bumpUpColumnsWithWeakOdc(ar.odc_dev, ar.per_dev, ar.numPot_dev, ar.minOdc_dev, ar.synPermBelowStimulusInc, ar.MAX_CONNECTED);
+	bumpUpColumnsWithWeakOdc(odc_dev, per_dev, numPot_dev, minOdc_dev, ar.synPermBelowStimulusInc, ar.MAX_CONNECTED);
 
 	if(ar.iteration_num % ar.update_period == 0)
-		updateMinOdc(ar.odc_dev, odc_sh, ar.minOdc_dev, ar.minPctOdc, ar.SP_SIZE);
+		updateMinOdc(odc_dev, odc_sh, minOdc_dev, ar.minPctOdc, ar.SP_SIZE);
 }
 
 __global__
diff --git a/memory_experiment/memory_experiment.cu b/memory_experiment/memory_experiment.cu
@@ -0,0 +1,115 @@
+#include <stdio.h>
+#include <assert.h>
+
+// Convenience function for checking CUDA runtime API results
+// can be wrapped around any runtime API call. No-op in release builds.
+inline
+cudaError_t checkCuda(cudaError_t result)
+{
+#if defined(DEBUG) || defined(_DEBUG)
+  if (result != cudaSuccess) {
+    fprintf(stderr, "CUDA Runtime Error: %s\n", 
+            cudaGetErrorString(result));
+    assert(result == cudaSuccess);
+  }
+#endif
+  return result;
+}
+
+void profileCopies(float        *h_a, 
+                   float        *h_b, 
+                   float        *d, 
+                   unsigned int  n,
+                   char         *desc)
+{
+  printf("\n%s transfers\n", desc);
+
+  unsigned int bytes = n * sizeof(float);
+
+  // events for timing
+  cudaEvent_t startEvent, stopEvent; 
+
+  checkCuda( cudaEventCreate(&startEvent) );
+  checkCuda( cudaEventCreate(&stopEvent) );
+
+  checkCuda( cudaEventRecord(startEvent, 0) );
+  checkCuda( cudaMemcpy(d, h_a, bytes, cudaMemcpyHostToDevice) );
+  checkCuda( cudaEventRecord(stopEvent, 0) );
+  checkCuda( cudaEventSynchronize(stopEvent) );
+
+  float time;
+  checkCuda( cudaEventElapsedTime(&time, startEvent, stopEvent) );
+  printf("  Host to Device bandwidth (GB/s): %f\n", bytes * 1e-6 / time);
+
+  checkCuda( cudaEventRecord(startEvent, 0) );
+  checkCuda( cudaMemcpy(h_b, d, bytes, cudaMemcpyDeviceToHost) );
+  checkCuda( cudaEventRecord(stopEvent, 0) );
+  checkCuda( cudaEventSynchronize(stopEvent) );
+
+  checkCuda( cudaEventElapsedTime(&time, startEvent, stopEvent) );
+  printf("  Device to Host bandwidth (GB/s): %f\n", bytes * 1e-6 / time);
+
+  for (int i = 0; i < n; ++i) {
+    if (h_a[i] != h_b[i]) {
+      printf("*** %s transfers failed ***\n", desc);
+      break;
+    }
+  }
+
+  // clean up events
+  checkCuda( cudaEventDestroy(startEvent) );
+  checkCuda( cudaEventDestroy(stopEvent) );
+}
+
+int main()
+{
+  unsigned int nElements = 4*1024*1024;
+  const unsigned int bytes = nElements * sizeof(float);
+
+  // host arrays
+  float *h_aPageable, *h_bPageable;   
+  float *h_aPinned, *h_bPinned;
+  float *h_aWC, *h_bWC;
+
+  // device array
+  float *d_a;
+
+  // allocate and initialize
+  h_aPageable = (float*)malloc(bytes);                    // host pageable
+  h_bPageable = (float*)malloc(bytes);                    // host pageable
+  checkCuda( cudaMallocHost((void**)&h_aPinned, bytes) ); // host pinned
+  checkCuda( cudaMallocHost((void**)&h_bPinned, bytes) ); // host pinned
+  checkCuda( cudaHostAlloc((void**)&h_aWC, bytes, cudaHostAllocWriteCombined) ); // host write-combined
+  checkCuda( cudaHostAlloc((void**)&h_bWC, bytes, cudaHostAllocWriteCombined) ); // host write-combined
+  checkCuda( cudaMalloc((void**)&d_a, bytes) );           // device
+
+  for (int i = 0; i < nElements; ++i) h_aPageable[i] = i;      
+  memcpy(h_aPinned, h_aPageable, bytes);
+  memcpy(h_aWC, h_aPageable, bytes);
+  memset(h_bPageable, 0, bytes);
+  memset(h_bPinned, 0, bytes);
+  memset(h_bWC, 0, bytes);
+
+  // output device info and transfer size
+  cudaDeviceProp prop;
+  checkCuda( cudaGetDeviceProperties(&prop, 0) );
+
+  printf("\nDevice: %s\n", prop.name);
+  printf("Transfer size (MB): %d\n", bytes / (1024 * 1024));
+
+  // perform copies and report bandwidth
+  profileCopies(h_aPageable, h_bPageable, d_a, nElements, "Pageable");
+  profileCopies(h_aPinned, h_bPinned, d_a, nElements, "Pinned");
+  profileCopies(h_aWC, h_bWC, d_a, nElements, "Write-combined");
+
+  printf("n");
+
+  // cleanup
+  cudaFree(d_a);
+  cudaFreeHost(h_aPinned);
+  cudaFreeHost(h_bPinned);
+  free(h_aPageable);
+  free(h_bPageable);
+
+  return 0;
+}