Refactor

Rename a member of GPU_Layer: size_hst -> l_size Delete dead DEBUG sections. Delete kernel functions that will be no longer maintained. Remove unused global variables. Merge header of GPUOpen_BD into itself. AoS approach does not benefit to SIMD for Momt kernels.
AndybnACT · Jan 31, 2022 · a8be928 · a8be928
1 parent b13ae31
commit a8be928
Show file tree

Hide file tree

Showing 8 changed files with 97 additions and 332 deletions.
diff --git a/GPUHeader.h b/GPUHeader.h
@@ -10,8 +10,6 @@
         #define CHKR 720
         #define CHKC 304
         #define CHKSI 10
-        #define ID_hst(row,col) col*size_hst[0] + row
-        //#define ID2E_hst(row,col,dim) size_hst[2]*dim + ID(row,col)
         extern float* tmpout;
     #endif
 #endif
@@ -58,7 +56,7 @@ struct GPU_Layer {
     float *R24_hst, *R35_hst, *H_hst;
     float *R_MASS_hst;
     float *Zmax_hst;
-    uint32_t size_hst[4];
+    uint32_t l_size[4];
 
     dim3 DimGridMomt_MN;
     dim3 DimGridMomt;
@@ -77,21 +75,7 @@ static inline struct GPU_Layer* ldlayer(int lid){
 }
 
 #ifndef CUDA_GLOB_VAR
-    extern float *Zout_hst, *MNout_hst;
-    extern float *MNdat_hst, *Zdat_hst;
-    extern float *R24_hst, *R35_hst, *H_hst;
-    //float *R1_hst, *R6_hst, *R11_hst;
-    extern float *R_MASS_hst;
-
-    // extern __device__ float *R35_dev;
-    // extern __device__ float *R24_dev, *H_dev;
-    // extern __device__ float *Z_dat_dev, *MN_dat_dev;
-    extern __device__ float *MN_out_dev, *Z_out_dev;
     extern __constant__ __device__ uint32_t all_size_dev[MAX_LAYERS][4];
-    // extern texture <float, cudaTextureType2D, cudaReadModeElementType> ZtexRef;
-    // extern texture <float, cudaTextureType2D, cudaReadModeElementType> MtexRef;
-    // extern texture <float, cudaTextureType2D, cudaReadModeElementType> NtexRef;
-    extern float *Zmax_hst;
     extern uint32_t all_size[MAX_LAYERS][4]; // mirror of all_size_dev
     extern cudaDeviceProp dev_prop;
 #endif
diff --git a/GPUMass_s.cu b/GPUMass_s.cu
@@ -10,9 +10,6 @@ extern "C" void mass_launch_(const float* Z_f, float* Z_f_complete, const float
     cudaError_t err;
     clock_t st, fi;
     struct GPU_Layer *L = ldlayer(*lid);
-
-    // cudaCHK( cudaMemcpy(H_hst, H_f, size_hst[3], cudaMemcpyHostToDevice) );
-    //cudaCHK( cudaMemcpy(Zdat_hst, Z_f, size_hst[3], cudaMemcpyHostToDevice) );
 
     st = clock();
     mass_kernel <<< L->DimGridMass, DimBlockMass >>> (*L);
@@ -21,19 +18,10 @@ extern "C" void mass_launch_(const float* Z_f, float* Z_f_complete, const float
     cudaERROR(err);
     fi = clock();
 
+#ifdef DEBUG
+    printf("TIME SPENT ON GPU %f\n",(float)(fi-st)/CLOCKS_PER_SEC);
 
-    #ifdef DEBUG
-        printf("TIME SPENT ON GPU %f\n",(float)(fi-st)/CLOCKS_PER_SEC);
-        // printf("printing information for debugging\n" );
-        // cudaCHK( cudaMemcpy(tmpout, Zout_hst, size_hst[3], cudaMemcpyDeviceToHost) );
-        // for (size_t i = 0; i < size_hst[2]; i++) {
-        //     if (abs(tmpout[i] - Z_f_complete[i]) > ERROR) {
-        //         printf("Z[%d] Z_cu:%e Z_f:%e %e\n", i, tmpout[i], Z_f_complete[i], tmpout[i] - Z_f_complete[i]);
-        //     }
-        // }
-    #else
-        // cudaCHK( cudaMemcpy(Z_f_complete, Zout_hst, size_hst[3], cudaMemcpyDeviceToHost) );
-    #endif
+#endif /* DEBUG */
 }
 
 __global__ void mass_kernel(struct GPU_Layer L){

diff --git a/GPUMoment_s.cu b/GPUMoment_s.cu
@@ -13,25 +13,16 @@ extern "C" void momt_launch_(float *M_f, float *N_f, float *Z_f, int *lid) {
     cudaError_t err;
     struct GPU_Layer *L = ldlayer(*lid);
 
-    #ifdef DEBUG
-        printf("Z_cu vs Z_f\n");
-        cudaCHK( cudaMemcpy(tmpout, Zout_hst, size_hst[3], cudaMemcpyDeviceToHost) );
-        for (size_t i = 0; i < size_hst[2]; i++) {
-            if (abs(tmpout[i] - Z_f[i]) > ERROR) {
-                printf("Z[%d,%d] Z_cu:%e Z_f:%e %e\n", i%size_hst[0], i/size_hst[0], tmpout[i], Z_f[i], tmpout[i] - Z_f[i]);
-            }
+#ifdef DEBUG
+    printf("Z_cu vs Z_f\n");
+    cudaCHK( cudaMemcpy(tmpout, Zout_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
+    for (size_t i = 0; i < l_size[2]; i++) {
+        if (abs(tmpout[i] - Z_f[i]) > ERROR) {
+            printf("err\n");
         }
-
-    #endif
-
-    // // kernel launch
-    // st = clock();
-    // momts_kernel <<< DimGridMomt, DimBlockMomt >>> (Zout_hst, MNdat_hst, R24_hst, R35_hst, H_hst);
-    // cudaDeviceSynchronize();
-    // err = cudaGetLastError();
-    // cudaERROR(err);
-    // fi = clock();
-
+    }
+
+#endif /* DEBUG */
 
     st = clock();
     momt_kernelM <<<  L->DimGridMomt_MN, DimBlockMomt_MN, 0, EXECstream[0] >>> (*L);
@@ -41,24 +32,10 @@ extern "C" void momt_launch_(float *M_f, float *N_f, float *Z_f, int *lid) {
     cudaERROR(err);
     fi = clock();
 
-    #ifdef DEBUG
-        printf("TIME SPENT ON GPU %f\n",(float)(fi-st)/CLOCKS_PER_SEC);
-        printf("printing debug information\n" );
-        cudaCHK( cudaMemcpy(tmpout, MNout_hst, 2*size_hst[3], cudaMemcpyDeviceToHost) );
-        for (size_t i = 0; i < size_hst[2]; i++) {
-            if (abs(tmpout[i] - M_f[i]) > ERROR) {
-                printf("M[%d,%d] M_cu:%e M_f:%e %e\n", i%size_hst[0], i/size_hst[0], tmpout[i], M_f[i], tmpout[i] - M_f[i]);
-            }
-        }
-        for (size_t i = size_hst[2], j=0; i < 2*size_hst[2]; i++, j++) {
-            if (abs(tmpout[i] - N_f[j]) > ERROR) {
-                printf("N[%d,%d] N_cu:%e N_f:%e %e\n", (i-size_hst[2])%size_hst[0], (i-size_hst[2])/size_hst[0], tmpout[i], N_f[j], tmpout[i] - N_f[j]);
-            }
-        }
-    #else
-        // cudaCHK( cudaMemcpy(M_f, MNout_hst, size_hst[3], cudaMemcpyDeviceToHost) );// FUTURE: delete
-        // cudaCHK( cudaMemcpy(N_f, MNout_hst+size_hst[2], size_hst[3], cudaMemcpyDeviceToHost) );// FUTURE: delete
-    #endif
+#ifdef DEBUG
+    printf("TIME SPENT ON GPU %f\n",(float)(fi-st)/CLOCKS_PER_SEC);
+
+#endif /* DEBUG */
 
 }
 
@@ -145,8 +122,8 @@ __global__ void momt_kernelN(struct GPU_Layer L) {
 
     const float* __restrict__ Z = L.Zout_hst;
     const float* __restrict__ MN = L.MNdat_hst;
-    const float* __restrict__ R4 = L.R24_hst + L.size_hst[2];
-    const float* __restrict__ R5 = L.R35_hst + L.size_hst[1];
+    const float* __restrict__ R4 = L.R24_hst + L.l_size[2];
+    const float* __restrict__ R5 = L.R35_hst + L.l_size[1];
     const float* __restrict__ H = L.H_hst;
     float* __restrict__ MN_out_dev = L.MNout_hst;
     const uint32_t __restrict__ *size_dev = all_size_dev[L.lid];
@@ -222,87 +199,3 @@ __global__ void momt_kernelN(struct GPU_Layer L) {
 
 
 }
-
-/*
-__global__ void momts_kernel(const float* __restrict__ Z, const float* __restrict__ MN,
-                             const float* __restrict__ R24, const float* __restrict__ R35,
-                             const float* __restrict__ H) {
-    __shared__ float   Z_dat[BLOCKX][BLOCKY+1]; // BLOCKY+1 for preventing bank conflicts
-    __shared__ float  MN_dat[BLOCKX][BLOCKY+1][2];
-    __shared__ float   H_dat[BLOCKX][BLOCKY+1];
-
-    int row = blockIdx.x*(EXECX) + threadIdx.x;
-    int col = blockIdx.y*(EXECY) + threadIdx.y;
-    int id = ID(row, col);
-    //int idr2 = id+size_dev[2];
-    int idmn = id+threadIdx.z*size_dev[2];
-    float r1, r2, tot=0.0f, x=0.0f;
-
-
-    if (row < size_dev[0] && col < size_dev[1]){
-    //printf("%d %d %d\n",row, col, id);
-        H_dat[threadIdx.x][threadIdx.y] = H[id]; // -->texture mem might be better
-        Z_dat[threadIdx.x][threadIdx.y] = Z[id];
-        MN_dat[threadIdx.x][threadIdx.y][threadIdx.z] = MN[idmn];
-        __syncthreads();
-
-
-        if (threadIdx.z == 0) { // for M
-            if (threadIdx.y != 0 || col == 0) {// boundary condition in threadblocks (if 1st thread is not 1st col)
-                if (threadIdx.x < EXECX){ // boundary condition in threadblocks
-                    if (row < size_dev[0] - 1){ // IS:IE
-                        int ip1 = threadIdx.x+1; // i plus 1
-                        int jm1 = threadIdx.y-1; // j minus 1 (only used when col != 0)
-                        if (H_dat[threadIdx.x][threadIdx.y] > GX && H_dat[ip1][threadIdx.y] > GX) { // preconditions //NOTE redundant checks
-                            r1 = R24[id];
-                            r2 = R35[col];
-                            tot = MN_dat[threadIdx.x][threadIdx.y][1] + MN_dat[ip1][threadIdx.y][1];
-                            x   = -r1*(Z_dat[ip1][threadIdx.y] - Z_dat[threadIdx.x][threadIdx.y]);
-                            if (col != 0) {
-                                tot += MN_dat[threadIdx.x][jm1][1] + MN_dat[ip1][jm1][1];
-                            }else{ // first col
-                                tot += tot;
-                            }
-                            tot *= r2;
-                            x += MN_dat[threadIdx.x][threadIdx.y][0];
-                            tot += x;
-                            if ( tot > EPS || -tot > EPS) MN_out_dev[id] = tot;
-                            else MN_out_dev[id] = 0;
-                            // if (row == 446 && col == 788) {
-                            //     printf("[%d,%d,%d]==================================>momt\n",threadIdx.x,threadIdx.y,threadIdx.z );
-                            //     printf("%e\t%e\t%e\t%e\t%e\t%e\n",r1,r2, MN_dat[threadIdx.x][threadIdx.y][1], MN_dat[ip1][threadIdx.y][1], MN_dat[threadIdx.x][jm1][1], MN_dat[ip1][jm1][1] );
-                            //     printf("%e\t%e\n",Z_dat[ip1][threadIdx.y], Z_dat[threadIdx.x][threadIdx.y] );
-                            // }
-                        }
-                    }
-                }
-            }
-        }else{ // for N
-            if (threadIdx.x != 0 || row == 0) {// boundary condition in threadblocks
-                if (threadIdx.y < EXECY){ //boundary condotion in threadblocks
-                    if (col < size_dev[1] - 1) { //JS:JE
-                        int jp1 = threadIdx.y+1;
-                        int im1 = threadIdx.x-1;
-                        if (H_dat[threadIdx.x][threadIdx.y] > GX && H_dat[threadIdx.x][jp1] > GX) {  // preconditions
-                            r1 = R24[idmn];
-                            r2 = R35[col+size_dev[1]];
-                            tot  = MN_dat[threadIdx.x][threadIdx.y][0] + MN_dat[threadIdx.x][jp1][0];
-                            x    = -r1*(Z_dat[threadIdx.x][jp1] - Z_dat[threadIdx.x][threadIdx.y]);
-                            if (row == 0) { // first row
-                                tot += tot;
-                            }else{
-                                tot += MN_dat[im1][threadIdx.y][0] + MN_dat[im1][jp1][0];
-                            }
-                            tot *= r2;
-                            x += MN_dat[threadIdx.x][threadIdx.y][1];
-                            tot = x - tot;
-                            if ( tot > EPS || -tot > EPS) MN_out_dev[idmn] = tot;
-                            else MN_out_dev[idmn] = 0;
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-*/
diff --git a/GPUOpen_BD.cu b/GPUOpen_BD.cu
@@ -1,7 +1,14 @@
 #include "GPUHeader.h"
 #include "GPUConfig.h"
 
-#include "GPUOpen_BD.h"
+typedef enum bd_side{
+    LEFT,
+    RIGHT,
+    TOP,
+    BOTTOM,
+} bdside;
+
+__global__ void openbd_kernel(struct GPU_Layer, bdside);
 
 extern "C" void openbd_launch_(float *Z_f_complete) {
     /* Only for outest layer, assume its layerid = 0 */
@@ -17,15 +24,15 @@ extern "C" void openbd_launch_(float *Z_f_complete) {
     cudaERROR(err);
 
 
-    #ifdef DEBUG
-        printf("printing information for debugging\n" );
-        cudaCHK( cudaMemcpy(tmpout, Zout_hst, size_hst[3], cudaMemcpyDeviceToHost) );
-        for (size_t i = 0; i < size_hst[2]; i++) {
-            if (abs(tmpout[i] - Z_f_complete[i]) > ERROR) {
-                printf("Z[%d,%d] Z_cu:%e Z_f:%e %e\n", i%size_hst[0], i/size_hst[0] , tmpout[i], Z_f_complete[i], tmpout[i] - Z_f_complete[i]);
-            }
+#ifdef DEBUG
+    printf("printing information for debugging\n" );
+    cudaCHK( cudaMemcpy(tmpout, Zout_hst, l_size[3], cudaMemcpyDeviceToHost) );
+    for (size_t i = 0; i < l_size[2]; i++) {
+        if (abs(tmpout[i] - Z_f_complete[i]) > ERROR) {
+            printf("Z[%d,%d] Z_cu:%e Z_f:%e %e\n", i%l_size[0], i/l_size[0] , tmpout[i], Z_f_complete[i], tmpout[i] - Z_f_complete[i]);
         }
-    #endif
+    }
+#endif /* DEBUG */
 
 }
 

diff --git a/GPUOpen_BD.h b/GPUOpen_BD.h
diff --git a/GPUOutput.cu b/GPUOutput.cu
@@ -14,7 +14,7 @@ extern "C" void maxamp_launch_(const int *lid){
     struct GPU_Layer *L = ldlayer(*lid);
 
     st = clock();
-    maximum_recorder_kernel <<< L->GridMaxAmp, MAXAMP_BLOCK >>> (L->Zmax_hst, L->Zout_hst, L->size_hst[2]);
+    maximum_recorder_kernel <<< L->GridMaxAmp, MAXAMP_BLOCK >>> (L->Zmax_hst, L->Zout_hst, L->l_size[2]);
     cudaDeviceSynchronize();
     err = cudaGetLastError();
     cudaERROR(err);
@@ -40,21 +40,21 @@ __global__ void maximum_recorder_kernel(float *max_array, const float* __restric
 
 extern "C" void cuda_getz_(float *Z_f, int *lid) {
     struct GPU_Layer *L = ldlayer(*lid);
-    cudaCHK( cudaMemcpy(Z_f, L->Zout_hst, L->size_hst[3], cudaMemcpyDeviceToHost) );
+    cudaCHK( cudaMemcpy(Z_f, L->Zout_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
 }
 
 extern "C" void cuda_getmn_(float *M_f, float *N_f, int *lid) {
     struct GPU_Layer *L = ldlayer(*lid);
-    cudaCHK( cudaMemcpy(M_f, L->MNout_hst, L->size_hst[3], cudaMemcpyDeviceToHost) );
-    cudaCHK( cudaMemcpy(N_f, L->MNout_hst+L->size_hst[2], L->size_hst[3], cudaMemcpyDeviceToHost) );
+    cudaCHK( cudaMemcpy(M_f, L->MNout_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
+    cudaCHK( cudaMemcpy(N_f, L->MNout_hst+L->l_size[2], L->l_size[3], cudaMemcpyDeviceToHost) );
 }
 
 extern "C" void cuda_getzmax_(float *Zmax_f, int *lid) {
     struct GPU_Layer *L = ldlayer(*lid);
-    cudaCHK( cudaMemcpy(Zmax_f, L->Zmax_hst, L->size_hst[3], cudaMemcpyDeviceToHost) );
+    cudaCHK( cudaMemcpy(Zmax_f, L->Zmax_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
 }
 
 // void cuda_getmnmax_(float *Mmax_f, float *Nmax_f) {
-//     cudaCHK( cudaMemcpy(Mmax_f, MNmax_hst, size_hst[3], cudaMemcpyDeviceToHost) );
-//     cudaCHK( cudaMemcpy(Nmax_f, MNmax_hst+size_hst[2], size_hst[3], cudaMemcpyDeviceToHost) );
+//     cudaCHK( cudaMemcpy(Mmax_f, MNmax_hst, l_size[3], cudaMemcpyDeviceToHost) );
+//     cudaCHK( cudaMemcpy(Nmax_f, MNmax_hst+l_size[2], l_size[3], cudaMemcpyDeviceToHost) );
 // }