support multigrid

GPU-COMCOT supports multigrid using linear spherical solver now. This commit contains a bunch of changes, including boundaries solvers, 2D-interpolators, and some debugging code. It does not look pretty to include so many changes into one commit. But anyway I think it is more important to release these code out because I have not touched it for a very long time. Signed-off-by: Tao Chiu <[email protected]>
AndybnACT · Jan 22, 2023 · f25fdf5 · f25fdf5
1 parent 6ca41a7
commit f25fdf5
Show file tree

Hide file tree

Showing 13 changed files with 726 additions and 97 deletions.
diff --git a/GPUConfig.h b/GPUConfig.h
@@ -23,6 +23,10 @@ extern dim3 DimGridMomt_MN;
 extern dim3 DimBlockMass;
 extern dim3 DimGridMass;
 
+#define BLOCKX_JNZ 64
+#define DEBUG_DEPTH 32
+#define LOOPDEPTH DEBUG_DEPTH
+
 #define BLOCKX_OPENBD 64
 extern dim3 DimBlockOpenBD;
 extern dim3 DimGridOpenBD_LR;

diff --git a/GPUHeader.h b/GPUHeader.h
@@ -1,4 +1,5 @@
 #include <stdio.h>
+#include "debug_option.h"
 #include "cuda.h"
 #include <time.h>
 
@@ -44,6 +45,7 @@
     #define CUDA_KERNEL
     #define ID(row,col) (col)*size_dev[0] + row
     #define ID2E(row,col,dim) size_dev[2]*(dim) + ID(row,col)
+    #define ID_MATRIX(row,col,row_size) ((col)*(row_size) + (row)) 
 #endif
 
 #define MAX_LAYERS 20   
@@ -59,10 +61,14 @@ struct GPU_Layer {
     float *R24_hst, *R35_hst, *H_hst;
     float *R_MASS_hst;
     float *Zmax_hst;
+    float *yflux;
+    float *xflux;
     uint32_t l_size[4];
     float grx;
     float gry;
 
+    dim3 DimGridJnz;
+    dim3 DimBlockJnz;
     dim3 DimGridMomt_MN;
     dim3 DimGridMomt;
     dim3 DimGridMass;
@@ -95,3 +101,48 @@ extern "C" void cuda_update_layer_(int *lid);
     extern uint32_t all_size[MAX_LAYERS][4]; // mirror of all_size_dev
     extern cudaDeviceProp dev_prop;
 #endif
+
+#ifdef RELDIFF
+static inline bool assert_diff(float base, float eval) {
+    if (base == 0.0f) {
+        if (eval != 0.0f)
+            return true;
+        else
+            return false;
+    }
+
+    if (fabs((eval - base) / base) >= DIFFRATE)
+        return true;
+    else
+        return false;
+}
+#else
+static inline bool assert_diff(float base, float eval) {
+    if (fabs(eval - base) >= TOLERANCE)
+        return true;
+    else
+        return false;
+}
+#endif /* RELDIFF */
+
+#define CMP_VAR(cmpp, varp, row, col, str,...) { \
+    for (size_t j = 0; j < col; j++) {                          \
+        for (size_t i = 0; i < row; i++) {                      \
+            size_t id = j*(row) + i;                            \
+            double diff = fabs((cmpp)[id] - (varp)[id]);        \
+            if (assert_diff((cmpp)[id], (varp)[id])) {          \
+                printf(str, __VA_ARGS__);                       \
+                printf("(i,j)=(%zu,%zu), %f %f, diff=%lf  ",    \
+                    i, j, (cmpp)[id], (varp)[id], diff);        \
+                printf("%x, %x\n", *(unsigned int*)((cmpp) + id),\
+                                *(unsigned int*)((varp) + id)); \
+            }                                                   \
+        }                                                       \
+    }                                                           \
+}
+
+static inline bool ispow2(unsigned int val) {
+    // returns true if val is power of 2
+    return !(val & (val - 1));
+} 
+
diff --git a/GPUMass_s.cu b/GPUMass_s.cu
@@ -1,5 +1,6 @@
 #include "GPUHeader.h"
 #include "GPUConfig.h"
+#include "debug_option.h"
 
 __global__ void mass_kernel(struct GPU_Layer);
 
@@ -17,10 +18,22 @@ extern "C" void mass_launch_(const float* Z_f, float* Z_f_complete, const float
     cudaERROR(err);
     fi = clock();
 
-#ifdef DEBUG
-    printf("TIME SPENT ON GPU %f\n",(float)(fi-st)/CLOCKS_PER_SEC);
-
-#endif /* DEBUG */
+#ifdef DEBUG_CORE
+    float *z_cmp;
+    printf("TIME SPENT ON GPU (MASS) %f\n",(float)(fi-st)/CLOCKS_PER_SEC);
+    z_cmp = (float*) malloc(L->l_size[3]);
+    cudaCHK( cudaMemcpy(z_cmp, L->Zout_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
+    for (size_t id = 0; id < L->l_size[2]; id++) {
+        if (assert_diff(z_cmp[id], Z_f_complete[id])) {
+            printf("(mass, layerid=%d) i = %d, %f, %f, diff=%f\n",
+                *lid,
+                id, 
+                z_cmp[id], Z_f_complete[id], 
+                fabs(z_cmp[id] - Z_f_complete[id]));
+        }
+    }
+    free(z_cmp);
+#endif /* DEBUG_CORE */
 }
 
 __global__ void mass_kernel(struct GPU_Layer L){
@@ -40,13 +53,23 @@ __global__ void mass_kernel(struct GPU_Layer L){
     //designed for architectures whose warpsize=32
     uint32_t row = blockIdx.x*31*(blockDim.x>>5) + 31*(threadIdx.x>>5) + threadIdx.x%32;
     uint32_t col = blockIdx.y*(size_dev[1]/gridDim.y);
-    uint32_t col_end = (blockIdx.y == gridDim.y-1)? size_dev[1]-1:(blockIdx.y+1)*(size_dev[1]/gridDim.y)+1;
+    uint32_t col_end, row_end;
     float h,z;
     float m, m_suf;
     float n, n_prev;
     float ztmp;
     float r1, r11;
     float r6, r6_prev;
+
+    if (L.lid == 1) {
+        col_end = (blockIdx.y == gridDim.y-1)? 
+            size_dev[1] - 1:(blockIdx.y+1)*(size_dev[1]/gridDim.y)+1;
+        row_end = size_dev[0]-1;
+    } else {
+        col_end = (blockIdx.y == gridDim.y-1)? 
+            size_dev[1]:(blockIdx.y+1)*(size_dev[1]/gridDim.y)+1;
+        row_end = size_dev[0];
+    }
 
     n_prev = MN[ID2E(row,col,1)];
     r6_prev = R_MASS[col*4+1];
@@ -66,7 +89,7 @@ __global__ void mass_kernel(struct GPU_Layer L){
         z =  Z[ID(row,i)];
         n = MN[ID2E(row,i,1)];
         m_suf = __shfl_up_sync(0xFFFFFFFF,m,1);
-        if (threadIdx.x%32 != 0 && row < size_dev[0]-1) {
+        if (threadIdx.x%32 != 0 && row < row_end) {
             ztmp = z - r1*(m-m_suf) - r11*(n*r6-n_prev*r6_prev);
             if (ztmp + h <= EPS) ztmp = -h;
             if (h <= GX || (ztmp < EPS && -ztmp < EPS) ) ztmp = 0.0;

diff --git a/GPUMoment_s.cu b/GPUMoment_s.cu
@@ -1,5 +1,6 @@
 #include "GPUHeader.h"
 #include "GPUConfig.h"
+#include "debug_option.h"
 
 __global__ void momts_kernel(const float* __restrict__,const float* __restrict__,
                     const float* __restrict__,const float* __restrict__,const float* __restrict__);
@@ -12,17 +13,6 @@ extern "C" void momt_launch_(float *M_f, float *N_f, float *Z_f, const int *lid)
     cudaError_t err;
     struct GPU_Layer *L = ldlayer(*lid);
 
-#ifdef DEBUG
-    printf("Z_cu vs Z_f\n");
-    cudaCHK( cudaMemcpy(tmpout, Zout_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
-    for (size_t i = 0; i < l_size[2]; i++) {
-        if (abs(tmpout[i] - Z_f[i]) > ERROR) {
-            printf("err\n");
-        }
-    }
-
-#endif /* DEBUG */
-
     st = clock();
     momt_kernelM <<<  L->DimGridMomt_MN, DimBlockMomt_MN, 0, EXECstream[0] >>> (*L);
     momt_kernelN <<<  L->DimGridMomt_MN, DimBlockMomt_MN, 0, EXECstream[1] >>> (*L);
@@ -31,15 +21,42 @@ extern "C" void momt_launch_(float *M_f, float *N_f, float *Z_f, const int *lid)
     cudaERROR(err);
     fi = clock();
 
-#ifdef DEBUG
-    printf("TIME SPENT ON GPU %f\n",(float)(fi-st)/CLOCKS_PER_SEC);
+#ifdef DEBUG_CORE
+    float *mn_cmp;
+    printf("TIME SPENT ON GPU (MOMT->%d) %f\n",(float)(fi-st)/CLOCKS_PER_SEC, L->lid);
 
-#endif /* DEBUG */
+    mn_cmp = (float*) malloc(2*L->l_size[3]);
+    cudaCHK( cudaMemcpy(
+        mn_cmp, L->MNout_hst, 2*L->l_size[3], cudaMemcpyDeviceToHost) );
+
+    CMP_VAR(mn_cmp, M_f, L->l_size[0], L->l_size[1], "(momt-m, layer-id=%d)", L->lid);
+    CMP_VAR(mn_cmp+L->l_size[2], N_f, L->l_size[0], L->l_size[1], "(momt-n, layer-id=%d)", L->lid);
+
+    // for (size_t id = 0; id < L->l_size[2]; id++) {
+    //     if (fabs(mn_cmp[id] - M_f[id]) >= TOLERANCE) {
+    //         printf("(momt-m, layerid=%d) i = %d, %f, %f, diff=%f\n",
+    //             *lid,
+    //             id, 
+    //             mn_cmp[id], M_f[id], 
+    //             fabs(mn_cmp[id] - M_f[id]));
+    //     }
+    // }
+    // for (size_t id = 0; id < L->l_size[2]; id++) {
+    //     if (fabs(mn_cmp[id + L->l_size[2]] - N_f[id]) >= TOLERANCE) {
+    //         printf("(momt-n, layerid=%d) i = %d, %f, %f, diff=%f\n",
+    //             *lid,
+    //             id, 
+    //             mn_cmp[id + L->l_size[2]], N_f[id], 
+    //             fabs(mn_cmp[id + L->l_size[2]] - N_f[id]));
+    //     }
+    // }
+    free(mn_cmp);
+#endif /* DEBUG_CORE */
 
 }
 
-__global__ void momt_kernelM(struct GPU_Layer L) {
-
+__global__ void momt_kernelM(const struct GPU_Layer L) 
+{    
      const float* __restrict__ Z = L.Zout_hst;
      const float* __restrict__ MN = L.MNdat_hst;
      const float* __restrict__ R2 = L.R24_hst;
@@ -71,19 +88,25 @@ __global__ void momt_kernelM(struct GPU_Layer L) {
      float xm;
      //=====================================================
 
+     // we can simply return since there is no `im1` 
+     if (row == 0 && L.lid != 1)
+         return;
+
      if (row < size_dev[0]) { //lower bound of threads
          n_prev = MN[ID2E(row,col,1)];
          n_ip1jm1_prev = __shfl_down_sync(0xFFFFFFFF, n_prev,1);
-         if (col != 0) col+= 1; // not start at left boundary
+         if (col != 0 || L.lid != 1) col+= 1; // not start at left boundary (consider jm1 at j == 0)
          for (uint32_t j = col; j < col_end; j++) { //grid striding
-             if (threadIdx.x%32 == 31) {
-                 r3 = R3[j];
-             }
-             __syncwarp();
-             r3    = __shfl_sync(0xFFFFFFFF,r3,31);
+             // if (threadIdx.x%32 == 31) {
+             //     r3 = R3[j];
+             // }
+             // __syncwarp();
+             // r3    = __shfl_sync(0xFFFFFFFF,r3,31);
+             r3    = R3[j];
              n     = MN[ID2E(row,j,1)];
              h     = H[ID(row,j)];
              z     = Z[ID(row,j)];
+            // __syncwarp();
 
              n_ip1 = __shfl_down_sync(0xFFFFFFFF, n,1);
              z_ip1 = __shfl_down_sync(0xFFFFFFFF, z,1);
@@ -102,7 +125,7 @@ __global__ void momt_kernelM(struct GPU_Layer L) {
                  }else{
                      MN_out_dev[ID(row,j)] = 0.0f;
                  }
-                 // if (row == 447 && j == 782) {
+                 // if (row == 1495 && L.lid == 2 && j == 50) {
                  //     printf("[%d,%d]===============================[row,j][%d,%d]\n",threadIdx.x,blockIdx.x,row,j );
                  //     printf("%e\t%e\t%e\t%e\t%e\t%e\n",r2,r3,n,n_ip1,n_prev,n_ip1jm1_prev);
                  //     printf("%e\t%e\t%e\n",m,z_ip1,z);
@@ -117,8 +140,8 @@ __global__ void momt_kernelM(struct GPU_Layer L) {
 
 
 
-__global__ void momt_kernelN(struct GPU_Layer L) {
-
+__global__ void momt_kernelN(const struct GPU_Layer L)
+{                                 
     const float* __restrict__ Z = L.Zout_hst;
     const float* __restrict__ MN = L.MNdat_hst;
     const float* __restrict__ R4 = L.R24_hst + L.l_size[2];
@@ -147,8 +170,13 @@ __global__ void momt_kernelN(struct GPU_Layer L) {
     float tot_m;
     float xn;
     //=====================================================
+    bool write = true;
 
-
+    if (L.lid != 1 && row == 0)
+        write = false;
+
+    if (L.lid != 1 && col == 0)
+        col = 1;
 
     if (row < size_dev[0]) { //lower bound of threads;
         m_jp1_prev = MN[ID(row,col_end)];
@@ -158,14 +186,19 @@ __global__ void momt_kernelN(struct GPU_Layer L) {
         h_jp1_prev = H[ID(row,col_end)];
         // !! bug fixed: note that comparison of unsigned expression >= 0 is always true
         for (int j = (int)col_end-1; j >= (int)col; j--) { //grid striding
-            if (threadIdx.x%32 == 0) {
-                r5 = R5[j];
-            }
-            __syncwarp();
-            r5    = __shfl_sync(0xFFFFFFFF,r5,0);
+            // if (threadIdx.x%32 == 0) {
+            //     r5 = R5[j];
+            // }
+            // __syncwarp();
+            // r5    = __shfl_sync(0xFFFFFFFF,r5,0);
+            r5    = R5[j];
             m     = MN[ID(row,j)];
             h     =  H[ID(row,j)];
             m_im1 = __shfl_up_sync(0xFFFFFFFF,m,1);
+
+            // if (L.lid != 1 && (row == 0 || j == 0)) {
+            //     write = false;
+            // }
 
             if (threadIdx.x%32 != 0 || row == 0) { //upper bound of lanes
                 r4    = R4[ID(row,j)];
@@ -183,9 +216,11 @@ __global__ void momt_kernelN(struct GPU_Layer L) {
                     // }
 
                     if (xn < EPS && xn > -EPS) xn = 0.0f;
-                    MN_out_dev[ID2E(row,j,1)] = xn;
+                    if (write)
+                        MN_out_dev[ID2E(row,j,1)] = xn;
                 }else{
-                    MN_out_dev[ID2E(row,j,1)] = 0.0f;
+                    if (write)
+                        MN_out_dev[ID2E(row,j,1)] = 0.0f;
                 }
                 m_jp1_prev    = m;
                 m_im1jp1_prev = m_im1;

diff --git a/GPUOutput.cu b/GPUOutput.cu
@@ -57,23 +57,45 @@ extern "C" void cuda_getz1_(float *Z_f, int *lid) {
     cudaCHK( cudaMemcpy(Z_f, L->Zdat_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
 }
 
+extern "C" void cuda_setz1_(float *Z_f, int *lid) {
+    struct GPU_Layer *L = ldlayer(*lid);
+    cudaCHK( cudaMemcpy(L->Zdat_hst, Z_f, L->l_size[3], cudaMemcpyHostToDevice) );
+}
+
 extern "C" void cuda_getz_(float *Z_f, int *lid) {
     struct GPU_Layer *L = ldlayer(*lid);
     cudaCHK( cudaMemcpy(Z_f, L->Zout_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
 }
 
+extern "C" void cuda_setz_(float *Z_f, int *lid) {
+    struct GPU_Layer *L = ldlayer(*lid);
+    cudaCHK( cudaMemcpy(L->Zout_hst, Z_f, L->l_size[3], cudaMemcpyHostToDevice) );
+}
+
 extern "C" void cuda_getmn1_(float *M_f, float *N_f, int *lid) {
     struct GPU_Layer *L = ldlayer(*lid);
     cudaCHK( cudaMemcpy(M_f, L->MNdat_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
     cudaCHK( cudaMemcpy(N_f, L->MNdat_hst+L->l_size[2], L->l_size[3], cudaMemcpyDeviceToHost) );
 }
 
+extern "C" void cuda_setmn1_(float *M_f, float *N_f, int *lid) {
+    struct GPU_Layer *L = ldlayer(*lid);
+    cudaCHK( cudaMemcpy(L->MNdat_hst, M_f, L->l_size[3], cudaMemcpyHostToDevice) );
+    cudaCHK( cudaMemcpy(L->MNdat_hst+L->l_size[2], N_f, L->l_size[3], cudaMemcpyHostToDevice) );
+}
+
 extern "C" void cuda_getmn_(float *M_f, float *N_f, int *lid) {
     struct GPU_Layer *L = ldlayer(*lid);
     cudaCHK( cudaMemcpy(M_f, L->MNout_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
     cudaCHK( cudaMemcpy(N_f, L->MNout_hst+L->l_size[2], L->l_size[3], cudaMemcpyDeviceToHost) );
 }
 
+extern "C" void cuda_setmn_(float *M_f, float *N_f, int *lid) {
+    struct GPU_Layer *L = ldlayer(*lid);
+    cudaCHK( cudaMemcpy(L->MNout_hst, M_f, L->l_size[3], cudaMemcpyHostToDevice) );
+    cudaCHK( cudaMemcpy(L->MNout_hst+L->l_size[2], N_f, L->l_size[3], cudaMemcpyHostToDevice) );
+}
+
 extern "C" void cuda_getzmax_(float *Zmax_f, int *lid) {
     struct GPU_Layer *L = ldlayer(*lid);
     cudaCHK( cudaMemcpy(Zmax_f, L->Zmax_hst, L->l_size[3], cudaMemcpyDeviceToHost) );