Skip to content

Commit

Permalink
support multigrid
Browse files Browse the repository at this point in the history
GPU-COMCOT supports multigrid using linear spherical solver now. This
commit contains a bunch of changes, including boundaries solvers,
2D-interpolators, and some debugging code. It does not look pretty to
include so many changes into one commit. But anyway I think it is more
important to release these code out because I have not touched it for a
very long time.

Signed-off-by: Tao Chiu <[email protected]>
  • Loading branch information
AndybnACT committed Jan 22, 2023
1 parent 6ca41a7 commit f25fdf5
Show file tree
Hide file tree
Showing 13 changed files with 726 additions and 97 deletions.
4 changes: 4 additions & 0 deletions GPUConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ extern dim3 DimGridMomt_MN;
extern dim3 DimBlockMass;
extern dim3 DimGridMass;

#define BLOCKX_JNZ 64
#define DEBUG_DEPTH 32
#define LOOPDEPTH DEBUG_DEPTH

#define BLOCKX_OPENBD 64
extern dim3 DimBlockOpenBD;
extern dim3 DimGridOpenBD_LR;
Expand Down
51 changes: 51 additions & 0 deletions GPUHeader.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <stdio.h>
#include "debug_option.h"
#include "cuda.h"
#include <time.h>

Expand Down Expand Up @@ -44,6 +45,7 @@
#define CUDA_KERNEL
#define ID(row,col) (col)*size_dev[0] + row
#define ID2E(row,col,dim) size_dev[2]*(dim) + ID(row,col)
#define ID_MATRIX(row,col,row_size) ((col)*(row_size) + (row))
#endif

#define MAX_LAYERS 20
Expand All @@ -59,10 +61,14 @@ struct GPU_Layer {
float *R24_hst, *R35_hst, *H_hst;
float *R_MASS_hst;
float *Zmax_hst;
float *yflux;
float *xflux;
uint32_t l_size[4];
float grx;
float gry;

dim3 DimGridJnz;
dim3 DimBlockJnz;
dim3 DimGridMomt_MN;
dim3 DimGridMomt;
dim3 DimGridMass;
Expand Down Expand Up @@ -95,3 +101,48 @@ extern "C" void cuda_update_layer_(int *lid);
extern uint32_t all_size[MAX_LAYERS][4]; // mirror of all_size_dev
extern cudaDeviceProp dev_prop;
#endif

#ifdef RELDIFF
static inline bool assert_diff(float base, float eval) {
if (base == 0.0f) {
if (eval != 0.0f)
return true;
else
return false;
}

if (fabs((eval - base) / base) >= DIFFRATE)
return true;
else
return false;
}
#else
static inline bool assert_diff(float base, float eval) {
if (fabs(eval - base) >= TOLERANCE)
return true;
else
return false;
}
#endif /* RELDIFF */

#define CMP_VAR(cmpp, varp, row, col, str,...) { \
for (size_t j = 0; j < col; j++) { \
for (size_t i = 0; i < row; i++) { \
size_t id = j*(row) + i; \
double diff = fabs((cmpp)[id] - (varp)[id]); \
if (assert_diff((cmpp)[id], (varp)[id])) { \
printf(str, __VA_ARGS__); \
printf("(i,j)=(%zu,%zu), %f %f, diff=%lf ", \
i, j, (cmpp)[id], (varp)[id], diff); \
printf("%x, %x\n", *(unsigned int*)((cmpp) + id),\
*(unsigned int*)((varp) + id)); \
} \
} \
} \
}

static inline bool ispow2(unsigned int val) {
// returns true if val is power of 2
return !(val & (val - 1));
}

35 changes: 29 additions & 6 deletions GPUMass_s.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "GPUHeader.h"
#include "GPUConfig.h"
#include "debug_option.h"

__global__ void mass_kernel(struct GPU_Layer);

Expand All @@ -17,10 +18,22 @@ extern "C" void mass_launch_(const float* Z_f, float* Z_f_complete, const float
cudaERROR(err);
fi = clock();

#ifdef DEBUG
printf("TIME SPENT ON GPU %f\n",(float)(fi-st)/CLOCKS_PER_SEC);

#endif /* DEBUG */
#ifdef DEBUG_CORE
float *z_cmp;
printf("TIME SPENT ON GPU (MASS) %f\n",(float)(fi-st)/CLOCKS_PER_SEC);
z_cmp = (float*) malloc(L->l_size[3]);
cudaCHK( cudaMemcpy(z_cmp, L->Zout_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
for (size_t id = 0; id < L->l_size[2]; id++) {
if (assert_diff(z_cmp[id], Z_f_complete[id])) {
printf("(mass, layerid=%d) i = %d, %f, %f, diff=%f\n",
*lid,
id,
z_cmp[id], Z_f_complete[id],
fabs(z_cmp[id] - Z_f_complete[id]));
}
}
free(z_cmp);
#endif /* DEBUG_CORE */
}

__global__ void mass_kernel(struct GPU_Layer L){
Expand All @@ -40,13 +53,23 @@ __global__ void mass_kernel(struct GPU_Layer L){
//designed for architectures whose warpsize=32
uint32_t row = blockIdx.x*31*(blockDim.x>>5) + 31*(threadIdx.x>>5) + threadIdx.x%32;
uint32_t col = blockIdx.y*(size_dev[1]/gridDim.y);
uint32_t col_end = (blockIdx.y == gridDim.y-1)? size_dev[1]-1:(blockIdx.y+1)*(size_dev[1]/gridDim.y)+1;
uint32_t col_end, row_end;
float h,z;
float m, m_suf;
float n, n_prev;
float ztmp;
float r1, r11;
float r6, r6_prev;

if (L.lid == 1) {
col_end = (blockIdx.y == gridDim.y-1)?
size_dev[1] - 1:(blockIdx.y+1)*(size_dev[1]/gridDim.y)+1;
row_end = size_dev[0]-1;
} else {
col_end = (blockIdx.y == gridDim.y-1)?
size_dev[1]:(blockIdx.y+1)*(size_dev[1]/gridDim.y)+1;
row_end = size_dev[0];
}

n_prev = MN[ID2E(row,col,1)];
r6_prev = R_MASS[col*4+1];
Expand All @@ -66,7 +89,7 @@ __global__ void mass_kernel(struct GPU_Layer L){
z = Z[ID(row,i)];
n = MN[ID2E(row,i,1)];
m_suf = __shfl_up_sync(0xFFFFFFFF,m,1);
if (threadIdx.x%32 != 0 && row < size_dev[0]-1) {
if (threadIdx.x%32 != 0 && row < row_end) {
ztmp = z - r1*(m-m_suf) - r11*(n*r6-n_prev*r6_prev);
if (ztmp + h <= EPS) ztmp = -h;
if (h <= GX || (ztmp < EPS && -ztmp < EPS) ) ztmp = 0.0;
Expand Down
101 changes: 68 additions & 33 deletions GPUMoment_s.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "GPUHeader.h"
#include "GPUConfig.h"
#include "debug_option.h"

__global__ void momts_kernel(const float* __restrict__,const float* __restrict__,
const float* __restrict__,const float* __restrict__,const float* __restrict__);
Expand All @@ -12,17 +13,6 @@ extern "C" void momt_launch_(float *M_f, float *N_f, float *Z_f, const int *lid)
cudaError_t err;
struct GPU_Layer *L = ldlayer(*lid);

#ifdef DEBUG
printf("Z_cu vs Z_f\n");
cudaCHK( cudaMemcpy(tmpout, Zout_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
for (size_t i = 0; i < l_size[2]; i++) {
if (abs(tmpout[i] - Z_f[i]) > ERROR) {
printf("err\n");
}
}

#endif /* DEBUG */

st = clock();
momt_kernelM <<< L->DimGridMomt_MN, DimBlockMomt_MN, 0, EXECstream[0] >>> (*L);
momt_kernelN <<< L->DimGridMomt_MN, DimBlockMomt_MN, 0, EXECstream[1] >>> (*L);
Expand All @@ -31,15 +21,42 @@ extern "C" void momt_launch_(float *M_f, float *N_f, float *Z_f, const int *lid)
cudaERROR(err);
fi = clock();

#ifdef DEBUG
printf("TIME SPENT ON GPU %f\n",(float)(fi-st)/CLOCKS_PER_SEC);
#ifdef DEBUG_CORE
float *mn_cmp;
printf("TIME SPENT ON GPU (MOMT->%d) %f\n",(float)(fi-st)/CLOCKS_PER_SEC, L->lid);

#endif /* DEBUG */
mn_cmp = (float*) malloc(2*L->l_size[3]);
cudaCHK( cudaMemcpy(
mn_cmp, L->MNout_hst, 2*L->l_size[3], cudaMemcpyDeviceToHost) );

CMP_VAR(mn_cmp, M_f, L->l_size[0], L->l_size[1], "(momt-m, layer-id=%d)", L->lid);
CMP_VAR(mn_cmp+L->l_size[2], N_f, L->l_size[0], L->l_size[1], "(momt-n, layer-id=%d)", L->lid);

// for (size_t id = 0; id < L->l_size[2]; id++) {
// if (fabs(mn_cmp[id] - M_f[id]) >= TOLERANCE) {
// printf("(momt-m, layerid=%d) i = %d, %f, %f, diff=%f\n",
// *lid,
// id,
// mn_cmp[id], M_f[id],
// fabs(mn_cmp[id] - M_f[id]));
// }
// }
// for (size_t id = 0; id < L->l_size[2]; id++) {
// if (fabs(mn_cmp[id + L->l_size[2]] - N_f[id]) >= TOLERANCE) {
// printf("(momt-n, layerid=%d) i = %d, %f, %f, diff=%f\n",
// *lid,
// id,
// mn_cmp[id + L->l_size[2]], N_f[id],
// fabs(mn_cmp[id + L->l_size[2]] - N_f[id]));
// }
// }
free(mn_cmp);
#endif /* DEBUG_CORE */

}

__global__ void momt_kernelM(struct GPU_Layer L) {

__global__ void momt_kernelM(const struct GPU_Layer L)
{
const float* __restrict__ Z = L.Zout_hst;
const float* __restrict__ MN = L.MNdat_hst;
const float* __restrict__ R2 = L.R24_hst;
Expand Down Expand Up @@ -71,19 +88,25 @@ __global__ void momt_kernelM(struct GPU_Layer L) {
float xm;
//=====================================================

// we can simply return since there is no `im1`
if (row == 0 && L.lid != 1)
return;

if (row < size_dev[0]) { //lower bound of threads
n_prev = MN[ID2E(row,col,1)];
n_ip1jm1_prev = __shfl_down_sync(0xFFFFFFFF, n_prev,1);
if (col != 0) col+= 1; // not start at left boundary
if (col != 0 || L.lid != 1) col+= 1; // not start at left boundary (consider jm1 at j == 0)
for (uint32_t j = col; j < col_end; j++) { //grid striding
if (threadIdx.x%32 == 31) {
r3 = R3[j];
}
__syncwarp();
r3 = __shfl_sync(0xFFFFFFFF,r3,31);
// if (threadIdx.x%32 == 31) {
// r3 = R3[j];
// }
// __syncwarp();
// r3 = __shfl_sync(0xFFFFFFFF,r3,31);
r3 = R3[j];
n = MN[ID2E(row,j,1)];
h = H[ID(row,j)];
z = Z[ID(row,j)];
// __syncwarp();

n_ip1 = __shfl_down_sync(0xFFFFFFFF, n,1);
z_ip1 = __shfl_down_sync(0xFFFFFFFF, z,1);
Expand All @@ -102,7 +125,7 @@ __global__ void momt_kernelM(struct GPU_Layer L) {
}else{
MN_out_dev[ID(row,j)] = 0.0f;
}
// if (row == 447 && j == 782) {
// if (row == 1495 && L.lid == 2 && j == 50) {
// printf("[%d,%d]===============================[row,j][%d,%d]\n",threadIdx.x,blockIdx.x,row,j );
// printf("%e\t%e\t%e\t%e\t%e\t%e\n",r2,r3,n,n_ip1,n_prev,n_ip1jm1_prev);
// printf("%e\t%e\t%e\n",m,z_ip1,z);
Expand All @@ -117,8 +140,8 @@ __global__ void momt_kernelM(struct GPU_Layer L) {



__global__ void momt_kernelN(struct GPU_Layer L) {

__global__ void momt_kernelN(const struct GPU_Layer L)
{
const float* __restrict__ Z = L.Zout_hst;
const float* __restrict__ MN = L.MNdat_hst;
const float* __restrict__ R4 = L.R24_hst + L.l_size[2];
Expand Down Expand Up @@ -147,8 +170,13 @@ __global__ void momt_kernelN(struct GPU_Layer L) {
float tot_m;
float xn;
//=====================================================
bool write = true;


if (L.lid != 1 && row == 0)
write = false;

if (L.lid != 1 && col == 0)
col = 1;

if (row < size_dev[0]) { //lower bound of threads;
m_jp1_prev = MN[ID(row,col_end)];
Expand All @@ -158,14 +186,19 @@ __global__ void momt_kernelN(struct GPU_Layer L) {
h_jp1_prev = H[ID(row,col_end)];
// !! bug fixed: note that comparison of unsigned expression >= 0 is always true
for (int j = (int)col_end-1; j >= (int)col; j--) { //grid striding
if (threadIdx.x%32 == 0) {
r5 = R5[j];
}
__syncwarp();
r5 = __shfl_sync(0xFFFFFFFF,r5,0);
// if (threadIdx.x%32 == 0) {
// r5 = R5[j];
// }
// __syncwarp();
// r5 = __shfl_sync(0xFFFFFFFF,r5,0);
r5 = R5[j];
m = MN[ID(row,j)];
h = H[ID(row,j)];
m_im1 = __shfl_up_sync(0xFFFFFFFF,m,1);

// if (L.lid != 1 && (row == 0 || j == 0)) {
// write = false;
// }

if (threadIdx.x%32 != 0 || row == 0) { //upper bound of lanes
r4 = R4[ID(row,j)];
Expand All @@ -183,9 +216,11 @@ __global__ void momt_kernelN(struct GPU_Layer L) {
// }

if (xn < EPS && xn > -EPS) xn = 0.0f;
MN_out_dev[ID2E(row,j,1)] = xn;
if (write)
MN_out_dev[ID2E(row,j,1)] = xn;
}else{
MN_out_dev[ID2E(row,j,1)] = 0.0f;
if (write)
MN_out_dev[ID2E(row,j,1)] = 0.0f;
}
m_jp1_prev = m;
m_im1jp1_prev = m_im1;
Expand Down
22 changes: 22 additions & 0 deletions GPUOutput.cu
Original file line number Diff line number Diff line change
Expand Up @@ -57,23 +57,45 @@ extern "C" void cuda_getz1_(float *Z_f, int *lid) {
cudaCHK( cudaMemcpy(Z_f, L->Zdat_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
}

extern "C" void cuda_setz1_(float *Z_f, int *lid) {
struct GPU_Layer *L = ldlayer(*lid);
cudaCHK( cudaMemcpy(L->Zdat_hst, Z_f, L->l_size[3], cudaMemcpyHostToDevice) );
}

extern "C" void cuda_getz_(float *Z_f, int *lid) {
struct GPU_Layer *L = ldlayer(*lid);
cudaCHK( cudaMemcpy(Z_f, L->Zout_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
}

extern "C" void cuda_setz_(float *Z_f, int *lid) {
struct GPU_Layer *L = ldlayer(*lid);
cudaCHK( cudaMemcpy(L->Zout_hst, Z_f, L->l_size[3], cudaMemcpyHostToDevice) );
}

extern "C" void cuda_getmn1_(float *M_f, float *N_f, int *lid) {
struct GPU_Layer *L = ldlayer(*lid);
cudaCHK( cudaMemcpy(M_f, L->MNdat_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
cudaCHK( cudaMemcpy(N_f, L->MNdat_hst+L->l_size[2], L->l_size[3], cudaMemcpyDeviceToHost) );
}

extern "C" void cuda_setmn1_(float *M_f, float *N_f, int *lid) {
struct GPU_Layer *L = ldlayer(*lid);
cudaCHK( cudaMemcpy(L->MNdat_hst, M_f, L->l_size[3], cudaMemcpyHostToDevice) );
cudaCHK( cudaMemcpy(L->MNdat_hst+L->l_size[2], N_f, L->l_size[3], cudaMemcpyHostToDevice) );
}

extern "C" void cuda_getmn_(float *M_f, float *N_f, int *lid) {
struct GPU_Layer *L = ldlayer(*lid);
cudaCHK( cudaMemcpy(M_f, L->MNout_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
cudaCHK( cudaMemcpy(N_f, L->MNout_hst+L->l_size[2], L->l_size[3], cudaMemcpyDeviceToHost) );
}

extern "C" void cuda_setmn_(float *M_f, float *N_f, int *lid) {
struct GPU_Layer *L = ldlayer(*lid);
cudaCHK( cudaMemcpy(L->MNout_hst, M_f, L->l_size[3], cudaMemcpyHostToDevice) );
cudaCHK( cudaMemcpy(L->MNout_hst+L->l_size[2], N_f, L->l_size[3], cudaMemcpyHostToDevice) );
}

extern "C" void cuda_getzmax_(float *Zmax_f, int *lid) {
struct GPU_Layer *L = ldlayer(*lid);
cudaCHK( cudaMemcpy(Zmax_f, L->Zmax_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
Expand Down
Loading

0 comments on commit f25fdf5

Please sign in to comment.