Skip to content

Commit

Permalink
Refactor
Browse files Browse the repository at this point in the history
Rename a member of GPU_Layer: size_hst -> l_size
Delete dead DEBUG sections.
Delete kernel functions that will be no longer maintained.
Remove unused global variables.
Merge header of GPUOpen_BD into itself.
AoS approach does not benefit to SIMD for Momt kernels.
  • Loading branch information
AndybnACT committed Jan 31, 2022
1 parent b13ae31 commit a8be928
Show file tree
Hide file tree
Showing 8 changed files with 97 additions and 332 deletions.
18 changes: 1 addition & 17 deletions GPUHeader.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
#define CHKR 720
#define CHKC 304
#define CHKSI 10
#define ID_hst(row,col) col*size_hst[0] + row
//#define ID2E_hst(row,col,dim) size_hst[2]*dim + ID(row,col)
extern float* tmpout;
#endif
#endif
Expand Down Expand Up @@ -58,7 +56,7 @@ struct GPU_Layer {
float *R24_hst, *R35_hst, *H_hst;
float *R_MASS_hst;
float *Zmax_hst;
uint32_t size_hst[4];
uint32_t l_size[4];

dim3 DimGridMomt_MN;
dim3 DimGridMomt;
Expand All @@ -77,21 +75,7 @@ static inline struct GPU_Layer* ldlayer(int lid){
}

#ifndef CUDA_GLOB_VAR
extern float *Zout_hst, *MNout_hst;
extern float *MNdat_hst, *Zdat_hst;
extern float *R24_hst, *R35_hst, *H_hst;
//float *R1_hst, *R6_hst, *R11_hst;
extern float *R_MASS_hst;

// extern __device__ float *R35_dev;
// extern __device__ float *R24_dev, *H_dev;
// extern __device__ float *Z_dat_dev, *MN_dat_dev;
extern __device__ float *MN_out_dev, *Z_out_dev;
extern __constant__ __device__ uint32_t all_size_dev[MAX_LAYERS][4];
// extern texture <float, cudaTextureType2D, cudaReadModeElementType> ZtexRef;
// extern texture <float, cudaTextureType2D, cudaReadModeElementType> MtexRef;
// extern texture <float, cudaTextureType2D, cudaReadModeElementType> NtexRef;
extern float *Zmax_hst;
extern uint32_t all_size[MAX_LAYERS][4]; // mirror of all_size_dev
extern cudaDeviceProp dev_prop;
#endif
18 changes: 3 additions & 15 deletions GPUMass_s.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,6 @@ extern "C" void mass_launch_(const float* Z_f, float* Z_f_complete, const float
cudaError_t err;
clock_t st, fi;
struct GPU_Layer *L = ldlayer(*lid);

// cudaCHK( cudaMemcpy(H_hst, H_f, size_hst[3], cudaMemcpyHostToDevice) );
//cudaCHK( cudaMemcpy(Zdat_hst, Z_f, size_hst[3], cudaMemcpyHostToDevice) );

st = clock();
mass_kernel <<< L->DimGridMass, DimBlockMass >>> (*L);
Expand All @@ -21,19 +18,10 @@ extern "C" void mass_launch_(const float* Z_f, float* Z_f_complete, const float
cudaERROR(err);
fi = clock();

#ifdef DEBUG
printf("TIME SPENT ON GPU %f\n",(float)(fi-st)/CLOCKS_PER_SEC);

#ifdef DEBUG
printf("TIME SPENT ON GPU %f\n",(float)(fi-st)/CLOCKS_PER_SEC);
// printf("printing information for debugging\n" );
// cudaCHK( cudaMemcpy(tmpout, Zout_hst, size_hst[3], cudaMemcpyDeviceToHost) );
// for (size_t i = 0; i < size_hst[2]; i++) {
// if (abs(tmpout[i] - Z_f_complete[i]) > ERROR) {
// printf("Z[%d] Z_cu:%e Z_f:%e %e\n", i, tmpout[i], Z_f_complete[i], tmpout[i] - Z_f_complete[i]);
// }
// }
#else
// cudaCHK( cudaMemcpy(Z_f_complete, Zout_hst, size_hst[3], cudaMemcpyDeviceToHost) );
#endif
#endif /* DEBUG */
}

__global__ void mass_kernel(struct GPU_Layer L){
Expand Down
137 changes: 15 additions & 122 deletions GPUMoment_s.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,16 @@ extern "C" void momt_launch_(float *M_f, float *N_f, float *Z_f, int *lid) {
cudaError_t err;
struct GPU_Layer *L = ldlayer(*lid);

#ifdef DEBUG
printf("Z_cu vs Z_f\n");
cudaCHK( cudaMemcpy(tmpout, Zout_hst, size_hst[3], cudaMemcpyDeviceToHost) );
for (size_t i = 0; i < size_hst[2]; i++) {
if (abs(tmpout[i] - Z_f[i]) > ERROR) {
printf("Z[%d,%d] Z_cu:%e Z_f:%e %e\n", i%size_hst[0], i/size_hst[0], tmpout[i], Z_f[i], tmpout[i] - Z_f[i]);
}
#ifdef DEBUG
printf("Z_cu vs Z_f\n");
cudaCHK( cudaMemcpy(tmpout, Zout_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
for (size_t i = 0; i < l_size[2]; i++) {
if (abs(tmpout[i] - Z_f[i]) > ERROR) {
printf("err\n");
}

#endif

// // kernel launch
// st = clock();
// momts_kernel <<< DimGridMomt, DimBlockMomt >>> (Zout_hst, MNdat_hst, R24_hst, R35_hst, H_hst);
// cudaDeviceSynchronize();
// err = cudaGetLastError();
// cudaERROR(err);
// fi = clock();

}

#endif /* DEBUG */

st = clock();
momt_kernelM <<< L->DimGridMomt_MN, DimBlockMomt_MN, 0, EXECstream[0] >>> (*L);
Expand All @@ -41,24 +32,10 @@ extern "C" void momt_launch_(float *M_f, float *N_f, float *Z_f, int *lid) {
cudaERROR(err);
fi = clock();

#ifdef DEBUG
printf("TIME SPENT ON GPU %f\n",(float)(fi-st)/CLOCKS_PER_SEC);
printf("printing debug information\n" );
cudaCHK( cudaMemcpy(tmpout, MNout_hst, 2*size_hst[3], cudaMemcpyDeviceToHost) );
for (size_t i = 0; i < size_hst[2]; i++) {
if (abs(tmpout[i] - M_f[i]) > ERROR) {
printf("M[%d,%d] M_cu:%e M_f:%e %e\n", i%size_hst[0], i/size_hst[0], tmpout[i], M_f[i], tmpout[i] - M_f[i]);
}
}
for (size_t i = size_hst[2], j=0; i < 2*size_hst[2]; i++, j++) {
if (abs(tmpout[i] - N_f[j]) > ERROR) {
printf("N[%d,%d] N_cu:%e N_f:%e %e\n", (i-size_hst[2])%size_hst[0], (i-size_hst[2])/size_hst[0], tmpout[i], N_f[j], tmpout[i] - N_f[j]);
}
}
#else
// cudaCHK( cudaMemcpy(M_f, MNout_hst, size_hst[3], cudaMemcpyDeviceToHost) );// FUTURE: delete
// cudaCHK( cudaMemcpy(N_f, MNout_hst+size_hst[2], size_hst[3], cudaMemcpyDeviceToHost) );// FUTURE: delete
#endif
#ifdef DEBUG
printf("TIME SPENT ON GPU %f\n",(float)(fi-st)/CLOCKS_PER_SEC);

#endif /* DEBUG */

}

Expand Down Expand Up @@ -145,8 +122,8 @@ __global__ void momt_kernelN(struct GPU_Layer L) {

const float* __restrict__ Z = L.Zout_hst;
const float* __restrict__ MN = L.MNdat_hst;
const float* __restrict__ R4 = L.R24_hst + L.size_hst[2];
const float* __restrict__ R5 = L.R35_hst + L.size_hst[1];
const float* __restrict__ R4 = L.R24_hst + L.l_size[2];
const float* __restrict__ R5 = L.R35_hst + L.l_size[1];
const float* __restrict__ H = L.H_hst;
float* __restrict__ MN_out_dev = L.MNout_hst;
const uint32_t __restrict__ *size_dev = all_size_dev[L.lid];
Expand Down Expand Up @@ -222,87 +199,3 @@ __global__ void momt_kernelN(struct GPU_Layer L) {


}

/*
__global__ void momts_kernel(const float* __restrict__ Z, const float* __restrict__ MN,
const float* __restrict__ R24, const float* __restrict__ R35,
const float* __restrict__ H) {
__shared__ float Z_dat[BLOCKX][BLOCKY+1]; // BLOCKY+1 for preventing bank conflicts
__shared__ float MN_dat[BLOCKX][BLOCKY+1][2];
__shared__ float H_dat[BLOCKX][BLOCKY+1];
int row = blockIdx.x*(EXECX) + threadIdx.x;
int col = blockIdx.y*(EXECY) + threadIdx.y;
int id = ID(row, col);
//int idr2 = id+size_dev[2];
int idmn = id+threadIdx.z*size_dev[2];
float r1, r2, tot=0.0f, x=0.0f;
if (row < size_dev[0] && col < size_dev[1]){
//printf("%d %d %d\n",row, col, id);
H_dat[threadIdx.x][threadIdx.y] = H[id]; // -->texture mem might be better
Z_dat[threadIdx.x][threadIdx.y] = Z[id];
MN_dat[threadIdx.x][threadIdx.y][threadIdx.z] = MN[idmn];
__syncthreads();
if (threadIdx.z == 0) { // for M
if (threadIdx.y != 0 || col == 0) {// boundary condition in threadblocks (if 1st thread is not 1st col)
if (threadIdx.x < EXECX){ // boundary condition in threadblocks
if (row < size_dev[0] - 1){ // IS:IE
int ip1 = threadIdx.x+1; // i plus 1
int jm1 = threadIdx.y-1; // j minus 1 (only used when col != 0)
if (H_dat[threadIdx.x][threadIdx.y] > GX && H_dat[ip1][threadIdx.y] > GX) { // preconditions //NOTE redundant checks
r1 = R24[id];
r2 = R35[col];
tot = MN_dat[threadIdx.x][threadIdx.y][1] + MN_dat[ip1][threadIdx.y][1];
x = -r1*(Z_dat[ip1][threadIdx.y] - Z_dat[threadIdx.x][threadIdx.y]);
if (col != 0) {
tot += MN_dat[threadIdx.x][jm1][1] + MN_dat[ip1][jm1][1];
}else{ // first col
tot += tot;
}
tot *= r2;
x += MN_dat[threadIdx.x][threadIdx.y][0];
tot += x;
if ( tot > EPS || -tot > EPS) MN_out_dev[id] = tot;
else MN_out_dev[id] = 0;
// if (row == 446 && col == 788) {
// printf("[%d,%d,%d]==================================>momt\n",threadIdx.x,threadIdx.y,threadIdx.z );
// printf("%e\t%e\t%e\t%e\t%e\t%e\n",r1,r2, MN_dat[threadIdx.x][threadIdx.y][1], MN_dat[ip1][threadIdx.y][1], MN_dat[threadIdx.x][jm1][1], MN_dat[ip1][jm1][1] );
// printf("%e\t%e\n",Z_dat[ip1][threadIdx.y], Z_dat[threadIdx.x][threadIdx.y] );
// }
}
}
}
}
}else{ // for N
if (threadIdx.x != 0 || row == 0) {// boundary condition in threadblocks
if (threadIdx.y < EXECY){ //boundary condotion in threadblocks
if (col < size_dev[1] - 1) { //JS:JE
int jp1 = threadIdx.y+1;
int im1 = threadIdx.x-1;
if (H_dat[threadIdx.x][threadIdx.y] > GX && H_dat[threadIdx.x][jp1] > GX) { // preconditions
r1 = R24[idmn];
r2 = R35[col+size_dev[1]];
tot = MN_dat[threadIdx.x][threadIdx.y][0] + MN_dat[threadIdx.x][jp1][0];
x = -r1*(Z_dat[threadIdx.x][jp1] - Z_dat[threadIdx.x][threadIdx.y]);
if (row == 0) { // first row
tot += tot;
}else{
tot += MN_dat[im1][threadIdx.y][0] + MN_dat[im1][jp1][0];
}
tot *= r2;
x += MN_dat[threadIdx.x][threadIdx.y][1];
tot = x - tot;
if ( tot > EPS || -tot > EPS) MN_out_dev[idmn] = tot;
else MN_out_dev[idmn] = 0;
}
}
}
}
}
}
}
*/
25 changes: 16 additions & 9 deletions GPUOpen_BD.cu
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
#include "GPUHeader.h"
#include "GPUConfig.h"

#include "GPUOpen_BD.h"
typedef enum bd_side{
LEFT,
RIGHT,
TOP,
BOTTOM,
} bdside;

__global__ void openbd_kernel(struct GPU_Layer, bdside);

extern "C" void openbd_launch_(float *Z_f_complete) {
/* Only for outest layer, assume its layerid = 0 */
Expand All @@ -17,15 +24,15 @@ extern "C" void openbd_launch_(float *Z_f_complete) {
cudaERROR(err);


#ifdef DEBUG
printf("printing information for debugging\n" );
cudaCHK( cudaMemcpy(tmpout, Zout_hst, size_hst[3], cudaMemcpyDeviceToHost) );
for (size_t i = 0; i < size_hst[2]; i++) {
if (abs(tmpout[i] - Z_f_complete[i]) > ERROR) {
printf("Z[%d,%d] Z_cu:%e Z_f:%e %e\n", i%size_hst[0], i/size_hst[0] , tmpout[i], Z_f_complete[i], tmpout[i] - Z_f_complete[i]);
}
#ifdef DEBUG
printf("printing information for debugging\n" );
cudaCHK( cudaMemcpy(tmpout, Zout_hst, l_size[3], cudaMemcpyDeviceToHost) );
for (size_t i = 0; i < l_size[2]; i++) {
if (abs(tmpout[i] - Z_f_complete[i]) > ERROR) {
printf("Z[%d,%d] Z_cu:%e Z_f:%e %e\n", i%l_size[0], i/l_size[0] , tmpout[i], Z_f_complete[i], tmpout[i] - Z_f_complete[i]);
}
#endif
}
#endif /* DEBUG */

}

Expand Down
16 changes: 0 additions & 16 deletions GPUOpen_BD.h

This file was deleted.

14 changes: 7 additions & 7 deletions GPUOutput.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ extern "C" void maxamp_launch_(const int *lid){
struct GPU_Layer *L = ldlayer(*lid);

st = clock();
maximum_recorder_kernel <<< L->GridMaxAmp, MAXAMP_BLOCK >>> (L->Zmax_hst, L->Zout_hst, L->size_hst[2]);
maximum_recorder_kernel <<< L->GridMaxAmp, MAXAMP_BLOCK >>> (L->Zmax_hst, L->Zout_hst, L->l_size[2]);
cudaDeviceSynchronize();
err = cudaGetLastError();
cudaERROR(err);
Expand All @@ -40,21 +40,21 @@ __global__ void maximum_recorder_kernel(float *max_array, const float* __restric

extern "C" void cuda_getz_(float *Z_f, int *lid) {
struct GPU_Layer *L = ldlayer(*lid);
cudaCHK( cudaMemcpy(Z_f, L->Zout_hst, L->size_hst[3], cudaMemcpyDeviceToHost) );
cudaCHK( cudaMemcpy(Z_f, L->Zout_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
}

extern "C" void cuda_getmn_(float *M_f, float *N_f, int *lid) {
struct GPU_Layer *L = ldlayer(*lid);
cudaCHK( cudaMemcpy(M_f, L->MNout_hst, L->size_hst[3], cudaMemcpyDeviceToHost) );
cudaCHK( cudaMemcpy(N_f, L->MNout_hst+L->size_hst[2], L->size_hst[3], cudaMemcpyDeviceToHost) );
cudaCHK( cudaMemcpy(M_f, L->MNout_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
cudaCHK( cudaMemcpy(N_f, L->MNout_hst+L->l_size[2], L->l_size[3], cudaMemcpyDeviceToHost) );
}

extern "C" void cuda_getzmax_(float *Zmax_f, int *lid) {
struct GPU_Layer *L = ldlayer(*lid);
cudaCHK( cudaMemcpy(Zmax_f, L->Zmax_hst, L->size_hst[3], cudaMemcpyDeviceToHost) );
cudaCHK( cudaMemcpy(Zmax_f, L->Zmax_hst, L->l_size[3], cudaMemcpyDeviceToHost) );
}

// void cuda_getmnmax_(float *Mmax_f, float *Nmax_f) {
// cudaCHK( cudaMemcpy(Mmax_f, MNmax_hst, size_hst[3], cudaMemcpyDeviceToHost) );
// cudaCHK( cudaMemcpy(Nmax_f, MNmax_hst+size_hst[2], size_hst[3], cudaMemcpyDeviceToHost) );
// cudaCHK( cudaMemcpy(Mmax_f, MNmax_hst, l_size[3], cudaMemcpyDeviceToHost) );
// cudaCHK( cudaMemcpy(Nmax_f, MNmax_hst+l_size[2], l_size[3], cudaMemcpyDeviceToHost) );
// }
Loading

0 comments on commit a8be928

Please sign in to comment.