Skip to content

Commit

Permalink
v1.0
Browse files Browse the repository at this point in the history
The program is currently available for computing earthquake tsunami on 1st layer with linear solver.
  • Loading branch information
AndybnACT authored Mar 15, 2018
1 parent c094f5b commit 87dae48
Show file tree
Hide file tree
Showing 27 changed files with 15,477 additions and 0 deletions.
42 changes: 42 additions & 0 deletions GPUConfig.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#ifndef SM_CONFG
#define SM_CONFG
#define LOAD_PER_SM 2
#define NUMSTREAM 4
extern cudaStream_t EXECstream[NUMSTREAM];
#endif

#ifndef MOMT_KERNEL_CONFG
#define MOMT_KERNEL_CONFG
#define BLOCKX 16 // ==> along column axis
#define EXECX 15 // BLOCKX-1
#define BLOCKY 16 // ==> along row axis
#define EXECY 15// BLOCKY-1

extern dim3 DimBlockMomt;
extern dim3 DimGridMomt;

#define BLOCKX_MOMT 64
extern dim3 DimBlockMomt_MN;
extern dim3 DimGridMomt_MN;
#endif

#ifndef MASS_KERNEL_CONFG
#define MASS_KERNEL_CONFG
#define BLOCKX_MASS 64
extern dim3 DimBlockMass;
extern dim3 DimGridMass;
#endif

#ifndef OPENBD_KERNEL_CONFG
#define OPENBD_KERNEL_CONFG
#define BLOCKX_OPENBD 64
extern dim3 DimBlockOpenBD;
extern dim3 DimGridOpenBD_LR;
extern dim3 DimGridOpenBD_TB;
#endif

#ifndef MAXAMP_KERNEL_CONFG
#define MAXAMP_KERNEL_CONFG
#define MAXAMP_BLOCK 512
extern size_t GridMaxAmp;
#endif
70 changes: 70 additions & 0 deletions GPUHeader.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#include <stdio.h>
#include "cuda.h"
#include <time.h>

//#define DEBUG
#ifdef DEBUG
#ifndef DEBUG_FUNC
#define DEBUG_FUNC
#define ERROR 1.0e-4
#define CHKR 720
#define CHKC 304
#define CHKSI 10
#define ID_hst(row,col) col*size_hst[0] + row
//#define ID2E_hst(row,col,dim) size_hst[2]*dim + ID(row,col)
extern float* tmpout;
#endif
#endif

#ifndef CONSTS
#define CONSTS
#define GX 1.0e-5
#define EPS 1.0e-10
#define TWLVTH 0.0833333333333333333
#define GRAV 9.807
#endif


#ifndef CUDA_CHK
#define CUDA_CHK
#define cudaCHK(FUN) ({\
if ((FUN) != cudaSuccess) {\
printf("%s in %s at line %d\n",cudaGetErrorString(cudaGetLastError()), __FILE__,__LINE__);\
exit(EXIT_FAILURE);\
}\
})
#define cudaERROR(err) ({\
if (err != cudaSuccess) {\
printf("error code: %d\n", err);\
printf("%s in %s at line %d\n",cudaGetErrorString(err), __FILE__,__LINE__);\
exit(EXIT_FAILURE);\
}\
})
#endif

#ifndef CUDA_KERNEL
#define CUDA_KERNEL
#define ID(row,col) (col)*size_dev[0] + row
#define ID2E(row,col,dim) size_dev[2]*(dim) + ID(row,col)
#endif


#ifndef CUDA_GLOB_VAR
extern float *Zout_hst, *MNout_hst;
extern float *MNdat_hst, *Zdat_hst;
extern float *R24_hst, *R35_hst, *H_hst;
//float *R1_hst, *R6_hst, *R11_hst;
extern float *R_MASS_hst;

// extern __device__ float *R35_dev;
// extern __device__ float *R24_dev, *H_dev;
// extern __device__ float *Z_dat_dev, *MN_dat_dev;
extern __device__ float *MN_out_dev, *Z_out_dev;
extern __constant__ __device__ uint32_t size_dev[4];
// extern texture <float, cudaTextureType2D, cudaReadModeElementType> ZtexRef;
// extern texture <float, cudaTextureType2D, cudaReadModeElementType> MtexRef;
// extern texture <float, cudaTextureType2D, cudaReadModeElementType> NtexRef;
extern float *Zmax_hst;
extern uint32_t size_hst[4];
extern cudaDeviceProp dev_prop;
#endif
85 changes: 85 additions & 0 deletions GPUMass_s.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#include "GPUHeader.h"
#include "GPUConfig.h"

extern "C" void mass_launch_(const float*, float*, const float*);
__global__ void mass_kernel(const float* __restrict__,const float* __restrict__,
const float* __restrict__,const float* __restrict__);



extern "C" void mass_launch_(const float* Z_f, float* Z_f_complete, const float *H_f){
cudaError_t err;
clock_t st, fi;

// cudaCHK( cudaMemcpy(H_hst, H_f, size_hst[3], cudaMemcpyHostToDevice) );
//cudaCHK( cudaMemcpy(Zdat_hst, Z_f, size_hst[3], cudaMemcpyHostToDevice) );

st = clock();
mass_kernel <<< DimGridMass, DimBlockMass >>> (Zdat_hst, MNdat_hst, R_MASS_hst, H_hst);// FUTURE MULTIPLE KERNELS
cudaDeviceSynchronize();
err = cudaGetLastError();
cudaERROR(err);
fi = clock();


#ifdef DEBUG
printf("TIME SPENT ON GPU %f\n",(float)(fi-st)/CLOCKS_PER_SEC);
// printf("printing information for debugging\n" );
// cudaCHK( cudaMemcpy(tmpout, Zout_hst, size_hst[3], cudaMemcpyDeviceToHost) );
// for (size_t i = 0; i < size_hst[2]; i++) {
// if (abs(tmpout[i] - Z_f_complete[i]) > ERROR) {
// printf("Z[%d] Z_cu:%e Z_f:%e %e\n", i, tmpout[i], Z_f_complete[i], tmpout[i] - Z_f_complete[i]);
// }
// }
#else
// cudaCHK( cudaMemcpy(Z_f_complete, Zout_hst, size_hst[3], cudaMemcpyDeviceToHost) );
#endif
}

__global__ void mass_kernel(const float* __restrict__ Z, const float* __restrict__ MN,
const float* __restrict__ R_MASS, const float* __restrict__ H){
/*+-->+-->+---->|
+-->+-->+---->|
+-->+-->+---->|
+-->+-->+---->|
*/
//designed for architectures whose warpsize=32
uint32_t row = blockIdx.x*31*(blockDim.x>>5) + 31*(threadIdx.x>>5) + threadIdx.x%32;
uint32_t col = blockIdx.y*(size_dev[1]/gridDim.y);
uint32_t col_end = (blockIdx.y == gridDim.y-1)? size_dev[1]-1:(blockIdx.y+1)*(size_dev[1]/gridDim.y)+1;
float h,z;
float m, m_suf;
float n, n_prev;
float ztmp;
float r1, r11;
float r6, r6_prev;

n_prev = MN[ID2E(row,col,1)];
r6_prev = R_MASS[col*4+1];

for (uint32_t i = col+1; i < col_end; i++) {
if (threadIdx.x%32 == 0) {
r1 = R_MASS[i*4];
r6 = R_MASS[i*4+1];
r11 = R_MASS[i*4+2];
}
__syncwarp();
r1 = __shfl_sync(0xFFFFFFFF,r1,0);
r6 = __shfl_sync(0xFFFFFFFF,r6,0);
r11 = __shfl_sync(0xFFFFFFFF,r11,0);
m = MN[ID(row,i)];
h = H[ID(row,i)];
z = Z[ID(row,i)];
n = MN[ID2E(row,i,1)];
m_suf = __shfl_up_sync(0xFFFFFFFF,m,1);
if (threadIdx.x%32 != 0 && row < size_dev[0]-1) {
ztmp = z - r1*(m-m_suf) - r11*(n*r6-n_prev*r6_prev);
if (ztmp + h <= EPS) ztmp = -h;
if (h <= GX || (ztmp < EPS && -ztmp < EPS) ) ztmp = 0.0;
Z_out_dev[ID(row,i)] = ztmp;

r6_prev = r6;
n_prev = n;
}
}
}
Loading

0 comments on commit 87dae48

Please sign in to comment.