Skip to content

Improved performance #26

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: gpu_experiment
Choose a base branch
from
17 changes: 14 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,20 @@ if(ENABLE_GPU_DEVICE)

# https://en.wikipedia.org/w/index.php?title=CUDA&section=5#GPUs_supported
# https://raw.githubusercontent.com/PointCloudLibrary/pcl/master/cmake/pcl_find_cuda.cmake
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "11.0")
set(CMAKE_CUDA_ARCHITECTURES 35 37 50 52 53 60 61 62 70 72 75 80 86)
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "11.1")
execute_process(COMMAND ${CMAKE_CUDA_COMPILER} --list-gpu-code RESULT_VARIABLE EXIT_CODE OUTPUT_VARIABLE OUTPUT_VAL)
if(EXIT_CODE EQUAL 0)
#Remove sm_
string(REPLACE "sm_" "" OUTPUT_VAL ${OUTPUT_VAL})
#Convert to list
string(REPLACE "\n" ";" CMAKE_CUDA_ARCHITECTURES ${OUTPUT_VAL})
#Remove last empty entry
list(REMOVE_AT CMAKE_CUDA_ARCHITECTURES -1)
else()
message(FATAL_ERROR "Failed to run NVCC to get list of GPU codes: ${EXIT_CODE}")
endif()
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "11.0")
set(CMAKE_CUDA_ARCHITECTURES 35 37 50 52 53 60 61 62 70 72 75 80)
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "10.0")
set(CMAKE_CUDA_ARCHITECTURES 30 32 35 37 50 52 53 60 61 62 70 72 75)
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "9.0")
Expand Down Expand Up @@ -107,4 +119,3 @@ else()
message(STATUS "GPU mode: ${BoldRed}OFF${ColourReset}")
endif()
message(STATUS "----------------------")

16 changes: 13 additions & 3 deletions src/gpu/arch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ enum {
ARCH_VOLTA,
ARCH_TURING,
ARCH_AMPERE,
ARCH_ADA,
ARCH_UNKNOWN
};

Expand All @@ -27,6 +28,7 @@ static const char *uarch_str[] = {
/*[ARCH_VOLTA] = */ "Volta",
/*[ARCH_TURING] = */ "Turing",
/*[ARCH_AMPERE] = */ "Ampere",
/*[ARCH_ADA] = */ "Ada",
};

struct benchmark_gpu {
Expand Down Expand Up @@ -143,8 +145,12 @@ struct gpu* get_gpu_info(int gpu_idx) {
break;
case 80:
case 86:
case 87:
gpu->uarch = ARCH_AMPERE;
break;
case 89:
gpu->uarch = ARCH_ADA;
break;
default:
printf("GPU: %s\n", gpu->name);
printErr("Invalid uarch: %d.%d\n", deviceProp.major, deviceProp.minor);
Expand All @@ -162,6 +168,7 @@ struct gpu* get_gpu_info(int gpu_idx) {
break;
case ARCH_TURING:
case ARCH_AMPERE: // UNTESTED
case ARCH_ADA: // UNTESTED
gpu->latency = 4;
break;
default:
Expand All @@ -185,21 +192,23 @@ struct benchmark_gpu* init_benchmark_gpu(struct gpu* gpu, int nbk, int tpb) {
bench->nbk = (nbk == INVALID_CFG) ? (gpu->latency * gpu->sm_count) : nbk;
bench->tpb = (tpb == INVALID_CFG) ? _ConvertSMVer2Cores(gpu->cc_major, gpu->cc_minor) : tpb;
}
bench->n = bench->nbk * bench->tpb;
bench->n = 16 * bench->nbk * bench->tpb;
bench->gflops = (double)(BENCHMARK_GPU_ITERS * 2 * (long)bench->n)/(long)1000000000;

cudaError_t err = cudaSuccess;
float *h_A;
float *h_B;
int size = bench->n * sizeof(float);

cudaSetDevice(0);

if ((err = cudaMallocHost((void **)&h_A, size)) != cudaSuccess) {
printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
printErr("XXX %s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
return NULL;
}

if ((err = cudaMallocHost((void **)&h_B, size)) != cudaSuccess) {
printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
printErr("XXX %s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
return NULL;
}

Expand All @@ -208,6 +217,7 @@ struct benchmark_gpu* init_benchmark_gpu(struct gpu* gpu, int nbk, int tpb) {
h_B[i] = rand()/(float)RAND_MAX;
}


if ((err = cudaMalloc((void **) &(bench->d_A), size)) != cudaSuccess) {
printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
return NULL;
Expand Down
2 changes: 1 addition & 1 deletion src/gpu/arch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

#include "../getarg.hpp"

#define BENCHMARK_GPU_ITERS 400000000
#define BENCHMARK_GPU_ITERS 40000000

struct benchmark_gpu;

Expand Down
35 changes: 29 additions & 6 deletions src/gpu/kernel.cu
Original file line number Diff line number Diff line change
@@ -1,15 +1,38 @@
#include "kernel.hpp"
#include <stdio.h>
#include <stdint.h>
#define N 16
#define gid threadIdx.x + blockIdx.x * blockDim.x
#define off gid*N


__global__
void compute_kernel(float *vec_a, float *vec_b, float *vec_c, int n) {
float a = vec_a[0];
float b = vec_b[0];
float c = 0.0;
__shared__ float myblockA[N];
__shared__ float myblockB[N];
__shared__ float myblockC[N];

#pragma unroll
for(int i = 0; i < N; i++){
myblockA[i] = vec_a[off+i];
myblockB[i] = vec_b[off+i];
myblockC[i] = vec_a[off+i];
}

__syncthreads();

#pragma unroll 2000
#pragma unroll 32
for(long i=0; i < BENCHMARK_GPU_ITERS; i++) {
c = (c * a) + b;
#pragma unroll
for(int j = 0; j < N; j++){
myblockC[j] = (myblockC[j] * myblockA[j]) + myblockB[j];
}
}

#pragma unroll
for(int i = 0; i < N; i++){
vec_c[off+i] = myblockC[i];
}

vec_c[0] = c;
}