-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathadd_array.cu
97 lines (79 loc) · 3.2 KB
/
add_array.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "utils.cuh"
#include <iostream>
__global__ void AddKernel(int *result, const int *arr1, const int *arr2, const int *arr3) {
/**
* Add Three arrays.
**/
int tid = threadIdx.x;
result[tid] = arr1[tid] + arr2[tid] + arr3[tid];
}
void HostAdd(int *result, const int arr1[], const int arr2[], const int arr3[], int arraysize) {
for (int i = 0; i < arraysize; i++) {
result[i] = arr1[i] + arr2[i] + arr3[i];
}
}
int main() {
float milliseconds = 0;
clock_t host_start, host_end;
cudaEvent_t kernel_start, kernel_end;
cudaEventCreate(&kernel_start);
cudaEventCreate(&kernel_end);
const int arraysize = 5;
const int a[arraysize] = { 0, 1, 2, 3, 4 };
const int b[arraysize] = { 4, 3, 2, 1, 0 };
const int c[arraysize] = { 1, 1, 1, 1, 1 };
// Transfer data from host to device
int *device_a, *device_b, *device_c;
GPUErrorCheck(cudaMalloc((void**)&device_a, sizeof(int) * arraysize));
GPUErrorCheck(cudaMalloc((void**)&device_b, sizeof(int) * arraysize));
GPUErrorCheck(cudaMalloc((void**)&device_c, sizeof(int) * arraysize));
GPUErrorCheck(cudaMemcpy(device_a, a, sizeof(int) * arraysize, cudaMemcpyHostToDevice));
GPUErrorCheck(cudaMemcpy(device_b, b, sizeof(int) * arraysize, cudaMemcpyHostToDevice));
GPUErrorCheck(cudaMemcpy(device_c, c, sizeof(int) * arraysize, cudaMemcpyHostToDevice));
int* device_result;
GPUErrorCheck(cudaMalloc((void**)&device_result, sizeof(int) * arraysize));
// Set block and grid size
int nx = 5;
dim3 block(5);
dim3 grid(nx / block.x);
printf("\nAdding three arrays...\n\n");
PrintInput(a, arraysize);
printf(" + ");
PrintInput(b, arraysize);
printf(" + ");
PrintInput(c, arraysize);
// Device execution
// Launch Add kernel
GPUErrorCheck(cudaEventRecord(kernel_start, 0));
AddKernel<<<grid, block>>>(device_result, device_a, device_b, device_c);
GPUErrorCheck(cudaDeviceSynchronize());
GPUErrorCheck(cudaEventRecord(kernel_end, 0));
GPUErrorCheck(cudaEventSynchronize(kernel_end));
// Transfer data from device to host
int *host_result;
host_result = (int*)malloc(sizeof(int) * arraysize);
GPUErrorCheck(cudaMemcpy(host_result, device_result, sizeof(int) * arraysize, cudaMemcpyDeviceToHost));
printf("\n\nGPU Result: ");
PrintOutput(host_result, arraysize);
GPUErrorCheck(cudaEventElapsedTime(&milliseconds, kernel_start, kernel_end));
GPUErrorCheck(cudaFree(device_result));
GPUErrorCheck(cudaFree(device_c));
GPUErrorCheck(cudaFree(device_b));
GPUErrorCheck(cudaFree(device_a));
free(host_result);
GPUErrorCheck(cudaDeviceReset());
// CPU execution
int *cpu_result;
cpu_result = (int*)malloc(sizeof(int) * arraysize);
host_start = clock();
HostAdd(cpu_result, a, b, c, arraysize);
host_end = clock();
printf("\nCPU Result: ");
PrintOutput(cpu_result, arraysize);
free(cpu_result);
printf("\n\nGPU Execution Time: %f milliseconds", milliseconds);
printf("\nCPU Execution Time: %d - %d = %f", host_end, host_start, (double)(host_end - host_start) / CLOCKS_PER_SEC);
return 0;
}