Skip to content

Commit 2301bcb

Browse files
convolution added
1 parent d3552c9 commit 2301bcb

22 files changed

+1163
-60
lines changed

CUDA_CNN/CNN

710 KB
Binary file not shown.

CUDA_CNN/ConvNet/ConvNet.cu

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#include "ConvNet.h"
2+
3+
ConvNet::ConvNet(int m, int n, int o) {
4+
filter_size = m;
5+
num_of_filters = n;
6+
output_size = o;
7+
8+
9+
float init_bias[n];
10+
float init_weight[n * m];
11+
12+
for(int i = 0; i < n; ++i) {
13+
init_bias[i] = 0.5f - float(rand()) / float(RAND_MAX);
14+
for(int j = 0; j < m; ++j) {
15+
init_weight[i * m + j] = 0.5f - float(rand()) / float(RAND_MAX);
16+
}
17+
}
18+
19+
cudaMalloc(&output, sizeof(float) * output_size);
20+
cudaMalloc(&middle, sizeof(float) * output_size);
21+
cudaMalloc(&bias, sizeof(float) * num_of_filters);
22+
cudaMalloc(&weight, sizeof(float) * filter_size * num_of_filters);
23+
cudaMalloc(&d_output, sizeof(float) * output_size);
24+
cudaMalloc(&d_middle, sizeof(float) * output_size);
25+
cudaMalloc(&d_weight, sizeof(float) * filter_size * num_of_filters);
26+
27+
cudaMemcpy(bias, init_bias, sizeof(float) * n, cudaMemcpyHostToDevice);
28+
cudaMemcpy(weight, init_weight, sizeof(float) * m * n, cudaMemcpyHostToDevice);
29+
}
30+
31+
32+
ConvNet::~ConvNet() {
33+
cudaFree(output);
34+
cudaFree(middle);
35+
36+
cudaFree(bias);
37+
38+
cudaFree(weight);
39+
40+
cudaFree(d_output);
41+
cudaFree(d_middle);
42+
cudaFree(d_weight);
43+
}
44+
45+
void ConvNet::reinit() {
46+
cudaMemset(output, 0, sizeof(float) * output_size);
47+
cudaMemset(middle, 0, sizeof(float) * output_size);
48+
}
49+
50+
void ConvNet::reinit_backprop() {
51+
cudaMemset(d_output, 0, sizeof(float) * output_size);
52+
cudaMemset(d_middle, 0, sizeof(float) * output_size);
53+
cudaMemset(d_weight, 0, sizeof(float) * filter_size * num_of_filters);
54+
}

CUDA_CNN/ConvNet/ConvNet.h

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#include <cstdlib>
2+
#include <vector>
3+
#include <memory>
4+
#include <cublas_v2.h>
5+
#include <cuda.h>
6+
7+
8+
9+
#ifndef CONV_NET_H
10+
#define CONV_NET_H
11+
#endif
12+
13+
struct ConvNet {
14+
int filter_size;
15+
int num_of_filters;
16+
int output_size;
17+
18+
float *output;
19+
float *middle;
20+
21+
float *bias;
22+
float *weight;
23+
24+
float *d_output;
25+
float *d_middle;
26+
float *d_weight;
27+
28+
ConvNet(int m, int n, int o);
29+
30+
~ConvNet();
31+
32+
void reinit();
33+
void reinit_backprop();
34+
};

CUDA_CNN/Layer_functions/convlayer.cu

+232
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
#include "convlayer.h"
2+
3+
4+
__global__ void calc_gradient(float *output, float *grad, int N)
5+
{
6+
int pos = blockIdx.x * blockDim.x + threadIdx.x;
7+
8+
if(pos < N){
9+
output[pos] += dt * grad[pos];
10+
}
11+
}
12+
13+
__global__ void apply_convolve_1(float input[28][28], float middle[6][24][24], float weight[6][5][5], float * bias) {
14+
int pos = blockIdx.x * blockDim.x + threadIdx.x;
15+
int total_operations = 5 * 5 * 6 * 24 * 24;
16+
17+
if(pos < total_operations) {
18+
int i1 = (pos /= 1) % 5;
19+
int i2 = (pos /= 5) % 5;
20+
int i3 = (pos /= 5) % 6;
21+
int i4 = (pos /= 6) % 24;
22+
int i5 = (pos /= 24) % 24;
23+
24+
atomicAdd(&middle[i3][i4][i5], weight[i3][i1][i2] * input[i4 + i1][i5 + i2]);
25+
if(i1 == 0 && i2 == 0) {
26+
middle[i3][i4][i5] += bias[i3];
27+
}
28+
}
29+
}
30+
31+
__global__ void apply_strided_convolve_2(float input[6][24][24], float middle[6][6][6], float weight[1][4][4], float * bias) {
32+
int pos = blockIdx.x * blockDim.x + threadIdx.x;
33+
int total_operations = 4 * 4 * 6 * 6 * 6;
34+
35+
if(pos < total_operations) {
36+
int i1 = (pos /= 1) % 4;
37+
int i2 = (pos /= 4) % 4;
38+
int i3 = (pos /= 4) % 6;
39+
int i4 = (pos /= 6) % 6;
40+
int i5 = (pos /= 6) % 6;
41+
42+
atomicAdd(&middle[i3][i4][i5], weight[0][i1][i2] * input[i3][i4 * 4 + i1][i5 * 4 + i2]);
43+
if(i1 == 0 && i2 == 0) {
44+
middle[i3][i4][i5] += bias[0];
45+
}
46+
}
47+
}
48+
49+
__global__ void final_convolve_3(float input[6][6][6], float middle[10], float weight[10][6][6][6], float * bias)
50+
{
51+
int pos = blockIdx.x * blockDim.x + threadIdx.x;
52+
int total_operations = 10 * 6 * 6 * 6;
53+
54+
if(pos < total_operations) {
55+
int i1 = (pos /= 1) % 10;
56+
int i2 = (pos /= 10) % 6;
57+
int i3 = (pos /= 6) % 6;
58+
int i4 = (pos /= 6) % 6;
59+
atomicAdd(&middle[i1], weight[i1][i2][i3][i4] * input[i2][i3][i4]);
60+
if(i2 == 0 && i3 == 0 && i4 == 0) {
61+
middle[i1] += bias[i1];
62+
}
63+
}
64+
}
65+
66+
67+
__global__ void apply_sigmoid(float * middle, float * output, float output_size) {
68+
int pos = blockDim.x * blockIdx.x + threadIdx.x;
69+
if(pos < output_size) {
70+
output[pos] = 1 / (1 + exp(-middle[pos]));
71+
}
72+
}
73+
74+
__global__ void backpass_final_3(float d_weight[10][6][6][6], float middle[10], float output[6][6][6]) {
75+
int pos = blockDim.x * blockIdx.x + threadIdx.x;
76+
int total_operations = 10 * 6 * 6 * 6;
77+
if(pos < total_operations) {
78+
int i1 = (pos /= 1) % 10;
79+
int i2 = (pos /= 10) % 6;
80+
int i3 = (pos /= 6) % 6;
81+
int i4 = (pos /= 6) % 6;
82+
83+
d_weight[i1][i2][i3][i4] = middle[i1] * output[i2][i3][i4];
84+
}
85+
}
86+
__global__ void backpass_final_bias_3(float bias[10], float middle[10]) {
87+
int pos = blockDim.x * blockIdx.x + threadIdx.x;
88+
int total_operations = 10;
89+
if(pos < total_operations) {
90+
bias[pos] += dt * middle[pos];
91+
}
92+
}
93+
94+
__global__ void backpass_strided_convolve_2(float output[6][6][6], float weight[10][6][6][6], float middle[10]) {
95+
int pos = blockIdx.x * blockDim.x + threadIdx.x;
96+
int total_operations = 10 * 6 * 6 * 6;
97+
if (pos < total_operations) {
98+
int i1 = (pos /= 1) % 10;
99+
int i2 = (pos /= 10) % 6;
100+
int i3 = (pos /= 6) % 6;
101+
int i4 = (pos /= 6) % 6;
102+
103+
atomicAdd(&output[i2][i3][i4], weight[i1][i2][i3][i4] * middle[i1]);
104+
}
105+
}
106+
107+
__global__ void backpass_strided_convolve_middle_2(float d_middle[6][6][6], float output[6][6][6], float middle[6][6][6])
108+
{
109+
int pos = blockIdx.x * blockDim.x + threadIdx.x;
110+
int total_operations = 6*6*6;
111+
112+
if(pos < total_operations){
113+
114+
int i1 = (pos /= 1) % 6;
115+
int i2 = (pos /= 6) % 6;
116+
int i3 = (pos /= 6) % 6;
117+
118+
float sigm = 1 / (1 + exp(-middle[i1][i2][i3]));
119+
120+
d_middle[i1][i2][i3] = output[i1][i2][i3] * sigm * (1 - sigm);
121+
}
122+
}
123+
124+
__global__ void backpass_strided_convolve_weight_2(float weight[1][4][4], float middle[6][6][6], float output[6][24][24])
125+
{
126+
int pos = blockIdx.x * blockDim.x + threadIdx.x;
127+
int total_operations = 1*4*4*6*6*6;
128+
129+
if(pos < total_operations){
130+
131+
int i1 = (pos /= 1) % 1;
132+
int i2 = (pos /= 1) % 4;
133+
int i3 = (pos /= 4) % 4;
134+
int i4 = (pos /= 4) % 6;
135+
int i5 = (pos /= 6) % 6;
136+
int i6 = (pos /= 6) % 6;
137+
138+
atomicAdd(&weight[i1][i2][i3], middle[i4][i5][i6] * output[i4][i5 * 4 + i2][i6 * 4 + i3]);
139+
}
140+
}
141+
142+
__global__ void backpass_strided_convolve_bias_2(float bias[1], float middle[6][6][6])
143+
{
144+
int pos = blockIdx.x * blockDim.x + threadIdx.x;
145+
int total_operations = 6*6*6;
146+
float d = pow(6.0f, 3.0f);
147+
148+
if(pos < total_operations) {
149+
int i1 = (pos /= 1) % 6;
150+
int i2 = (pos /= 6) % 6;
151+
int i3 = (pos /= 6) % 6;
152+
153+
atomicAdd(&bias[0], dt * middle[i1][i2][i3] / d);
154+
}
155+
}
156+
157+
__global__ void backpass_convolve_1(float output[6][24][24], float weight[1][4][4], float middle[6][6][6])
158+
{
159+
int pos = blockIdx.x * blockDim.x + threadIdx.x;
160+
int total_operations = 1*4*4*6*6*6;
161+
162+
if(pos < total_operations) {
163+
int i1 = (pos /= 1) % 1;
164+
int i2 = (pos /= 1) % 4;
165+
int i3 = (pos /= 4) % 4;
166+
int i4 = (pos /= 4) % 6;
167+
int i5 = (pos /= 6) % 6;
168+
int i6 = (pos /= 6) % 6;
169+
170+
atomicAdd(&output[i4][i5 * 4 + i2][i6 * 4 + i3], weight[i1][i2][i3] * middle[i4][i5][i6]);
171+
}
172+
}
173+
174+
__global__ void backpass_convolve_middle_1(float d_middle[6][24][24], float output[6][24][24], float middle[6][24][24])
175+
{
176+
int pos = blockIdx.x * blockDim.x + threadIdx.x;
177+
int total_operations = 6*24*24;
178+
179+
if(pos < total_operations) {
180+
int i1 = (pos /= 1 ) % 6;
181+
int i2 = (pos /= 6 ) % 24;
182+
int i3 = (pos /= 24 ) % 24;
183+
184+
float o = 1 / (1 + exp(-middle[i1][i2][i3]));
185+
186+
d_middle[i1][i2][i3] = output[i1][i2][i3] * o * (1 - o);
187+
}
188+
}
189+
190+
__global__ void backpas_convolve_weight_1(float weight[6][5][5], float middle[6][24][24], float output[28][28])
191+
{
192+
int pos = blockIdx.x * blockDim.x + threadIdx.x;
193+
int total_operations = 6*5*5*24*24;
194+
float d = pow(24.0f, 2.0f);
195+
196+
if(pos < total_operations) {
197+
int i1 = (pos /= 1) % 6;
198+
int i2 = (pos /= 6) % 5;
199+
int i3 = (pos /= 5) % 5;
200+
int i4 = (pos /= 5) % 24;
201+
int i5 = (pos /= 24) % 24;
202+
203+
atomicAdd(&weight[i1][i2][i3], middle[i1][i4][i5] * output[i4 + i2][i5 + i3] / d);
204+
}
205+
}
206+
207+
__global__ void backpass_convolve_bias_1(float bias[6], float middle[6][24][24])
208+
{
209+
int pos = blockIdx.x * blockDim.x + threadIdx.x;
210+
int total_operations = 6*24*24;
211+
float d = pow(24.0f, 2.0f);
212+
213+
if(pos < total_operations) {
214+
215+
int i1 = (pos /= 1) % 6;
216+
int i2 = (pos /= 6) % 24;
217+
int i3 = (pos /= 24) % 24;
218+
219+
atomicAdd(&bias[i1], dt * middle[i1][i2][i3] / d);
220+
}
221+
}
222+
223+
__global__ void calcError(float *err, float *output, unsigned int Y, int N)
224+
{
225+
int pos = blockIdx.x * blockDim.x + threadIdx.x;
226+
227+
if(pos < N) {
228+
err[pos] = ((Y == pos ? 1.0f : 0.0f) - output[pos]);
229+
}
230+
}
231+
232+

CUDA_CNN/Layer_functions/convlayer.h

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#include <cstdlib>
2+
#include <vector>
3+
#include <memory>
4+
#include <cublas_v2.h>
5+
#include <cuda.h>
6+
7+
#ifndef LAYER_H
8+
#define LAYER_H
9+
#endif
10+
11+
const static float dt = 1.0E-01f;
12+
const static float threshold = 1.0E-02f;
13+
14+
__global__ void calc_gradient(float *output, float *grad, const int N);
15+
__global__ void apply_sigmoid(float * middle, float * output, float output_size);
16+
__global__ void calcError(float *err, float *output, unsigned int Y, const int N);
17+
18+
__global__ void apply_convolve_1(float input[28][28], float middle[6][24][24], float weight[6][5][5], float * bias);
19+
__global__ void backpass_convolve_1(float output[6][24][24], float weight[1][4][4], float middle[6][6][6]);
20+
__global__ void backpass_convolve_middle_1(float d_middle[6][24][24], float output[6][24][24], float middle[6][24][24]);
21+
__global__ void backpas_convolve_weight_1(float weight[6][5][5], float middle[6][24][24], float output[28][28]);
22+
__global__ void backpass_convolve_bias_1(float bias[6], float middle[6][24][24]);
23+
24+
__global__ void apply_strided_convolve_2(float input[6][24][24], float middle[6][6][6], float weight[1][4][4], float * bias);;
25+
__global__ void backpass_strided_convolve_2(float output[6][6][6], float weight[10][6][6][6], float middle[10]);
26+
__global__ void backpass_strided_convolve_middle_2(float d_middle[6][6][6], float output[6][6][6], float middle[6][6][6]);
27+
__global__ void backpass_strided_convolve_weight_2(float weight[1][4][4], float middle[6][6][6], float output[6][24][24]);
28+
__global__ void backpass_strided_convolve_bias_2(float bias[1], float middle[6][6][6]);
29+
30+
__global__ void final_convolve_3(float input[6][6][6], float middle[10], float weight[10][6][6][6], float * bias);
31+
__global__ void backpass_final_3(float d_weight[10][6][6][6], float middle[10], float output[6][6][6]);
32+
__global__ void backpass_final_bias_3(float bias[10], float middle[10]);

CUDA_CNN/data/t10k-images.idx3-ubyte

7.48 MB
Binary file not shown.

CUDA_CNN/data/t10k-labels.idx1-ubyte

9.77 KB
Binary file not shown.

CUDA_CNN/data/train-images.idx3-ubyte

44.9 MB
Binary file not shown.

CUDA_CNN/data/train-labels.idx1-ubyte

58.6 KB
Binary file not shown.

0 commit comments

Comments
 (0)