-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupsample_layer.cu
118 lines (76 loc) · 2.7 KB
/
upsample_layer.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#include <vector>
#include "upsample_layer.h"
#include <stdio.h>
template <typename Dtype>
__global__ void upsample_kernel(size_t N, const Dtype *x, int w, int h, int c, int batch, int stride, int forward, float scale, Dtype *out)
{
#if 1
// printf("ReLUForwardsss 01 enqueue line=%d",__LINE__);
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (N); \
i += blockDim.x * gridDim.x)
{
int out_index = i;
int out_w = i%(w*stride);
i = i/(w*stride);
int out_h = i%(h*stride);
i = i/(h*stride);
int out_c = i%c;
i = i/c;
int b = i%batch;
int in_w = out_w / stride;
int in_h = out_h / stride;
int in_c = out_c;
int in_index = b*w*h*c + in_c*w*h + in_h*w + in_w;
// out[out_index] += scale * x[in_index];
out[out_index] = scale*x[in_index];//坑死了
}
//}
#else
#endif
/*
size_t i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(i >= N) return;
int out_index = i;
int out_w = i%(w*stride);
i = i/(w*stride);
int out_h = i%(h*stride);
i = i/(h*stride);
int out_c = i%c;
i = i/c;
int b = i%batch;
int in_w = out_w / stride;
int in_h = out_h / stride;
int in_c = out_c;
int in_index = b*w*h*c + in_c*w*h + in_h*w + in_w;
out[out_index] += scale * x[in_index];
*/
// else atomicAdd(x+in_index, scale * out[out_index]);
}
__global__ void ReLUForward(const int n, const float* in, float* out,
float negative_slope) {
CUDA_KERNEL_LOOP(index, n) {
// printf("ReLUForwardsss 01 enqueue line=%d",__LINE__);
out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope;
}
}
void ReluForward_gpu(const float* bottom,
float* top,int b,int c,int w,int h,float negative_slope) {
const int nthreads =b*c*w*h;
// NOLINT_NEXT_LINE(whitespace/operators)
// printf("ReLUForwardsss 01 enqueue line=%d threads:%d %d %d %d %d \n",__LINE__,nthreads,b,c,w,h);
ReLUForward <<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>(
nthreads, bottom, top, negative_slope);
CUDA_POST_KERNEL_CHECK;
}
void Forward_gpu(
float* input,int b,int c,int w,int h,int stride_, float* output ) {
//const Dtype* bottom_data = bottom[0]->gpu_data();
//Dtype* top_data = top[0]->mutable_gpu_data();
//const int count = top[0]->count();
const int nthreads =b*c*w*h*stride_*stride_;
//vector<int> buttom_shape = bottom[0]->shape();
// printf("ReLUForwardsss 01 enqueue line=%d",__LINE__);
upsample_kernel<float><<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS>>>( nthreads, input,h, w, c,b,stride_, 1,1, output);
CUDA_POST_KERNEL_CHECK;
}