4
4
5
5
#include " Processing.h"
6
6
#include " CUDA_check.h"
7
+ #include " Parameters.h"
7
8
8
9
#define CUDA_CHECK_RETURN (value ) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)
10
+ #define BLOCK_SIZE 32
9
11
12
+ #if !defined CONSTANT_MEMORY && !defined SHARED_MEMORY
10
13
__global__ void
11
14
kernel (unsigned char *input, unsigned long long int *krn, unsigned char *output, int height, int width, int channels,
12
15
int size, double weight) {
@@ -66,8 +69,8 @@ Image *process(Image *img, Kernel *krn) {
66
69
sizeof (unsigned long long int ) * krn->size * krn->size ,
67
70
cudaMemcpyHostToDevice));
68
71
69
- dim3 blockDim (32 , 32 );
70
- dim3 gridDim (ceil (((float ) img->width ) / blockDim . x ), ceil (((float ) img->height ) / blockDim . y ));
72
+ dim3 blockDim (BLOCK_SIZE, BLOCK_SIZE );
73
+ dim3 gridDim (ceil (((float ) img->width ) / BLOCK_SIZE ), ceil (((float ) img->height ) / BLOCK_SIZE ));
71
74
72
75
kernel<<<gridDim , blockDim >>> (d_input, d_krn, d_output, img->height , img->width , img->channels , krn->size ,
73
76
krn->weight );
@@ -82,4 +85,82 @@ Image *process(Image *img, Kernel *krn) {
82
85
cudaFree (d_output);
83
86
84
87
return res;
85
- }
88
+ }
89
+ #endif
90
+
91
+ #if defined CONSTANT_MEMORY && !defined SHARED_MEMORY
92
+ __constant__ unsigned long long int KERNEL[25 * 25 ];
93
+
94
+ __global__ void
95
+ kernelConstant (unsigned char *input, unsigned char *output, int height, int width, int channels,
96
+ int size, double weight) {
97
+ int iy = blockIdx .y * blockDim .y + threadIdx .y ;
98
+ int ix = blockIdx .x * blockDim .x + threadIdx .x ;
99
+
100
+ if (iy < height && ix < width) {
101
+ int kCenter = size / 2 ;
102
+ int dx, dy, px, py;
103
+
104
+ for (int ic = 0 ; ic < channels; ic++) {
105
+ // vars "i?" identify image's element
106
+ unsigned long long int newVal = 0 ;
107
+ for (int ky = 0 ; ky < size; ky++) {
108
+ for (int kx = 0 ; kx < size; kx++) {
109
+ // vars "k?" identify kernel's element
110
+ dx = kx - kCenter ;
111
+ dy = ky - kCenter ;
112
+ // vars "d?" identify kernel's element's position with respect to the center
113
+ px = ix + dx;
114
+ py = iy + dy;
115
+ // vars "p?" identify the pixel to combine with kernel's element
116
+
117
+ if (px < 0 || px >= width) { // edge handling: extend
118
+ px = (px < 0 ) ? 0 : (width - 1 );
119
+ }
120
+ if (py < 0 || py >= height) {
121
+ py = (py < 0 ) ? 0 : (height - 1 );
122
+ }
123
+
124
+ newVal += (unsigned long long int ) input[py * width * channels + px * channels + ic] *
125
+ KERNEL[ky * size + kx];
126
+ }
127
+ }
128
+ newVal = (unsigned long long int ) ((long double ) newVal * weight);
129
+ output[iy * width * channels + ix * channels + ic] = (unsigned char ) newVal;
130
+ }
131
+ }
132
+ }
133
+
134
+ Image *process (Image *img, Kernel *krn) {
135
+ Image *res = Image_new_empty (img->width , img->height , img->channels );
136
+
137
+ unsigned char *d_input;
138
+ unsigned char *d_output;
139
+
140
+ CUDA_CHECK_RETURN (cudaMalloc ((void **) &d_input, sizeof (unsigned char ) * img->width * img->height * img->channels ));
141
+ CUDA_CHECK_RETURN (
142
+ cudaMalloc ((void **) &d_output, sizeof (unsigned char ) * img->width * img->height * img->channels ));
143
+
144
+ CUDA_CHECK_RETURN (cudaMemcpy ((void *) d_input, (void *) img->data ,
145
+ sizeof (unsigned char ) * img->width * img->height * img->channels ,
146
+ cudaMemcpyHostToDevice));
147
+ CUDA_CHECK_RETURN (cudaMemcpyToSymbol (KERNEL, (void *) krn->coefficients ,
148
+ sizeof (unsigned long long int ) * krn->size * krn->size ));
149
+
150
+ dim3 blockDim (BLOCK_SIZE, BLOCK_SIZE);
151
+ dim3 gridDim (ceil (((float ) img->width ) / BLOCK_SIZE), ceil (((float ) img->height ) / BLOCK_SIZE));
152
+
153
+ kernelConstant<<<gridDim , blockDim >>> (d_input, d_output, img->height , img->width , img->channels , krn->size ,
154
+ krn->weight );
155
+ cudaDeviceSynchronize ();
156
+
157
+ CUDA_CHECK_RETURN (cudaMemcpy ((void *) res->data , (void *) d_output,
158
+ sizeof (unsigned char ) * img->width * img->height * img->channels ,
159
+ cudaMemcpyDeviceToHost));
160
+
161
+ cudaFree (d_input);
162
+ cudaFree (d_output);
163
+
164
+ return res;
165
+ }
166
+ #endif
0 commit comments