8
8
#include " stb_image_write.h"
9
9
#include < time.h>
10
10
11
- #define INPUT_IMAGE " Images/img_9.png "
11
+ #define NUMBER_OF_IMAGES 10
12
12
13
13
typedef struct Pixel
14
14
{
@@ -25,146 +25,198 @@ int main(int argc, char **argv)
25
25
clock_t timer_start, timer_end;
26
26
timer_start = clock ();
27
27
28
- int deviceId ;
29
- int numberOfSMs ;
28
+ size_t threadsPerBlock = 128 ;
29
+ size_t numberOfBlocks = 32 ;
30
30
31
- cudaGetDevice (&deviceId);
32
- cudaDeviceGetAttribute (&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
31
+ cudaStream_t stream[10 ];
33
32
34
- size_t threadsPerBlock = 256 ;
35
- size_t numberOfBlocks = 32 * numberOfSMs;
33
+ for (int i = 0 ; i < 10 ; i++)
34
+ {
35
+ cudaStreamCreate (&stream[i]);
36
+ }
36
37
37
- cudaStream_t stream0, stream1, stream2, stream3, stream4, stream5, stream6, stream7, stream8, stream9 ;
38
+ printf ( " Building filepaths \r\n " ) ;
38
39
39
- cudaStreamCreate (&stream0);
40
- cudaStreamCreate (&stream1);
41
- cudaStreamCreate (&stream2);
42
- cudaStreamCreate (&stream3);
43
- cudaStreamCreate (&stream4);
44
- cudaStreamCreate (&stream5);
45
- cudaStreamCreate (&stream6);
46
- cudaStreamCreate (&stream7);
47
- cudaStreamCreate (&stream8);
48
- cudaStreamCreate (&stream9);
40
+ const char *inputFileName[10 ] =
41
+ {
42
+ " Images/img_0.png" ,
43
+ " Images/img_1.png" ,
44
+ " Images/img_2.png" ,
45
+ " Images/img_3.png" ,
46
+ " Images/img_4.png" ,
47
+ " Images/img_5.png" ,
48
+ " Images/img_6.png" ,
49
+ " Images/img_7.png" ,
50
+ " Images/img_8.png" ,
51
+ " Images/img_9.png" ,
52
+ };
53
+
54
+ // Build output filename
55
+ const char *fileNameOutConvolution[10 ] =
56
+ {
57
+ " Output_Images/Convolution/OutputConvolution0.png" ,
58
+ " Output_Images/Convolution/OutputConvolution1.png" ,
59
+ " Output_Images/Convolution/OutputConvolution2.png" ,
60
+ " Output_Images/Convolution/OutputConvolution3.png" ,
61
+ " Output_Images/Convolution/OutputConvolution4.png" ,
62
+ " Output_Images/Convolution/OutputConvolution5.png" ,
63
+ " Output_Images/Convolution/OutputConvolution6.png" ,
64
+ " Output_Images/Convolution/OutputConvolution7.png" ,
65
+ " Output_Images/Convolution/OutputConvolution8.png" ,
66
+ " Output_Images/Convolution/OutputConvolution9.png" ,
67
+ };
68
+
69
+ const char *fileNameOutMinPooling[10 ] =
70
+ {
71
+ " Output_Images/Pooling/OutputMinPooling0.png" ,
72
+ " Output_Images/Pooling/OutputMinPooling1.png" ,
73
+ " Output_Images/Pooling/OutputMinPooling2.png" ,
74
+ " Output_Images/Pooling/OutputMinPooling3.png" ,
75
+ " Output_Images/Pooling/OutputMinPooling4.png" ,
76
+ " Output_Images/Pooling/OutputMinPooling5.png" ,
77
+ " Output_Images/Pooling/OutputMinPooling6.png" ,
78
+ " Output_Images/Pooling/OutputMinPooling7.png" ,
79
+ " Output_Images/Pooling/OutputMinPooling8.png" ,
80
+ " Output_Images/Pooling/OutputMinPooling9.png" ,
81
+ };
82
+
83
+ const char *fileNameOutMaxPooling[10 ] =
84
+ {
85
+ " Output_Images/Pooling/OutputMaxPooling0.png" ,
86
+ " Output_Images/Pooling/OutputMaxPooling1.png" ,
87
+ " Output_Images/Pooling/OutputMaxPooling2.png" ,
88
+ " Output_Images/Pooling/OutputMaxPooling3.png" ,
89
+ " Output_Images/Pooling/OutputMaxPooling4.png" ,
90
+ " Output_Images/Pooling/OutputMaxPooling5.png" ,
91
+ " Output_Images/Pooling/OutputMaxPooling6.png" ,
92
+ " Output_Images/Pooling/OutputMaxPooling7.png" ,
93
+ " Output_Images/Pooling/OutputMaxPooling8.png" ,
94
+ " Output_Images/Pooling/OutputMaxPooling9.png" ,
95
+ };
96
+
97
+ int width[10 ], height[10 ], componentCount[10 ], size[10 ];
98
+
99
+ unsigned char *originalImageHost[10 ];
100
+ unsigned char *imageDataConvolutionHost[10 ]; // Saves output image
101
+ unsigned char *imageDataMinPoolingHost[10 ]; // Saves Min pooling image
102
+ unsigned char *imageDataMaxPoolingHost[10 ]; // Saves Max pooling image
103
+ unsigned char *originalImage[10 ]; // Saves the original image on host
104
+ unsigned char *imageDataGrayscale[10 ]; // Saves the grayscale image on device
105
+ unsigned char *imageDataConvolution[10 ]; // Saves the convolved image
106
+ unsigned char *imageDataMinPooling[10 ]; // Saves the min pooled image
107
+ unsigned char *imageDataMaxPooling[10 ]; // Saves the max pooled image
49
108
50
- // Open image
51
- printf (" Loading png file\r\n " );
109
+ printf (" Done\r\n " );
52
110
53
- int width, height, componentCount ;
111
+ printf ( " Loading png files \r\n " ) ;
54
112
55
- unsigned char *originalImageCPU = stbi_load (INPUT_IMAGE, &width, &height, &componentCount, 4 ); // Saves original image
56
- unsigned char *originalImage;
57
- unsigned char *imageDataGrayscale; // Saves grayscale image
58
- unsigned char *imageDataConvolution; // Saves output image
59
- unsigned char *imageDataMinPooling; // Saves Min pooling image
60
- unsigned char *imageDataMaxPooling; // Saves Max pooling image
113
+ for (int i = 0 ; i < NUMBER_OF_IMAGES; i++)
114
+ {
115
+ originalImageHost[i] = stbi_load (inputFileName[i], &width[i], &height[i], &componentCount[i], 4 );
61
116
62
- int size = width * height * 4 ;
117
+ size[i] = height[i] * width[i] * 4 ;
63
118
64
- cudaMallocManaged ((unsigned char **)&originalImage, size);
65
- cudaMallocManaged ((unsigned char **)&imageDataGrayscale, size);
66
- cudaMallocManaged ((unsigned char **)&imageDataConvolution, size);
67
- cudaMallocManaged ((unsigned char **)&imageDataMinPooling, size);
68
- cudaMallocManaged ((unsigned char **)&imageDataMaxPooling, size);
119
+ // Saves output image
120
+ imageDataConvolutionHost[i] = (unsigned char *)malloc (size[i]);
69
121
70
- cudaMemPrefetchAsync (originalImage, size, deviceId);
71
- cudaMemPrefetchAsync (imageDataGrayscale, size, deviceId);
72
- cudaMemPrefetchAsync (imageDataConvolution, size, deviceId);
73
- cudaMemPrefetchAsync (imageDataMinPooling, size, deviceId);
74
- cudaMemPrefetchAsync (imageDataMaxPooling, size, deviceId);
122
+ // Saves Min pooling image
123
+ imageDataMinPoolingHost[i] = (unsigned char *)malloc (size[i]);
75
124
76
- cudaMemcpy (originalImage, originalImageCPU, size, cudaMemcpyHostToDevice);
125
+ // Saves Max pooling image
126
+ imageDataMaxPoolingHost[i] = (unsigned char *)malloc (size[i]);
77
127
78
- // Build output filename
79
- const char *fileNameOutConvolution = " Output_Images/Convolution/OutputConvolution.png" ;
80
- const char *fileNameOutMinPooling = " Output_Images/Pooling/OutputMinPooling.png" ;
81
- const char *fileNameOutMaxPooling = " Output_Images/Pooling/OutputMaxPooling.png" ;
128
+ cudaMalloc (&originalImage[i], size[i]);
129
+ cudaMalloc (&imageDataGrayscale[i], size[i]);
130
+ cudaMalloc (&imageDataConvolution[i], size[i]);
131
+ cudaMalloc (&imageDataMinPooling[i], size[i]);
132
+ cudaMalloc (&imageDataMaxPooling[i], size[i]);
82
133
83
- if (!originalImage)
84
- {
85
- printf (" Failed to open Image\r\n " );
86
- stbi_image_free (originalImageCPU);
87
- cudaFree (originalImage);
88
- cudaFree (imageDataGrayscale);
89
- cudaFree (imageDataConvolution);
90
- cudaFree (imageDataMinPooling);
91
- cudaFree (imageDataMaxPooling);
92
-
93
- return -1 ;
134
+ cudaMemcpy (originalImage[i], originalImageHost[i], size[i], cudaMemcpyHostToDevice);
94
135
}
95
136
96
137
printf (" Done\r\n " );
97
138
98
- // Validate image sizes
99
- if (width % 32 || height % 32 )
139
+ // Process grayscale
140
+ printf (" Processing images grayscale\r\n " );
141
+ for (int i = 0 ; i < NUMBER_OF_IMAGES; i++)
100
142
{
101
- // NOTE: Leaked memory of "imageDataGrayscale"
102
- printf (" Width and/or Height is not dividable by 32!\r\n " );
103
- stbi_image_free (originalImageCPU);
104
- cudaFree (originalImage);
105
- cudaFree (imageDataGrayscale);
106
- cudaFree (imageDataConvolution);
107
- cudaFree (imageDataMinPooling);
108
- cudaFree (imageDataMaxPooling);
109
-
110
- return -1 ;
143
+ ConvertImageToGrayCpu<<<numberOfBlocks, threadsPerBlock, i, stream[i]>>> (originalImage[i], imageDataGrayscale[i], width[i], height[i]);
111
144
}
112
-
113
- // Process image on cpu
114
- printf (" Processing image grayscale\r\n " );
115
- ConvertImageToGrayCpu<<<numberOfBlocks, threadsPerBlock>>> (originalImage, imageDataGrayscale, width, height);
116
145
cudaDeviceSynchronize ();
117
146
printf (" Done\r\n " );
118
147
119
- // Process image on cpu
148
+ // Process convolution
120
149
printf (" Processing image convolution\r\n " );
121
- convolveImage<<<numberOfBlocks, threadsPerBlock>>> (imageDataGrayscale, imageDataConvolution, width, height);
150
+ for (int i = 0 ; i < NUMBER_OF_IMAGES; i++)
151
+ {
152
+ convolveImage<<<numberOfBlocks, threadsPerBlock, i, stream[i]>>> (imageDataGrayscale[i], imageDataConvolution[i], width[i], height[i]);
153
+ }
122
154
cudaDeviceSynchronize ();
123
155
printf (" Done\r\n " );
124
156
125
- // Write image back to disk
126
- printf (" Writing convolved png to disk\r\n " );
127
- stbi_write_png (fileNameOutConvolution, width - 2 , height - 2 , 4 , imageDataConvolution, 4 * width);
157
+ // Process min pooling
158
+ printf (" Processing images minimum pooling\r\n " );
159
+ for (int i = 0 ; i < NUMBER_OF_IMAGES; i++)
160
+ {
161
+ minPooling<<<numberOfBlocks, threadsPerBlock, i, stream[i]>>> (originalImage[i], imageDataMinPooling[i], width[i], height[i]);
162
+ }
163
+ cudaDeviceSynchronize ();
128
164
printf (" Done\r\n " );
129
165
130
- printf (" Processing image minimum pooling\r\n " );
131
- minPooling<<<numberOfBlocks, threadsPerBlock>>> (originalImage, imageDataMinPooling, width, height);
132
- cudaDeviceSynchronize ();
166
+ // Process max pooling
167
+ printf (" Processing image maximum pooling\r\n " );
168
+ for (int i = 0 ; i < NUMBER_OF_IMAGES; i++)
169
+ {
170
+ maxPooling<<<numberOfBlocks, threadsPerBlock>>> (originalImage[i], imageDataMaxPooling[i], width[i], height[i]);
171
+ cudaDeviceSynchronize ();
172
+ }
133
173
printf (" Done\r\n " );
174
+
175
+ // Writing Convolved images
134
176
135
177
// Write image back to disk
136
- printf (" Writing min pooling png to disk\r\n " );
137
- stbi_write_png (fileNameOutMinPooling, width / 2 , height / 2 , 4 , imageDataMinPooling, 4 * (width / 2 ));
178
+ printf (" Writing convolved png to disk\r\n " );
179
+ for (int i = 0 ; i < NUMBER_OF_IMAGES; i++)
180
+ {
181
+ cudaMemcpy (imageDataConvolutionHost[i], imageDataConvolution[i], size[i], cudaMemcpyDeviceToHost);
182
+ stbi_write_png (fileNameOutConvolution[i], width[i] - 2 , height[i] - 2 , 4 , imageDataConvolutionHost[i], 4 * width[i]);
183
+ }
138
184
printf (" Done\r\n " );
139
-
140
- printf (" Processing image maximum pooling\r\n " );
141
- maxPooling<<<numberOfBlocks, threadsPerBlock>>> (originalImage, imageDataMaxPooling, width, height);
142
- cudaDeviceSynchronize ();
185
+
186
+ // Writing min pooled images
187
+ printf (" Writing min pooling png to disk\r\n " );
188
+ for (int i = 0 ; i < NUMBER_OF_IMAGES; i++)
189
+ {
190
+ cudaMemcpy (imageDataMinPoolingHost[i], imageDataMinPooling[i], size[i], cudaMemcpyDeviceToHost);
191
+ stbi_write_png (fileNameOutMinPooling[i], width[i] / 2 , height[i] / 2 , 4 , imageDataMinPoolingHost[i], 4 * (width[i] / 2 ));
192
+ }
143
193
printf (" Done\r\n " );
144
194
145
- // Write image back to disk
195
+ // Writing max pooled images
146
196
printf (" Writing max pooling png to disk\r\n " );
147
- stbi_write_png (fileNameOutMaxPooling, width / 2 , height / 2 , 4 , imageDataMaxPooling, 4 * (width / 2 ));
197
+ for (int i = 0 ; i < NUMBER_OF_IMAGES; i++)
198
+ {
199
+ cudaMemcpy (imageDataMaxPoolingHost[i], imageDataMaxPooling[i], size[i], cudaMemcpyDeviceToHost);
200
+ stbi_write_png (fileNameOutMaxPooling[i], width[i] / 2 , height[i] / 2 , 4 , imageDataMaxPoolingHost[i], 4 * (width[i] / 2 ));
201
+ }
148
202
printf (" Done\r\n " );
149
203
150
- stbi_image_free (originalImageCPU);
151
-
152
- cudaFree (originalImage);
153
- cudaFree (imageDataGrayscale);
154
- cudaFree (imageDataConvolution);
155
- cudaFree (imageDataMinPooling);
156
- cudaFree (imageDataMaxPooling);
157
-
158
- cudaStreamDestroy (stream0);
159
- cudaStreamDestroy (stream1);
160
- cudaStreamDestroy (stream2);
161
- cudaStreamDestroy (stream3);
162
- cudaStreamDestroy (stream4);
163
- cudaStreamDestroy (stream5);
164
- cudaStreamDestroy (stream6);
165
- cudaStreamDestroy (stream7);
166
- cudaStreamDestroy (stream8);
167
- cudaStreamDestroy (stream9);
204
+ // Free memory and destroy streams
205
+ for (int i = 0 ; i < NUMBER_OF_IMAGES; i++)
206
+ {
207
+ stbi_image_free (originalImageHost[i]);
208
+
209
+ free (imageDataConvolutionHost[i]);
210
+ free (imageDataMinPoolingHost[i]);
211
+ free (imageDataMaxPoolingHost[i]);
212
+
213
+ cudaFree (originalImage[i]);
214
+ cudaFree (imageDataConvolution[i]);
215
+ cudaFree (imageDataMinPooling[i]);
216
+ cudaFree (imageDataMaxPooling[i]);
217
+
218
+ cudaStreamDestroy (stream[i]);
219
+ }
168
220
169
221
timer_end = clock (); // end the timer
170
222
double time_spent = (double )(timer_end - timer_start) / CLOCKS_PER_SEC;
@@ -277,11 +329,13 @@ __global__ void minPooling(unsigned char *originalImage, unsigned char *minPooli
277
329
__global__ void maxPooling (unsigned char *originalImage, unsigned char *maxPoolingImage, int width, int height)
278
330
{
279
331
int counter = 0 ;
332
+ int idx = (threadIdx .x + blockIdx .x * blockDim .x ) * 4 ;
333
+ int gridStride = blockDim .x * gridDim .x ;
280
334
281
335
// Iterate over the image in 2x2 blocks
282
336
for (int y = 0 ; y < height; y += 2 )
283
337
{
284
- for (int x = 0 ; x < width; x += 2 )
338
+ for (int x = idx ; x < width; x += 2 )
285
339
{
286
340
// For each channel, find the maximum value in the 2x2 block
287
341
for (int c = 0 ; c < 4 ; c++)
0 commit comments