Skip to content
This repository was archived by the owner on Jan 11, 2023. It is now read-only.

Commit ea837f3

Browse files
Sem KirkelsSem Kirkels
Sem Kirkels
authored and
Sem Kirkels
committed
Completed streams
1 parent 2aadfc7 commit ea837f3

File tree

4 files changed

+191
-108
lines changed

4 files changed

+191
-108
lines changed

.gitignore

+32-3
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,35 @@ img7_convolution.png
6464
img8_convolution.png
6565
img9_convolution.png
6666

67-
OutputConvolution.png
68-
OutputMaxPooling.png
69-
OutputMinPooling.png
67+
OutputConvolution0.png
68+
OutputConvolution1.png
69+
OutputConvolution2.png
70+
OutputConvolution3.png
71+
OutputConvolution4.png
72+
OutputConvolution5.png
73+
OutputConvolution6.png
74+
OutputConvolution7.png
75+
OutputConvolution8.png
76+
OutputConvolution9.png
77+
78+
OutputMaxPooling0.png
79+
OutputMaxPooling1.png
80+
OutputMaxPooling2.png
81+
OutputMaxPooling3.png
82+
OutputMaxPooling4.png
83+
OutputMaxPooling5.png
84+
OutputMaxPooling6.png
85+
OutputMaxPooling7.png
86+
OutputMaxPooling8.png
87+
OutputMaxPooling9.png
88+
89+
OutputMinPooling0.png
90+
OutputMinPooling1.png
91+
OutputMinPooling2.png
92+
OutputMinPooling3.png
93+
OutputMinPooling4.png
94+
OutputMinPooling5.png
95+
OutputMinPooling6.png
96+
OutputMinPooling7.png
97+
OutputMinPooling8.png
98+
OutputMinPooling9.png

CUDA/Images/img_5.png

-877 KB
Binary file not shown.

CUDA/imgfilter_cuda.cu

+159-105
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
#include "stb_image_write.h"
99
#include <time.h>
1010

11-
#define INPUT_IMAGE "Images/img_9.png"
11+
#define NUMBER_OF_IMAGES 10
1212

1313
typedef struct Pixel
1414
{
@@ -25,146 +25,198 @@ int main(int argc, char **argv)
2525
clock_t timer_start, timer_end;
2626
timer_start = clock();
2727

28-
int deviceId;
29-
int numberOfSMs;
28+
size_t threadsPerBlock = 128;
29+
size_t numberOfBlocks = 32;
3030

31-
cudaGetDevice(&deviceId);
32-
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
31+
cudaStream_t stream[10];
3332

34-
size_t threadsPerBlock = 256;
35-
size_t numberOfBlocks = 32 * numberOfSMs;
33+
for(int i = 0; i < 10; i++)
34+
{
35+
cudaStreamCreate(&stream[i]);
36+
}
3637

37-
cudaStream_t stream0, stream1, stream2, stream3, stream4, stream5, stream6, stream7, stream8, stream9;
38+
printf("Building filepaths\r\n");
3839

39-
cudaStreamCreate(&stream0);
40-
cudaStreamCreate(&stream1);
41-
cudaStreamCreate(&stream2);
42-
cudaStreamCreate(&stream3);
43-
cudaStreamCreate(&stream4);
44-
cudaStreamCreate(&stream5);
45-
cudaStreamCreate(&stream6);
46-
cudaStreamCreate(&stream7);
47-
cudaStreamCreate(&stream8);
48-
cudaStreamCreate(&stream9);
40+
const char *inputFileName[10] =
41+
{
42+
"Images/img_0.png",
43+
"Images/img_1.png",
44+
"Images/img_2.png",
45+
"Images/img_3.png",
46+
"Images/img_4.png",
47+
"Images/img_5.png",
48+
"Images/img_6.png",
49+
"Images/img_7.png",
50+
"Images/img_8.png",
51+
"Images/img_9.png",
52+
};
53+
54+
// Build output filename
55+
const char *fileNameOutConvolution[10] =
56+
{
57+
"Output_Images/Convolution/OutputConvolution0.png",
58+
"Output_Images/Convolution/OutputConvolution1.png",
59+
"Output_Images/Convolution/OutputConvolution2.png",
60+
"Output_Images/Convolution/OutputConvolution3.png",
61+
"Output_Images/Convolution/OutputConvolution4.png",
62+
"Output_Images/Convolution/OutputConvolution5.png",
63+
"Output_Images/Convolution/OutputConvolution6.png",
64+
"Output_Images/Convolution/OutputConvolution7.png",
65+
"Output_Images/Convolution/OutputConvolution8.png",
66+
"Output_Images/Convolution/OutputConvolution9.png",
67+
};
68+
69+
const char *fileNameOutMinPooling[10] =
70+
{
71+
"Output_Images/Pooling/OutputMinPooling0.png",
72+
"Output_Images/Pooling/OutputMinPooling1.png",
73+
"Output_Images/Pooling/OutputMinPooling2.png",
74+
"Output_Images/Pooling/OutputMinPooling3.png",
75+
"Output_Images/Pooling/OutputMinPooling4.png",
76+
"Output_Images/Pooling/OutputMinPooling5.png",
77+
"Output_Images/Pooling/OutputMinPooling6.png",
78+
"Output_Images/Pooling/OutputMinPooling7.png",
79+
"Output_Images/Pooling/OutputMinPooling8.png",
80+
"Output_Images/Pooling/OutputMinPooling9.png",
81+
};
82+
83+
const char *fileNameOutMaxPooling[10] =
84+
{
85+
"Output_Images/Pooling/OutputMaxPooling0.png",
86+
"Output_Images/Pooling/OutputMaxPooling1.png",
87+
"Output_Images/Pooling/OutputMaxPooling2.png",
88+
"Output_Images/Pooling/OutputMaxPooling3.png",
89+
"Output_Images/Pooling/OutputMaxPooling4.png",
90+
"Output_Images/Pooling/OutputMaxPooling5.png",
91+
"Output_Images/Pooling/OutputMaxPooling6.png",
92+
"Output_Images/Pooling/OutputMaxPooling7.png",
93+
"Output_Images/Pooling/OutputMaxPooling8.png",
94+
"Output_Images/Pooling/OutputMaxPooling9.png",
95+
};
96+
97+
int width[10], height[10], componentCount[10], size[10];
98+
99+
unsigned char *originalImageHost[10];
100+
unsigned char *imageDataConvolutionHost[10]; // Saves output image
101+
unsigned char *imageDataMinPoolingHost[10]; // Saves Min pooling image
102+
unsigned char *imageDataMaxPoolingHost[10]; // Saves Max pooling image
103+
unsigned char *originalImage[10]; // Saves the original image on host
104+
unsigned char *imageDataGrayscale[10]; // Saves the grayscale image on device
105+
unsigned char *imageDataConvolution[10]; // Saves the convolved image
106+
unsigned char *imageDataMinPooling[10]; // Saves the min pooled image
107+
unsigned char *imageDataMaxPooling[10]; // Saves the max pooled image
49108

50-
// Open image
51-
printf("Loading png file\r\n");
109+
printf("Done\r\n");
52110

53-
int width, height, componentCount;
111+
printf("Loading png files\r\n");
54112

55-
unsigned char *originalImageCPU = stbi_load(INPUT_IMAGE, &width, &height, &componentCount, 4); // Saves original image
56-
unsigned char *originalImage;
57-
unsigned char *imageDataGrayscale; // Saves grayscale image
58-
unsigned char *imageDataConvolution; // Saves output image
59-
unsigned char *imageDataMinPooling; // Saves Min pooling image
60-
unsigned char *imageDataMaxPooling; // Saves Max pooling image
113+
for(int i = 0; i < NUMBER_OF_IMAGES; i++)
114+
{
115+
originalImageHost[i] = stbi_load(inputFileName[i], &width[i], &height[i], &componentCount[i], 4);
61116

62-
int size = width * height * 4;
117+
size[i] = height[i] * width[i] * 4;
63118

64-
cudaMallocManaged((unsigned char **)&originalImage, size);
65-
cudaMallocManaged((unsigned char **)&imageDataGrayscale, size);
66-
cudaMallocManaged((unsigned char **)&imageDataConvolution, size);
67-
cudaMallocManaged((unsigned char **)&imageDataMinPooling, size);
68-
cudaMallocManaged((unsigned char **)&imageDataMaxPooling, size);
119+
// Saves output image
120+
imageDataConvolutionHost[i] = (unsigned char *)malloc(size[i]);
69121

70-
cudaMemPrefetchAsync(originalImage, size, deviceId);
71-
cudaMemPrefetchAsync(imageDataGrayscale, size, deviceId);
72-
cudaMemPrefetchAsync(imageDataConvolution, size, deviceId);
73-
cudaMemPrefetchAsync(imageDataMinPooling, size, deviceId);
74-
cudaMemPrefetchAsync(imageDataMaxPooling, size, deviceId);
122+
// Saves Min pooling image
123+
imageDataMinPoolingHost[i] = (unsigned char *)malloc(size[i]);
75124

76-
cudaMemcpy(originalImage, originalImageCPU, size, cudaMemcpyHostToDevice);
125+
// Saves Max pooling image
126+
imageDataMaxPoolingHost[i] = (unsigned char *)malloc(size[i]);
77127

78-
// Build output filename
79-
const char *fileNameOutConvolution = "Output_Images/Convolution/OutputConvolution.png";
80-
const char *fileNameOutMinPooling = "Output_Images/Pooling/OutputMinPooling.png";
81-
const char *fileNameOutMaxPooling = "Output_Images/Pooling/OutputMaxPooling.png";
128+
cudaMalloc(&originalImage[i], size[i]);
129+
cudaMalloc(&imageDataGrayscale[i], size[i]);
130+
cudaMalloc(&imageDataConvolution[i], size[i]);
131+
cudaMalloc(&imageDataMinPooling[i], size[i]);
132+
cudaMalloc(&imageDataMaxPooling[i], size[i]);
82133

83-
if (!originalImage)
84-
{
85-
printf("Failed to open Image\r\n");
86-
stbi_image_free(originalImageCPU);
87-
cudaFree(originalImage);
88-
cudaFree(imageDataGrayscale);
89-
cudaFree(imageDataConvolution);
90-
cudaFree(imageDataMinPooling);
91-
cudaFree(imageDataMaxPooling);
92-
93-
return -1;
134+
cudaMemcpy(originalImage[i], originalImageHost[i], size[i], cudaMemcpyHostToDevice);
94135
}
95136

96137
printf("Done\r\n");
97138

98-
// Validate image sizes
99-
if (width % 32 || height % 32)
139+
// Process grayscale
140+
printf("Processing images grayscale\r\n");
141+
for(int i = 0; i < NUMBER_OF_IMAGES; i++)
100142
{
101-
// NOTE: Leaked memory of "imageDataGrayscale"
102-
printf("Width and/or Height is not dividable by 32!\r\n");
103-
stbi_image_free(originalImageCPU);
104-
cudaFree(originalImage);
105-
cudaFree(imageDataGrayscale);
106-
cudaFree(imageDataConvolution);
107-
cudaFree(imageDataMinPooling);
108-
cudaFree(imageDataMaxPooling);
109-
110-
return -1;
143+
ConvertImageToGrayCpu<<<numberOfBlocks, threadsPerBlock, i, stream[i]>>>(originalImage[i], imageDataGrayscale[i], width[i], height[i]);
111144
}
112-
113-
// Process image on cpu
114-
printf("Processing image grayscale\r\n");
115-
ConvertImageToGrayCpu<<<numberOfBlocks, threadsPerBlock>>>(originalImage, imageDataGrayscale, width, height);
116145
cudaDeviceSynchronize();
117146
printf("Done\r\n");
118147

119-
// Process image on cpu
148+
// Process convolution
120149
printf("Processing image convolution\r\n");
121-
convolveImage<<<numberOfBlocks, threadsPerBlock>>>(imageDataGrayscale, imageDataConvolution, width, height);
150+
for(int i = 0; i < NUMBER_OF_IMAGES; i++)
151+
{
152+
convolveImage<<<numberOfBlocks, threadsPerBlock, i, stream[i]>>>(imageDataGrayscale[i], imageDataConvolution[i], width[i], height[i]);
153+
}
122154
cudaDeviceSynchronize();
123155
printf("Done\r\n");
124156

125-
// Write image back to disk
126-
printf("Writing convolved png to disk\r\n");
127-
stbi_write_png(fileNameOutConvolution, width - 2, height - 2, 4, imageDataConvolution, 4 * width);
157+
// Process min pooling
158+
printf("Processing images minimum pooling\r\n");
159+
for(int i = 0; i < NUMBER_OF_IMAGES; i++)
160+
{
161+
minPooling<<<numberOfBlocks, threadsPerBlock, i, stream[i]>>>(originalImage[i], imageDataMinPooling[i], width[i], height[i]);
162+
}
163+
cudaDeviceSynchronize();
128164
printf("Done\r\n");
129165

130-
printf("Processing image minimum pooling\r\n");
131-
minPooling<<<numberOfBlocks, threadsPerBlock>>>(originalImage, imageDataMinPooling, width, height);
132-
cudaDeviceSynchronize();
166+
// Process max pooling
167+
printf("Processing image maximum pooling\r\n");
168+
for(int i = 0; i < NUMBER_OF_IMAGES; i++)
169+
{
170+
maxPooling<<<numberOfBlocks, threadsPerBlock>>>(originalImage[i], imageDataMaxPooling[i], width[i], height[i]);
171+
cudaDeviceSynchronize();
172+
}
133173
printf("Done\r\n");
174+
175+
// Writing Convolved images
134176

135177
// Write image back to disk
136-
printf("Writing min pooling png to disk\r\n");
137-
stbi_write_png(fileNameOutMinPooling, width / 2, height / 2, 4, imageDataMinPooling, 4 * (width / 2));
178+
printf("Writing convolved png to disk\r\n");
179+
for(int i = 0; i < NUMBER_OF_IMAGES; i++)
180+
{
181+
cudaMemcpy(imageDataConvolutionHost[i], imageDataConvolution[i], size[i], cudaMemcpyDeviceToHost);
182+
stbi_write_png(fileNameOutConvolution[i], width[i] - 2, height[i] - 2, 4, imageDataConvolutionHost[i], 4 * width[i]);
183+
}
138184
printf("Done\r\n");
139-
140-
printf("Processing image maximum pooling\r\n");
141-
maxPooling<<<numberOfBlocks, threadsPerBlock>>>(originalImage, imageDataMaxPooling, width, height);
142-
cudaDeviceSynchronize();
185+
186+
// Writing min pooled images
187+
printf("Writing min pooling png to disk\r\n");
188+
for(int i = 0; i < NUMBER_OF_IMAGES; i++)
189+
{
190+
cudaMemcpy(imageDataMinPoolingHost[i], imageDataMinPooling[i], size[i], cudaMemcpyDeviceToHost);
191+
stbi_write_png(fileNameOutMinPooling[i], width[i] / 2, height[i] / 2, 4, imageDataMinPoolingHost[i], 4 * (width[i] / 2));
192+
}
143193
printf("Done\r\n");
144194

145-
// Write image back to disk
195+
// Writing max pooled images
146196
printf("Writing max pooling png to disk\r\n");
147-
stbi_write_png(fileNameOutMaxPooling, width / 2, height / 2, 4, imageDataMaxPooling, 4 * (width / 2));
197+
for(int i = 0; i < NUMBER_OF_IMAGES; i++)
198+
{
199+
cudaMemcpy(imageDataMaxPoolingHost[i], imageDataMaxPooling[i], size[i], cudaMemcpyDeviceToHost);
200+
stbi_write_png(fileNameOutMaxPooling[i], width[i] / 2, height[i] / 2, 4, imageDataMaxPoolingHost[i], 4 * (width[i] / 2));
201+
}
148202
printf("Done\r\n");
149203

150-
stbi_image_free(originalImageCPU);
151-
152-
cudaFree(originalImage);
153-
cudaFree(imageDataGrayscale);
154-
cudaFree(imageDataConvolution);
155-
cudaFree(imageDataMinPooling);
156-
cudaFree(imageDataMaxPooling);
157-
158-
cudaStreamDestroy(stream0);
159-
cudaStreamDestroy(stream1);
160-
cudaStreamDestroy(stream2);
161-
cudaStreamDestroy(stream3);
162-
cudaStreamDestroy(stream4);
163-
cudaStreamDestroy(stream5);
164-
cudaStreamDestroy(stream6);
165-
cudaStreamDestroy(stream7);
166-
cudaStreamDestroy(stream8);
167-
cudaStreamDestroy(stream9);
204+
// Free memory and destroy streams
205+
for(int i = 0; i < NUMBER_OF_IMAGES; i++)
206+
{
207+
stbi_image_free(originalImageHost[i]);
208+
209+
free(imageDataConvolutionHost[i]);
210+
free(imageDataMinPoolingHost[i]);
211+
free(imageDataMaxPoolingHost[i]);
212+
213+
cudaFree(originalImage[i]);
214+
cudaFree(imageDataConvolution[i]);
215+
cudaFree(imageDataMinPooling[i]);
216+
cudaFree(imageDataMaxPooling[i]);
217+
218+
cudaStreamDestroy(stream[i]);
219+
}
168220

169221
timer_end = clock(); // end the timer
170222
double time_spent = (double)(timer_end - timer_start) / CLOCKS_PER_SEC;
@@ -277,11 +329,13 @@ __global__ void minPooling(unsigned char *originalImage, unsigned char *minPooli
277329
__global__ void maxPooling(unsigned char *originalImage, unsigned char *maxPoolingImage, int width, int height)
278330
{
279331
int counter = 0;
332+
int idx = (threadIdx.x + blockIdx.x * blockDim.x) * 4;
333+
int gridStride = blockDim.x * gridDim.x;
280334

281335
// Iterate over the image in 2x2 blocks
282336
for (int y = 0; y < height; y += 2)
283337
{
284-
for (int x = 0; x < width; x += 2)
338+
for (int x = idx; x < width; x += 2)
285339
{
286340
// For each channel, find the maximum value in the 2x2 block
287341
for (int c = 0; c < 4; c++)

Test_Results/Results_Markdown/Results.md

Whitespace-only changes.

0 commit comments

Comments
 (0)