Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Shared Memory Bitonic Sort #1

Merged
merged 9 commits into from
Jan 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions bitonic_sort.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#ifndef BITONIC_SORT_CUH
#define BITONIC_SORT_CUH

#include <climits>
#include <cuda_runtime.h>

__device__ int swap(int x, int mask, int dir);
__global__ void warpBitonicSort(int *arr, int size);
__global__ void smemBitonicSort(int *arr, int size);
void launchBitonicSort(int *arr, int size);

#endif // BITONIC_SORT_CUH
105 changes: 52 additions & 53 deletions main.cpp
Original file line number Diff line number Diff line change
@@ -1,75 +1,74 @@
#include "bitonic_sort.cuh"
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <cuda_runtime.h>
#include "warp_bitonic_sort.cuh"

// Function to check if the array is sorted
bool isSorted(int* arr, int size) {
for (int i = 1; i < size; i++) {
if (arr[i] < arr[i-1]) return false;
}
return true;
bool isSorted(int *arr, int size) {
for (int i = 1; i < size; i++) {
if (arr[i] < arr[i - 1])
return false;
}
return true;
}

int main() {
const int SIZE = 4096; // Must be a multiple of 32 for this example
const int BLOCK_SIZE = 256;
const int SIZE = 1024; // Must be a multiple of 32 for this example

// Allocate and initialize host array
int* h_arr = new int[SIZE];
for (int i = 0; i < SIZE; i++) {
h_arr[i] = rand() % 1000; // Random integers between 0 and 999
}
// Allocate and initialize host array
int *h_arr = new int[SIZE];
for (int i = 0; i < SIZE; i++) {
h_arr[i] = rand() % 1000; // Random integers between 0 and 999
}

// Allocate device array
int* d_arr;
cudaMalloc(&d_arr, SIZE * sizeof(int));
// Allocate device array
int *d_arr;
cudaMalloc(&d_arr, SIZE * sizeof(int));

// Copy host array to device
cudaMemcpy(d_arr, h_arr, SIZE * sizeof(int), cudaMemcpyHostToDevice);
// Copy host array to device
cudaMemcpy(d_arr, h_arr, SIZE * sizeof(int), cudaMemcpyHostToDevice);

// Create CUDA events for timing
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Create CUDA events for timing
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

// Record the start event
cudaEventRecord(start, nullptr);
// Record the start event
cudaEventRecord(start, nullptr);

// Launch kernel
launchWarpBitonicSort(d_arr, SIZE);
// Launch kernel
launchBitonicSort(d_arr, SIZE);

// Record the stop event
cudaEventRecord(stop, nullptr);
cudaEventSynchronize(stop);
// Record the stop event
cudaEventRecord(stop, nullptr);
cudaEventSynchronize(stop);

// Calculate elapsed time
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
// Calculate elapsed time
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);

// Copy result back to host
cudaMemcpy(h_arr, d_arr, SIZE * sizeof(int), cudaMemcpyDeviceToHost);
// Copy result back to host
cudaMemcpy(h_arr, d_arr, SIZE * sizeof(int), cudaMemcpyDeviceToHost);

// Check if sorted
bool sorted = isSorted(h_arr, SIZE);
printf("Array is %s\n", sorted ? "sorted" : "not sorted");
// Check if sorted
bool sorted = isSorted(h_arr, SIZE);
printf("Array is %s\n", sorted ? "sorted" : "not sorted");

// Print first few elements to verify
printf("First 32 elements: ");
for (int i = 0; i < 32; i++) {
printf("%d ", h_arr[i]);
}
printf("\n");
// Print first few elements to verify
printf("First 32 elements: ");
for (int i = 0; i < 32; i++) {
printf("%d ", h_arr[i]);
}
printf("\n");

// Print timing information
printf("Kernel execution time: %f milliseconds\n", milliseconds);
// Print timing information
printf("Kernel execution time: %f milliseconds\n", milliseconds);

// Clean up
delete[] h_arr;
cudaFree(d_arr);
cudaEventDestroy(start);
cudaEventDestroy(stop);
// Clean up
delete[] h_arr;
cudaFree(d_arr);
cudaEventDestroy(start);
cudaEventDestroy(stop);

return 0;
return 0;
}
16 changes: 11 additions & 5 deletions makefile
Original file line number Diff line number Diff line change
@@ -1,24 +1,30 @@
CXX = g++
NVCC = nvcc
CXXFLAGS = -std=c++11 -O2
NVCCFLAGS = -O2
NVCCFLAGS = -O2 -G -g
CUDA_PATH = /opt/cuda
INCLUDES = -I$(CUDA_PATH)/include
LDFLAGS = -L$(CUDA_PATH)/lib64 -lcudart

all: warp_bitonic_sort cpu_bitonic_sort
all: cpu_bitonic_sort warp_bitonic_sort smem_bitonic_sort

warp_bitonic_sort: main.o warp_bitonic_sort.o
$(CXX) $^ -o $@ $(LDFLAGS)

smem_bitonic_sort: main.o smem_bitonic_sort.o
$(CXX) $^ -o $@ $(LDFLAGS)

cpu_bitonic_sort: cpu_bitonic_sort.cpp
$(CXX) $^ -o $@

main.o: main.cpp warp_bitonic_sort.cuh
main.o: main.cpp bitonic_sort.cuh
$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@

warp_bitonic_sort.o: warp_bitonic_sort.cu warp_bitonic_sort.cuh
warp_bitonic_sort.o: warp_bitonic_sort.cu bitonic_sort.cuh
$(NVCC) $(NVCCFLAGS) -c $< -o $@

smem_bitonic_sort.o: smem_bitonic_sort.cu bitonic_sort.cuh
$(NVCC) $(NVCCFLAGS) -c $< -o $@

clean:
rm -f *.o warp_bitonic_sort cpu_bitonic_sort
rm -f *.o warp_bitonic_sort smem_bitonic_sort cpu_bitonic_sort
100 changes: 100 additions & 0 deletions smem_bitonic_sort.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/**
* SMEM Bitoic Sort
*
* This uses shared memory to sort arrays. This uses warp shffle operator to
* compare and swap
*
* Author: Andrew Boessen
*/

#include "bitonic_sort.cuh"

/**
* Swap
*
* This is used for swapping elements in bitonic sorting
*
* @param x caller line id's value
* @param mask source lane id = caller line id ^ mask
* @param dir direction to swap
*
* @return min or max of source and caller
*/
__device__ int swap(int x, int mask, int dir) {
// get correspondin element to x in butterfly diagram
int y = __shfl_xor_sync(0xffffffff, x, mask);
// return smaller or larger value based on direction of swap
return x < y == dir ? y : x;
}

/**
* SMEM Bitonic Sort
*
* This function performs a bitonic sort on integers whithin a thread blocks of
* 1024 threads. This stores itermediate products in shared memory for better
* efficiency.
*
* The function uses the butterfly network pattern of bitonic sort, leveraging
* CUDA's warp-level primitives for efficient sorting within a warp (32
* threads). The swaps are tiled into warps of 32 threads. This is able to do
* swaps without allocating extra memory for temporary variable.
*
* @param arr Pointer to the array of integers to be sorted
* @param size Total number of elements in the array
*
* @note This function assumes that the number of threads per block is at least
* equal to the warp size. Elements beyond the array size are padded with
* INT_MAX.
*
* @see swap() for the element comparison and swapping logic
*/
__global__ void smemBitonicSort(int *arr, int size) {
// shared memory for block of 1024 threads
extern __shared__ int smem[];

// local thread id in block
int thread_id = threadIdx.x;

// seed shared memory array with value from global array
// pad overflow threads with INT_MAX
smem[thread_id] = thread_id < size ? arr[thread_id] : INT_MAX;
__syncthreads();

// make bitonic sequence and sort
for (int i = 0; (1 << i) <= size; i++) {
for (int j = 0; j <= i; j++) {
// distance between caller and source lanes
int offset = 1 << (i - j - 1);
// direction to swap caller and source lanes
int dir;
// only alternate direction when forming bitonic sequence
if (1 << i == blockDim.x) {
dir = (thread_id >> (i - j)) & 1;
} else {
dir = (thread_id >> (i + 1)) & 1 ^ (thread_id >> (i - j)) & 1;
}
if (1 << i <= warpSize) {
smem[thread_id] = swap(smem[thread_id], offset, dir);
} else {
__syncthreads();
int partner_val = smem[thread_id ^ offset];
int val = smem[thread_id];
// compare and swap elements
smem[thread_id] = val < partner_val == dir ? val : partner_val;
smem[thread_id ^ offset] = val < partner_val == dir ? partner_val : val;
}
}
}
__syncthreads();

// update value in array with sorted value
if (thread_id < size) {
arr[thread_id] = smem[thread_id];
}
}

void launchBitonicSort(int *arr, int size) {
const int BLOCK_SIZE = 1024;
smemBitonicSort<<<size / BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE * sizeof(int)>>>(
arr, size);
}
57 changes: 30 additions & 27 deletions warp_bitonic_sort.cu
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
/**
* Warp Bitoic Sort
*
* This uses warp shuffle to sort integers in a warp with bitonic sort
*
* Author: Andrew Boessen
*/
* Warp Bitoic Sort
*
* This uses warp shuffle to sort integers in a warp with bitonic sort
*
* Author: Andrew Boessen
*/

#include "warp_bitonic_sort.cuh"
#include "bitonic_sort.cuh"

/**
* Swap
*
* This is used for swapping elements in bitonic sorting
*
* @param x caller line id's value
* @param mask source lane id = caller line id ^ mask
* @param dir direction to swap
*
* @return min or max of source and caller
*/
* Swap
*
* This is used for swapping elements in bitonic sorting
*
* @param x caller line id's value
* @param mask source lane id = caller line id ^ mask
* @param dir direction to swap
*
* @return min or max of source and caller
*/
__device__ int swap(int x, int mask, int dir) {
// get correspondin element to x in butterfly diagram
int y = __shfl_xor_sync(0xffffffff, x, mask);
Expand All @@ -29,17 +29,20 @@ __device__ int swap(int x, int mask, int dir) {
/**
* Warp Bitonic Sort
*
* This function performs a bitonic sort on integers within a warp using warp shuffle operations.
* It sorts a portion of the input array corresponding to the calling thread's warp.
* This function performs a bitonic sort on integers within a warp using warp
* shuffle operations. It sorts a portion of the input array corresponding to
* the calling thread's warp.
*
* The function uses the butterfly network pattern of bitonic sort, leveraging CUDA's warp-level
* primitives for efficient sorting within a warp (32 threads).
* The function uses the butterfly network pattern of bitonic sort, leveraging
* CUDA's warp-level primitives for efficient sorting within a warp (32
* threads).
*
* @param arr Pointer to the array of integers to be sorted
* @param size Total number of elements in the array
*
* @note This function assumes that the number of threads per block is at least equal to the warp size.
* Elements beyond the array size are padded with INT_MAX.
* @note This function assumes that the number of threads per block is at least
* equal to the warp size. Elements beyond the array size are padded with
* INT_MAX.
*
* @see swap() for the element comparison and swapping logic
*/
Expand All @@ -54,7 +57,7 @@ __global__ void warpBitonicSort(int *arr, int size) {
for (int i = 0; (1 << i) <= warpSize; i++) {
for (int j = 0; j <= i; j++) {
// distance between caller and source lanes
int mask = 1 << (i-j);
int mask = 1 << (i - j);
// direction to swap caller and source lanes
int dir;
// only alternate direction when forming bitonic sequence
Expand All @@ -73,7 +76,7 @@ __global__ void warpBitonicSort(int *arr, int size) {
}
}

void launchWarpBitonicSort(int *arr, int size) {
const int BLOCK_SIZE = 256;
warpBitonicSort<<<size/BLOCK_SIZE, BLOCK_SIZE>>>(arr, size);
void launchBitonicSort(int *arr, int size) {
const int BLOCK_SIZE = 256;
warpBitonicSort<<<size / BLOCK_SIZE, BLOCK_SIZE>>>(arr, size);
}
11 changes: 0 additions & 11 deletions warp_bitonic_sort.cuh

This file was deleted.