Skip to content

Commit b12fe4d

Browse files
authored
Merge pull request #179 from sterrettm2/qsort-openmp
Adds OpenMP to qsort, should also improve test speed a bit
2 parents 9fd995b + e01e79f commit b12fe4d

9 files changed

+110
-22
lines changed

lib/meson.build

+2-2
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ if cpp.has_argument('-march=icelake-client')
3434
'x86simdsort-icl.cpp',
3535
),
3636
include_directories : [src],
37-
cpp_args : ['-march=icelake-client'],
37+
cpp_args : ['-march=icelake-client', openmpflags],
3838
gnu_symbol_visibility : 'inlineshidden',
3939
)
4040
endif
@@ -45,7 +45,7 @@ if cancompilefp16
4545
'x86simdsort-spr.cpp',
4646
),
4747
include_directories : [src],
48-
cpp_args : ['-march=sapphirerapids'],
48+
cpp_args : ['-march=sapphirerapids', openmpflags],
4949
gnu_symbol_visibility : 'inlineshidden',
5050
)
5151
endif

src/avx512-16bit-qsort.hpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -556,6 +556,7 @@ avx512_qsort_fp16(uint16_t *arr,
556556
{
557557
using vtype = zmm_vector<float16>;
558558

559+
// TODO multithreading support here
559560
if (arrsize > 1) {
560561
arrsize_t nan_count = 0;
561562
if (UNLIKELY(hasnan)) {
@@ -564,11 +565,11 @@ avx512_qsort_fp16(uint16_t *arr,
564565
}
565566
if (descending) {
566567
qsort_<vtype, Comparator<vtype, true>, uint16_t>(
567-
arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize));
568+
arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0);
568569
}
569570
else {
570571
qsort_<vtype, Comparator<vtype, false>, uint16_t>(
571-
arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize));
572+
arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0);
572573
}
573574
replace_inf_with_nan(arr, arrsize, nan_count, descending);
574575
}

src/avx512fp16-16bit-qsort.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ struct zmm_vector<_Float16> {
2222
using opmask_t = __mmask32;
2323
static const uint8_t numlanes = 32;
2424
static constexpr int network_sort_threshold = 128;
25-
static constexpr int partition_unroll_factor = 0;
25+
static constexpr int partition_unroll_factor = 8;
2626
static constexpr simd_type vec_type = simd_type::AVX512;
2727

2828
using swizzle_ops = avx512_16bit_swizzle_ops;

src/xss-common-includes.h

+5
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,11 @@
8282
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, \
8383
21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
8484

85+
#if defined(XSS_USE_OPENMP) && defined(_OPENMP)
86+
#define XSS_COMPILE_OPENMP
87+
#include <omp.h>
88+
#endif
89+
8590
template <class... T>
8691
constexpr bool always_false = false;
8792

src/xss-common-keyvaluesort.hpp

-5
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,6 @@
1111
#include "xss-common-qsort.h"
1212
#include "xss-network-keyvaluesort.hpp"
1313

14-
#if defined(XSS_USE_OPENMP) && defined(_OPENMP)
15-
#define XSS_COMPILE_OPENMP
16-
#include <omp.h>
17-
#endif
18-
1914
/*
2015
* Sort all the NAN's to end of the array and return the index of the last elem
2116
* in the array which is not a nan

src/xss-common-qsort.h

+71-5
Original file line numberDiff line numberDiff line change
@@ -521,8 +521,11 @@ template <typename vtype, int maxN>
521521
void sort_n(typename vtype::type_t *arr, int N);
522522

523523
template <typename vtype, typename comparator, typename type_t>
524-
static void
525-
qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters)
524+
static void qsort_(type_t *arr,
525+
arrsize_t left,
526+
arrsize_t right,
527+
arrsize_t max_iters,
528+
arrsize_t task_threshold)
526529
{
527530
/*
528531
* Resort to std::sort if quicksort isnt making any progress
@@ -559,10 +562,40 @@ qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters)
559562
type_t leftmostValue = comparator::leftmost(smallest, biggest);
560563
type_t rightmostValue = comparator::rightmost(smallest, biggest);
561564

565+
#ifdef XSS_COMPILE_OPENMP
566+
if (pivot != leftmostValue) {
567+
bool parallel_left = (pivot_index - left) > task_threshold;
568+
if (parallel_left) {
569+
#pragma omp task
570+
qsort_<vtype, comparator>(
571+
arr, left, pivot_index - 1, max_iters - 1, task_threshold);
572+
}
573+
else {
574+
qsort_<vtype, comparator>(
575+
arr, left, pivot_index - 1, max_iters - 1, task_threshold);
576+
}
577+
}
578+
if (pivot != rightmostValue) {
579+
bool parallel_right = (right - pivot_index) > task_threshold;
580+
581+
if (parallel_right) {
582+
#pragma omp task
583+
qsort_<vtype, comparator>(
584+
arr, pivot_index, right, max_iters - 1, task_threshold);
585+
}
586+
else {
587+
qsort_<vtype, comparator>(
588+
arr, pivot_index, right, max_iters - 1, task_threshold);
589+
}
590+
}
591+
#else
592+
UNUSED(task_threshold);
593+
562594
if (pivot != leftmostValue)
563-
qsort_<vtype, comparator>(arr, left, pivot_index - 1, max_iters - 1);
595+
qsort_<vtype, comparator>(arr, left, pivot_index - 1, max_iters - 1, 0);
564596
if (pivot != rightmostValue)
565-
qsort_<vtype, comparator>(arr, pivot_index, right, max_iters - 1);
597+
qsort_<vtype, comparator>(arr, pivot_index, right, max_iters - 1, 0);
598+
#endif
566599
}
567600

568601
template <typename vtype, typename comparator, typename type_t>
@@ -627,8 +660,41 @@ X86_SIMD_SORT_INLINE void xss_qsort(T *arr, arrsize_t arrsize, bool hasnan)
627660
}
628661

629662
UNUSED(hasnan);
663+
664+
#ifdef XSS_COMPILE_OPENMP
665+
666+
bool use_parallel = arrsize > 100000;
667+
668+
if (use_parallel) {
669+
// This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system
670+
constexpr int thread_limit = 8;
671+
int thread_count = std::min(thread_limit, omp_get_max_threads());
672+
arrsize_t task_threshold
673+
= std::max((arrsize_t)100000, arrsize / 100);
674+
675+
// We use omp parallel and then omp single to setup the threads that will run the omp task calls in qsort_
676+
// The omp single prevents multiple threads from running the initial qsort_ simultaneously and causing problems
677+
// Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays
678+
#pragma omp parallel num_threads(thread_count)
679+
#pragma omp single
680+
qsort_<vtype, comparator, T>(arr,
681+
0,
682+
arrsize - 1,
683+
2 * (arrsize_t)log2(arrsize),
684+
task_threshold);
685+
}
686+
else {
687+
qsort_<vtype, comparator, T>(arr,
688+
0,
689+
arrsize - 1,
690+
2 * (arrsize_t)log2(arrsize),
691+
std::numeric_limits<arrsize_t>::max());
692+
}
693+
#pragma omp taskwait
694+
#else
630695
qsort_<vtype, comparator, T>(
631-
arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize));
696+
arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0);
697+
#endif
632698

633699
replace_inf_with_nan(arr, arrsize, nan_count, descending);
634700
}

tests/meson.build

+7
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,26 @@
11
libtests = []
22

3+
if get_option('use_openmp')
4+
openmpflags = ['-DXSS_USE_OPENMP=true']
5+
endif
6+
37
libtests += static_library('tests_qsort',
48
files('test-qsort.cpp', ),
59
dependencies: gtest_dep,
610
include_directories : [src, lib, utils],
11+
cpp_args : [openmpflags],
712
)
813

914
libtests += static_library('tests_kvsort',
1015
files('test-keyvalue.cpp', ),
1116
dependencies: gtest_dep,
1217
include_directories : [src, lib, utils],
18+
cpp_args : [openmpflags],
1319
)
1420

1521
libtests += static_library('tests_objsort',
1622
files('test-objqsort.cpp', ),
1723
dependencies: gtest_dep,
1824
include_directories : [src, lib, utils],
25+
cpp_args : [openmpflags],
1926
)

tests/test-keyvalue.cpp

+10-5
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,13 @@ class simdkvsort : public ::testing::Test {
1515
simdkvsort()
1616
{
1717
std::iota(arrsize.begin(), arrsize.end(), 1);
18-
arrsize.push_back(10'000);
19-
arrsize.push_back(100'000);
20-
arrsize.push_back(1'000'000);
18+
std::iota(arrsize_long.begin(), arrsize_long.end(), 1);
19+
#ifdef XSS_USE_OPENMP
20+
// These extended tests are only needed for the OpenMP logic
21+
arrsize_long.push_back(10'000);
22+
arrsize_long.push_back(100'000);
23+
arrsize_long.push_back(1'000'000);
24+
#endif
2125

2226
arrtype = {"random",
2327
"constant",
@@ -32,6 +36,7 @@ class simdkvsort : public ::testing::Test {
3236
}
3337
std::vector<std::string> arrtype;
3438
std::vector<size_t> arrsize = std::vector<size_t>(1024);
39+
std::vector<size_t> arrsize_long = std::vector<size_t>(1024);
3540
};
3641

3742
TYPED_TEST_SUITE_P(simdkvsort);
@@ -168,7 +173,7 @@ TYPED_TEST_P(simdkvsort, test_kvsort_ascending)
168173
using T2 = typename std::tuple_element<1, decltype(TypeParam())>::type;
169174
for (auto type : this->arrtype) {
170175
bool hasnan = is_nan_test(type);
171-
for (auto size : this->arrsize) {
176+
for (auto size : this->arrsize_long) {
172177
std::vector<T1> key = get_array<T1>(type, size);
173178
std::vector<T2> val = get_array<T2>(type, size);
174179
std::vector<T1> key_bckp = key;
@@ -199,7 +204,7 @@ TYPED_TEST_P(simdkvsort, test_kvsort_descending)
199204
using T2 = typename std::tuple_element<1, decltype(TypeParam())>::type;
200205
for (auto type : this->arrtype) {
201206
bool hasnan = is_nan_test(type);
202-
for (auto size : this->arrsize) {
207+
for (auto size : this->arrsize_long) {
203208
std::vector<T1> key = get_array<T1>(type, size);
204209
std::vector<T2> val = get_array<T2>(type, size);
205210
std::vector<T1> key_bckp = key;

tests/test-qsort.cpp

+11-2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,14 @@ class simdsort : public ::testing::Test {
1111
simdsort()
1212
{
1313
std::iota(arrsize.begin(), arrsize.end(), 1);
14+
std::iota(arrsize_long.begin(), arrsize_long.end(), 1);
15+
#ifdef XSS_USE_OPENMP
16+
// These extended tests are only needed for the OpenMP logic
17+
arrsize_long.push_back(10'000);
18+
arrsize_long.push_back(100'000);
19+
arrsize_long.push_back(1'000'000);
20+
#endif
21+
1422
arrtype = {"random",
1523
"constant",
1624
"sorted",
@@ -24,6 +32,7 @@ class simdsort : public ::testing::Test {
2432
}
2533
std::vector<std::string> arrtype;
2634
std::vector<size_t> arrsize = std::vector<size_t>(1024);
35+
std::vector<size_t> arrsize_long = std::vector<size_t>(1024);
2736
};
2837

2938
TYPED_TEST_SUITE_P(simdsort);
@@ -32,7 +41,7 @@ TYPED_TEST_P(simdsort, test_qsort_ascending)
3241
{
3342
for (auto type : this->arrtype) {
3443
bool hasnan = is_nan_test(type);
35-
for (auto size : this->arrsize) {
44+
for (auto size : this->arrsize_long) {
3645
std::vector<TypeParam> basearr = get_array<TypeParam>(type, size);
3746

3847
// Ascending order
@@ -54,7 +63,7 @@ TYPED_TEST_P(simdsort, test_qsort_descending)
5463
{
5564
for (auto type : this->arrtype) {
5665
bool hasnan = is_nan_test(type);
57-
for (auto size : this->arrsize) {
66+
for (auto size : this->arrsize_long) {
5867
std::vector<TypeParam> basearr = get_array<TypeParam>(type, size);
5968

6069
// Descending order

0 commit comments

Comments
 (0)