@@ -521,8 +521,11 @@ template <typename vtype, int maxN>
521
521
void sort_n (typename vtype::type_t *arr, int N);
522
522
523
523
template <typename vtype, typename comparator, typename type_t >
524
- static void
525
- qsort_ (type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters)
524
+ static void qsort_ (type_t *arr,
525
+ arrsize_t left,
526
+ arrsize_t right,
527
+ arrsize_t max_iters,
528
+ arrsize_t task_threshold)
526
529
{
527
530
/*
528
531
* Resort to std::sort if quicksort isnt making any progress
@@ -559,10 +562,40 @@ qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters)
559
562
type_t leftmostValue = comparator::leftmost (smallest, biggest);
560
563
type_t rightmostValue = comparator::rightmost (smallest, biggest);
561
564
565
+ #ifdef XSS_COMPILE_OPENMP
566
+ if (pivot != leftmostValue) {
567
+ bool parallel_left = (pivot_index - left) > task_threshold;
568
+ if (parallel_left) {
569
+ #pragma omp task
570
+ qsort_<vtype, comparator>(
571
+ arr, left, pivot_index - 1 , max_iters - 1 , task_threshold);
572
+ }
573
+ else {
574
+ qsort_<vtype, comparator>(
575
+ arr, left, pivot_index - 1 , max_iters - 1 , task_threshold);
576
+ }
577
+ }
578
+ if (pivot != rightmostValue) {
579
+ bool parallel_right = (right - pivot_index) > task_threshold;
580
+
581
+ if (parallel_right) {
582
+ #pragma omp task
583
+ qsort_<vtype, comparator>(
584
+ arr, pivot_index, right, max_iters - 1 , task_threshold);
585
+ }
586
+ else {
587
+ qsort_<vtype, comparator>(
588
+ arr, pivot_index, right, max_iters - 1 , task_threshold);
589
+ }
590
+ }
591
+ #else
592
+ UNUSED (task_threshold);
593
+
562
594
if (pivot != leftmostValue)
563
- qsort_<vtype, comparator>(arr, left, pivot_index - 1 , max_iters - 1 );
595
+ qsort_<vtype, comparator>(arr, left, pivot_index - 1 , max_iters - 1 , 0 );
564
596
if (pivot != rightmostValue)
565
- qsort_<vtype, comparator>(arr, pivot_index, right, max_iters - 1 );
597
+ qsort_<vtype, comparator>(arr, pivot_index, right, max_iters - 1 , 0 );
598
+ #endif
566
599
}
567
600
568
601
template <typename vtype, typename comparator, typename type_t >
@@ -627,8 +660,41 @@ X86_SIMD_SORT_INLINE void xss_qsort(T *arr, arrsize_t arrsize, bool hasnan)
627
660
}
628
661
629
662
UNUSED (hasnan);
663
+
664
+ #ifdef XSS_COMPILE_OPENMP
665
+
666
+ bool use_parallel = arrsize > 100000 ;
667
+
668
+ if (use_parallel) {
669
+ // This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system
670
+ constexpr int thread_limit = 8 ;
671
+ int thread_count = std::min (thread_limit, omp_get_max_threads ());
672
+ arrsize_t task_threshold
673
+ = std::max ((arrsize_t )100000 , arrsize / 100 );
674
+
675
+ // We use omp parallel and then omp single to setup the threads that will run the omp task calls in qsort_
676
+ // The omp single prevents multiple threads from running the initial qsort_ simultaneously and causing problems
677
+ // Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays
678
+ #pragma omp parallel num_threads(thread_count)
679
+ #pragma omp single
680
+ qsort_<vtype, comparator, T>(arr,
681
+ 0 ,
682
+ arrsize - 1 ,
683
+ 2 * (arrsize_t )log2 (arrsize),
684
+ task_threshold);
685
+ }
686
+ else {
687
+ qsort_<vtype, comparator, T>(arr,
688
+ 0 ,
689
+ arrsize - 1 ,
690
+ 2 * (arrsize_t )log2 (arrsize),
691
+ std::numeric_limits<arrsize_t >::max ());
692
+ }
693
+ #pragma omp taskwait
694
+ #else
630
695
qsort_<vtype, comparator, T>(
631
- arr, 0 , arrsize - 1 , 2 * (arrsize_t )log2 (arrsize));
696
+ arr, 0 , arrsize - 1 , 2 * (arrsize_t )log2 (arrsize), 0 );
697
+ #endif
632
698
633
699
replace_inf_with_nan (arr, arrsize, nan_count, descending);
634
700
}
0 commit comments