@@ -41,6 +41,8 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less
41
41
42
42
cfg_if ! {
43
43
if #[ cfg( any( feature = "optimize_for_size" , target_pointer_width = "16" ) ) ] {
44
+ // Unlike driftsort, mergesort only requires len / 2,
45
+ // not len - len / 2.
44
46
let alloc_len = len / 2 ;
45
47
46
48
cfg_if! {
@@ -91,16 +93,26 @@ fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], i
91
93
// By allocating n elements of memory we can ensure the entire input can
92
94
// be sorted using stable quicksort, which allows better performance on
93
95
// random and low-cardinality distributions. However, we still want to
94
- // reduce our memory usage to n / 2 for large inputs. We do this by scaling
95
- // our allocation as max(n / 2, min(n, 8MB)), ensuring we scale like n for
96
- // small inputs and n / 2 for large inputs, without a sudden drop off. We
97
- // also need to ensure our alloc >= MIN_SMALL_SORT_SCRATCH_LEN , as the
96
+ // reduce our memory usage to n - n / 2 for large inputs. We do this by scaling
97
+ // our allocation as max(n - n / 2, min(n, 8MB)), ensuring we scale like n for
98
+ // small inputs and n - n / 2 for large inputs, without a sudden drop off. We
99
+ // also need to ensure our alloc >= SMALL_SORT_GENERAL_SCRATCH_LEN , as the
98
100
// small-sort always needs this much memory.
101
+ //
102
+ // driftsort will produce unsorted runs of up to min_good_run_len, which
103
+ // is at most len - len / 2.
104
+ // Unsorted runs need to be processed by quicksort, which requires as much
105
+ // scratch space as the run length, therefore the scratch space must be at
106
+ // least len - len / 2.
107
+ // If min_good_run_len is ever modified, this code must be updated to allocate
108
+ // the correct scratch size for it.
99
109
const MAX_FULL_ALLOC_BYTES : usize = 8_000_000 ; // 8MB
100
110
let max_full_alloc = MAX_FULL_ALLOC_BYTES / mem:: size_of :: < T > ( ) ;
101
111
let len = v. len ( ) ;
102
- let alloc_len =
103
- cmp:: max ( cmp:: max ( len / 2 , cmp:: min ( len, max_full_alloc) ) , SMALL_SORT_GENERAL_SCRATCH_LEN ) ;
112
+ let alloc_len = cmp:: max (
113
+ cmp:: max ( len - len / 2 , cmp:: min ( len, max_full_alloc) ) ,
114
+ SMALL_SORT_GENERAL_SCRATCH_LEN ,
115
+ ) ;
104
116
105
117
// For small inputs 4KiB of stack storage suffices, which allows us to avoid
106
118
// calling the (de-)allocator. Benchmarks showed this was quite beneficial.
0 commit comments