Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit dbc0ed2

Browse files
committedNov 20, 2022
Unify stable and unstable sort implementations in same core module
This moves the stable sort implementation to the core::slice::sort module. By virtue of being in core it can't access `Vec`. The two `Vec` used by merge sort, `buf` and `runs`, are modelled as custom types that implement the very limited required `Vec` interface with the help of provided allocation and free functions. This is done to allow future re-use of functions and logic between stable and unstable sort. Such as `insert_head`.
1 parent 736c675 commit dbc0ed2

File tree

3 files changed

+540
-310
lines changed

3 files changed

+540
-310
lines changed
 

‎library/alloc/src/slice.rs

Lines changed: 39 additions & 309 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,12 @@ use core::cmp::Ordering::{self, Less};
1919
use core::mem::{self, SizedTypeProperties};
2020
#[cfg(not(no_global_oom_handling))]
2121
use core::ptr;
22+
#[cfg(not(no_global_oom_handling))]
23+
use core::slice::sort;
2224

2325
use crate::alloc::Allocator;
2426
#[cfg(not(no_global_oom_handling))]
25-
use crate::alloc::Global;
27+
use crate::alloc::{self, Global};
2628
#[cfg(not(no_global_oom_handling))]
2729
use crate::borrow::ToOwned;
2830
use crate::boxed::Box;
@@ -203,7 +205,7 @@ impl<T> [T] {
203205
where
204206
T: Ord,
205207
{
206-
merge_sort(self, T::lt);
208+
stable_sort(self, T::lt);
207209
}
208210

209211
/// Sorts the slice with a comparator function.
@@ -259,7 +261,7 @@ impl<T> [T] {
259261
where
260262
F: FnMut(&T, &T) -> Ordering,
261263
{
262-
merge_sort(self, |a, b| compare(a, b) == Less);
264+
stable_sort(self, |a, b| compare(a, b) == Less);
263265
}
264266

265267
/// Sorts the slice with a key extraction function.
@@ -302,7 +304,7 @@ impl<T> [T] {
302304
F: FnMut(&T) -> K,
303305
K: Ord,
304306
{
305-
merge_sort(self, |a, b| f(a).lt(&f(b)));
307+
stable_sort(self, |a, b| f(a).lt(&f(b)));
306308
}
307309

308310
/// Sorts the slice with a key extraction function.
@@ -809,324 +811,52 @@ impl<T: Clone> ToOwned for [T] {
809811
// Sorting
810812
////////////////////////////////////////////////////////////////////////////////
811813

812-
/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
813-
///
814-
/// This is the integral subroutine of insertion sort.
815-
#[cfg(not(no_global_oom_handling))]
816-
fn insert_head<T, F>(v: &mut [T], is_less: &mut F)
817-
where
818-
F: FnMut(&T, &T) -> bool,
819-
{
820-
if v.len() >= 2 && is_less(&v[1], &v[0]) {
821-
unsafe {
822-
// There are three ways to implement insertion here:
823-
//
824-
// 1. Swap adjacent elements until the first one gets to its final destination.
825-
// However, this way we copy data around more than is necessary. If elements are big
826-
// structures (costly to copy), this method will be slow.
827-
//
828-
// 2. Iterate until the right place for the first element is found. Then shift the
829-
// elements succeeding it to make room for it and finally place it into the
830-
// remaining hole. This is a good method.
831-
//
832-
// 3. Copy the first element into a temporary variable. Iterate until the right place
833-
// for it is found. As we go along, copy every traversed element into the slot
834-
// preceding it. Finally, copy data from the temporary variable into the remaining
835-
// hole. This method is very good. Benchmarks demonstrated slightly better
836-
// performance than with the 2nd method.
837-
//
838-
// All methods were benchmarked, and the 3rd showed best results. So we chose that one.
839-
let tmp = mem::ManuallyDrop::new(ptr::read(&v[0]));
840-
841-
// Intermediate state of the insertion process is always tracked by `hole`, which
842-
// serves two purposes:
843-
// 1. Protects integrity of `v` from panics in `is_less`.
844-
// 2. Fills the remaining hole in `v` in the end.
845-
//
846-
// Panic safety:
847-
//
848-
// If `is_less` panics at any point during the process, `hole` will get dropped and
849-
// fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it
850-
// initially held exactly once.
851-
let mut hole = InsertionHole { src: &*tmp, dest: &mut v[1] };
852-
ptr::copy_nonoverlapping(&v[1], &mut v[0], 1);
853-
854-
for i in 2..v.len() {
855-
if !is_less(&v[i], &*tmp) {
856-
break;
857-
}
858-
ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1);
859-
hole.dest = &mut v[i];
860-
}
861-
// `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`.
862-
}
863-
}
864-
865-
// When dropped, copies from `src` into `dest`.
866-
struct InsertionHole<T> {
867-
src: *const T,
868-
dest: *mut T,
869-
}
870-
871-
impl<T> Drop for InsertionHole<T> {
872-
fn drop(&mut self) {
873-
unsafe {
874-
ptr::copy_nonoverlapping(self.src, self.dest, 1);
875-
}
876-
}
877-
}
878-
}
879-
880-
/// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and
881-
/// stores the result into `v[..]`.
882-
///
883-
/// # Safety
884-
///
885-
/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
886-
/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
887-
#[cfg(not(no_global_oom_handling))]
888-
unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F)
889-
where
890-
F: FnMut(&T, &T) -> bool,
891-
{
892-
let len = v.len();
893-
let v = v.as_mut_ptr();
894-
let (v_mid, v_end) = unsafe { (v.add(mid), v.add(len)) };
895-
896-
// The merge process first copies the shorter run into `buf`. Then it traces the newly copied
897-
// run and the longer run forwards (or backwards), comparing their next unconsumed elements and
898-
// copying the lesser (or greater) one into `v`.
899-
//
900-
// As soon as the shorter run is fully consumed, the process is done. If the longer run gets
901-
// consumed first, then we must copy whatever is left of the shorter run into the remaining
902-
// hole in `v`.
903-
//
904-
// Intermediate state of the process is always tracked by `hole`, which serves two purposes:
905-
// 1. Protects integrity of `v` from panics in `is_less`.
906-
// 2. Fills the remaining hole in `v` if the longer run gets consumed first.
907-
//
908-
// Panic safety:
909-
//
910-
// If `is_less` panics at any point during the process, `hole` will get dropped and fill the
911-
// hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every
912-
// object it initially held exactly once.
913-
let mut hole;
914-
915-
if mid <= len - mid {
916-
// The left run is shorter.
917-
unsafe {
918-
ptr::copy_nonoverlapping(v, buf, mid);
919-
hole = MergeHole { start: buf, end: buf.add(mid), dest: v };
920-
}
921-
922-
// Initially, these pointers point to the beginnings of their arrays.
923-
let left = &mut hole.start;
924-
let mut right = v_mid;
925-
let out = &mut hole.dest;
926-
927-
while *left < hole.end && right < v_end {
928-
// Consume the lesser side.
929-
// If equal, prefer the left run to maintain stability.
930-
unsafe {
931-
let to_copy = if is_less(&*right, &**left) {
932-
get_and_increment(&mut right)
933-
} else {
934-
get_and_increment(left)
935-
};
936-
ptr::copy_nonoverlapping(to_copy, get_and_increment(out), 1);
937-
}
938-
}
939-
} else {
940-
// The right run is shorter.
941-
unsafe {
942-
ptr::copy_nonoverlapping(v_mid, buf, len - mid);
943-
hole = MergeHole { start: buf, end: buf.add(len - mid), dest: v_mid };
944-
}
945-
946-
// Initially, these pointers point past the ends of their arrays.
947-
let left = &mut hole.dest;
948-
let right = &mut hole.end;
949-
let mut out = v_end;
950-
951-
while v < *left && buf < *right {
952-
// Consume the greater side.
953-
// If equal, prefer the right run to maintain stability.
954-
unsafe {
955-
let to_copy = if is_less(&*right.sub(1), &*left.sub(1)) {
956-
decrement_and_get(left)
957-
} else {
958-
decrement_and_get(right)
959-
};
960-
ptr::copy_nonoverlapping(to_copy, decrement_and_get(&mut out), 1);
961-
}
962-
}
963-
}
964-
// Finally, `hole` gets dropped. If the shorter run was not fully consumed, whatever remains of
965-
// it will now be copied into the hole in `v`.
966-
967-
unsafe fn get_and_increment<T>(ptr: &mut *mut T) -> *mut T {
968-
let old = *ptr;
969-
*ptr = unsafe { ptr.add(1) };
970-
old
971-
}
972-
973-
unsafe fn decrement_and_get<T>(ptr: &mut *mut T) -> *mut T {
974-
*ptr = unsafe { ptr.sub(1) };
975-
*ptr
976-
}
977-
978-
// When dropped, copies the range `start..end` into `dest..`.
979-
struct MergeHole<T> {
980-
start: *mut T,
981-
end: *mut T,
982-
dest: *mut T,
983-
}
984-
985-
impl<T> Drop for MergeHole<T> {
986-
fn drop(&mut self) {
987-
// `T` is not a zero-sized type, and these are pointers into a slice's elements.
988-
unsafe {
989-
let len = self.end.sub_ptr(self.start);
990-
ptr::copy_nonoverlapping(self.start, self.dest, len);
991-
}
992-
}
993-
}
994-
}
995-
996-
/// This merge sort borrows some (but not all) ideas from TimSort, which is described in detail
997-
/// [here](https://github.com/python/cpython/blob/main/Objects/listsort.txt).
998-
///
999-
/// The algorithm identifies strictly descending and non-descending subsequences, which are called
1000-
/// natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed
1001-
/// onto the stack, and then some pairs of adjacent runs are merged until these two invariants are
1002-
/// satisfied:
1003-
///
1004-
/// 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len`
1005-
/// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len`
1006-
///
1007-
/// The invariants ensure that the total running time is *O*(*n* \* log(*n*)) worst-case.
814+
#[inline]
1008815
#[cfg(not(no_global_oom_handling))]
1009-
fn merge_sort<T, F>(v: &mut [T], mut is_less: F)
816+
fn stable_sort<T, F>(v: &mut [T], mut is_less: F)
1010817
where
1011818
F: FnMut(&T, &T) -> bool,
1012819
{
1013-
// Slices of up to this length get sorted using insertion sort.
1014-
const MAX_INSERTION: usize = 20;
1015-
// Very short runs are extended using insertion sort to span at least this many elements.
1016-
const MIN_RUN: usize = 10;
1017-
1018-
// Sorting has no meaningful behavior on zero-sized types.
1019820
if T::IS_ZST {
821+
// Sorting has no meaningful behavior on zero-sized types. Do nothing.
1020822
return;
1021823
}
1022824

1023-
let len = v.len();
1024-
1025-
// Short arrays get sorted in-place via insertion sort to avoid allocations.
1026-
if len <= MAX_INSERTION {
1027-
if len >= 2 {
1028-
for i in (0..len - 1).rev() {
1029-
insert_head(&mut v[i..], &mut is_less);
1030-
}
1031-
}
1032-
return;
1033-
}
1034-
1035-
// Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
1036-
// shallow copies of the contents of `v` without risking the dtors running on copies if
1037-
// `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
1038-
// which will always have length at most `len / 2`.
1039-
let mut buf = Vec::with_capacity(len / 2);
825+
let elem_alloc_fn = |len: usize| -> *mut T {
826+
// SAFETY: Creating the layout is safe as long as merge_sort never calls this with len >
827+
// v.len(). Alloc in general will only be used as 'shadow-region' to store temporary swap
828+
// elements.
829+
unsafe { alloc::alloc(alloc::Layout::array::<T>(len).unwrap_unchecked()) as *mut T }
830+
};
1040831

1041-
// In order to identify natural runs in `v`, we traverse it backwards. That might seem like a
1042-
// strange decision, but consider the fact that merges more often go in the opposite direction
1043-
// (forwards). According to benchmarks, merging forwards is slightly faster than merging
1044-
// backwards. To conclude, identifying runs by traversing backwards improves performance.
1045-
let mut runs = vec![];
1046-
let mut end = len;
1047-
while end > 0 {
1048-
// Find the next natural run, and reverse it if it's strictly descending.
1049-
let mut start = end - 1;
1050-
if start > 0 {
1051-
start -= 1;
1052-
unsafe {
1053-
if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) {
1054-
while start > 0 && is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) {
1055-
start -= 1;
1056-
}
1057-
v[start..end].reverse();
1058-
} else {
1059-
while start > 0 && !is_less(v.get_unchecked(start), v.get_unchecked(start - 1))
1060-
{
1061-
start -= 1;
1062-
}
1063-
}
1064-
}
1065-
}
1066-
1067-
// Insert some more elements into the run if it's too short. Insertion sort is faster than
1068-
// merge sort on short sequences, so this significantly improves performance.
1069-
while start > 0 && end - start < MIN_RUN {
1070-
start -= 1;
1071-
insert_head(&mut v[start..end], &mut is_less);
832+
let elem_dealloc_fn = |buf_ptr: *mut T, len: usize| {
833+
// SAFETY: Creating the layout is safe as long as merge_sort never calls this with len >
834+
// v.len(). The caller must ensure that buf_ptr was created by elem_alloc_fn with the same
835+
// len.
836+
unsafe {
837+
alloc::dealloc(buf_ptr as *mut u8, alloc::Layout::array::<T>(len).unwrap_unchecked());
1072838
}
839+
};
1073840

1074-
// Push this run onto the stack.
1075-
runs.push(Run { start, len: end - start });
1076-
end = start;
1077-
1078-
// Merge some pairs of adjacent runs to satisfy the invariants.
1079-
while let Some(r) = collapse(&runs) {
1080-
let left = runs[r + 1];
1081-
let right = runs[r];
1082-
unsafe {
1083-
merge(
1084-
&mut v[left.start..right.start + right.len],
1085-
left.len,
1086-
buf.as_mut_ptr(),
1087-
&mut is_less,
1088-
);
1089-
}
1090-
runs[r] = Run { start: left.start, len: left.len + right.len };
1091-
runs.remove(r + 1);
841+
let run_alloc_fn = |len: usize| -> *mut sort::TimSortRun {
842+
// SAFETY: Creating the layout is safe as long as merge_sort never calls this with an
843+
// obscene length or 0.
844+
unsafe {
845+
alloc::alloc(alloc::Layout::array::<sort::TimSortRun>(len).unwrap_unchecked())
846+
as *mut sort::TimSortRun
1092847
}
1093-
}
1094-
1095-
// Finally, exactly one run must remain in the stack.
1096-
debug_assert!(runs.len() == 1 && runs[0].start == 0 && runs[0].len == len);
848+
};
1097849

1098-
// Examines the stack of runs and identifies the next pair of runs to merge. More specifically,
1099-
// if `Some(r)` is returned, that means `runs[r]` and `runs[r + 1]` must be merged next. If the
1100-
// algorithm should continue building a new run instead, `None` is returned.
1101-
//
1102-
// TimSort is infamous for its buggy implementations, as described here:
1103-
// http://envisage-project.eu/timsort-specification-and-verification/
1104-
//
1105-
// The gist of the story is: we must enforce the invariants on the top four runs on the stack.
1106-
// Enforcing them on just top three is not sufficient to ensure that the invariants will still
1107-
// hold for *all* runs in the stack.
1108-
//
1109-
// This function correctly checks invariants for the top four runs. Additionally, if the top
1110-
// run starts at index 0, it will always demand a merge operation until the stack is fully
1111-
// collapsed, in order to complete the sort.
1112-
#[inline]
1113-
fn collapse(runs: &[Run]) -> Option<usize> {
1114-
let n = runs.len();
1115-
if n >= 2
1116-
&& (runs[n - 1].start == 0
1117-
|| runs[n - 2].len <= runs[n - 1].len
1118-
|| (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len)
1119-
|| (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len))
1120-
{
1121-
if n >= 3 && runs[n - 3].len < runs[n - 1].len { Some(n - 3) } else { Some(n - 2) }
1122-
} else {
1123-
None
850+
let run_dealloc_fn = |buf_ptr: *mut sort::TimSortRun, len: usize| {
851+
// SAFETY: The caller must ensure that buf_ptr was created by elem_alloc_fn with the same
852+
// len.
853+
unsafe {
854+
alloc::dealloc(
855+
buf_ptr as *mut u8,
856+
alloc::Layout::array::<sort::TimSortRun>(len).unwrap_unchecked(),
857+
);
1124858
}
1125-
}
859+
};
1126860

1127-
#[derive(Clone, Copy)]
1128-
struct Run {
1129-
start: usize,
1130-
len: usize,
1131-
}
861+
sort::merge_sort(v, &mut is_less, elem_alloc_fn, elem_dealloc_fn, run_alloc_fn, run_dealloc_fn);
1132862
}

‎library/core/src/slice/mod.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,19 @@ use crate::slice;
2828
/// Pure rust memchr implementation, taken from rust-memchr
2929
pub mod memchr;
3030

31+
#[unstable(
32+
feature = "slice_internals",
33+
issue = "none",
34+
reason = "exposed from core to be reused in std;"
35+
)]
36+
pub mod sort;
37+
3138
mod ascii;
3239
mod cmp;
3340
mod index;
3441
mod iter;
3542
mod raw;
3643
mod rotate;
37-
mod sort;
3844
mod specialize;
3945

4046
#[stable(feature = "rust1", since = "1.0.0")]

‎library/core/src/slice/sort.rs

Lines changed: 494 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@
55
//!
66
//! Unstable sorting is compatible with libcore because it doesn't allocate memory, unlike our
77
//! stable sorting implementation.
8+
//!
9+
//! In addition it also contains the core logic of the stable sort used by `slice::sort` based on
10+
//! TimSort.
11+
12+
#![allow(unused)] // FIXME debug
813

914
use crate::cmp;
1015
use crate::mem::{self, MaybeUninit, SizedTypeProperties};
@@ -883,6 +888,7 @@ fn partition_at_index_loop<'a, T, F>(
883888
}
884889
}
885890

891+
/// Reorder the slice such that the element at `index` is at its final sorted position.
886892
pub fn partition_at_index<T, F>(
887893
v: &mut [T],
888894
index: usize,
@@ -927,3 +933,491 @@ where
927933
let pivot = &mut pivot[0];
928934
(left, pivot, right)
929935
}
936+
937+
/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
938+
///
939+
/// This is the integral subroutine of insertion sort.
940+
fn insert_head<T, F>(v: &mut [T], is_less: &mut F)
941+
where
942+
F: FnMut(&T, &T) -> bool,
943+
{
944+
if v.len() >= 2 && is_less(&v[1], &v[0]) {
945+
unsafe {
946+
// There are three ways to implement insertion here:
947+
//
948+
// 1. Swap adjacent elements until the first one gets to its final destination.
949+
// However, this way we copy data around more than is necessary. If elements are big
950+
// structures (costly to copy), this method will be slow.
951+
//
952+
// 2. Iterate until the right place for the first element is found. Then shift the
953+
// elements succeeding it to make room for it and finally place it into the
954+
// remaining hole. This is a good method.
955+
//
956+
// 3. Copy the first element into a temporary variable. Iterate until the right place
957+
// for it is found. As we go along, copy every traversed element into the slot
958+
// preceding it. Finally, copy data from the temporary variable into the remaining
959+
// hole. This method is very good. Benchmarks demonstrated slightly better
960+
// performance than with the 2nd method.
961+
//
962+
// All methods were benchmarked, and the 3rd showed best results. So we chose that one.
963+
let tmp = mem::ManuallyDrop::new(ptr::read(&v[0]));
964+
965+
// Intermediate state of the insertion process is always tracked by `hole`, which
966+
// serves two purposes:
967+
// 1. Protects integrity of `v` from panics in `is_less`.
968+
// 2. Fills the remaining hole in `v` in the end.
969+
//
970+
// Panic safety:
971+
//
972+
// If `is_less` panics at any point during the process, `hole` will get dropped and
973+
// fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it
974+
// initially held exactly once.
975+
let mut hole = InsertionHole { src: &*tmp, dest: &mut v[1] };
976+
ptr::copy_nonoverlapping(&v[1], &mut v[0], 1);
977+
978+
for i in 2..v.len() {
979+
if !is_less(&v[i], &*tmp) {
980+
break;
981+
}
982+
ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1);
983+
hole.dest = &mut v[i];
984+
}
985+
// `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`.
986+
}
987+
}
988+
989+
// When dropped, copies from `src` into `dest`.
990+
struct InsertionHole<T> {
991+
src: *const T,
992+
dest: *mut T,
993+
}
994+
995+
impl<T> Drop for InsertionHole<T> {
996+
fn drop(&mut self) {
997+
unsafe {
998+
ptr::copy_nonoverlapping(self.src, self.dest, 1);
999+
}
1000+
}
1001+
}
1002+
}
1003+
1004+
/// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and
1005+
/// stores the result into `v[..]`.
1006+
///
1007+
/// # Safety
1008+
///
1009+
/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
1010+
/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
1011+
unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F)
1012+
where
1013+
F: FnMut(&T, &T) -> bool,
1014+
{
1015+
let len = v.len();
1016+
let v = v.as_mut_ptr();
1017+
let (v_mid, v_end) = unsafe { (v.add(mid), v.add(len)) };
1018+
1019+
// The merge process first copies the shorter run into `buf`. Then it traces the newly copied
1020+
// run and the longer run forwards (or backwards), comparing their next unconsumed elements and
1021+
// copying the lesser (or greater) one into `v`.
1022+
//
1023+
// As soon as the shorter run is fully consumed, the process is done. If the longer run gets
1024+
// consumed first, then we must copy whatever is left of the shorter run into the remaining
1025+
// hole in `v`.
1026+
//
1027+
// Intermediate state of the process is always tracked by `hole`, which serves two purposes:
1028+
// 1. Protects integrity of `v` from panics in `is_less`.
1029+
// 2. Fills the remaining hole in `v` if the longer run gets consumed first.
1030+
//
1031+
// Panic safety:
1032+
//
1033+
// If `is_less` panics at any point during the process, `hole` will get dropped and fill the
1034+
// hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every
1035+
// object it initially held exactly once.
1036+
let mut hole;
1037+
1038+
if mid <= len - mid {
1039+
// The left run is shorter.
1040+
unsafe {
1041+
ptr::copy_nonoverlapping(v, buf, mid);
1042+
hole = MergeHole { start: buf, end: buf.add(mid), dest: v };
1043+
}
1044+
1045+
// Initially, these pointers point to the beginnings of their arrays.
1046+
let left = &mut hole.start;
1047+
let mut right = v_mid;
1048+
let out = &mut hole.dest;
1049+
1050+
while *left < hole.end && right < v_end {
1051+
// Consume the lesser side.
1052+
// If equal, prefer the left run to maintain stability.
1053+
unsafe {
1054+
let to_copy = if is_less(&*right, &**left) {
1055+
get_and_increment(&mut right)
1056+
} else {
1057+
get_and_increment(left)
1058+
};
1059+
ptr::copy_nonoverlapping(to_copy, get_and_increment(out), 1);
1060+
}
1061+
}
1062+
} else {
1063+
// The right run is shorter.
1064+
unsafe {
1065+
ptr::copy_nonoverlapping(v_mid, buf, len - mid);
1066+
hole = MergeHole { start: buf, end: buf.add(len - mid), dest: v_mid };
1067+
}
1068+
1069+
// Initially, these pointers point past the ends of their arrays.
1070+
let left = &mut hole.dest;
1071+
let right = &mut hole.end;
1072+
let mut out = v_end;
1073+
1074+
while v < *left && buf < *right {
1075+
// Consume the greater side.
1076+
// If equal, prefer the right run to maintain stability.
1077+
unsafe {
1078+
let to_copy = if is_less(&*right.sub(1), &*left.sub(1)) {
1079+
decrement_and_get(left)
1080+
} else {
1081+
decrement_and_get(right)
1082+
};
1083+
ptr::copy_nonoverlapping(to_copy, decrement_and_get(&mut out), 1);
1084+
}
1085+
}
1086+
}
1087+
// Finally, `hole` gets dropped. If the shorter run was not fully consumed, whatever remains of
1088+
// it will now be copied into the hole in `v`.
1089+
1090+
unsafe fn get_and_increment<T>(ptr: &mut *mut T) -> *mut T {
1091+
let old = *ptr;
1092+
*ptr = unsafe { ptr.add(1) };
1093+
old
1094+
}
1095+
1096+
unsafe fn decrement_and_get<T>(ptr: &mut *mut T) -> *mut T {
1097+
*ptr = unsafe { ptr.sub(1) };
1098+
*ptr
1099+
}
1100+
1101+
// When dropped, copies the range `start..end` into `dest..`.
1102+
struct MergeHole<T> {
1103+
start: *mut T,
1104+
end: *mut T,
1105+
dest: *mut T,
1106+
}
1107+
1108+
impl<T> Drop for MergeHole<T> {
1109+
fn drop(&mut self) {
1110+
// `T` is not a zero-sized type, and these are pointers into a slice's elements.
1111+
unsafe {
1112+
let len = self.end.sub_ptr(self.start);
1113+
ptr::copy_nonoverlapping(self.start, self.dest, len);
1114+
}
1115+
}
1116+
}
1117+
}
1118+
1119+
/// This merge sort borrows some (but not all) ideas from TimSort, which used to be described in
1120+
/// detail [here](https://github.com/python/cpython/blob/main/Objects/listsort.txt). However Python
1121+
/// has switched to a Powersort based implementation.
1122+
///
1123+
/// The algorithm identifies strictly descending and non-descending subsequences, which are called
1124+
/// natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed
1125+
/// onto the stack, and then some pairs of adjacent runs are merged until these two invariants are
1126+
/// satisfied:
1127+
///
1128+
/// 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len`
1129+
/// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len`
1130+
///
1131+
/// The invariants ensure that the total running time is *O*(*n* \* log(*n*)) worst-case.
1132+
pub fn merge_sort<T, CmpF, ElemAllocF, ElemDeallocF, RunAllocF, RunDeallocF>(
1133+
v: &mut [T],
1134+
is_less: &mut CmpF,
1135+
elem_alloc_fn: ElemAllocF,
1136+
elem_dealloc_fn: ElemDeallocF,
1137+
run_alloc_fn: RunAllocF,
1138+
run_dealloc_fn: RunDeallocF,
1139+
) where
1140+
CmpF: FnMut(&T, &T) -> bool,
1141+
ElemAllocF: Fn(usize) -> *mut T,
1142+
ElemDeallocF: Fn(*mut T, usize),
1143+
RunAllocF: Fn(usize) -> *mut TimSortRun,
1144+
RunDeallocF: Fn(*mut TimSortRun, usize),
1145+
{
1146+
// Slices of up to this length get sorted using insertion sort.
1147+
const MAX_INSERTION: usize = 20;
1148+
// Very short runs are extended using insertion sort to span at least this many elements.
1149+
const MIN_RUN: usize = 10;
1150+
1151+
// The caller should have already checked that.
1152+
debug_assert!(!T::IS_ZST);
1153+
1154+
let len = v.len();
1155+
1156+
// Short arrays get sorted in-place via insertion sort to avoid allocations.
1157+
if len <= MAX_INSERTION {
1158+
if len >= 2 {
1159+
for i in (0..len - 1).rev() {
1160+
insert_head(&mut v[i..], is_less);
1161+
}
1162+
}
1163+
return;
1164+
}
1165+
1166+
// Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
1167+
// shallow copies of the contents of `v` without risking the dtors running on copies if
1168+
// `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
1169+
// which will always have length at most `len / 2`.
1170+
let mut buf = BufGuard::new(len / 2, elem_alloc_fn, elem_dealloc_fn);
1171+
let buf_ptr = buf.buf_ptr;
1172+
1173+
let mut runs = RunVec::new(run_alloc_fn, run_dealloc_fn);
1174+
1175+
// In order to identify natural runs in `v`, we traverse it backwards. That might seem like a
1176+
// strange decision, but consider the fact that merges more often go in the opposite direction
1177+
// (forwards). According to benchmarks, merging forwards is slightly faster than merging
1178+
// backwards. To conclude, identifying runs by traversing backwards improves performance.
1179+
let mut end = len;
1180+
while end > 0 {
1181+
// Find the next natural run, and reverse it if it's strictly descending.
1182+
let mut start = end - 1;
1183+
if start > 0 {
1184+
start -= 1;
1185+
unsafe {
1186+
if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) {
1187+
while start > 0 && is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) {
1188+
start -= 1;
1189+
}
1190+
v[start..end].reverse();
1191+
} else {
1192+
while start > 0 && !is_less(v.get_unchecked(start), v.get_unchecked(start - 1))
1193+
{
1194+
start -= 1;
1195+
}
1196+
}
1197+
}
1198+
}
1199+
1200+
// Insert some more elements into the run if it's too short. Insertion sort is faster than
1201+
// merge sort on short sequences, so this significantly improves performance.
1202+
while start > 0 && end - start < MIN_RUN {
1203+
start -= 1;
1204+
insert_head(&mut v[start..end], is_less);
1205+
}
1206+
1207+
// Push this run onto the stack.
1208+
runs.push(TimSortRun { start, len: end - start });
1209+
end = start;
1210+
1211+
// Merge some pairs of adjacent runs to satisfy the invariants.
1212+
while let Some(r) = collapse(runs.as_slice()) {
1213+
let left = runs[r + 1];
1214+
let right = runs[r];
1215+
unsafe {
1216+
merge(&mut v[left.start..right.start + right.len], left.len, buf_ptr, is_less);
1217+
}
1218+
runs[r] = TimSortRun { start: left.start, len: left.len + right.len };
1219+
runs.remove(r + 1);
1220+
}
1221+
}
1222+
1223+
// Finally, exactly one run must remain in the stack.
1224+
debug_assert!(runs.len() == 1 && runs[0].start == 0 && runs[0].len == len);
1225+
1226+
// Examines the stack of runs and identifies the next pair of runs to merge. More specifically,
1227+
// if `Some(r)` is returned, that means `runs[r]` and `runs[r + 1]` must be merged next. If the
1228+
// algorithm should continue building a new run instead, `None` is returned.
1229+
//
1230+
// TimSort is infamous for its buggy implementations, as described here:
1231+
// http://envisage-project.eu/timsort-specification-and-verification/
1232+
//
1233+
// The gist of the story is: we must enforce the invariants on the top four runs on the stack.
1234+
// Enforcing them on just top three is not sufficient to ensure that the invariants will still
1235+
// hold for *all* runs in the stack.
1236+
//
1237+
// This function correctly checks invariants for the top four runs. Additionally, if the top
1238+
// run starts at index 0, it will always demand a merge operation until the stack is fully
1239+
// collapsed, in order to complete the sort.
1240+
#[inline]
1241+
fn collapse(runs: &[TimSortRun]) -> Option<usize> {
1242+
let n = runs.len();
1243+
if n >= 2
1244+
&& (runs[n - 1].start == 0
1245+
|| runs[n - 2].len <= runs[n - 1].len
1246+
|| (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len)
1247+
|| (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len))
1248+
{
1249+
if n >= 3 && runs[n - 3].len < runs[n - 1].len { Some(n - 3) } else { Some(n - 2) }
1250+
} else {
1251+
None
1252+
}
1253+
}
1254+
1255+
// Extremely basic versions of Vec.
1256+
// Their use is super limited and by having the code here, it allows reuse between the sort
1257+
// implementations.
1258+
struct BufGuard<T, ElemAllocF, ElemDeallocF>
1259+
where
1260+
ElemAllocF: Fn(usize) -> *mut T,
1261+
ElemDeallocF: Fn(*mut T, usize),
1262+
{
1263+
buf_ptr: *mut T,
1264+
capacity: usize,
1265+
elem_alloc_fn: ElemAllocF,
1266+
elem_dealloc_fn: ElemDeallocF,
1267+
}
1268+
1269+
impl<T, ElemAllocF, ElemDeallocF> BufGuard<T, ElemAllocF, ElemDeallocF>
1270+
where
1271+
ElemAllocF: Fn(usize) -> *mut T,
1272+
ElemDeallocF: Fn(*mut T, usize),
1273+
{
1274+
fn new(len: usize, elem_alloc_fn: ElemAllocF, elem_dealloc_fn: ElemDeallocF) -> Self {
1275+
Self { buf_ptr: elem_alloc_fn(len), capacity: len, elem_alloc_fn, elem_dealloc_fn }
1276+
}
1277+
}
1278+
1279+
impl<T, ElemAllocF, ElemDeallocF> Drop for BufGuard<T, ElemAllocF, ElemDeallocF>
1280+
where
1281+
ElemAllocF: Fn(usize) -> *mut T,
1282+
ElemDeallocF: Fn(*mut T, usize),
1283+
{
1284+
fn drop(&mut self) {
1285+
(self.elem_dealloc_fn)(self.buf_ptr, self.capacity);
1286+
}
1287+
}
1288+
1289+
struct RunVec<RunAllocF, RunDeallocF>
1290+
where
1291+
RunAllocF: Fn(usize) -> *mut TimSortRun,
1292+
RunDeallocF: Fn(*mut TimSortRun, usize),
1293+
{
1294+
buf_ptr: *mut TimSortRun,
1295+
capacity: usize,
1296+
len: usize,
1297+
run_alloc_fn: RunAllocF,
1298+
run_dealloc_fn: RunDeallocF,
1299+
}
1300+
1301+
impl<RunAllocF, RunDeallocF> RunVec<RunAllocF, RunDeallocF>
1302+
where
1303+
RunAllocF: Fn(usize) -> *mut TimSortRun,
1304+
RunDeallocF: Fn(*mut TimSortRun, usize),
1305+
{
1306+
fn new(run_alloc_fn: RunAllocF, run_dealloc_fn: RunDeallocF) -> Self {
1307+
// Most slices can be sorted with at most 16 runs in-flight.
1308+
const START_RUN_CAPACITY: usize = 16;
1309+
1310+
Self {
1311+
buf_ptr: run_alloc_fn(START_RUN_CAPACITY),
1312+
capacity: START_RUN_CAPACITY,
1313+
len: 0,
1314+
run_alloc_fn,
1315+
run_dealloc_fn,
1316+
}
1317+
}
1318+
1319+
fn push(&mut self, val: TimSortRun) {
1320+
if self.len == self.capacity {
1321+
let old_capacity = self.capacity;
1322+
let old_buf_ptr = self.buf_ptr;
1323+
1324+
self.capacity = self.capacity * 2;
1325+
self.buf_ptr = (self.run_alloc_fn)(self.capacity);
1326+
1327+
// SAFETY: buf_ptr new and old were correctly allocated and old_buf_ptr has
1328+
// old_capacity valid elements.
1329+
unsafe {
1330+
ptr::copy_nonoverlapping(old_buf_ptr, self.buf_ptr, old_capacity);
1331+
}
1332+
1333+
(self.run_dealloc_fn)(old_buf_ptr, old_capacity);
1334+
}
1335+
1336+
// SAFETY: The invariant was just checked.
1337+
unsafe {
1338+
self.buf_ptr.add(self.len).write(val);
1339+
}
1340+
self.len += 1;
1341+
}
1342+
1343+
fn remove(&mut self, index: usize) {
1344+
if index >= self.len {
1345+
panic!("Index out of bounds");
1346+
}
1347+
1348+
// SAFETY: buf_ptr needs to be valid and len invariant upheld.
1349+
unsafe {
1350+
// the place we are taking from.
1351+
let ptr = self.buf_ptr.add(index);
1352+
1353+
// Shift everything down to fill in that spot.
1354+
ptr::copy(ptr.add(1), ptr, self.len - index - 1);
1355+
}
1356+
self.len -= 1;
1357+
}
1358+
1359+
fn as_slice(&self) -> &[TimSortRun] {
1360+
// SAFETY: Safe as long as buf_ptr is valid and len invariant was upheld.
1361+
unsafe { &*ptr::slice_from_raw_parts(self.buf_ptr, self.len) }
1362+
}
1363+
1364+
fn len(&self) -> usize {
1365+
self.len
1366+
}
1367+
}
1368+
1369+
impl<RunAllocF, RunDeallocF> core::ops::Index<usize> for RunVec<RunAllocF, RunDeallocF>
1370+
where
1371+
RunAllocF: Fn(usize) -> *mut TimSortRun,
1372+
RunDeallocF: Fn(*mut TimSortRun, usize),
1373+
{
1374+
type Output = TimSortRun;
1375+
1376+
fn index(&self, index: usize) -> &Self::Output {
1377+
if index < self.len {
1378+
// SAFETY: buf_ptr and len invariant must be upheld.
1379+
unsafe {
1380+
return &*(self.buf_ptr.add(index));
1381+
}
1382+
}
1383+
1384+
panic!("Index out of bounds");
1385+
}
1386+
}
1387+
1388+
impl<RunAllocF, RunDeallocF> core::ops::IndexMut<usize> for RunVec<RunAllocF, RunDeallocF>
1389+
where
1390+
RunAllocF: Fn(usize) -> *mut TimSortRun,
1391+
RunDeallocF: Fn(*mut TimSortRun, usize),
1392+
{
1393+
fn index_mut(&mut self, index: usize) -> &mut Self::Output {
1394+
if index < self.len {
1395+
// SAFETY: buf_ptr and len invariant must be upheld.
1396+
unsafe {
1397+
return &mut *(self.buf_ptr.add(index));
1398+
}
1399+
}
1400+
1401+
panic!("Index out of bounds");
1402+
}
1403+
}
1404+
1405+
impl<RunAllocF, RunDeallocF> Drop for RunVec<RunAllocF, RunDeallocF>
1406+
where
1407+
RunAllocF: Fn(usize) -> *mut TimSortRun,
1408+
RunDeallocF: Fn(*mut TimSortRun, usize),
1409+
{
1410+
fn drop(&mut self) {
1411+
// As long as TimSortRun is Copy we don't need to drop them individually but just the
1412+
// whole allocation.
1413+
(self.run_dealloc_fn)(self.buf_ptr, self.capacity);
1414+
}
1415+
}
1416+
}
1417+
1418+
/// Internal type used by merge_sort.
1419+
#[derive(Clone, Copy, Debug)]
1420+
pub struct TimSortRun {
1421+
len: usize,
1422+
start: usize,
1423+
}

0 commit comments

Comments
 (0)
Please sign in to comment.