Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 0fe8f34

Browse files
committedApr 10, 2025·
Auto merge of #137412 - scottmcm:redo-swap, r=cuviper
Ensure `swap_nonoverlapping` is really always untyped This replaces #134954, which was arguably overcomplicated. ## Fixes #134713 Actually using the type passed to `ptr::swap_nonoverlapping` for anything other than its size + align turns out to not work, so this goes back to always erasing the types down to just bytes. (Except in `const`, which keeps doing the same thing as before to preserve `@RalfJung's` fix from #134689) ## Fixes #134946 I'd previously moved the swapping to use auto-vectorization *on bytes*, but someone pointed out on Discord that the tail loop handling from that left a whole bunch of byte-by-byte swapping around. This goes back to manual tail handling to avoid that, then still triggers auto-vectorization on pointer-width values. (So you'll see `<4 x i64>` on `x86-64-v3` for example.)
2 parents 2205455 + 63dcac8 commit 0fe8f34

File tree

8 files changed

+303
-96
lines changed

8 files changed

+303
-96
lines changed
 

‎library/core/src/ptr/mod.rs

Lines changed: 88 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,7 @@ use crate::cmp::Ordering;
398398
use crate::intrinsics::const_eval_select;
399399
use crate::marker::FnPtr;
400400
use crate::mem::{self, MaybeUninit, SizedTypeProperties};
401+
use crate::num::NonZero;
401402
use crate::{fmt, hash, intrinsics, ub_checks};
402403

403404
mod alignment;
@@ -1094,51 +1095,25 @@ pub const unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
10941095
// are pointers inside `T` we will copy them in one go rather than trying to copy a part
10951096
// of a pointer (which would not work).
10961097
// SAFETY: Same preconditions as this function
1097-
unsafe { swap_nonoverlapping_simple_untyped(x, y, count) }
1098+
unsafe { swap_nonoverlapping_const(x, y, count) }
10981099
} else {
1099-
macro_rules! attempt_swap_as_chunks {
1100-
($ChunkTy:ty) => {
1101-
if align_of::<T>() >= align_of::<$ChunkTy>()
1102-
&& size_of::<T>() % size_of::<$ChunkTy>() == 0
1103-
{
1104-
let x: *mut $ChunkTy = x.cast();
1105-
let y: *mut $ChunkTy = y.cast();
1106-
let count = count * (size_of::<T>() / size_of::<$ChunkTy>());
1107-
// SAFETY: these are the same bytes that the caller promised were
1108-
// ok, just typed as `MaybeUninit<ChunkTy>`s instead of as `T`s.
1109-
// The `if` condition above ensures that we're not violating
1110-
// alignment requirements, and that the division is exact so
1111-
// that we don't lose any bytes off the end.
1112-
return unsafe { swap_nonoverlapping_simple_untyped(x, y, count) };
1113-
}
1114-
};
1100+
// Going though a slice here helps codegen know the size fits in `isize`
1101+
let slice = slice_from_raw_parts_mut(x, count);
1102+
// SAFETY: This is all readable from the pointer, meaning it's one
1103+
// allocated object, and thus cannot be more than isize::MAX bytes.
1104+
let bytes = unsafe { mem::size_of_val_raw::<[T]>(slice) };
1105+
if let Some(bytes) = NonZero::new(bytes) {
1106+
// SAFETY: These are the same ranges, just expressed in a different
1107+
// type, so they're still non-overlapping.
1108+
unsafe { swap_nonoverlapping_bytes(x.cast(), y.cast(), bytes) };
11151109
}
1116-
1117-
// Split up the slice into small power-of-two-sized chunks that LLVM is able
1118-
// to vectorize (unless it's a special type with more-than-pointer alignment,
1119-
// because we don't want to pessimize things like slices of SIMD vectors.)
1120-
if align_of::<T>() <= size_of::<usize>()
1121-
&& (!size_of::<T>().is_power_of_two()
1122-
|| size_of::<T>() > size_of::<usize>() * 2)
1123-
{
1124-
attempt_swap_as_chunks!(usize);
1125-
attempt_swap_as_chunks!(u8);
1126-
}
1127-
1128-
// SAFETY: Same preconditions as this function
1129-
unsafe { swap_nonoverlapping_simple_untyped(x, y, count) }
11301110
}
11311111
)
11321112
}
11331113

11341114
/// Same behavior and safety conditions as [`swap_nonoverlapping`]
1135-
///
1136-
/// LLVM can vectorize this (at least it can for the power-of-two-sized types
1137-
/// `swap_nonoverlapping` tries to use) so no need to manually SIMD it.
11381115
#[inline]
1139-
const unsafe fn swap_nonoverlapping_simple_untyped<T>(x: *mut T, y: *mut T, count: usize) {
1140-
let x = x.cast::<MaybeUninit<T>>();
1141-
let y = y.cast::<MaybeUninit<T>>();
1116+
const unsafe fn swap_nonoverlapping_const<T>(x: *mut T, y: *mut T, count: usize) {
11421117
let mut i = 0;
11431118
while i < count {
11441119
// SAFETY: By precondition, `i` is in-bounds because it's below `n`
@@ -1147,26 +1122,91 @@ const unsafe fn swap_nonoverlapping_simple_untyped<T>(x: *mut T, y: *mut T, coun
11471122
// and it's distinct from `x` since the ranges are non-overlapping
11481123
let y = unsafe { y.add(i) };
11491124

1150-
// If we end up here, it's because we're using a simple type -- like
1151-
// a small power-of-two-sized thing -- or a special type with particularly
1152-
// large alignment, particularly SIMD types.
1153-
// Thus, we're fine just reading-and-writing it, as either it's small
1154-
// and that works well anyway or it's special and the type's author
1155-
// presumably wanted things to be done in the larger chunk.
1156-
11571125
// SAFETY: we're only ever given pointers that are valid to read/write,
11581126
// including being aligned, and nothing here panics so it's drop-safe.
11591127
unsafe {
1160-
let a: MaybeUninit<T> = read(x);
1161-
let b: MaybeUninit<T> = read(y);
1162-
write(x, b);
1163-
write(y, a);
1128+
// Note that it's critical that these use `copy_nonoverlapping`,
1129+
// rather than `read`/`write`, to avoid #134713 if T has padding.
1130+
let mut temp = MaybeUninit::<T>::uninit();
1131+
copy_nonoverlapping(x, temp.as_mut_ptr(), 1);
1132+
copy_nonoverlapping(y, x, 1);
1133+
copy_nonoverlapping(temp.as_ptr(), y, 1);
11641134
}
11651135

11661136
i += 1;
11671137
}
11681138
}
11691139

1140+
// Don't let MIR inline this, because we really want it to keep its noalias metadata
1141+
#[rustc_no_mir_inline]
1142+
#[inline]
1143+
fn swap_chunk<const N: usize>(x: &mut MaybeUninit<[u8; N]>, y: &mut MaybeUninit<[u8; N]>) {
1144+
let a = *x;
1145+
let b = *y;
1146+
*x = b;
1147+
*y = a;
1148+
}
1149+
1150+
#[inline]
1151+
unsafe fn swap_nonoverlapping_bytes(x: *mut u8, y: *mut u8, bytes: NonZero<usize>) {
1152+
// Same as `swap_nonoverlapping::<[u8; N]>`.
1153+
unsafe fn swap_nonoverlapping_chunks<const N: usize>(
1154+
x: *mut MaybeUninit<[u8; N]>,
1155+
y: *mut MaybeUninit<[u8; N]>,
1156+
chunks: NonZero<usize>,
1157+
) {
1158+
let chunks = chunks.get();
1159+
for i in 0..chunks {
1160+
// SAFETY: i is in [0, chunks) so the adds and dereferences are in-bounds.
1161+
unsafe { swap_chunk(&mut *x.add(i), &mut *y.add(i)) };
1162+
}
1163+
}
1164+
1165+
// Same as `swap_nonoverlapping_bytes`, but accepts at most 1+2+4=7 bytes
1166+
#[inline]
1167+
unsafe fn swap_nonoverlapping_short(x: *mut u8, y: *mut u8, bytes: NonZero<usize>) {
1168+
// Tail handling for auto-vectorized code sometimes has element-at-a-time behaviour,
1169+
// see <https://github.com/rust-lang/rust/issues/134946>.
1170+
// By swapping as different sizes, rather than as a loop over bytes,
1171+
// we make sure not to end up with, say, seven byte-at-a-time copies.
1172+
1173+
let bytes = bytes.get();
1174+
let mut i = 0;
1175+
macro_rules! swap_prefix {
1176+
($($n:literal)+) => {$(
1177+
if (bytes & $n) != 0 {
1178+
// SAFETY: `i` can only have the same bits set as those in bytes,
1179+
// so these `add`s are in-bounds of `bytes`. But the bit for
1180+
// `$n` hasn't been set yet, so the `$n` bytes that `swap_chunk`
1181+
// will read and write are within the usable range.
1182+
unsafe { swap_chunk::<$n>(&mut*x.add(i).cast(), &mut*y.add(i).cast()) };
1183+
i |= $n;
1184+
}
1185+
)+};
1186+
}
1187+
swap_prefix!(4 2 1);
1188+
debug_assert_eq!(i, bytes);
1189+
}
1190+
1191+
const CHUNK_SIZE: usize = size_of::<*const ()>();
1192+
let bytes = bytes.get();
1193+
1194+
let chunks = bytes / CHUNK_SIZE;
1195+
let tail = bytes % CHUNK_SIZE;
1196+
if let Some(chunks) = NonZero::new(chunks) {
1197+
// SAFETY: this is bytes/CHUNK_SIZE*CHUNK_SIZE bytes, which is <= bytes,
1198+
// so it's within the range of our non-overlapping bytes.
1199+
unsafe { swap_nonoverlapping_chunks::<CHUNK_SIZE>(x.cast(), y.cast(), chunks) };
1200+
}
1201+
if let Some(tail) = NonZero::new(tail) {
1202+
const { assert!(CHUNK_SIZE <= 8) };
1203+
let delta = chunks * CHUNK_SIZE;
1204+
// SAFETY: the tail length is below CHUNK SIZE because of the remainder,
1205+
// and CHUNK_SIZE is at most 8 by the const assert, so tail <= 7
1206+
unsafe { swap_nonoverlapping_short(x.add(delta), y.add(delta), tail) };
1207+
}
1208+
}
1209+
11701210
/// Moves `src` into the pointed `dst`, returning the previous `dst` value.
11711211
///
11721212
/// Neither value is dropped.

‎library/coretests/tests/ptr.rs

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -984,3 +984,39 @@ fn test_ptr_metadata_in_const() {
984984
assert_eq!(SLICE_META, 3);
985985
assert_eq!(DYN_META.size_of(), 42);
986986
}
987+
988+
// See <https://github.com/rust-lang/rust/issues/134713>
989+
const fn ptr_swap_nonoverlapping_is_untyped_inner() {
990+
#[repr(C)]
991+
struct HasPadding(usize, u8);
992+
993+
let buf1: [usize; 2] = [1000, 2000];
994+
let buf2: [usize; 2] = [3000, 4000];
995+
996+
// HasPadding and [usize; 2] have the same size and alignment,
997+
// so swap_nonoverlapping should treat them the same
998+
assert!(size_of::<HasPadding>() == size_of::<[usize; 2]>());
999+
assert!(align_of::<HasPadding>() == align_of::<[usize; 2]>());
1000+
1001+
let mut b1 = buf1;
1002+
let mut b2 = buf2;
1003+
// Safety: b1 and b2 are distinct local variables,
1004+
// with the same size and alignment as HasPadding.
1005+
unsafe {
1006+
std::ptr::swap_nonoverlapping(
1007+
b1.as_mut_ptr().cast::<HasPadding>(),
1008+
b2.as_mut_ptr().cast::<HasPadding>(),
1009+
1,
1010+
);
1011+
}
1012+
assert!(b1[0] == buf2[0]);
1013+
assert!(b1[1] == buf2[1]);
1014+
assert!(b2[0] == buf1[0]);
1015+
assert!(b2[1] == buf1[1]);
1016+
}
1017+
1018+
#[test]
1019+
fn test_ptr_swap_nonoverlapping_is_untyped() {
1020+
ptr_swap_nonoverlapping_is_untyped_inner();
1021+
const { ptr_swap_nonoverlapping_is_untyped_inner() };
1022+
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
use std::mem::{size_of, align_of};
2+
3+
// See <https://github.com/rust-lang/rust/issues/134713>
4+
5+
#[repr(C)]
6+
struct Foo(usize, u8);
7+
8+
fn main() {
9+
let buf1: [usize; 2] = [1000, 2000];
10+
let buf2: [usize; 2] = [3000, 4000];
11+
12+
// Foo and [usize; 2] have the same size and alignment,
13+
// so swap_nonoverlapping should treat them the same
14+
assert_eq!(size_of::<Foo>(), size_of::<[usize; 2]>());
15+
assert_eq!(align_of::<Foo>(), align_of::<[usize; 2]>());
16+
17+
let mut b1 = buf1;
18+
let mut b2 = buf2;
19+
// Safety: b1 and b2 are distinct local variables,
20+
// with the same size and alignment as Foo.
21+
unsafe {
22+
std::ptr::swap_nonoverlapping(
23+
b1.as_mut_ptr().cast::<Foo>(),
24+
b2.as_mut_ptr().cast::<Foo>(),
25+
1,
26+
);
27+
}
28+
assert_eq!(b1, buf2);
29+
assert_eq!(b2, buf1);
30+
}

‎tests/assembly/x86_64-typed-swap.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,31 @@ pub fn swap_simd(x: &mut __m128, y: &mut __m128) {
5151
// CHECK-NEXT: retq
5252
swap(x, y)
5353
}
54+
55+
// CHECK-LABEL: swap_string:
56+
#[no_mangle]
57+
pub fn swap_string(x: &mut String, y: &mut String) {
58+
// CHECK-NOT: mov
59+
// CHECK-COUNT-4: movups
60+
// CHECK-NOT: mov
61+
// CHECK-COUNT-4: movq
62+
// CHECK-NOT: mov
63+
swap(x, y)
64+
}
65+
66+
// CHECK-LABEL: swap_44_bytes:
67+
#[no_mangle]
68+
pub fn swap_44_bytes(x: &mut [u8; 44], y: &mut [u8; 44]) {
69+
// Ensure we do better than a long run of byte copies,
70+
// see <https://github.com/rust-lang/rust/issues/134946>
71+
72+
// CHECK-NOT: movb
73+
// CHECK-COUNT-8: movups{{.+}}xmm
74+
// CHECK-NOT: movb
75+
// CHECK-COUNT-4: movq
76+
// CHECK-NOT: movb
77+
// CHECK-COUNT-4: movl
78+
// CHECK-NOT: movb
79+
// CHECK: retq
80+
swap(x, y)
81+
}

‎tests/codegen/simd/swap-simd-types.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ pub fn swap_single_m256(x: &mut __m256, y: &mut __m256) {
2323
#[no_mangle]
2424
pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
2525
// CHECK-NOT: alloca
26-
// CHECK: load <8 x float>{{.+}}align 32
27-
// CHECK: store <8 x float>{{.+}}align 32
26+
// CHECK-COUNT-2: load <4 x i64>{{.+}}align 32
27+
// CHECK-COUNT-2: store <4 x i64>{{.+}}align 32
2828
if x.len() == y.len() {
2929
x.swap_with_slice(y);
3030
}
@@ -34,7 +34,7 @@ pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
3434
#[no_mangle]
3535
pub fn swap_bytes32(x: &mut [u8; 32], y: &mut [u8; 32]) {
3636
// CHECK-NOT: alloca
37-
// CHECK: load <32 x i8>{{.+}}align 1
38-
// CHECK: store <32 x i8>{{.+}}align 1
37+
// CHECK-COUNT-2: load <4 x i64>{{.+}}align 1
38+
// CHECK-COUNT-2: store <4 x i64>{{.+}}align 1
3939
swap(x, y)
4040
}

‎tests/codegen/swap-large-types.rs

Lines changed: 51 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,16 @@ type KeccakBuffer = [[u64; 5]; 5];
1212
// to stack for large types, which is completely unnecessary as the lack of
1313
// overlap means we can just do whatever fits in registers at a time.
1414

15+
// The tests here (after the first one showing that the problem still exists)
16+
// are less about testing *exactly* what the codegen is, and more about testing
17+
// 1) That things are swapped directly from one argument to the other,
18+
// never going through stack along the way, and
19+
// 2) That we're doing the swapping for big things using large vector types,
20+
// rather then `i64` or `<8 x i8>` (or, even worse, `i8`) at a time.
21+
//
22+
// (There are separate tests for intrinsics::typed_swap_nonoverlapping that
23+
// check that it, as an intrinsic, are emitting exactly what it should.)
24+
1525
// CHECK-LABEL: @swap_basic
1626
#[no_mangle]
1727
pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
@@ -26,65 +36,81 @@ pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
2636
}
2737
}
2838

29-
// This test verifies that the library does something smarter, and thus
30-
// doesn't need any scratch space on the stack.
31-
3239
// CHECK-LABEL: @swap_std
3340
#[no_mangle]
3441
pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
3542
// CHECK-NOT: alloca
36-
// CHECK: load <{{[0-9]+}} x i64>
37-
// CHECK: store <{{[0-9]+}} x i64>
43+
// CHECK: load <{{2|4}} x i64>
44+
// CHECK: store <{{2|4}} x i64>
3845
swap(x, y)
3946
}
4047

41-
// Verify that types with usize alignment are swapped via vectored usizes,
42-
// not falling back to byte-level code.
43-
4448
// CHECK-LABEL: @swap_slice
4549
#[no_mangle]
4650
pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
4751
// CHECK-NOT: alloca
48-
// CHECK: load <{{[0-9]+}} x i64>
49-
// CHECK: store <{{[0-9]+}} x i64>
52+
// CHECK: load <{{2|4}} x i64>
53+
// CHECK: store <{{2|4}} x i64>
5054
if x.len() == y.len() {
5155
x.swap_with_slice(y);
5256
}
5357
}
5458

55-
// But for a large align-1 type, vectorized byte copying is what we want.
56-
5759
type OneKilobyteBuffer = [u8; 1024];
5860

5961
// CHECK-LABEL: @swap_1kb_slices
6062
#[no_mangle]
6163
pub fn swap_1kb_slices(x: &mut [OneKilobyteBuffer], y: &mut [OneKilobyteBuffer]) {
6264
// CHECK-NOT: alloca
63-
// CHECK: load <{{[0-9]+}} x i8>
64-
// CHECK: store <{{[0-9]+}} x i8>
65+
66+
// CHECK-NOT: load i32
67+
// CHECK-NOT: store i32
68+
// CHECK-NOT: load i16
69+
// CHECK-NOT: store i16
70+
// CHECK-NOT: load i8
71+
// CHECK-NOT: store i8
72+
73+
// CHECK: load <{{2|4}} x i64>{{.+}}align 1,
74+
// CHECK: store <{{2|4}} x i64>{{.+}}align 1,
75+
76+
// CHECK-NOT: load i32
77+
// CHECK-NOT: store i32
78+
// CHECK-NOT: load i16
79+
// CHECK-NOT: store i16
80+
// CHECK-NOT: load i8
81+
// CHECK-NOT: store i8
82+
6583
if x.len() == y.len() {
6684
x.swap_with_slice(y);
6785
}
6886
}
6987

70-
// This verifies that the 2×read + 2×write optimizes to just 3 memcpys
71-
// for an unusual type like this. It's not clear whether we should do anything
72-
// smarter in Rust for these, so for now it's fine to leave these up to the backend.
73-
// That's not as bad as it might seem, as for example, LLVM will lower the
74-
// memcpys below to VMOVAPS on YMMs if one enables the AVX target feature.
75-
// Eventually we'll be able to pass `align_of::<T>` to a const generic and
76-
// thus pick a smarter chunk size ourselves without huge code duplication.
77-
7888
#[repr(align(64))]
7989
pub struct BigButHighlyAligned([u8; 64 * 3]);
8090

8191
// CHECK-LABEL: @swap_big_aligned
8292
#[no_mangle]
8393
pub fn swap_big_aligned(x: &mut BigButHighlyAligned, y: &mut BigButHighlyAligned) {
8494
// CHECK-NOT: call void @llvm.memcpy
85-
// CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
86-
// CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
87-
// CHECK: call void @llvm.memcpy.{{.+}}(ptr noundef nonnull align 64 dereferenceable(192)
95+
// CHECK-NOT: load i32
96+
// CHECK-NOT: store i32
97+
// CHECK-NOT: load i16
98+
// CHECK-NOT: store i16
99+
// CHECK-NOT: load i8
100+
// CHECK-NOT: store i8
101+
102+
// CHECK-COUNT-2: load <{{2|4}} x i64>{{.+}}align 64,
103+
// CHECK-COUNT-2: store <{{2|4}} x i64>{{.+}}align 64,
104+
105+
// CHECK-COUNT-2: load <{{2|4}} x i64>{{.+}}align 32,
106+
// CHECK-COUNT-2: store <{{2|4}} x i64>{{.+}}align 32,
107+
108+
// CHECK-NOT: load i32
109+
// CHECK-NOT: store i32
110+
// CHECK-NOT: load i16
111+
// CHECK-NOT: store i16
112+
// CHECK-NOT: load i8
113+
// CHECK-NOT: store i8
88114
// CHECK-NOT: call void @llvm.memcpy
89115
swap(x, y)
90116
}

‎tests/codegen/swap-small-types.rs

Lines changed: 63 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
//@ compile-flags: -Copt-level=3 -Z merge-functions=disabled
22
//@ only-x86_64
3+
//@ min-llvm-version: 20
4+
//@ ignore-std-debug-assertions (`ptr::swap_nonoverlapping` has one which blocks some optimizations)
35

46
#![crate_type = "lib"]
57

@@ -27,13 +29,19 @@ pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) {
2729
pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) {
2830
// CHECK-NOT: alloca
2931

30-
// Whether `i8` is the best for this is unclear, but
31-
// might as well record what's actually happening right now.
32-
33-
// CHECK: load i8
34-
// CHECK: load i8
35-
// CHECK: store i8
36-
// CHECK: store i8
32+
// Swapping `i48` might be cleaner in LLVM-IR here, but `i32`+`i16` isn't bad,
33+
// and is closer to the assembly it generates anyway.
34+
35+
// CHECK-NOT: load{{ }}
36+
// CHECK: load i32{{.+}}align 2
37+
// CHECK-NEXT: load i32{{.+}}align 2
38+
// CHECK-NEXT: store i32{{.+}}align 2
39+
// CHECK-NEXT: store i32{{.+}}align 2
40+
// CHECK: load i16{{.+}}align 2
41+
// CHECK-NEXT: load i16{{.+}}align 2
42+
// CHECK-NEXT: store i16{{.+}}align 2
43+
// CHECK-NEXT: store i16{{.+}}align 2
44+
// CHECK-NOT: store{{ }}
3745
swap(x, y)
3846
}
3947

@@ -76,30 +84,49 @@ pub fn swap_slices<'a>(x: &mut &'a [u32], y: &mut &'a [u32]) {
7684
swap(x, y)
7785
}
7886

79-
// LLVM doesn't vectorize a loop over 3-byte elements,
80-
// so we chunk it down to bytes and loop over those instead.
8187
type RGB24 = [u8; 3];
8288

8389
// CHECK-LABEL: @swap_rgb24_slices
8490
#[no_mangle]
8591
pub fn swap_rgb24_slices(x: &mut [RGB24], y: &mut [RGB24]) {
8692
// CHECK-NOT: alloca
87-
// CHECK: load <{{[0-9]+}} x i8>
88-
// CHECK: store <{{[0-9]+}} x i8>
93+
94+
// CHECK: mul nuw nsw i64 %{{x|y}}.1, 3
95+
96+
// CHECK: load <{{[0-9]+}} x i64>
97+
// CHECK: store <{{[0-9]+}} x i64>
98+
99+
// CHECK-COUNT-2: load i32
100+
// CHECK-COUNT-2: store i32
101+
// CHECK-COUNT-2: load i16
102+
// CHECK-COUNT-2: store i16
103+
// CHECK-COUNT-2: load i8
104+
// CHECK-COUNT-2: store i8
89105
if x.len() == y.len() {
90106
x.swap_with_slice(y);
91107
}
92108
}
93109

94-
// This one has a power-of-two size, so we iterate over it directly
95110
type RGBA32 = [u8; 4];
96111

97112
// CHECK-LABEL: @swap_rgba32_slices
98113
#[no_mangle]
99114
pub fn swap_rgba32_slices(x: &mut [RGBA32], y: &mut [RGBA32]) {
100115
// CHECK-NOT: alloca
101-
// CHECK: load <{{[0-9]+}} x i32>
102-
// CHECK: store <{{[0-9]+}} x i32>
116+
117+
// Because the size in bytes in a multiple of 4, we can skip the smallest sizes.
118+
119+
// CHECK: load <{{[0-9]+}} x i64>
120+
// CHECK: store <{{[0-9]+}} x i64>
121+
122+
// CHECK-COUNT-2: load i32
123+
// CHECK-COUNT-2: store i32
124+
125+
// CHECK-NOT: load i16
126+
// CHECK-NOT: store i16
127+
// CHECK-NOT: load i8
128+
// CHECK-NOT: store i8
129+
103130
if x.len() == y.len() {
104131
x.swap_with_slice(y);
105132
}
@@ -113,8 +140,8 @@ const _: () = assert!(!std::mem::size_of::<String>().is_power_of_two());
113140
#[no_mangle]
114141
pub fn swap_string_slices(x: &mut [String], y: &mut [String]) {
115142
// CHECK-NOT: alloca
116-
// CHECK: load <{{[0-9]+}} x i64>
117-
// CHECK: store <{{[0-9]+}} x i64>
143+
// CHECK: load <{{[0-9]+}} x i64>{{.+}}, align 8,
144+
// CHECK: store <{{[0-9]+}} x i64>{{.+}}, align 8,
118145
if x.len() == y.len() {
119146
x.swap_with_slice(y);
120147
}
@@ -130,6 +157,26 @@ pub struct Packed {
130157
#[no_mangle]
131158
pub fn swap_packed_structs(x: &mut Packed, y: &mut Packed) {
132159
// CHECK-NOT: alloca
160+
161+
// CHECK-NOT: load
162+
// CHECK-NOT: store
163+
164+
// CHECK: %[[A:.+]] = load i64, ptr %x, align 1,
165+
// CHECK-NEXT: %[[B:.+]] = load i64, ptr %y, align 1,
166+
// CHECK-NEXT: store i64 %[[B]], ptr %x, align 1,
167+
// CHECK-NEXT: store i64 %[[A]], ptr %y, align 1,
168+
169+
// CHECK-NOT: load
170+
// CHECK-NOT: store
171+
172+
// CHECK: %[[C:.+]] = load i8, ptr %[[X8:.+]], align 1,
173+
// CHECK-NEXT: %[[D:.+]] = load i8, ptr %[[Y8:.+]], align 1,
174+
// CHECK-NEXT: store i8 %[[D]], ptr %[[X8]], align 1,
175+
// CHECK-NEXT: store i8 %[[C]], ptr %[[Y8]], align 1,
176+
177+
// CHECK-NOT: load
178+
// CHECK-NOT: store
179+
133180
// CHECK: ret void
134181
swap(x, y)
135182
}

‎tests/ui/consts/missing_span_in_backtrace.stderr

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@ note: inside `swap_nonoverlapping::<MaybeUninit<u8>>`
1212
--> $SRC_DIR/core/src/ptr/mod.rs:LL:COL
1313
note: inside `swap_nonoverlapping::compiletime::<MaybeUninit<u8>>`
1414
--> $SRC_DIR/core/src/ptr/mod.rs:LL:COL
15-
note: inside `std::ptr::swap_nonoverlapping_simple_untyped::<MaybeUninit<u8>>`
16-
--> $SRC_DIR/core/src/ptr/mod.rs:LL:COL
17-
note: inside `std::ptr::read::<MaybeUninit<MaybeUninit<u8>>>`
15+
note: inside `std::ptr::swap_nonoverlapping_const::<MaybeUninit<u8>>`
1816
--> $SRC_DIR/core/src/ptr/mod.rs:LL:COL
17+
note: inside `copy_nonoverlapping::<MaybeUninit<u8>>`
18+
--> $SRC_DIR/core/src/intrinsics/mod.rs:LL:COL
1919
= help: this code performed an operation that depends on the underlying bytes representing a pointer
2020
= help: the absolute address of a pointer is not known at compile-time, so such operations are not supported
2121
= note: this error originates in the macro `$crate::intrinsics::const_eval_select` which comes from the expansion of the macro `const_eval_select` (in Nightly builds, run with -Z macro-backtrace for more info)

0 commit comments

Comments
 (0)
Please sign in to comment.