Skip to content

Commit 2627981

Browse files
committed
Avoid memcpy in codegen for more types, notably Vec
PR 111999 set up the framework to be able to do this; this PR expands it to more types than just arrays. Most interestingly, this allows it to work with `Vec<T>` and `String`, so swapping those no longer ends up going through stack like it does today (<https://rust.godbolt.org/z/cKG7o8aaW>). And since this is done in codegen, it's not special for `swap`, and thus will hopefully allow types like this to better optimize in lots of places, with easier SRoA.
1 parent 505f03b commit 2627981

File tree

10 files changed

+248
-36
lines changed

10 files changed

+248
-36
lines changed

compiler/rustc_codegen_llvm/src/type_of.rs

+32-4
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,7 @@ impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {
407407
// arrays but don't count as aggregate types
408408
if let FieldsShape::Array { count, .. } = self.layout.fields()
409409
&& let element = self.field(cx, 0)
410-
&& element.ty.is_integral()
410+
&& element.ty.is_primitive()
411411
{
412412
// `cx.type_ix(bits)` is tempting here, but while that works great
413413
// for things that *stay* as memory-to-memory copies, it also ends
@@ -418,8 +418,36 @@ impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {
418418
return Some(cx.type_vector(ety, *count));
419419
}
420420

421-
// FIXME: The above only handled integer arrays; surely more things
422-
// would also be possible. Be careful about provenance, though!
423-
None
421+
// Ensure the type isn't too complex nor otherwise ineligible
422+
is_scalar_copy_reasonable(4, self.ty, cx)?;
423+
424+
// Otherwise we can load/store it via a long-enough integer type
425+
Some(cx.type_ix(self.layout.size().bits()))
426+
}
427+
}
428+
429+
fn is_scalar_copy_reasonable<'a, 'tcx>(
430+
max_fields: u32,
431+
t: Ty<'tcx>,
432+
cx: &CodegenCx<'a, 'tcx>,
433+
) -> Option<u32> {
434+
if t.is_any_ptr() || t.is_primitive() {
435+
return max_fields.checked_sub(1);
436+
}
437+
438+
match t.kind() {
439+
ty::Tuple(field_tys) => field_tys
440+
.into_iter()
441+
.try_fold(max_fields, |mf, tt| is_scalar_copy_reasonable(mf, tt, cx)),
442+
// Unions are magic and can carry anything, regardless of their field
443+
// types, so force them to always go through `memcpy`.
444+
ty::Adt(adt_def, _) if adt_def.is_union() => None,
445+
// If there could be multiple variants, just use `memcpy` for now.
446+
ty::Adt(adt_def, _) if adt_def.variants().len() != 1 => None,
447+
ty::Adt(adt_def, substs) => adt_def.all_fields().try_fold(max_fields, |mf, field_def| {
448+
let field_ty = field_def.ty(cx.tcx, substs);
449+
is_scalar_copy_reasonable(mf, field_ty, cx)
450+
}),
451+
_ => None,
424452
}
425453
}

tests/assembly/swap-strings.rs

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
// assembly-output: emit-asm
2+
// compile-flags: --crate-type=lib -O -C llvm-args=-x86-asm-syntax=intel
3+
// only-x86_64
4+
// ignore-sgx
5+
// ignore-debug
6+
7+
// Ensure that the swap uses SIMD registers and does not go to stack.
8+
9+
// CHECK-LABEL: swap_strings_xmm:
10+
#[no_mangle]
11+
pub fn swap_strings_xmm(a: &mut String, b: &mut String) {
12+
// CHECK-DAG: movups [[A1:xmm.+]], xmmword ptr [[AX:.+]]
13+
// CHECK-DAG: mov [[A2:r.+]], qword ptr [[AQ:.+]]
14+
// CHECK-DAG: movups [[B1:xmm.+]], xmmword ptr [[BX:.+]]
15+
// CHECK-DAG: mov [[B2:r.+]], qword ptr [[BQ:.+]]
16+
// CHECK-NOT: mov
17+
// CHECK-DAG: movups xmmword ptr [[AX]], [[B1]]
18+
// CHECK-DAG: mov qword ptr [[AQ]], [[B2]]
19+
// CHECK-DAG: movups xmmword ptr [[BX]], [[A1]]
20+
// CHECK-DAG: mov qword ptr [[BQ]], [[A2]]
21+
// CHECK: ret
22+
std::mem::swap(a, b);
23+
}

tests/codegen/issues/issue-15953.rs

+8-5
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,32 @@
11
// Test that llvm generates `memcpy` for moving a value
22
// inside a function and moving an argument.
33

4+
#[derive(Default, Debug)]
5+
struct RatherLargeType(usize, isize, usize, isize, usize, isize);
6+
47
struct Foo {
5-
x: Vec<i32>,
8+
x: RatherLargeType,
69
}
710

811
#[inline(never)]
912
#[no_mangle]
1013
// CHECK: memcpy
11-
fn interior(x: Vec<i32>) -> Vec<i32> {
14+
fn interior(x: RatherLargeType) -> RatherLargeType {
1215
let Foo { x } = Foo { x: x };
1316
x
1417
}
1518

1619
#[inline(never)]
1720
#[no_mangle]
1821
// CHECK: memcpy
19-
fn exterior(x: Vec<i32>) -> Vec<i32> {
22+
fn exterior(x: RatherLargeType) -> RatherLargeType {
2023
x
2124
}
2225

2326
fn main() {
24-
let x = interior(Vec::new());
27+
let x = interior(RatherLargeType::default());
2528
println!("{:?}", x);
2629

27-
let x = exterior(Vec::new());
30+
let x = exterior(RatherLargeType::default());
2831
println!("{:?}", x);
2932
}

tests/codegen/issues/issue-86106.rs

+17-8
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,15 @@
22
// compile-flags: -C opt-level=3 -Z merge-functions=disabled
33

44
// The below two functions ensure that both `String::new()` and `"".to_string()`
5-
// produce the identical code.
5+
// generate their values directly, rather that creating a constant and copying
6+
// that constant (which takes more instructions because of PIC).
67

78
#![crate_type = "lib"]
89

910
// CHECK-LABEL: define {{(dso_local )?}}void @string_new
1011
#[no_mangle]
1112
pub fn string_new() -> String {
12-
// CHECK: store ptr inttoptr
13+
// CHECK: store {{i16|i32|i64}} 1, ptr %_0,
1314
// CHECK-NEXT: getelementptr
1415
// CHECK-NEXT: call void @llvm.memset
1516
// CHECK-NEXT: ret void
@@ -19,10 +20,8 @@ pub fn string_new() -> String {
1920
// CHECK-LABEL: define {{(dso_local )?}}void @empty_to_string
2021
#[no_mangle]
2122
pub fn empty_to_string() -> String {
22-
// CHECK: store ptr inttoptr
23-
// CHECK-NEXT: getelementptr
24-
// CHECK-NEXT: call void @llvm.memset
25-
// CHECK-NEXT: ret void
23+
// CHECK: store {{i48|i96|i192}} 1, ptr %_0, align {{2|4|8}}
24+
// CHECK-NEXT: ret
2625
"".to_string()
2726
}
2827

@@ -32,7 +31,7 @@ pub fn empty_to_string() -> String {
3231
// CHECK-LABEL: @empty_vec
3332
#[no_mangle]
3433
pub fn empty_vec() -> Vec<u8> {
35-
// CHECK: store ptr inttoptr
34+
// CHECK: store ptr inttoptr ({{i16|i32|i64}} 1 to ptr), ptr %_0,
3635
// CHECK-NEXT: getelementptr
3736
// CHECK-NEXT: call void @llvm.memset
3837
// CHECK-NEXT: ret void
@@ -42,9 +41,19 @@ pub fn empty_vec() -> Vec<u8> {
4241
// CHECK-LABEL: @empty_vec_clone
4342
#[no_mangle]
4443
pub fn empty_vec_clone() -> Vec<u8> {
45-
// CHECK: store ptr inttoptr
44+
// CHECK: store {{i16|i32|i64}} 1, ptr %_0,
4645
// CHECK-NEXT: getelementptr
4746
// CHECK-NEXT: call void @llvm.memset
4847
// CHECK-NEXT: ret void
4948
vec![].clone()
5049
}
50+
51+
// CHECK-LABEL: @empty_vec_from_array
52+
#[no_mangle]
53+
pub fn empty_vec_from_array() -> Vec<u8> {
54+
// CHECK: store ptr inttoptr ({{i16|i32|i64}} 1 to ptr), ptr %_0,
55+
// CHECK-NEXT: getelementptr
56+
// CHECK-NEXT: call void @llvm.memset
57+
// CHECK-NEXT: ret void
58+
[].into()
59+
}

tests/codegen/loads.rs

+12-2
Original file line numberDiff line numberDiff line change
@@ -136,12 +136,22 @@ pub fn small_array_alignment(x: [i8; 4]) -> [i8; 4] {
136136
x
137137
}
138138

139-
// CHECK-LABEL: small_struct_alignment
139+
// CHECK-LABEL: i32 @small_struct_alignment(i32 %0)
140140
// The struct is loaded as i32, but its alignment is lower, go with 1 byte to avoid target
141141
// dependent alignment
142142
#[no_mangle]
143143
pub fn small_struct_alignment(x: Bytes) -> Bytes {
144-
// CHECK: [[VAR:%[0-9]+]] = load i32, ptr %{{.*}}, align 1
144+
// CHECK: [[RETP:%.+]] = alloca %Bytes, align 1
145+
// CHECK: [[ALIGNED:%.+]] = alloca i32, align 4
146+
// CHECK: %x = alloca %Bytes, align 1
147+
148+
// CHECK: store i32 %0, ptr [[ALIGNED]], align 4
149+
// CHECK: call void @llvm.memcpy{{.+}}(ptr align 1 %x, ptr align 4 %1, i64 4, i1 false)
150+
151+
// CHECK: [[TEMP:%[0-9]+]] = load i32, ptr %x, align 1
152+
// CHECK: store i32 [[TEMP]], ptr [[RETP]], align 1
153+
154+
// CHECK: [[VAR:%[0-9]+]] = load i32, ptr [[RETP]], align 1
145155
// CHECK: ret i32 [[VAR]]
146156
x
147157
}

tests/codegen/mem-replace-simple-type.rs

+13-2
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,19 @@ pub fn replace_ref_str<'a>(r: &mut &'a str, v: &'a str) -> &'a str {
3737
pub fn replace_short_array(r: &mut [u32; 3], v: [u32; 3]) -> [u32; 3] {
3838
// CHECK-NOT: alloca
3939
// CHECK: %[[R:.+]] = load <3 x i32>, ptr %r, align 4
40-
// CHECK: store <3 x i32> %[[R]], ptr %result
40+
// CHECK: store <3 x i32> %[[R]], ptr %result, align 4
4141
// CHECK: %[[V:.+]] = load <3 x i32>, ptr %v, align 4
42-
// CHECK: store <3 x i32> %[[V]], ptr %r
42+
// CHECK: store <3 x i32> %[[V]], ptr %r, align 4
43+
std::mem::replace(r, v)
44+
}
45+
46+
#[no_mangle]
47+
// CHECK-LABEL: @replace_string(
48+
pub fn replace_string(r: &mut String, v: String) -> String {
49+
// CHECK-NOT: alloca
50+
// CHECK: %[[R:.+]] = load i192, ptr %r, align 8
51+
// CHECK: store i192 %[[R]], ptr %result, align 8
52+
// CHECK: %[[V:.+]] = load i192, ptr %v, align 8
53+
// CHECK: store i192 %[[V]], ptr %r, align 8
4354
std::mem::replace(r, v)
4455
}

tests/codegen/packed.rs

+20-4
Original file line numberDiff line numberDiff line change
@@ -119,14 +119,22 @@ pub struct Packed2Pair(u8, u32);
119119
// CHECK-LABEL: @pkd1_pair
120120
#[no_mangle]
121121
pub fn pkd1_pair(pair1: &mut Packed1Pair, pair2: &mut Packed1Pair) {
122-
// CHECK: call void @llvm.memcpy.{{.*}}(ptr align 1 %{{.*}}, ptr align 1 %{{.*}}, i{{[0-9]+}} 5, i1 false)
122+
// CHECK: [[ALLOCA:%.+]] = alloca %Packed1Pair, align 1
123+
// CHECK: [[TEMP1:%.+]] = load i40, ptr %pair1, align 1
124+
// CHECK: store i40 [[TEMP1]], ptr [[ALLOCA]], align 1
125+
// CHECK: [[TEMP2:%.+]] = load i40, ptr [[ALLOCA]], align 1
126+
// CHECK: store i40 [[TEMP2]], ptr %pair2, align 1
123127
*pair2 = *pair1;
124128
}
125129

126130
// CHECK-LABEL: @pkd2_pair
127131
#[no_mangle]
128132
pub fn pkd2_pair(pair1: &mut Packed2Pair, pair2: &mut Packed2Pair) {
129-
// CHECK: call void @llvm.memcpy.{{.*}}(ptr align 2 %{{.*}}, ptr align 2 %{{.*}}, i{{[0-9]+}} 6, i1 false)
133+
// CHECK: [[ALLOCA:%.+]] = alloca %Packed2Pair, align 2
134+
// CHECK: [[TEMP1:%.+]] = load i48, ptr %pair1, align 2
135+
// CHECK: store i48 [[TEMP1]], ptr [[ALLOCA]], align 2
136+
// CHECK: [[TEMP2:%.+]] = load i48, ptr [[ALLOCA]], align 2
137+
// CHECK: store i48 [[TEMP2]], ptr %pair2, align 2
130138
*pair2 = *pair1;
131139
}
132140

@@ -141,13 +149,21 @@ pub struct Packed2NestedPair((u32, u32));
141149
// CHECK-LABEL: @pkd1_nested_pair
142150
#[no_mangle]
143151
pub fn pkd1_nested_pair(pair1: &mut Packed1NestedPair, pair2: &mut Packed1NestedPair) {
144-
// CHECK: call void @llvm.memcpy.{{.*}}(ptr align 1 %{{.*}}, ptr align 1 %{{.*}}, i{{[0-9]+}} 8, i1 false)
152+
// CHECK: [[ALLOCA:%.+]] = alloca %Packed1NestedPair, align 1
153+
// CHECK: [[TEMP1:%.+]] = load i64, ptr %pair1, align 1
154+
// CHECK: store i64 [[TEMP1]], ptr [[ALLOCA]], align 1
155+
// CHECK: [[TEMP2:%.+]] = load i64, ptr [[ALLOCA]], align 1
156+
// CHECK: store i64 [[TEMP2]], ptr %pair2, align 1
145157
*pair2 = *pair1;
146158
}
147159

148160
// CHECK-LABEL: @pkd2_nested_pair
149161
#[no_mangle]
150162
pub fn pkd2_nested_pair(pair1: &mut Packed2NestedPair, pair2: &mut Packed2NestedPair) {
151-
// CHECK: call void @llvm.memcpy.{{.*}}(ptr align 2 %{{.*}}, ptr align 2 %{{.*}}, i{{[0-9]+}} 8, i1 false)
163+
// CHECK: [[ALLOCA:%.+]] = alloca %Packed2NestedPair, align 2
164+
// CHECK: [[TEMP1:%.+]] = load i64, ptr %pair1, align 2
165+
// CHECK: store i64 [[TEMP1]], ptr [[ALLOCA]], align 2
166+
// CHECK: [[TEMP2:%.+]] = load i64, ptr [[ALLOCA]], align 2
167+
// CHECK: store i64 [[TEMP2]], ptr %pair2, align 2
152168
*pair2 = *pair1;
153169
}

tests/codegen/simd-intrinsic/simd-intrinsic-transmute-array.rs

+4-2
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ pub fn vector_align() -> usize {
3535
// CHECK-LABEL: @build_array_s
3636
#[no_mangle]
3737
pub fn build_array_s(x: [f32; 4]) -> S<4> {
38-
// CHECK: call void @llvm.memcpy.{{.+}}({{.*}} align [[VECTOR_ALIGN]] {{.*}} align [[ARRAY_ALIGN]] {{.*}}, [[USIZE]] 16, i1 false)
38+
// CHECK: %[[VAL:.+]] = load <4 x float>, ptr %x, align [[ARRAY_ALIGN]]
39+
// CHECK: store <4 x float> %[[VAL:.+]], ptr %_0, align [[VECTOR_ALIGN]]
3940
S::<4>(x)
4041
}
4142

@@ -50,7 +51,8 @@ pub fn build_array_transmute_s(x: [f32; 4]) -> S<4> {
5051
// CHECK-LABEL: @build_array_t
5152
#[no_mangle]
5253
pub fn build_array_t(x: [f32; 4]) -> T {
53-
// CHECK: call void @llvm.memcpy.{{.+}}({{.*}} align [[VECTOR_ALIGN]] {{.*}} align [[ARRAY_ALIGN]] {{.*}}, [[USIZE]] 16, i1 false)
54+
// CHECK: %[[VAL:.+]] = load <4 x float>, ptr %x, align [[ARRAY_ALIGN]]
55+
// CHECK: store <4 x float> %[[VAL:.+]], ptr %_0, align [[VECTOR_ALIGN]]
5456
T(x)
5557
}
5658

tests/codegen/swap-small-types.rs

+61-9
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,21 @@
66

77
use std::mem::swap;
88

9+
// CHECK-LABEL: @swap_fat_ptrs
10+
#[no_mangle]
11+
pub fn swap_fat_ptrs<'a>(x: &mut &'a str, y: &mut &'a str) {
12+
// CHECK-NOT: alloca
13+
// CHECK: %[[X0:.+]] = load ptr, ptr %x, align 8
14+
// CHECK: %[[X1:.+]] = load i64, ptr %[[PX1:.+]], align 8
15+
// CHECK: %[[Y0:.+]] = load ptr, ptr %y, align 8
16+
// CHECK: %[[Y1:.+]] = load i64, ptr %[[PY1:.+]], align 8
17+
// CHECK: store ptr %[[Y0]], ptr %x, align 8
18+
// CHECK: store i64 %[[Y1]], ptr %[[PX1]], align 8
19+
// CHECK: store ptr %[[X0]], ptr %y, align 8
20+
// CHECK: store i64 %[[X1]], ptr %[[PY1]], align 8
21+
swap(x, y)
22+
}
23+
924
type RGB48 = [u16; 3];
1025

1126
// CHECK-LABEL: @swap_rgb48_manually(
@@ -40,9 +55,9 @@ type RGB24 = [u8; 3];
4055
// CHECK-LABEL: @swap_rgb24_slices
4156
#[no_mangle]
4257
pub fn swap_rgb24_slices(x: &mut [RGB24], y: &mut [RGB24]) {
43-
// CHECK-NOT: alloca
44-
// CHECK: load <{{[0-9]+}} x i8>
45-
// CHECK: store <{{[0-9]+}} x i8>
58+
// CHECK-NOT: alloca
59+
// CHECK: load <{{[0-9]+}} x i8>
60+
// CHECK: store <{{[0-9]+}} x i8>
4661
if x.len() == y.len() {
4762
x.swap_with_slice(y);
4863
}
@@ -51,12 +66,23 @@ pub fn swap_rgb24_slices(x: &mut [RGB24], y: &mut [RGB24]) {
5166
// This one has a power-of-two size, so we iterate over it directly
5267
type RGBA32 = [u8; 4];
5368

69+
// CHECK-LABEL: @swap_rgba32
70+
#[no_mangle]
71+
pub fn swap_rgba32(x: &mut RGBA32, y: &mut RGBA32) {
72+
// CHECK-NOT: alloca
73+
// CHECK: load <4 x i8>
74+
// CHECK: load <4 x i8>
75+
// CHECK: store <4 x i8>
76+
// CHECK: store <4 x i8>
77+
swap(x, y)
78+
}
79+
5480
// CHECK-LABEL: @swap_rgba32_slices
5581
#[no_mangle]
5682
pub fn swap_rgba32_slices(x: &mut [RGBA32], y: &mut [RGBA32]) {
57-
// CHECK-NOT: alloca
58-
// CHECK: load <{{[0-9]+}} x i32>
59-
// CHECK: store <{{[0-9]+}} x i32>
83+
// CHECK-NOT: alloca
84+
// CHECK: load <{{[0-9]+}} x i32>
85+
// CHECK: store <{{[0-9]+}} x i32>
6086
if x.len() == y.len() {
6187
x.swap_with_slice(y);
6288
}
@@ -69,10 +95,36 @@ const _: () = assert!(!std::mem::size_of::<String>().is_power_of_two());
6995
// CHECK-LABEL: @swap_string_slices
7096
#[no_mangle]
7197
pub fn swap_string_slices(x: &mut [String], y: &mut [String]) {
72-
// CHECK-NOT: alloca
73-
// CHECK: load <{{[0-9]+}} x i64>
74-
// CHECK: store <{{[0-9]+}} x i64>
98+
// CHECK-NOT: alloca
99+
// CHECK: load <{{[0-9]+}} x i64>
100+
// CHECK: store <{{[0-9]+}} x i64>
75101
if x.len() == y.len() {
76102
x.swap_with_slice(y);
77103
}
78104
}
105+
106+
// It's wasteful to do three `memcpy`s when a `String` is just three fields.
107+
108+
// CHECK-LABEL: @swap_strings
109+
#[no_mangle]
110+
pub fn swap_strings(x: &mut String, y: &mut String) {
111+
// CHECK-NOT: alloca
112+
// CHECK: load i192
113+
// CHECK: load i192
114+
// CHECK: store i192
115+
// CHECK: store i192
116+
swap(x, y)
117+
}
118+
119+
// CHECK-LABEL: @swap_tuple_with_padding
120+
#[no_mangle]
121+
pub fn swap_tuple_with_padding(x: &mut (u8, u32, u8), y: &mut (u8, u32, u8)) {
122+
// CHECK-NOT: alloca
123+
// CHECK: load i64
124+
// CHECK-NOT: noundef
125+
// CHECK: load i64
126+
// CHECK-NOT: noundef
127+
// CHECK: store i64
128+
// CHECK: store i64
129+
swap(x, y)
130+
}

0 commit comments

Comments
 (0)