Skip to content

Remove my scalar_copy_backend_type optimization attempt #123185

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 10, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions compiler/rustc_codegen_llvm/src/type_.rs
Original file line number Diff line number Diff line change
@@ -281,9 +281,6 @@ impl<'ll, 'tcx> LayoutTypeMethods<'tcx> for CodegenCx<'ll, 'tcx> {
fn reg_backend_type(&self, ty: &Reg) -> &'ll Type {
ty.llvm_type(self)
}
fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option<Self::Type> {
layout.scalar_copy_llvm_type(self)
}
}

impl<'ll, 'tcx> TypeMembershipMethods<'tcx> for CodegenCx<'ll, 'tcx> {
42 changes: 0 additions & 42 deletions compiler/rustc_codegen_llvm/src/type_of.rs
Original file line number Diff line number Diff line change
@@ -5,7 +5,6 @@ use rustc_middle::bug;
use rustc_middle::ty::layout::{LayoutOf, TyAndLayout};
use rustc_middle::ty::print::{with_no_trimmed_paths, with_no_visible_paths};
use rustc_middle::ty::{self, Ty, TypeVisitableExt};
use rustc_target::abi::HasDataLayout;
use rustc_target::abi::{Abi, Align, FieldsShape};
use rustc_target::abi::{Int, Pointer, F128, F16, F32, F64};
use rustc_target::abi::{Scalar, Size, Variants};
@@ -166,7 +165,6 @@ pub trait LayoutLlvmExt<'tcx> {
index: usize,
immediate: bool,
) -> &'a Type;
fn scalar_copy_llvm_type<'a>(&self, cx: &CodegenCx<'a, 'tcx>) -> Option<&'a Type>;
}

impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {
@@ -308,44 +306,4 @@ impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {

self.scalar_llvm_type_at(cx, scalar)
}

fn scalar_copy_llvm_type<'a>(&self, cx: &CodegenCx<'a, 'tcx>) -> Option<&'a Type> {
debug_assert!(self.is_sized());

// FIXME: this is a fairly arbitrary choice, but 128 bits on WASM
// (matching the 128-bit SIMD types proposal) and 256 bits on x64
// (like AVX2 registers) seems at least like a tolerable starting point.
let threshold = cx.data_layout().pointer_size * 4;
if self.layout.size() > threshold {
return None;
}

// Vectors, even for non-power-of-two sizes, have the same layout as
// arrays but don't count as aggregate types
// While LLVM theoretically supports non-power-of-two sizes, and they
// often work fine, sometimes x86-isel deals with them horribly
// (see #115212) so for now only use power-of-two ones.
if let FieldsShape::Array { count, .. } = self.layout.fields()
&& count.is_power_of_two()
&& let element = self.field(cx, 0)
&& element.ty.is_integral()
{
// `cx.type_ix(bits)` is tempting here, but while that works great
// for things that *stay* as memory-to-memory copies, it also ends
// up suppressing vectorization as it introduces shifts when it
// extracts all the individual values.

let ety = element.llvm_type(cx);
if *count == 1 {
// Emitting `<1 x T>` would be silly; just use the scalar.
return Some(ety);
} else {
return Some(cx.type_vector(ety, *count));
}
}

// FIXME: The above only handled integer arrays; surely more things
// would also be possible. Be careful about provenance, though!
None
}
}
38 changes: 3 additions & 35 deletions compiler/rustc_codegen_ssa/src/base.rs
Original file line number Diff line number Diff line change
@@ -12,7 +12,7 @@ use crate::mir;
use crate::mir::operand::OperandValue;
use crate::mir::place::PlaceRef;
use crate::traits::*;
use crate::{CachedModuleCodegen, CompiledModule, CrateInfo, MemFlags, ModuleCodegen, ModuleKind};
use crate::{CachedModuleCodegen, CompiledModule, CrateInfo, ModuleCodegen, ModuleKind};

use rustc_ast::expand::allocator::{global_fn_name, AllocatorKind, ALLOCATOR_METHODS};
use rustc_attr as attr;
@@ -37,7 +37,7 @@ use rustc_session::config::{self, CrateType, EntryFnType, OptLevel, OutputType};
use rustc_session::Session;
use rustc_span::symbol::sym;
use rustc_span::Symbol;
use rustc_target::abi::{Align, FIRST_VARIANT};
use rustc_target::abi::FIRST_VARIANT;

use std::cmp;
use std::collections::BTreeSet;
@@ -282,15 +282,7 @@ pub fn coerce_unsized_into<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
}

if src_f.layout.ty == dst_f.layout.ty {
memcpy_ty(
bx,
dst_f.llval,
dst_f.align,
src_f.llval,
src_f.align,
src_f.layout,
MemFlags::empty(),
);
bx.typed_place_copy(dst_f, src_f);
} else {
coerce_unsized_into(bx, src_f, dst_f);
}
@@ -382,30 +374,6 @@ pub fn wants_new_eh_instructions(sess: &Session) -> bool {
wants_wasm_eh(sess) || wants_msvc_seh(sess)
}

pub fn memcpy_ty<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
bx: &mut Bx,
dst: Bx::Value,
dst_align: Align,
src: Bx::Value,
src_align: Align,
layout: TyAndLayout<'tcx>,
flags: MemFlags,
) {
let size = layout.size.bytes();
if size == 0 {
return;
}

if flags == MemFlags::empty()
&& let Some(bty) = bx.cx().scalar_copy_backend_type(layout)
{
let temp = bx.load(bty, src, src_align);
bx.store(temp, dst, dst_align);
} else {
bx.memcpy(dst, dst_align, src, src_align, bx.cx().const_usize(size), flags);
}
}

pub fn codegen_instance<'a, 'tcx: 'a, Bx: BuilderMethods<'a, 'tcx>>(
cx: &'a Bx::CodegenCx,
instance: Instance<'tcx>,
13 changes: 3 additions & 10 deletions compiler/rustc_codegen_ssa/src/mir/block.rs
Original file line number Diff line number Diff line change
@@ -1459,7 +1459,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
}
_ => (op.immediate_or_packed_pair(bx), arg.layout.align.abi, false),
},
Ref(llval, _, align) => match arg.mode {
Ref(llval, llextra, align) => match arg.mode {
PassMode::Indirect { attrs, .. } => {
let required_align = match attrs.pointee_align {
Some(pointee_align) => cmp::max(pointee_align, arg.layout.align.abi),
@@ -1470,15 +1470,8 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
// alignment requirements may be higher than the type's alignment, so copy
// to a higher-aligned alloca.
let scratch = PlaceRef::alloca_aligned(bx, arg.layout, required_align);
base::memcpy_ty(
bx,
scratch.llval,
scratch.align,
llval,
align,
op.layout,
MemFlags::empty(),
);
let op_place = PlaceRef { llval, llextra, layout: op.layout, align };
bx.typed_place_copy(scratch, op_place);
(scratch.llval, scratch.align, true)
} else {
(llval, align, true)
16 changes: 5 additions & 11 deletions compiler/rustc_codegen_ssa/src/mir/operand.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use super::place::PlaceRef;
use super::{FunctionCx, LocalRef};

use crate::base;
use crate::size_of_val;
use crate::traits::*;
use crate::MemFlags;
@@ -398,7 +397,7 @@ impl<'a, 'tcx, V: CodegenObject> OperandValue<V> {
self.store_with_flags(bx, dest, MemFlags::NONTEMPORAL);
}

fn store_with_flags<Bx: BuilderMethods<'a, 'tcx, Value = V>>(
pub(crate) fn store_with_flags<Bx: BuilderMethods<'a, 'tcx, Value = V>>(
self,
bx: &mut Bx,
dest: PlaceRef<'tcx, V>,
@@ -410,16 +409,11 @@ impl<'a, 'tcx, V: CodegenObject> OperandValue<V> {
// Avoid generating stores of zero-sized values, because the only way to have a zero-sized
// value is through `undef`/`poison`, and the store itself is useless.
}
OperandValue::Ref(r, None, source_align) => {
OperandValue::Ref(llval, llextra @ None, source_align) => {
assert!(dest.layout.is_sized(), "cannot directly store unsized values");
if flags.contains(MemFlags::NONTEMPORAL) {
// HACK(nox): This is inefficient but there is no nontemporal memcpy.
let ty = bx.backend_type(dest.layout);
let val = bx.load(ty, r, source_align);
bx.store_with_flags(val, dest.llval, dest.align, flags);
return;
}
base::memcpy_ty(bx, dest.llval, dest.align, r, source_align, dest.layout, flags)
let source_place =
PlaceRef { llval, llextra, align: source_align, layout: dest.layout };
bx.typed_place_copy_with_flags(dest, source_place, flags);
}
OperandValue::Ref(_, Some(_), _) => {
bug!("cannot directly store unsized values");
25 changes: 20 additions & 5 deletions compiler/rustc_codegen_ssa/src/traits/builder.rs
Original file line number Diff line number Diff line change
@@ -281,17 +281,32 @@ pub trait BuilderMethods<'a, 'tcx>:
dst: PlaceRef<'tcx, Self::Value>,
src: PlaceRef<'tcx, Self::Value>,
) {
debug_assert!(src.llextra.is_none());
debug_assert!(dst.llextra.is_none());
self.typed_place_copy_with_flags(dst, src, MemFlags::empty());
}

fn typed_place_copy_with_flags(
&mut self,
dst: PlaceRef<'tcx, Self::Value>,
src: PlaceRef<'tcx, Self::Value>,
flags: MemFlags,
) {
debug_assert!(src.llextra.is_none(), "cannot directly copy from unsized values");
debug_assert!(dst.llextra.is_none(), "cannot directly copy into unsized values");
debug_assert_eq!(dst.layout.size, src.layout.size);
if self.sess().opts.optimize == OptLevel::No && self.is_backend_immediate(dst.layout) {
if flags.contains(MemFlags::NONTEMPORAL) {
// HACK(nox): This is inefficient but there is no nontemporal memcpy.
let ty = self.backend_type(dst.layout);
let val = self.load(ty, src.llval, src.align);
self.store_with_flags(val, dst.llval, dst.align, flags);
} else if self.sess().opts.optimize == OptLevel::No && self.is_backend_immediate(dst.layout)
{
// If we're not optimizing, the aliasing information from `memcpy`
// isn't useful, so just load-store the value for smaller code.
let temp = self.load_operand(src);
temp.val.store(self, dst);
temp.val.store_with_flags(self, dst, flags);
} else if !dst.layout.is_zst() {
let bytes = self.const_usize(dst.layout.size.bytes());
self.memcpy(dst.llval, dst.align, src.llval, src.align, bytes, MemFlags::empty());
self.memcpy(dst.llval, dst.align, src.llval, src.align, bytes, flags);
}
}

22 changes: 0 additions & 22 deletions compiler/rustc_codegen_ssa/src/traits/type_.rs
Original file line number Diff line number Diff line change
@@ -133,28 +133,6 @@ pub trait LayoutTypeMethods<'tcx>: Backend<'tcx> {
|| self.is_backend_immediate(layout)
|| self.is_backend_scalar_pair(layout))
}

/// A type that can be used in a [`super::BuilderMethods::load`] +
/// [`super::BuilderMethods::store`] pair to implement a *typed* copy,
/// such as a MIR `*_0 = *_1`.
///
/// It's always legal to return `None` here, as the provided impl does,
/// in which case callers should use [`super::BuilderMethods::memcpy`]
/// instead of the `load`+`store` pair.
///
/// This can be helpful for things like arrays, where the LLVM backend type
/// `[3 x i16]` optimizes to three separate loads and stores, but it can
/// instead be copied via an `i48` that stays as the single `load`+`store`.
/// (As of 2023-05 LLVM cannot necessarily optimize away a `memcpy` in these
/// cases, due to `poison` handling, but in codegen we have more information
/// about the type invariants, so can emit something better instead.)
///
/// This *should* return `None` for particularly-large types, where leaving
/// the `memcpy` may well be important to avoid code size explosion.
fn scalar_copy_backend_type(&self, layout: TyAndLayout<'tcx>) -> Option<Self::Type> {
let _ = layout;
None
}
}

// For backends that support CFI using type membership (i.e., testing whether a given pointer is
44 changes: 25 additions & 19 deletions tests/codegen/array-codegen.rs
Original file line number Diff line number Diff line change
@@ -5,52 +5,58 @@
// CHECK-LABEL: @array_load
#[no_mangle]
pub fn array_load(a: &[u8; 4]) -> [u8; 4] {
// CHECK: %_0 = alloca [4 x i8], align 1
// CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1
// CHECK: store <4 x i8> %[[TEMP1]], ptr %_0, align 1
// CHECK: %[[TEMP2:.+]] = load i32, ptr %_0, align 1
// CHECK: ret i32 %[[TEMP2]]
// CHECK-NOT: alloca
// CHECK: %[[ALLOCA:.+]] = alloca [4 x i8], align 1
// CHECK-NOT: alloca
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[ALLOCA]], ptr align 1 %a, {{.+}} 4, i1 false)
// CHECK: %[[TEMP:.+]] = load i32, ptr %[[ALLOCA]], align 1
// CHECK: ret i32 %[[TEMP]]
*a
}

// CHECK-LABEL: @array_store
#[no_mangle]
pub fn array_store(a: [u8; 4], p: &mut [u8; 4]) {
// CHECK-NOT: alloca
// CHECK: %[[TEMP:.+]] = alloca i32, [[TEMPALIGN:align [0-9]+]]
// CHECK-NOT: alloca
// CHECK: %a = alloca [4 x i8]
// CHECK: %[[TEMP:.+]] = load <4 x i8>, ptr %a, align 1
// CHECK-NEXT: store <4 x i8> %[[TEMP]], ptr %p, align 1
// CHECK-NOT: alloca
// store i32 %0, ptr %[[TEMP]]
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %a, ptr [[TEMPALIGN]] %[[TEMP]], {{.+}} 4, i1 false)
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %a, {{.+}} 4, i1 false)
*p = a;
}

// CHECK-LABEL: @array_copy
#[no_mangle]
pub fn array_copy(a: &[u8; 4], p: &mut [u8; 4]) {
// CHECK-NOT: alloca
// CHECK: %[[LOCAL:.+]] = alloca [4 x i8], align 1
// CHECK: %[[TEMP1:.+]] = load <4 x i8>, ptr %a, align 1
// CHECK: store <4 x i8> %[[TEMP1]], ptr %[[LOCAL]], align 1
// CHECK: %[[TEMP2:.+]] = load <4 x i8>, ptr %[[LOCAL]], align 1
// CHECK: store <4 x i8> %[[TEMP2]], ptr %p, align 1
// CHECK-NOT: alloca
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[LOCAL]], ptr align 1 %a, {{.+}} 4, i1 false)
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %[[LOCAL]], {{.+}} 4, i1 false)
*p = *a;
}

// CHECK-LABEL: @array_copy_1_element
#[no_mangle]
pub fn array_copy_1_element(a: &[u8; 1], p: &mut [u8; 1]) {
// CHECK-NOT: alloca
// CHECK: %[[LOCAL:.+]] = alloca [1 x i8], align 1
// CHECK: %[[TEMP1:.+]] = load i8, ptr %a, align 1
// CHECK: store i8 %[[TEMP1]], ptr %[[LOCAL]], align 1
// CHECK: %[[TEMP2:.+]] = load i8, ptr %[[LOCAL]], align 1
// CHECK: store i8 %[[TEMP2]], ptr %p, align 1
// CHECK-NOT: alloca
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[LOCAL]], ptr align 1 %a, {{.+}} 1, i1 false)
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %[[LOCAL]], {{.+}} 1, i1 false)
*p = *a;
}

// CHECK-LABEL: @array_copy_2_elements
#[no_mangle]
pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
// CHECK-NOT: alloca
// CHECK: %[[LOCAL:.+]] = alloca [2 x i8], align 1
// CHECK: %[[TEMP1:.+]] = load <2 x i8>, ptr %a, align 1
// CHECK: store <2 x i8> %[[TEMP1]], ptr %[[LOCAL]], align 1
// CHECK: %[[TEMP2:.+]] = load <2 x i8>, ptr %[[LOCAL]], align 1
// CHECK: store <2 x i8> %[[TEMP2]], ptr %p, align 1
// CHECK-NOT: alloca
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %[[LOCAL]], ptr align 1 %a, {{.+}} 2, i1 false)
// CHECK: call void @llvm.memcpy.{{.+}}(ptr align 1 %p, ptr align 1 %[[LOCAL]], {{.+}} 2, i1 false)
*p = *a;
}
8 changes: 4 additions & 4 deletions tests/codegen/array-optimized.rs
Original file line number Diff line number Diff line change
@@ -16,8 +16,8 @@ pub fn array_copy_1_element(a: &[u8; 1], p: &mut [u8; 1]) {
#[no_mangle]
pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
// CHECK-NOT: alloca
// CHECK: %[[TEMP:.+]] = load <2 x i8>, ptr %a, align 1
// CHECK: store <2 x i8> %[[TEMP]], ptr %p, align 1
// CHECK: %[[TEMP:.+]] = load i16, ptr %a, align 1
// CHECK: store i16 %[[TEMP]], ptr %p, align 1
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that while we now generate an alloca and memcpys for this (as seen in the array-codegen file), LLVM is able to remove them.

If LLVM picks i16 for this (and not <2 x i8>), then great, let's do that.

// CHECK: ret
*p = *a;
}
@@ -26,8 +26,8 @@ pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
#[no_mangle]
pub fn array_copy_4_elements(a: &[u8; 4], p: &mut [u8; 4]) {
// CHECK-NOT: alloca
// CHECK: %[[TEMP:.+]] = load <4 x i8>, ptr %a, align 1
// CHECK: store <4 x i8> %[[TEMP]], ptr %p, align 1
// CHECK: %[[TEMP:.+]] = load i32, ptr %a, align 1
// CHECK: store i32 %[[TEMP]], ptr %p, align 1
// CHECK: ret
*p = *a;
}
39 changes: 29 additions & 10 deletions tests/codegen/issues/issue-122805.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
//@ compile-flags: -O
//@ revisions: OPT2 OPT3WINX64 OPT3LINX64
//@ [OPT2] compile-flags: -O
//@ [OPT3LINX64] compile-flags: -C opt-level=3
//@ [OPT3WINX64] compile-flags: -C opt-level=3
//@ [OPT3LINX64] only-linux
//@ [OPT3WINX64] only-windows
//@ [OPT3LINX64] only-x86_64
//@ [OPT3WINX64] only-x86_64
//@ min-llvm-version: 18.1.3

#![crate_type = "lib"]
@@ -9,15 +16,27 @@
// to avoid complicating the code.
// CHECK-LABEL: define{{.*}}void @convert(
// CHECK-NOT: shufflevector
// CHECK: insertelement <8 x i16>
// CHECK-NEXT: insertelement <8 x i16>
// CHECK-NEXT: insertelement <8 x i16>
// CHECK-NEXT: insertelement <8 x i16>
// CHECK-NEXT: insertelement <8 x i16>
// CHECK-NEXT: insertelement <8 x i16>
// CHECK-NEXT: insertelement <8 x i16>
// CHECK-NEXT: insertelement <8 x i16>
// CHECK-NEXT: store <8 x i16>
// OPT2: store i16
// OPT2-NEXT: getelementptr inbounds i8, {{.+}} 2
// OPT2-NEXT: store i16
// OPT2-NEXT: getelementptr inbounds i8, {{.+}} 4
// OPT2-NEXT: store i16
// OPT2-NEXT: getelementptr inbounds i8, {{.+}} 6
// OPT2-NEXT: store i16
// OPT2-NEXT: getelementptr inbounds i8, {{.+}} 8
// OPT2-NEXT: store i16
// OPT2-NEXT: getelementptr inbounds i8, {{.+}} 10
// OPT2-NEXT: store i16
// OPT2-NEXT: getelementptr inbounds i8, {{.+}} 12
// OPT2-NEXT: store i16
// OPT2-NEXT: getelementptr inbounds i8, {{.+}} 14
// OPT2-NEXT: store i16
// OPT3LINX64: load <8 x i16>
// OPT3LINX64-NEXT: call <8 x i16> @llvm.bswap
// OPT3LINX64-NEXT: store <8 x i16>
// OPT3WINX64: load <8 x i16>
// OPT3WINX64-NEXT: call <8 x i16> @llvm.bswap
// OPT3WINX64-NEXT: store <8 x i16>
// CHECK-NEXT: ret void
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI @dianqk that I updated this test.

There's no more vector operation generated in O2 -- but I think separate stores for it is probably fine and similar enough to the insertvector lines -- so I updated it to ensure that it still gets vectorized in x64 at O3.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, thanks! I hadn't noticed that different optimization levels yield different optimization effects.
I believe it makes sense to differentiate between O2 and O3 here.

The changes to O2 seem fine. I'll check again on this later.

Copy link
Member

@dianqk dianqk Apr 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I confirm that is fine. I was just curious why the results are the same for O2 and O3 is in LLVM's opt before.
I've found that Rust's O2 disables LLVM's SLP vectorization pass:

vectorize_slp: !sess.opts.cg.no_vectorize_slp
&& sess.opts.optimize == config::OptLevel::Aggressive,
.

#[no_mangle]
#[cfg(target_endian = "little")]
6 changes: 2 additions & 4 deletions tests/codegen/mem-replace-simple-type.rs
Original file line number Diff line number Diff line change
@@ -45,9 +45,7 @@ pub fn replace_short_array_3(r: &mut [u32; 3], v: [u32; 3]) -> [u32; 3] {
// CHECK-LABEL: @replace_short_array_4(
pub fn replace_short_array_4(r: &mut [u32; 4], v: [u32; 4]) -> [u32; 4] {
// CHECK-NOT: alloca
// CHECK: %[[R:.+]] = load <4 x i32>, ptr %r, align 4
// CHECK: store <4 x i32> %[[R]], ptr %result
// CHECK: %[[V:.+]] = load <4 x i32>, ptr %v, align 4
// CHECK: store <4 x i32> %[[V]], ptr %r
// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %result, ptr align 4 %r, i64 16, i1 false)
// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %r, ptr align 4 %v, i64 16, i1 false)
std::mem::replace(r, v)
}