-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SelectionDAG] Optimize MPI for align(1) GEPs using base pointer #145309
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Summary: For align(1) loads/stores from GEPs: - Replace MPI(gep, 0) with MPI(base_ptr, const_offset) - Preserve base pointer's stronger alignment - Implemented directly in SelectionDAGBuilder::visitLoad/visitStore Issue: llvm#143215 Future: 1. Optimize expandUnalignedLoad/Store 2. Improve computeKnownBits for phi(GEP)
@llvm/pr-subscribers-backend-amdgpu Author: Acthinks Yang (Acthinks) ChangesSummary:
Issue: #143215 Future:
Full diff: https://github.com/llvm/llvm-project/pull/145309.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c01f1e7928477..3f5963a735ffb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4566,6 +4566,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
if (I.isAtomic())
return visitAtomicLoad(I);
+ const DataLayout &DL = DAG.getDataLayout();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
const Value *SV = I.getOperand(0);
if (TLI.supportSwiftError()) {
@@ -4587,7 +4588,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
Type *Ty = I.getType();
SmallVector<EVT, 4> ValueVTs, MemVTs;
SmallVector<TypeSize, 4> Offsets;
- ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &MemVTs, &Offsets);
+ ComputeValueVTs(TLI, DL, Ty, ValueVTs, &MemVTs, &Offsets);
unsigned NumValues = ValueVTs.size();
if (NumValues == 0)
return;
@@ -4597,7 +4598,25 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
const MDNode *Ranges = getRangeMetadata(I);
bool isVolatile = I.isVolatile();
MachineMemOperand::Flags MMOFlags =
- TLI.getLoadMemOperandFlags(I, DAG.getDataLayout(), AC, LibInfo);
+ TLI.getLoadMemOperandFlags(I, DL, AC, LibInfo);
+
+ // See visitStore comments.
+ int64_t Offset = 0;
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(SV);
+ GEP && Alignment == Align(1)) {
+ const Value *BasePtrV = GEP->getPointerOperand();
+ APInt OffsetAccumulated =
+ APInt(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
+ if (GEP->accumulateConstantOffset(DL, OffsetAccumulated)) {
+ Align BaseAlignment =
+ getKnownAlignment(const_cast<Value *>(BasePtrV), DL, &I, AC);
+ if (BaseAlignment > Alignment) {
+ SV = BasePtrV;
+ Alignment = BaseAlignment;
+ Offset = OffsetAccumulated.getSExtValue();
+ }
+ }
+ }
SDValue Root;
bool ConstantMemory = false;
@@ -4647,7 +4666,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
// TODO: MachinePointerInfo only supports a fixed length offset.
MachinePointerInfo PtrInfo =
!Offsets[i].isScalable() || Offsets[i].isZero()
- ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue())
+ ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue() + Offset)
: MachinePointerInfo();
SDValue A = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
@@ -4734,6 +4753,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
if (I.isAtomic())
return visitAtomicStore(I);
+ const DataLayout &DL = DAG.getDataLayout();
const Value *SrcV = I.getOperand(0);
const Value *PtrV = I.getOperand(1);
@@ -4754,8 +4774,8 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
SmallVector<EVT, 4> ValueVTs, MemVTs;
SmallVector<TypeSize, 4> Offsets;
- ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
- SrcV->getType(), ValueVTs, &MemVTs, &Offsets);
+ ComputeValueVTs(DAG.getTargetLoweringInfo(), DL, SrcV->getType(), ValueVTs,
+ &MemVTs, &Offsets);
unsigned NumValues = ValueVTs.size();
if (NumValues == 0)
return;
@@ -4772,7 +4792,32 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
Align Alignment = I.getAlign();
AAMDNodes AAInfo = I.getAAMetadata();
- auto MMOFlags = TLI.getStoreMemOperandFlags(I, DAG.getDataLayout());
+ // refine MPI: V + Offset
+ // Example:
+ // align 4 %p
+ // %gep = getelementptr i8, ptr %p, i32 1
+ // store i32 %v, ptr %len, align 1
+ // ->
+ // MPI: V = %p, Offset = 1
+ // SDNode: store<(store (s32) into %p + 1, align 1, basealign 4)>
+ int64_t Offset = 0;
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(PtrV);
+ GEP && Alignment == Align(1)) {
+ const Value *BasePtrV = GEP->getPointerOperand();
+ APInt OffsetAccumulated =
+ APInt(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
+ if (GEP->accumulateConstantOffset(DL, OffsetAccumulated)) {
+ Align BaseAlignment =
+ getKnownAlignment(const_cast<Value *>(BasePtrV), DL, &I, AC);
+ if (BaseAlignment > Alignment) {
+ PtrV = BasePtrV;
+ Alignment = BaseAlignment;
+ Offset = OffsetAccumulated.getSExtValue();
+ }
+ }
+ }
+
+ auto MMOFlags = TLI.getStoreMemOperandFlags(I, DL);
unsigned ChainI = 0;
for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
@@ -4787,7 +4832,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
// TODO: MachinePointerInfo only supports a fixed length offset.
MachinePointerInfo PtrInfo =
!Offsets[i].isScalable() || Offsets[i].isZero()
- ? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue())
+ ? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue() + Offset)
: MachinePointerInfo();
SDValue Add = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index a18b5b5396f63..2010c350226dc 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -4708,13 +4708,13 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; SI: ; %bb.0:
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
; SI-NEXT: s_load_dword s2, s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa
; SI-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:49
; SI-NEXT: buffer_load_ubyte v5, off, s[4:7], 0 offset:50
; SI-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:51
-; SI-NEXT: buffer_load_ubyte v7, off, s[4:7], 0 offset:52
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
+; SI-NEXT: s_load_dword s3, s[4:5], 0xd
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s2
@@ -4725,10 +4725,10 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; SI-NEXT: s_lshl_b32 s0, s3, 24
; SI-NEXT: v_or_b32_e32 v2, v2, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v6
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v3, s0, v3
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -4740,45 +4740,43 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; VI: ; %bb.0:
; VI-NEXT: s_add_u32 s0, s4, 49
; VI-NEXT: s_addc_u32 s1, s5, 0
-; VI-NEXT: s_add_u32 s2, s4, 50
-; VI-NEXT: s_addc_u32 s3, s5, 0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_add_u32 s0, s0, 3
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_load_ubyte v6, v[0:1]
+; VI-NEXT: s_load_dword s2, s[4:5], 0x34
+; VI-NEXT: s_add_u32 s0, s4, 50
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshl_b32 s2, s2, 24
; VI-NEXT: s_add_u32 s0, s4, 51
+; VI-NEXT: flat_load_ubyte v7, v[0:1]
; VI-NEXT: s_addc_u32 s1, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v7, s1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v6, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_ubyte v8, v[0:1]
-; VI-NEXT: flat_load_ubyte v9, v[2:3]
-; VI-NEXT: flat_load_ubyte v10, v[4:5]
-; VI-NEXT: flat_load_ubyte v6, v[6:7]
; VI-NEXT: s_add_u32 s0, s4, 53
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_load_dword s2, s[4:5], 0x24
+; VI-NEXT: s_load_dword s3, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v7, s2
+; VI-NEXT: v_mov_b32_e32 v9, s3
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: flat_store_dword v[2:3], v7
+; VI-NEXT: flat_store_dword v[2:3], v9
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
-; VI-NEXT: v_or_b32_e32 v4, v4, v9
-; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10
-; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v7
+; VI-NEXT: v_or_b32_e32 v4, v4, v6
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v8
+; VI-NEXT: v_or_b32_e32 v5, s2, v5
; VI-NEXT: v_or_b32_e32 v4, v5, v4
; VI-NEXT: flat_store_dword v[2:3], v4
; VI-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
index c9c49e8f7f532..cde1b6af4717f 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
@@ -578,5 +578,97 @@ define void @store_large_constant(ptr %x) {
store i64 18364758544493064720, ptr %x, align 1
ret void
}
+
+define void @store_const_with_align_attribute(ptr align 2 %p) {
+; SLOW-LABEL: store_const_with_align_attribute:
+; SLOW: # %bb.0: # %entry
+; SLOW-NEXT: sb zero, 3(a0)
+; SLOW-NEXT: sh zero, 4(a0)
+; SLOW-NEXT: sb zero, 6(a0)
+; SLOW-NEXT: ret
+;
+; FAST-LABEL: store_const_with_align_attribute:
+; FAST: # %bb.0: # %entry
+; FAST-NEXT: sw zero, 3(a0)
+; FAST-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ store i32 0, ptr %len, align 1
+ ret void
+}
+
+; TODO: opt expandUnalignedStore
+define void @store_with_align_attribute(ptr align 2 %p, i32 %v) {
+; SLOW-LABEL: store_with_align_attribute:
+; SLOW: # %bb.0: # %entry
+; SLOW-NEXT: srli a2, a1, 24
+; SLOW-NEXT: srli a3, a1, 16
+; SLOW-NEXT: srli a4, a1, 8
+; SLOW-NEXT: sb a1, 3(a0)
+; SLOW-NEXT: sb a4, 4(a0)
+; SLOW-NEXT: sb a3, 5(a0)
+; SLOW-NEXT: sb a2, 6(a0)
+; SLOW-NEXT: ret
+;
+; FAST-LABEL: store_with_align_attribute:
+; FAST: # %bb.0: # %entry
+; FAST-NEXT: sw a1, 3(a0)
+; FAST-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ store i32 %v, ptr %len, align 1
+ ret void
+}
+
+; TODO: opt expandUnalignedLoad
+define i32 @load_with_align_attribute(ptr align 2 %p) {
+; SLOWBASE-LABEL: load_with_align_attribute:
+; SLOWBASE: # %bb.0: # %entry
+; SLOWBASE-NEXT: lbu a1, 4(a0)
+; SLOWBASE-NEXT: lbu a2, 3(a0)
+; SLOWBASE-NEXT: lbu a3, 5(a0)
+; SLOWBASE-NEXT: lbu a0, 6(a0)
+; SLOWBASE-NEXT: slli a1, a1, 8
+; SLOWBASE-NEXT: or a1, a1, a2
+; SLOWBASE-NEXT: slli a3, a3, 16
+; SLOWBASE-NEXT: slli a0, a0, 24
+; SLOWBASE-NEXT: or a0, a0, a3
+; SLOWBASE-NEXT: or a0, a0, a1
+; SLOWBASE-NEXT: ret
+;
+; RV32IZBKB-LABEL: load_with_align_attribute:
+; RV32IZBKB: # %bb.0: # %entry
+; RV32IZBKB-NEXT: lbu a1, 4(a0)
+; RV32IZBKB-NEXT: lbu a2, 5(a0)
+; RV32IZBKB-NEXT: lbu a3, 6(a0)
+; RV32IZBKB-NEXT: lbu a0, 3(a0)
+; RV32IZBKB-NEXT: packh a2, a2, a3
+; RV32IZBKB-NEXT: packh a0, a0, a1
+; RV32IZBKB-NEXT: pack a0, a0, a2
+; RV32IZBKB-NEXT: ret
+;
+; RV64IZBKB-LABEL: load_with_align_attribute:
+; RV64IZBKB: # %bb.0: # %entry
+; RV64IZBKB-NEXT: lbu a1, 3(a0)
+; RV64IZBKB-NEXT: lbu a2, 4(a0)
+; RV64IZBKB-NEXT: lbu a3, 5(a0)
+; RV64IZBKB-NEXT: lbu a0, 6(a0)
+; RV64IZBKB-NEXT: packh a1, a1, a2
+; RV64IZBKB-NEXT: slli a3, a3, 16
+; RV64IZBKB-NEXT: slli a0, a0, 24
+; RV64IZBKB-NEXT: or a0, a0, a3
+; RV64IZBKB-NEXT: or a0, a0, a1
+; RV64IZBKB-NEXT: ret
+;
+; FAST-LABEL: load_with_align_attribute:
+; FAST: # %bb.0: # %entry
+; FAST-NEXT: lw a0, 3(a0)
+; FAST-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ %v = load i32, ptr %len, align 1
+ ret i32 %v
+}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; SLOWZBKB: {{.*}}
|
@llvm/pr-subscribers-llvm-selectiondag Author: Acthinks Yang (Acthinks) ChangesSummary:
Issue: #143215 Future:
Full diff: https://github.com/llvm/llvm-project/pull/145309.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c01f1e7928477..3f5963a735ffb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4566,6 +4566,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
if (I.isAtomic())
return visitAtomicLoad(I);
+ const DataLayout &DL = DAG.getDataLayout();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
const Value *SV = I.getOperand(0);
if (TLI.supportSwiftError()) {
@@ -4587,7 +4588,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
Type *Ty = I.getType();
SmallVector<EVT, 4> ValueVTs, MemVTs;
SmallVector<TypeSize, 4> Offsets;
- ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &MemVTs, &Offsets);
+ ComputeValueVTs(TLI, DL, Ty, ValueVTs, &MemVTs, &Offsets);
unsigned NumValues = ValueVTs.size();
if (NumValues == 0)
return;
@@ -4597,7 +4598,25 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
const MDNode *Ranges = getRangeMetadata(I);
bool isVolatile = I.isVolatile();
MachineMemOperand::Flags MMOFlags =
- TLI.getLoadMemOperandFlags(I, DAG.getDataLayout(), AC, LibInfo);
+ TLI.getLoadMemOperandFlags(I, DL, AC, LibInfo);
+
+ // See visitStore comments.
+ int64_t Offset = 0;
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(SV);
+ GEP && Alignment == Align(1)) {
+ const Value *BasePtrV = GEP->getPointerOperand();
+ APInt OffsetAccumulated =
+ APInt(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
+ if (GEP->accumulateConstantOffset(DL, OffsetAccumulated)) {
+ Align BaseAlignment =
+ getKnownAlignment(const_cast<Value *>(BasePtrV), DL, &I, AC);
+ if (BaseAlignment > Alignment) {
+ SV = BasePtrV;
+ Alignment = BaseAlignment;
+ Offset = OffsetAccumulated.getSExtValue();
+ }
+ }
+ }
SDValue Root;
bool ConstantMemory = false;
@@ -4647,7 +4666,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
// TODO: MachinePointerInfo only supports a fixed length offset.
MachinePointerInfo PtrInfo =
!Offsets[i].isScalable() || Offsets[i].isZero()
- ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue())
+ ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue() + Offset)
: MachinePointerInfo();
SDValue A = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
@@ -4734,6 +4753,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
if (I.isAtomic())
return visitAtomicStore(I);
+ const DataLayout &DL = DAG.getDataLayout();
const Value *SrcV = I.getOperand(0);
const Value *PtrV = I.getOperand(1);
@@ -4754,8 +4774,8 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
SmallVector<EVT, 4> ValueVTs, MemVTs;
SmallVector<TypeSize, 4> Offsets;
- ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
- SrcV->getType(), ValueVTs, &MemVTs, &Offsets);
+ ComputeValueVTs(DAG.getTargetLoweringInfo(), DL, SrcV->getType(), ValueVTs,
+ &MemVTs, &Offsets);
unsigned NumValues = ValueVTs.size();
if (NumValues == 0)
return;
@@ -4772,7 +4792,32 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
Align Alignment = I.getAlign();
AAMDNodes AAInfo = I.getAAMetadata();
- auto MMOFlags = TLI.getStoreMemOperandFlags(I, DAG.getDataLayout());
+ // refine MPI: V + Offset
+ // Example:
+ // align 4 %p
+ // %gep = getelementptr i8, ptr %p, i32 1
+ // store i32 %v, ptr %len, align 1
+ // ->
+ // MPI: V = %p, Offset = 1
+ // SDNode: store<(store (s32) into %p + 1, align 1, basealign 4)>
+ int64_t Offset = 0;
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(PtrV);
+ GEP && Alignment == Align(1)) {
+ const Value *BasePtrV = GEP->getPointerOperand();
+ APInt OffsetAccumulated =
+ APInt(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
+ if (GEP->accumulateConstantOffset(DL, OffsetAccumulated)) {
+ Align BaseAlignment =
+ getKnownAlignment(const_cast<Value *>(BasePtrV), DL, &I, AC);
+ if (BaseAlignment > Alignment) {
+ PtrV = BasePtrV;
+ Alignment = BaseAlignment;
+ Offset = OffsetAccumulated.getSExtValue();
+ }
+ }
+ }
+
+ auto MMOFlags = TLI.getStoreMemOperandFlags(I, DL);
unsigned ChainI = 0;
for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
@@ -4787,7 +4832,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
// TODO: MachinePointerInfo only supports a fixed length offset.
MachinePointerInfo PtrInfo =
!Offsets[i].isScalable() || Offsets[i].isZero()
- ? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue())
+ ? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue() + Offset)
: MachinePointerInfo();
SDValue Add = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index a18b5b5396f63..2010c350226dc 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -4708,13 +4708,13 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; SI: ; %bb.0:
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
; SI-NEXT: s_load_dword s2, s[4:5], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa
; SI-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:49
; SI-NEXT: buffer_load_ubyte v5, off, s[4:7], 0 offset:50
; SI-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:51
-; SI-NEXT: buffer_load_ubyte v7, off, s[4:7], 0 offset:52
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53
+; SI-NEXT: s_load_dword s3, s[4:5], 0xd
; SI-NEXT: s_mov_b64 s[4:5], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, s2
@@ -4725,10 +4725,10 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5
-; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7
+; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; SI-NEXT: s_lshl_b32 s0, s3, 24
; SI-NEXT: v_or_b32_e32 v2, v2, v4
-; SI-NEXT: v_or_b32_e32 v3, v3, v6
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_or_b32_e32 v3, s0, v3
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -4740,45 +4740,43 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; VI: ; %bb.0:
; VI-NEXT: s_add_u32 s0, s4, 49
; VI-NEXT: s_addc_u32 s1, s5, 0
-; VI-NEXT: s_add_u32 s2, s4, 50
-; VI-NEXT: s_addc_u32 s3, s5, 0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_add_u32 s0, s0, 3
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_load_ubyte v6, v[0:1]
+; VI-NEXT: s_load_dword s2, s[4:5], 0x34
+; VI-NEXT: s_add_u32 s0, s4, 50
+; VI-NEXT: s_addc_u32 s1, s5, 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshl_b32 s2, s2, 24
; VI-NEXT: s_add_u32 s0, s4, 51
+; VI-NEXT: flat_load_ubyte v7, v[0:1]
; VI-NEXT: s_addc_u32 s1, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v7, s1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v6, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_ubyte v8, v[0:1]
-; VI-NEXT: flat_load_ubyte v9, v[2:3]
-; VI-NEXT: flat_load_ubyte v10, v[4:5]
-; VI-NEXT: flat_load_ubyte v6, v[6:7]
; VI-NEXT: s_add_u32 s0, s4, 53
; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_load_dword s2, s[4:5], 0x24
+; VI-NEXT: s_load_dword s3, s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v7, s2
+; VI-NEXT: v_mov_b32_e32 v9, s3
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: flat_store_dword v[2:3], v7
+; VI-NEXT: flat_store_dword v[2:3], v9
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v8
-; VI-NEXT: v_or_b32_e32 v4, v4, v9
-; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10
-; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v7
+; VI-NEXT: v_or_b32_e32 v4, v4, v6
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v8
+; VI-NEXT: v_or_b32_e32 v5, s2, v5
; VI-NEXT: v_or_b32_e32 v4, v5, v4
; VI-NEXT: flat_store_dword v[2:3], v4
; VI-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
index c9c49e8f7f532..cde1b6af4717f 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
@@ -578,5 +578,97 @@ define void @store_large_constant(ptr %x) {
store i64 18364758544493064720, ptr %x, align 1
ret void
}
+
+define void @store_const_with_align_attribute(ptr align 2 %p) {
+; SLOW-LABEL: store_const_with_align_attribute:
+; SLOW: # %bb.0: # %entry
+; SLOW-NEXT: sb zero, 3(a0)
+; SLOW-NEXT: sh zero, 4(a0)
+; SLOW-NEXT: sb zero, 6(a0)
+; SLOW-NEXT: ret
+;
+; FAST-LABEL: store_const_with_align_attribute:
+; FAST: # %bb.0: # %entry
+; FAST-NEXT: sw zero, 3(a0)
+; FAST-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ store i32 0, ptr %len, align 1
+ ret void
+}
+
+; TODO: opt expandUnalignedStore
+define void @store_with_align_attribute(ptr align 2 %p, i32 %v) {
+; SLOW-LABEL: store_with_align_attribute:
+; SLOW: # %bb.0: # %entry
+; SLOW-NEXT: srli a2, a1, 24
+; SLOW-NEXT: srli a3, a1, 16
+; SLOW-NEXT: srli a4, a1, 8
+; SLOW-NEXT: sb a1, 3(a0)
+; SLOW-NEXT: sb a4, 4(a0)
+; SLOW-NEXT: sb a3, 5(a0)
+; SLOW-NEXT: sb a2, 6(a0)
+; SLOW-NEXT: ret
+;
+; FAST-LABEL: store_with_align_attribute:
+; FAST: # %bb.0: # %entry
+; FAST-NEXT: sw a1, 3(a0)
+; FAST-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ store i32 %v, ptr %len, align 1
+ ret void
+}
+
+; TODO: opt expandUnalignedLoad
+define i32 @load_with_align_attribute(ptr align 2 %p) {
+; SLOWBASE-LABEL: load_with_align_attribute:
+; SLOWBASE: # %bb.0: # %entry
+; SLOWBASE-NEXT: lbu a1, 4(a0)
+; SLOWBASE-NEXT: lbu a2, 3(a0)
+; SLOWBASE-NEXT: lbu a3, 5(a0)
+; SLOWBASE-NEXT: lbu a0, 6(a0)
+; SLOWBASE-NEXT: slli a1, a1, 8
+; SLOWBASE-NEXT: or a1, a1, a2
+; SLOWBASE-NEXT: slli a3, a3, 16
+; SLOWBASE-NEXT: slli a0, a0, 24
+; SLOWBASE-NEXT: or a0, a0, a3
+; SLOWBASE-NEXT: or a0, a0, a1
+; SLOWBASE-NEXT: ret
+;
+; RV32IZBKB-LABEL: load_with_align_attribute:
+; RV32IZBKB: # %bb.0: # %entry
+; RV32IZBKB-NEXT: lbu a1, 4(a0)
+; RV32IZBKB-NEXT: lbu a2, 5(a0)
+; RV32IZBKB-NEXT: lbu a3, 6(a0)
+; RV32IZBKB-NEXT: lbu a0, 3(a0)
+; RV32IZBKB-NEXT: packh a2, a2, a3
+; RV32IZBKB-NEXT: packh a0, a0, a1
+; RV32IZBKB-NEXT: pack a0, a0, a2
+; RV32IZBKB-NEXT: ret
+;
+; RV64IZBKB-LABEL: load_with_align_attribute:
+; RV64IZBKB: # %bb.0: # %entry
+; RV64IZBKB-NEXT: lbu a1, 3(a0)
+; RV64IZBKB-NEXT: lbu a2, 4(a0)
+; RV64IZBKB-NEXT: lbu a3, 5(a0)
+; RV64IZBKB-NEXT: lbu a0, 6(a0)
+; RV64IZBKB-NEXT: packh a1, a1, a2
+; RV64IZBKB-NEXT: slli a3, a3, 16
+; RV64IZBKB-NEXT: slli a0, a0, 24
+; RV64IZBKB-NEXT: or a0, a0, a3
+; RV64IZBKB-NEXT: or a0, a0, a1
+; RV64IZBKB-NEXT: ret
+;
+; FAST-LABEL: load_with_align_attribute:
+; FAST: # %bb.0: # %entry
+; FAST-NEXT: lw a0, 3(a0)
+; FAST-NEXT: ret
+entry:
+ %len = getelementptr inbounds nuw i8, ptr %p, i32 3
+ %v = load i32, ptr %len, align 1
+ ret i32 %v
+}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; SLOWZBKB: {{.*}}
|
// See visitStore comments. | ||
int64_t Offset = 0; | ||
if (auto *GEP = dyn_cast<GetElementPtrInst>(SV); | ||
GEP && Alignment == Align(1)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Align of 1 shouldn't be special. But isn't this already refined in the middle end? We shouldn't need to redo this analysis during codegen. Given the test changes, we probably didn't bother to refine the alignment of kernel argument loads or something
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we probably didn't bother to refine the alignment of kernel argument loads or something
After AMDGPU Lower Kernel Arguments, we get the following, which is basically the same as the RISCV load_with_align_attribute testcase:
define amdgpu_kernel void @packed_struct_argument_alignment(<{ i32, i64 }> %arg0, i8 %0, <{ i32, i64 }> %arg1) {
%packed_struct_argument_alignment.kernarg.segment = call nonnull align 16 dereferenceable(284) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
%arg0.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %packed_struct_argument_alignment.kernarg.segment, i64 36
%arg0.load = load <{ i32, i64 }>, ptr addrspace(4) %arg0.kernarg.offset, align 4, !invariant.load !0
%arg1.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %packed_struct_argument_alignment.kernarg.segment, i64 49
%arg1.load = load <{ i32, i64 }>, ptr addrspace(4) %arg1.kernarg.offset, align 1, !invariant.load !0
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Align of 1 shouldn't be special.
Yes, it is possible to do so without special treatment. This is currently done because it is uncommon to miss optimizations in other scenarios.
But isn't this already refined in the middle end? We shouldn't need to redo this analysis during codegen.
The middle end has given the best alignment information for load/store. Getting the basePtr align information may provide better optimization opportunities in expandUnalignedLoad/Store.
Given the test changes, we probably didn't bother to refine the alignment of kernel argument loads or something
demo:
define void @store_const_with_align_attribute(ptr align 2 %p) {
entry:
%len = getelementptr inbounds nuw i8, ptr %p, i32 3
store i32 0, ptr %len, align 1
ret void
}
old:
sb zero, 3(a0)
sb zero, 4(a0)
sb zero, 5(a0)
sb zero, 6(a0)
ret
new:
sb zero, 3(a0)
sh zero, 4(a0)
sb zero, 6(a0)
ret
Summary:
For align(1) loads/stores from GEPs:
Issue: #143215
Future: