Skip to content

Commit 9abbec6

Browse files
LU-JOHNarsenm
andauthored
[AMDGPU] Reland "Remove redundant s_cmp_lg_* sX, 0" (#164201)
Reland PR #162352. Fix by excluding SI_PC_ADD_REL_OFFSET from instructions that set SCC = DST!=0. Passes check-libc-amdgcn-amd-amdhsa now. Distribution of instructions that allowed a redundant S_CMP to be deleted in check-libc-amdgcn-amd-amdhsa test: ``` S_AND_B32 485 S_AND_B64 47 S_ANDN2_B32 42 S_ANDN2_B64 277492 S_CSELECT_B64 17631 S_LSHL_B32 6 S_OR_B64 11 ``` --------- Signed-off-by: John Lu <[email protected]> Co-authored-by: Matt Arsenault <[email protected]>
1 parent 411be14 commit 9abbec6

36 files changed

+2461
-3136
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10628,6 +10628,59 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
1062810628
if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
1062910629
return false;
1063010630

10631+
const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
10632+
this]() -> bool {
10633+
if (CmpValue != 0)
10634+
return false;
10635+
10636+
MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
10637+
if (!Def || Def->getParent() != CmpInstr.getParent())
10638+
return false;
10639+
10640+
const auto foldableSelect = [](MachineInstr *Def) -> bool {
10641+
if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 ||
10642+
Def->getOpcode() == AMDGPU::S_CSELECT_B64) {
10643+
bool Op1IsNonZeroImm =
10644+
Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0;
10645+
bool Op2IsZeroImm =
10646+
Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0;
10647+
if (Op1IsNonZeroImm && Op2IsZeroImm)
10648+
return true;
10649+
}
10650+
return false;
10651+
};
10652+
10653+
// For S_OP that set SCC = DST!=0, do the transformation
10654+
//
10655+
// s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
10656+
10657+
// If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
10658+
// for S_CSELECT* already has the same value that will be calculated by
10659+
// s_cmp_lg_*
10660+
//
10661+
// s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
10662+
// imm), 0)
10663+
if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(Def))
10664+
return false;
10665+
10666+
MachineInstr *KillsSCC = nullptr;
10667+
for (MachineInstr &MI :
10668+
make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
10669+
if (MI.modifiesRegister(AMDGPU::SCC, &RI))
10670+
return false;
10671+
if (MI.killsRegister(AMDGPU::SCC, &RI))
10672+
KillsSCC = &MI;
10673+
}
10674+
10675+
if (MachineOperand *SccDef =
10676+
Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
10677+
SccDef->setIsDead(false);
10678+
if (KillsSCC)
10679+
KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
10680+
CmpInstr.eraseFromParent();
10681+
return true;
10682+
};
10683+
1063110684
const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
1063210685
this](int64_t ExpectedValue, unsigned SrcSize,
1063310686
bool IsReversible, bool IsSigned) -> bool {
@@ -10702,16 +10755,20 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
1070210755
if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
1070310756
return false;
1070410757

10705-
for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
10706-
I != E; ++I) {
10707-
if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
10708-
I->killsRegister(AMDGPU::SCC, &RI))
10758+
MachineInstr *KillsSCC = nullptr;
10759+
for (MachineInstr &MI :
10760+
make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
10761+
if (MI.modifiesRegister(AMDGPU::SCC, &RI))
1070910762
return false;
10763+
if (MI.killsRegister(AMDGPU::SCC, &RI))
10764+
KillsSCC = &MI;
1071010765
}
1071110766

1071210767
MachineOperand *SccDef =
1071310768
Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
1071410769
SccDef->setIsDead(false);
10770+
if (KillsSCC)
10771+
KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
1071510772
CmpInstr.eraseFromParent();
1071610773

1071710774
if (!MRI->use_nodbg_empty(DefReg)) {
@@ -10755,15 +10812,15 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
1075510812
case AMDGPU::S_CMP_LG_I32:
1075610813
case AMDGPU::S_CMPK_LG_U32:
1075710814
case AMDGPU::S_CMPK_LG_I32:
10758-
return optimizeCmpAnd(0, 32, true, false);
10815+
return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
1075910816
case AMDGPU::S_CMP_GT_U32:
1076010817
case AMDGPU::S_CMPK_GT_U32:
1076110818
return optimizeCmpAnd(0, 32, false, false);
1076210819
case AMDGPU::S_CMP_GT_I32:
1076310820
case AMDGPU::S_CMPK_GT_I32:
1076410821
return optimizeCmpAnd(0, 32, false, true);
1076510822
case AMDGPU::S_CMP_LG_U64:
10766-
return optimizeCmpAnd(0, 64, true, false);
10823+
return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
1076710824
}
1076810825

1076910826
return false;

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,52 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
714714
}
715715
}
716716

717+
static bool setsSCCifResultIsNonZero(const MachineInstr &MI) {
718+
switch (MI.getOpcode()) {
719+
case AMDGPU::S_ABSDIFF_I32:
720+
case AMDGPU::S_ABS_I32:
721+
case AMDGPU::S_AND_B32:
722+
case AMDGPU::S_AND_B64:
723+
case AMDGPU::S_ANDN2_B32:
724+
case AMDGPU::S_ANDN2_B64:
725+
case AMDGPU::S_ASHR_I32:
726+
case AMDGPU::S_ASHR_I64:
727+
case AMDGPU::S_BCNT0_I32_B32:
728+
case AMDGPU::S_BCNT0_I32_B64:
729+
case AMDGPU::S_BCNT1_I32_B32:
730+
case AMDGPU::S_BCNT1_I32_B64:
731+
case AMDGPU::S_BFE_I32:
732+
case AMDGPU::S_BFE_I64:
733+
case AMDGPU::S_BFE_U32:
734+
case AMDGPU::S_BFE_U64:
735+
case AMDGPU::S_LSHL_B32:
736+
case AMDGPU::S_LSHL_B64:
737+
case AMDGPU::S_LSHR_B32:
738+
case AMDGPU::S_LSHR_B64:
739+
case AMDGPU::S_NAND_B32:
740+
case AMDGPU::S_NAND_B64:
741+
case AMDGPU::S_NOR_B32:
742+
case AMDGPU::S_NOR_B64:
743+
case AMDGPU::S_NOT_B32:
744+
case AMDGPU::S_NOT_B64:
745+
case AMDGPU::S_OR_B32:
746+
case AMDGPU::S_OR_B64:
747+
case AMDGPU::S_ORN2_B32:
748+
case AMDGPU::S_ORN2_B64:
749+
case AMDGPU::S_QUADMASK_B32:
750+
case AMDGPU::S_QUADMASK_B64:
751+
case AMDGPU::S_WQM_B32:
752+
case AMDGPU::S_WQM_B64:
753+
case AMDGPU::S_XNOR_B32:
754+
case AMDGPU::S_XNOR_B64:
755+
case AMDGPU::S_XOR_B32:
756+
case AMDGPU::S_XOR_B64:
757+
return true;
758+
default:
759+
return false;
760+
}
761+
}
762+
717763
static bool isEXP(const MachineInstr &MI) {
718764
return MI.getDesc().TSFlags & SIInstrFlags::EXP;
719765
}

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
140140
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
141141
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
142142
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
143-
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
144143
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
145144
; CHECK-NEXT: ; %bb.1: ; %false
146145
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -345,7 +344,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
345344
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
346345
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
347346
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
348-
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
349347
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
350348
; CHECK-NEXT: ; %bb.1: ; %false
351349
; CHECK-NEXT: s_mov_b32 s0, 33

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
143143
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
144144
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
145145
; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
146-
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
147146
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
148147
; CHECK-NEXT: ; %bb.1: ; %false
149148
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -348,7 +347,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
348347
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
349348
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
350349
; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
351-
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
352350
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
353351
; CHECK-NEXT: ; %bb.1: ; %false
354352
; CHECK-NEXT: s_mov_b32 s0, 33

llvm/test/CodeGen/AMDGPU/addsub64_carry.ll

Lines changed: 8 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -180,11 +180,7 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
180180
; CHECK-LABEL: s_add64_32:
181181
; CHECK: ; %bb.0:
182182
; CHECK-NEXT: s_add_u32 s0, s0, s2
183-
; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
184-
; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
185183
; CHECK-NEXT: s_addc_u32 s1, s1, s3
186-
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
187-
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
188184
; CHECK-NEXT: s_addc_u32 s2, s4, 0
189185
; CHECK-NEXT: ; return to shader part epilog
190186
%sum64 = add i64 %val64A, %val64B
@@ -199,14 +195,10 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
199195
define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
200196
; CHECK-LABEL: s_uadd_v2i64:
201197
; CHECK: ; %bb.0:
202-
; CHECK-NEXT: s_add_u32 s10, s2, s6
203-
; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
204-
; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
205-
; CHECK-NEXT: s_addc_u32 s8, s3, s7
198+
; CHECK-NEXT: s_add_u32 s6, s2, s6
199+
; CHECK-NEXT: s_addc_u32 s7, s3, s7
206200
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
207201
; CHECK-NEXT: s_add_u32 s0, s0, s4
208-
; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
209-
; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
210202
; CHECK-NEXT: s_addc_u32 s1, s1, s5
211203
; CHECK-NEXT: v_mov_b32_e32 v2, s0
212204
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -215,8 +207,8 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
215207
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
216208
; CHECK-NEXT: v_readfirstlane_b32 s0, v7
217209
; CHECK-NEXT: v_readfirstlane_b32 s2, v6
218-
; CHECK-NEXT: v_mov_b32_e32 v4, s10
219-
; CHECK-NEXT: v_mov_b32_e32 v5, s8
210+
; CHECK-NEXT: v_mov_b32_e32 v4, s6
211+
; CHECK-NEXT: v_mov_b32_e32 v5, s7
220212
; CHECK-NEXT: s_mov_b32 s1, s0
221213
; CHECK-NEXT: s_mov_b32 s3, s2
222214
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -233,14 +225,10 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
233225
define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
234226
; CHECK-LABEL: s_usub_v2i64:
235227
; CHECK: ; %bb.0:
236-
; CHECK-NEXT: s_sub_u32 s10, s2, s6
237-
; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
238-
; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
239-
; CHECK-NEXT: s_subb_u32 s8, s3, s7
228+
; CHECK-NEXT: s_sub_u32 s6, s2, s6
229+
; CHECK-NEXT: s_subb_u32 s7, s3, s7
240230
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
241231
; CHECK-NEXT: s_sub_u32 s0, s0, s4
242-
; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
243-
; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
244232
; CHECK-NEXT: s_subb_u32 s1, s1, s5
245233
; CHECK-NEXT: v_mov_b32_e32 v2, s0
246234
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -249,8 +237,8 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
249237
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
250238
; CHECK-NEXT: v_readfirstlane_b32 s0, v7
251239
; CHECK-NEXT: v_readfirstlane_b32 s2, v6
252-
; CHECK-NEXT: v_mov_b32_e32 v4, s10
253-
; CHECK-NEXT: v_mov_b32_e32 v5, s8
240+
; CHECK-NEXT: v_mov_b32_e32 v4, s6
241+
; CHECK-NEXT: v_mov_b32_e32 v5, s7
254242
; CHECK-NEXT: s_mov_b32 s1, s0
255243
; CHECK-NEXT: s_mov_b32 s3, s2
256244
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -268,8 +256,6 @@ define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval)
268256
; CHECK-LABEL: s_uadd_i64:
269257
; CHECK: ; %bb.0:
270258
; CHECK-NEXT: s_add_u32 s0, s0, s2
271-
; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0
272-
; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0
273259
; CHECK-NEXT: s_addc_u32 s1, s1, s3
274260
; CHECK-NEXT: v_mov_b32_e32 v2, s0
275261
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -292,8 +278,6 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
292278
; CHECK-LABEL: s_uadd_p1:
293279
; CHECK: ; %bb.0:
294280
; CHECK-NEXT: s_add_u32 s0, s0, 1
295-
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
296-
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
297281
; CHECK-NEXT: s_addc_u32 s1, s1, 0
298282
; CHECK-NEXT: v_mov_b32_e32 v2, s0
299283
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -339,8 +323,6 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
339323
; CHECK-LABEL: s_usub_p1:
340324
; CHECK: ; %bb.0:
341325
; CHECK-NEXT: s_sub_u32 s0, s0, 1
342-
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
343-
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
344326
; CHECK-NEXT: s_subb_u32 s1, s1, 0
345327
; CHECK-NEXT: v_mov_b32_e32 v2, s0
346328
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -363,8 +345,6 @@ define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
363345
; CHECK-LABEL: s_usub_n1:
364346
; CHECK: ; %bb.0:
365347
; CHECK-NEXT: s_sub_u32 s0, s0, -1
366-
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
367-
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
368348
; CHECK-NEXT: s_subb_u32 s1, s1, -1
369349
; CHECK-NEXT: v_mov_b32_e32 v2, s0
370350
; CHECK-NEXT: v_mov_b32_e32 v3, s1

0 commit comments

Comments
 (0)