-
Notifications
You must be signed in to change notification settings - Fork 12.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU/GlobalISel: Temporal divergence lowering i1 #124299
base: users/petar-avramovic/temporal-divergence
Are you sure you want to change the base?
AMDGPU/GlobalISel: Temporal divergence lowering i1 #124299
Conversation
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Petar Avramovic (petar-avramovic) ChangesUse of i1 outside of the cycle, both uniform and divergent, Patch is 124.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124299.diff 9 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
index d8cd1e7379c93f..7e8b9d5524be32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
@@ -80,6 +80,7 @@ class DivergenceLoweringHelper : public PhiLoweringHelper {
void constrainAsLaneMask(Incoming &In) override;
bool lowerTempDivergence();
+ bool lowerTempDivergenceI1();
};
DivergenceLoweringHelper::DivergenceLoweringHelper(
@@ -221,6 +222,54 @@ bool DivergenceLoweringHelper::lowerTempDivergence() {
return false;
}
+bool DivergenceLoweringHelper::lowerTempDivergenceI1() {
+ MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)};
+ initializeLaneMaskRegisterAttributes(BoolS1);
+
+ for (auto [Inst, UseInst, Cycle] : MUI->get_TDCs()) {
+ Register Reg = Inst->getOperand(0).getReg();
+ if (MRI->getType(Reg) != LLT::scalar(1))
+ continue;
+
+ Register MergedMask = MRI->createVirtualRegister(BoolS1);
+ Register PrevIterMask = MRI->createVirtualRegister(BoolS1);
+
+ MachineBasicBlock *CycleHeaderMBB = Cycle->getHeader();
+ SmallVector<MachineBasicBlock *, 1> ExitingBlocks;
+ Cycle->getExitingBlocks(ExitingBlocks);
+ assert(ExitingBlocks.size() == 1);
+ MachineBasicBlock *CycleExitingMBB = ExitingBlocks[0];
+
+ B.setInsertPt(*CycleHeaderMBB, CycleHeaderMBB->begin());
+ auto CrossIterPHI = B.buildInstr(AMDGPU::PHI).addDef(PrevIterMask);
+
+ // We only care about cycle iterration path - merge Reg with previous
+ // iteration. For other incomings use implicit def.
+ // Predecessors should be CyclePredecessor and CycleExitingMBB.
+ // In older versions of irreducible control flow lowering there could be
+ // cases with more predecessors. To keep this lowering as generic as
+ // possible also handle those cases.
+ for (auto MBB : CycleHeaderMBB->predecessors()) {
+ if (MBB == CycleExitingMBB) {
+ CrossIterPHI.addReg(MergedMask);
+ } else {
+ B.setInsertPt(*MBB, MBB->getFirstTerminator());
+ auto ImplDef = B.buildInstr(AMDGPU::IMPLICIT_DEF, {BoolS1}, {});
+ CrossIterPHI.addReg(ImplDef.getReg(0));
+ }
+ CrossIterPHI.addMBB(MBB);
+ }
+
+ buildMergeLaneMasks(*CycleExitingMBB, CycleExitingMBB->getFirstTerminator(),
+ {}, MergedMask, PrevIterMask, Reg);
+
+ replaceUsesOfRegInInstWith(Reg, const_cast<MachineInstr *>(UseInst),
+ MergedMask);
+ }
+
+ return false;
+}
+
} // End anonymous namespace.
INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
@@ -260,6 +309,12 @@ bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
// Non-i1 temporal divergence lowering.
Changed |= Helper.lowerTempDivergence();
+ // This covers both uniform and divergent i1s. Lane masks are in sgpr and need
+ // to be updated in each iteration.
+ Changed |= Helper.lowerTempDivergenceI1();
+ // Temporal divergence lowering of divergent i1 phi used outside of the cycle
+ // could also be handled by lowerPhis but we do it in lowerTempDivergenceI1
+ // since in some case lowerPhis does unnecessary lane mask merging.
Changed |= Helper.lowerPhis();
return Changed;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index 65c96a3db5bbfa..11acd451d98d7d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -104,20 +104,25 @@ define void @divergent_i1_phi_used_inside_loop(float %val, ptr %addr) {
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s5, 1
; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: ; implicit-def: $sgpr7
; GFX10-NEXT: .LBB2_1: ; %loop
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s6
; GFX10-NEXT: s_xor_b32 s5, s5, 1
-; GFX10-NEXT: s_add_i32 s6, s6, 1
+; GFX10-NEXT: s_and_b32 s8, s5, 1
+; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v0
+; GFX10-NEXT: s_cselect_b32 s8, exec_lo, 0
+; GFX10-NEXT: s_add_i32 s6, s6, 1
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo
+; GFX10-NEXT: s_and_b32 s8, exec_lo, s8
+; GFX10-NEXT: s_or_b32 s7, s7, s8
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB2_1
; GFX10-NEXT: ; %bb.2: ; %exit
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cmp_lg_u32 s5, 0
-; GFX10-NEXT: s_cselect_b32 s4, exec_lo, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s7
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -147,29 +152,34 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
; GFX10-NEXT: v_mov_b32_e32 v1, 0x3e8
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: ; implicit-def: $sgpr7
; GFX10-NEXT: s_branch .LBB3_2
; GFX10-NEXT: .LBB3_1: ; %loop_body
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX10-NEXT: v_cvt_f32_u32_e32 v8, s6
-; GFX10-NEXT: s_xor_b32 s4, s4, exec_lo
+; GFX10-NEXT: s_mov_b32 s8, exec_lo
; GFX10-NEXT: s_add_i32 s6, s6, 1
+; GFX10-NEXT: s_xor_b32 s4, s4, s8
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v0
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo
+; GFX10-NEXT: s_and_b32 s8, exec_lo, s4
+; GFX10-NEXT: s_or_b32 s7, s7, s8
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execz .LBB3_6
; GFX10-NEXT: .LBB3_2: ; %loop_start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_cmpk_le_i32 s6, 0x3e8
-; GFX10-NEXT: s_mov_b32 s7, 1
+; GFX10-NEXT: s_mov_b32 s8, 1
; GFX10-NEXT: s_cbranch_scc0 .LBB3_4
; GFX10-NEXT: ; %bb.3: ; %else
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX10-NEXT: s_mov_b32 s7, 0
+; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: flat_store_dword v[6:7], v1
; GFX10-NEXT: .LBB3_4: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX10-NEXT: s_xor_b32 s7, s7, 1
-; GFX10-NEXT: s_cmp_lg_u32 s7, 0
+; GFX10-NEXT: s_xor_b32 s8, s8, 1
+; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_cbranch_scc1 .LBB3_1
; GFX10-NEXT: ; %bb.5: ; %if
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
@@ -177,7 +187,7 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
; GFX10-NEXT: s_branch .LBB3_1
; GFX10-NEXT: .LBB3_6: ; %exit
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s7
; GFX10-NEXT: flat_store_dword v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
index f244639ed97623..c0fbdb541ab9ff 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
@@ -201,20 +201,27 @@ body: |
; GFX10-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C]](s32), %bb.0
- ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.1
- ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C1]](s1), %bb.0, %11(s1), %bb.1
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %19(s1), %bb.1
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.1
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C1]](s1), %bb.0, %11(s1), %bb.1
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
- ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
- ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+ ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
+ ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
+ ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
- ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]]
+ ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32)
+ ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY3]](s1), $exec_lo, implicit-def $scc
+ ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
+ ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-NEXT: G_BR %bb.2
; GFX10-NEXT: {{ $}}
@@ -222,7 +229,7 @@ body: |
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32)
; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
- ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[XOR]](s1), [[C5]], [[C4]]
+ ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[S_OR_B32_]](s1), [[C5]], [[C4]]
; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
; GFX10-NEXT: SI_RETURN
bb.0:
@@ -285,17 +292,20 @@ body: |
; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C]]
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1)
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.0, %37(s1), %bb.5
- ; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C1]](s32), %bb.0
- ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %17(s32), %bb.5
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.0, %44(s1), %bb.5
+ ; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %36(s1), %bb.5
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C1]](s32), %bb.0
+ ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %17(s32), %bb.5
; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+ ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1000
- ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI2]](s32), [[C2]]
+ ; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI3]](s32), [[C2]]
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: G_BRCOND [[ICMP]](s1), %bb.4
; GFX10-NEXT: G_BR %bb.2
@@ -303,9 +313,9 @@ body: |
; GFX10-NEXT: bb.2:
; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI %24(s1), %bb.4, [[C3]](s1), %bb.1
+ ; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s1) = G_PHI %24(s1), %bb.4, [[C3]](s1), %bb.1
; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
- ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C4]]
+ ; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI4]], [[C4]]
; GFX10-NEXT: G_BRCOND [[XOR]](s1), %bb.5
; GFX10-NEXT: G_BR %bb.3
; GFX10-NEXT: {{ $}}
@@ -329,12 +339,16 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[C8:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[COPY9]], [[C8]]
- ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
+ ; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
+ ; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI3]](s32)
; GFX10-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
; GFX10-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C9]]
- ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI1]](s32)
- ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
+ ; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C9]]
+ ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32)
+ ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
+ ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
+ ; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+ ; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-NEXT: G_BR %bb.6
; GFX10-NEXT: {{ $}}
@@ -342,7 +356,7 @@ body: |
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32)
; GFX10-NEXT: [[C10:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
; GFX10-NEXT: [[C11:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
- ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[XOR1]](s1), [[C11]], [[C10]]
+ ; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[S_OR_B32_]](s1), [[C11]], [[C10]]
; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
; GFX10-NEXT: SI_RETURN
bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index 14d370dd9663f0..7f1dc03b118d77 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -13,24 +13,22 @@ define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val,
; GFX10-LABEL: divergent_i1_phi_used_outside_loop:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
-; GFX10-NEXT: s_andn2_b32 s5, s4, exec_lo
+; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo
-; GFX10-NEXT: s_or_b32 s6, s5, s6
-; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: ; implicit-def: $sgpr7
; GFX10-NEXT: .LBB0_1: ; %loop
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s5
+; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6
; GFX10-NEXT: s_mov_b32 s8, exec_lo
-; GFX10-NEXT: s_mov_b32 s7, s6
-; GFX10-NEXT: s_add_i32 s5, s5, 1
-; GFX10-NEXT: s_xor_b32 s6, s6, s8
+; GFX10-NEXT: s_add_i32 s6, s6, 1
+; GFX10-NEXT: s_xor_b32 s8, s5, s8
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v0
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 s8, s7, exec_lo
-; GFX10-NEXT: s_and_b32 s6, exec_lo, s6
-; GFX10-NEXT: s_or_b32 s6, s8, s6
+; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo
+; GFX10-NEXT: s_and_b32 s9, exec_lo, s5
+; GFX10-NEXT: s_mov_b32 s5, s8
+; GFX10-NEXT: s_or_b32 s7, s7, s9
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB0_1
; GFX10-NEXT: ; %bb.2: ; %exit
@@ -135,21 +133,26 @@ define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val,
; GFX10-LABEL: divergent_i1_xor_used_outside_loop:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 1.0, v1
-; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s6, 0
+; GFX10-NEXT: ; implicit-def: $sgpr7
; GFX10-NEXT: .LBB2_1: ; %loop
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6
+; GFX10-NEXT: s_mov_b32 s8, exec_lo
; GFX10-NEXT: s_add_i32 s6, s6, 1
-; GFX10-NEXT: s_xor_b32 s4, s4, exec_lo
+; GFX10-NEXT: s_xor_b32 s5, s5, s8
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v0
-; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo
+; GFX10-NEXT: s_and_b32 s8, exec_lo, s5
+; GFX10-NEXT: s_or_b32 s7, s7, s8
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB2_1
; GFX10-NEXT: ; %bb.2: ; %exit
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s7
; GFX10-NEXT: flat_store_dword v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -192,31 +195,35 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
; GFX10-NEXT: s_cbranch_execz .LBB3_6
; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: ; implicit-def: $sgpr9
; GFX10-NEXT: ; implicit-def: $sgpr10
+; GFX10-NEXT: ; implicit-def: $sgpr11
+; GFX10-NEXT: ; implicit-def: $sgpr9
; GFX10-NEXT: s_branch .LBB3_3
; GFX10-NEXT: .LBB3_2: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_xor_b32 s5, s10, exec_lo
-; GFX10-NEXT: s_and_b32 s11, exec_lo, s9
-; GFX10-NEXT: s_or_b32 s8, s11, s8
+; GFX10-NEXT: s_xor_b32 s5, s11, exec_lo
+; GFX10-NEXT: s_and_b32 s12, exec_lo, s10
+; GFX10-NEXT: s_or_b32 s8, s12, s8
+; GFX10-NEXT: s_andn2_b32 s9, s9, exec_lo
+; GFX10-NEXT: s_and_b32 s5, exec_lo, s5
+; GFX10-NEXT: s_or_b32 s9, s9, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_cbranch_execz .LBB3_5
; GFX10-NEXT: .LBB3_3: ; %loop.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_ashr_i32 s5, s4, 31
-; GFX10-NEXT: s_andn2_b32 s9, s9, exec_lo
+; GFX10-NEXT: s_andn2_b32 s10, s10, exec_lo
; GFX10-NEXT: s_lshl_b64 s[12:13], s[4:5], 2
-; GFX10-NEXT: s_andn2_b32 s5, s10, exec_lo
+; GFX10-NEXT: s_andn2_b32 s5, s11, exec_lo
; GFX10-NEXT: v_mov_b32_e32 v5, s12
; GFX10-NEXT: v_mov_b32_e32 v6, s13
-; GFX10-NEXT: s_and_b32 s10, exec_lo, exec_lo
; GFX10-NEXT: s_and_b32 s11, exec_lo, exec_lo
-; GFX10-NEXT: s_or_b32 s10, s5, s10
+; GFX10-NEXT: s_and_b32 s12, exec_lo, exec_lo
+; GFX10-NEXT: s_or_b32 s11, s5, s11
; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v1, v5
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v2, v6, ...
[truncated]
|
3e04401
to
a5c340d
Compare
0a2db42
to
5e06268
Compare
Insert point of merging phi is changed to after Inst, not in the exiting block. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for moving the insert point.
An analogous comment applies here like in the non-i1 case: it would be good to cache the MergedMask
register for re-use when the same Inst
occurs with multiple UseInst
s.
MachineBasicBlock *CycleHeaderMBB = Cycle->getHeader(); | ||
SmallVector<MachineBasicBlock *, 1> ExitingBlocks; | ||
Cycle->getExitingBlocks(ExitingBlocks); | ||
assert(ExitingBlocks.size() == 1); | ||
MachineBasicBlock *CycleExitingMBB = ExitingBlocks[0]; | ||
|
||
B.setInsertPt(*CycleHeaderMBB, CycleHeaderMBB->begin()); | ||
auto CrossIterPHI = B.buildInstr(AMDGPU::PHI).addDef(PrevIterMask); | ||
|
||
// We only care about cycle iterration path - merge Reg with previous | ||
// iteration. For other incomings use implicit def. | ||
// Predecessors should be CyclePredecessor and CycleExitingMBB. | ||
// In older versions of irreducible control flow lowering there could be | ||
// cases with more predecessors. To keep this lowering as generic as | ||
// possible also handle those cases. | ||
for (auto MBB : CycleHeaderMBB->predecessors()) { | ||
if (MBB == CycleExitingMBB) { | ||
CrossIterPHI.addReg(MergedMask); | ||
} else { | ||
B.setInsertPt(*MBB, MBB->getFirstTerminator()); | ||
auto ImplDef = B.buildInstr(AMDGPU::IMPLICIT_DEF, {BoolS1}, {}); | ||
CrossIterPHI.addReg(ImplDef.getReg(0)); | ||
} | ||
CrossIterPHI.addMBB(MBB); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This still feels vaguely off to me.
First of all, can you please add a Cycle->isReducible()
assert here just in case, since you're relying on there only being one header? (Or apply what I wrote below to the predecessors of all entry blocks to the cycle.)
Second, I'm worried about that case distinction between different kinds of predecessors. It really doesn't feel properly generic to me. Latches and exiting blocks aren't necessarily the same thing. (The current structurizer tends to make it that way, but even with the structurizer we probably don't have a guarantee, let alone with a future different control flow lowering.)
Let's think through some cases:
- If the predecessor is outside the cycle, it should be an
IMPLICIT_DEF
- If it is inside the cycle, i.e. a latch, there are two cases:
- If the predecessor is dominated by
Inst
, then clearly useMergedMask
- Otherwise, it's actually not clear: could it be possible that control flow leaves the dominance region of
Inst
and goes around the loop again?
- If the predecessor is dominated by
On that last point, you could perhaps make a case that because the loop is temporally divergent, having the CFG in wave CFG form requires a certain structure. But I'm not convinced. In fact, consider:
flowchart TD
Start --> Header
Header --> B
Header --> Inst
Inst --> B
B --> Header
Inst --> C
C --> Header
C --> UseInst
Let's say the branches of Header and C are uniform; only the branch of the basic block containing Inst is divergent.
Then we have temporal divergence in Inst, and need to handle the i1
. But the CFG is already reconvergent: the divergent branch is post-dominated by one of its immediate successors. This means it is possible to insert EXEC masking code for this CFG correctly without structuring it. Therefore, the code here should really handle this case correctly.
I think this calls for using MachineSSAUpdater for a robust solution:
- Initialize the updater with IMPLICIT_DEF in every predecessor of the header that is outside the cycle.
- Insert the MergedMask into the updater for the
Inst->getParent()
- Then build the lane mask merging, querying the updater for the previous mask (i.e., let the updater create that register)
- The MergedMask can still be used by UseInst due to dominance
And note that with this solution, even the case of irreducible cycles would work correctly, if you apply the first bullet to the predecessors of all entry blocks.
Except now there's an additional issue with caching the MergedMask that doesn't exist for the non-i1 case: Different UseInst
s could appear with respect to different Cycle
s. The non-i1 case doesn't actually care about the cycle structure, but the code here does.
In particular, with my suggestions, it will use the cycle to determine how to initialize the updater.
We can still get away with a single MergedMask
per Inst
:
- The lazy way to do it would be to not initialize the updater with IMPLICIT_DEFs and just let it figure them out by itself. This would generate correct code.
- However, this has a compile-time cost and could end up being too pessimistic in terms of the generated code.
- The Inst and all its uses could be contained in a cycle. This naive way would lead to a phi node at the header of this cycle, and therefore needlessly extending the live range of the value to the entire outer cycle.
- In thie following example, the values we create only need to live from InnerHeader to UseInst, but the naive use of the updater will extend them from OuterHeader all the way through OuterLatch:
flowchart TD
Start --> OuterHeader --> InnerHeader --> Inst --> InnerHeader
Inst --> UseInst --> OuterLatch --> OuterHeader
OuterLatch --> Exit
- The smarter way is to initialize the updater in the way I described, using the largest relevant cycle.
So instead of merely caching the MergedMask
, we should collect all UseInst
s and determine the largest relevant cycle (as returned by the uniformity analysis).
MachineBasicBlock *MBB = Inst->getParent(); | ||
buildMergeLaneMasks(*MBB, MBB->getFirstTerminator(), {}, MergedMask, | ||
PrevIterMask, Reg); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this case, it would be better to move the lane merging to directly after Inst
.
The register pressure calculus is exactly opposite from the non-i1 case:
- In the non-i1 case, shifting the COPY down is good because it reduces the live range of the VGPR while potentially making the live range of the SGPR longer. VGPRs are more expensive than SGPRs, so this is a good trade-off.
- In the i1 case, we'll have a live range for the merged mask extending across the entire cycle anyway. By moving the lane merging closer to Inst, we leave the live range of the merged mask unchanged, but we (most likely) reduce the live range of the i1 value produced by Inst.
a5c340d
to
538c0b4
Compare
5e06268
to
ac7971c
Compare
538c0b4
to
3f039f9
Compare
ac7971c
to
3387413
Compare
3f039f9
to
a637e56
Compare
3387413
to
55ebcb7
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
a637e56
to
3a94e7f
Compare
55ebcb7
to
9289642
Compare
3a94e7f
to
4afb1c7
Compare
6000c22
to
fe32af6
Compare
4afb1c7
to
f084882
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, this now looks good to me in terms of the overall flow. I have a bunch of nitpickier, mostly style-related comments.
if (MRI->getType(Reg) != LLT::scalar(1)) | ||
continue; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This check is now redundant
buildMergeLaneMasks(*MBB, MBB->getFirstTerminator(), {}, MergedMask, | ||
SSAUpdater.GetValueInMiddleOfBlock(MBB), Reg); | ||
|
||
LRCCache[Reg].second = MergedMask; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should be able to just keep LRCIter/Entry as a reference and update via that instead of repeating the cache lookup.
LRCCache[Reg] = {LRC, {}}; | ||
} | ||
|
||
for (auto LRCIter : LRCCache) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Naming: this isn't an iterator, a more accurate generic name would be just "Entry" (or LRCCacheEntry, but that's long)
if (MRI->getType(Reg) != LLT::scalar(1)) | ||
continue; | ||
|
||
replaceUsesOfRegInInstWith(Reg, UseInst, LRCCache[Reg].second); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can use .lookup instead of operator[] for consistency with above.
const MachineCycle *CachedLRC = LRCCache.lookup(Reg).first; | ||
if (CachedLRC) { | ||
LRC = CachedLRC->contains(LRC) ? CachedLRC : LRC; | ||
assert(LRC->contains(CachedLRC)); | ||
} | ||
|
||
LRCCache[Reg] = {LRC, {}}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is actually a great use case for try_emplace to do the cache lookup only once:
const MachineCycle *CachedLRC = LRCCache.lookup(Reg).first; | |
if (CachedLRC) { | |
LRC = CachedLRC->contains(LRC) ? CachedLRC : LRC; | |
assert(LRC->contains(CachedLRC)); | |
} | |
LRCCache[Reg] = {LRC, {}}; | |
const MachineCycle *&CachedLRC = LRCCache.try_emplace(Reg); | |
if (!CachedLRC || !CachedLRC->contains(LRC)) | |
CachedLRC = LRC; |
Use of i1 outside of the cycle, both uniform and divergent, is lane mask(in sgpr) that contains i1 at iteration that lane exited the cycle. Create phi that merges lane mask across all iterations.
fe32af6
to
ebc354e
Compare
f084882
to
31f4387
Compare
Use of i1 outside of the cycle, both uniform and divergent,
is lane mask(in sgpr) that contains i1 at iteration that lane
exited the cycle.
Create phi that merges lane mask across all iterations.