Skip to content

AMDGPU/GlobalISel: Temporal divergence lowering i1 #124299

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ class DivergenceLoweringHelper : public PhiLoweringHelper {
void constrainAsLaneMask(Incoming &In) override;

bool lowerTemporalDivergence();
bool lowerTemporalDivergenceI1();
};

DivergenceLoweringHelper::DivergenceLoweringHelper(
Expand Down Expand Up @@ -228,6 +229,63 @@ bool DivergenceLoweringHelper::lowerTemporalDivergence() {
return false;
}

bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() {
MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)};
initializeLaneMaskRegisterAttributes(BoolS1);
MachineSSAUpdater SSAUpdater(*MF);

// In case of use outside muliple nested cycles or muliple uses we only need
// to merge lane mask across largest relevant cycle.
SmallDenseMap<Register, std::pair<const MachineCycle *, Register>> LRCCache;
for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) {
if (MRI->getType(Reg) != LLT::scalar(1))
continue;

auto [LRCCacheIter, RegNotCached] = LRCCache.try_emplace(Reg);
auto &CycleMergedMask = LRCCacheIter->getSecond();
const MachineCycle *&CachedLRC = CycleMergedMask.first;
if (RegNotCached || LRC->contains(CachedLRC)) {
CachedLRC = LRC;
}
}

for (auto &LRCCacheEntry : LRCCache) {
Register Reg = LRCCacheEntry.first;
auto &CycleMergedMask = LRCCacheEntry.getSecond();
const MachineCycle *Cycle = CycleMergedMask.first;

Register MergedMask = MRI->createVirtualRegister(BoolS1);
SSAUpdater.Initialize(MergedMask);

MachineBasicBlock *MBB = MRI->getVRegDef(Reg)->getParent();
SSAUpdater.AddAvailableValue(MBB, MergedMask);

for (auto Entry : Cycle->getEntries()) {
for (MachineBasicBlock *Pred : Entry->predecessors()) {
if (!Cycle->contains(Pred)) {
B.setInsertPt(*Pred, Pred->getFirstTerminator());
auto ImplDef = B.buildInstr(AMDGPU::IMPLICIT_DEF, {BoolS1}, {});
SSAUpdater.AddAvailableValue(Pred, ImplDef.getReg(0));
}
}
}

buildMergeLaneMasks(*MBB, MBB->getFirstTerminator(), {}, MergedMask,
SSAUpdater.GetValueInMiddleOfBlock(MBB), Reg);

CycleMergedMask.second = MergedMask;
}

for (auto [Reg, UseInst, Cycle] : MUI->getTemporalDivergenceList()) {
if (MRI->getType(Reg) != LLT::scalar(1))
continue;

replaceUsesOfRegInInstWith(Reg, UseInst, LRCCache.lookup(Reg).second);
}

return false;
}

} // End anonymous namespace.

INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
Expand Down Expand Up @@ -267,6 +325,12 @@ bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(

// Non-i1 temporal divergence lowering.
Changed |= Helper.lowerTemporalDivergence();
// This covers both uniform and divergent i1s. Lane masks are in sgpr and need
// to be updated in each iteration.
Changed |= Helper.lowerTemporalDivergenceI1();
// Temporal divergence lowering of divergent i1 phi used outside of the cycle
// could also be handled by lowerPhis but we do it in lowerTempDivergenceI1
// since in some case lowerPhis does unnecessary lane mask merging.
Changed |= Helper.lowerPhis();
return Changed;
}
Original file line number Diff line number Diff line change
Expand Up @@ -104,20 +104,25 @@ define void @divergent_i1_phi_used_inside_loop(float %val, ptr %addr) {
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: s_mov_b32 s5, 1
; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: ; implicit-def: $sgpr7
; GFX10-NEXT: .LBB2_1: ; %loop
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s6
; GFX10-NEXT: s_xor_b32 s5, s5, 1
; GFX10-NEXT: s_add_i32 s6, s6, 1
; GFX10-NEXT: s_and_b32 s8, s5, 1
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v0
; GFX10-NEXT: s_cselect_b32 s8, exec_lo, 0
; GFX10-NEXT: s_add_i32 s6, s6, 1
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo
; GFX10-NEXT: s_and_b32 s8, exec_lo, s8
; GFX10-NEXT: s_or_b32 s7, s7, s8
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB2_1
; GFX10-NEXT: ; %bb.2: ; %exit
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cmp_lg_u32 s5, 0
; GFX10-NEXT: s_cselect_b32 s4, exec_lo, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s7
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -147,37 +152,42 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
; GFX10-NEXT: v_mov_b32_e32 v1, 0x3e8
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: s_mov_b32 s6, 0
; GFX10-NEXT: ; implicit-def: $sgpr7
; GFX10-NEXT: s_branch .LBB3_2
; GFX10-NEXT: .LBB3_1: ; %loop_body
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX10-NEXT: v_cvt_f32_u32_e32 v8, s6
; GFX10-NEXT: s_xor_b32 s4, s4, exec_lo
; GFX10-NEXT: s_mov_b32 s8, exec_lo
; GFX10-NEXT: s_add_i32 s6, s6, 1
; GFX10-NEXT: s_xor_b32 s4, s4, s8
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v0
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 s7, s7, exec_lo
; GFX10-NEXT: s_and_b32 s8, exec_lo, s4
; GFX10-NEXT: s_or_b32 s7, s7, s8
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_cbranch_execz .LBB3_6
; GFX10-NEXT: .LBB3_2: ; %loop_start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_cmpk_le_i32 s6, 0x3e8
; GFX10-NEXT: s_mov_b32 s7, 1
; GFX10-NEXT: s_mov_b32 s8, 1
; GFX10-NEXT: s_cbranch_scc0 .LBB3_4
; GFX10-NEXT: ; %bb.3: ; %else
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX10-NEXT: s_mov_b32 s7, 0
; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: flat_store_dword v[6:7], v1
; GFX10-NEXT: .LBB3_4: ; %Flow
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX10-NEXT: s_xor_b32 s7, s7, 1
; GFX10-NEXT: s_cmp_lg_u32 s7, 0
; GFX10-NEXT: s_xor_b32 s8, s8, 1
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_cbranch_scc1 .LBB3_1
; GFX10-NEXT: ; %bb.5: ; %if
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX10-NEXT: flat_store_dword v[4:5], v1
; GFX10-NEXT: s_branch .LBB3_1
; GFX10-NEXT: .LBB3_6: ; %exit
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s7
; GFX10-NEXT: flat_store_dword v[2:3], v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,28 +201,35 @@ body: |
; GFX10-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C]](s32), %bb.0
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.1
; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C1]](s1), %bb.0, %11(s1), %bb.1
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %19(s1), %bb.1
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C]](s32), %bb.0
; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.1
; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C1]](s1), %bb.0, %11(s1), %bb.1
; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]]
; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32)
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY3]](s1), $exec_lo, implicit-def $scc
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-NEXT: G_BR %bb.2
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.2:
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32)
; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[XOR]](s1), [[C5]], [[C4]]
; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[S_OR_B32_]](s1), [[C5]], [[C4]]
; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
; GFX10-NEXT: SI_RETURN
bb.0:
Expand Down Expand Up @@ -285,27 +292,30 @@ body: |
; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C]]
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1)
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.1:
; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.0, %37(s1), %bb.5
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C1]](s32), %bb.0
; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %17(s32), %bb.5
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.0, %44(s1), %bb.5
; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %36(s1), %bb.5
; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C1]](s32), %bb.0
; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %17(s32), %bb.5
; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1000
; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI2]](s32), [[C2]]
; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI3]](s32), [[C2]]
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: G_BRCOND [[ICMP]](s1), %bb.4
; GFX10-NEXT: G_BR %bb.2
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.2:
; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000)
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s1) = G_PHI %24(s1), %bb.4, [[C3]](s1), %bb.1
; GFX10-NEXT: [[PHI4:%[0-9]+]]:_(s1) = G_PHI %24(s1), %bb.4, [[C3]](s1), %bb.1
; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C4]]
; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI4]], [[C4]]
; GFX10-NEXT: G_BRCOND [[XOR]](s1), %bb.5
; GFX10-NEXT: G_BR %bb.3
; GFX10-NEXT: {{ $}}
Expand All @@ -329,20 +339,24 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[C8:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; GFX10-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[COPY9]], [[C8]]
; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI3]](s32)
; GFX10-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
; GFX10-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C9]]
; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI1]](s32)
; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C9]]
; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32)
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-NEXT: G_BR %bb.6
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.6:
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT]](s32)
; GFX10-NEXT: [[C10:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
; GFX10-NEXT: [[C11:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[XOR1]](s1), [[C11]], [[C10]]
; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[S_OR_B32_]](s1), [[C11]], [[C10]]
; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
; GFX10-NEXT: SI_RETURN
bb.0:
Expand Down
Loading
Loading