-
Notifications
You must be signed in to change notification settings - Fork 12.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU/GlobalISel: Temporal divergence lowering (non i1) #124298
base: users/petar-avramovic/update-divergence-lowring-tests
Are you sure you want to change the base?
AMDGPU/GlobalISel: Temporal divergence lowering (non i1) #124298
Conversation
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-analysis Author: Petar Avramovic (petar-avramovic) ChangesRecord all uses outside cycle with divergent exit during Patch is 23.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124298.diff 12 Files Affected:
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index bd09f4fe43e087..91ee0e41332199 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -342,6 +342,10 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
typename SyncDependenceAnalysisT::DivergenceDescriptor;
using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap;
+ // Use outside cycle with divergent exit
+ using UOCWDE =
+ std::tuple<const InstructionT *, const InstructionT *, const CycleT *>;
+
GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI,
const TargetTransformInfo *TTI)
: Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI),
@@ -395,6 +399,14 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
}
void print(raw_ostream &out) const;
+ SmallVector<UOCWDE, 8> UsesOutsideCycleWithDivergentExit;
+ void recordUseOutsideCycleWithDivergentExit(const InstructionT *,
+ const InstructionT *,
+ const CycleT *);
+ inline iterator_range<UOCWDE *> getUsesOutsideCycleWithDivergentExit() const {
+ return make_range(UsesOutsideCycleWithDivergentExit.begin(),
+ UsesOutsideCycleWithDivergentExit.end());
+ }
protected:
/// \brief Value/block pair representing a single phi input.
@@ -1129,6 +1141,14 @@ void GenericUniformityAnalysisImpl<ContextT>::compute() {
}
}
+template <typename ContextT>
+void GenericUniformityAnalysisImpl<
+ ContextT>::recordUseOutsideCycleWithDivergentExit(const InstructionT *Inst,
+ const InstructionT *User,
+ const CycleT *Cycle) {
+ UsesOutsideCycleWithDivergentExit.emplace_back(Inst, User, Cycle);
+}
+
template <typename ContextT>
bool GenericUniformityAnalysisImpl<ContextT>::isAlwaysUniform(
const InstructionT &Instr) const {
@@ -1180,6 +1200,16 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
}
}
+ if (!UsesOutsideCycleWithDivergentExit.empty()) {
+ OS << "\nUSES OUTSIDE CYCLES WITH DIVERGENT EXIT:\n";
+
+ for (auto [Inst, UseInst, Cycle] : UsesOutsideCycleWithDivergentExit) {
+ OS << "Inst :" << Context.print(Inst)
+ << "Used by :" << Context.print(UseInst)
+ << "Outside cycle :" << Cycle->print(Context) << "\n\n";
+ }
+ }
+
for (auto &block : F) {
OS << "\nBLOCK " << Context.print(&block) << '\n';
@@ -1210,6 +1240,13 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
}
}
+template <typename ContextT>
+iterator_range<typename GenericUniformityInfo<ContextT>::UOCWDE *>
+GenericUniformityInfo<ContextT>::getUsesOutsideCycleWithDivergentExit() const {
+ return make_range(DA->UsesOutsideCycleWithDivergentExit.begin(),
+ DA->UsesOutsideCycleWithDivergentExit.end());
+}
+
template <typename ContextT>
bool GenericUniformityInfo<ContextT>::hasDivergence() const {
return DA->hasDivergence();
diff --git a/llvm/include/llvm/ADT/GenericUniformityInfo.h b/llvm/include/llvm/ADT/GenericUniformityInfo.h
index e53afccc020b46..660fd6d46114d7 100644
--- a/llvm/include/llvm/ADT/GenericUniformityInfo.h
+++ b/llvm/include/llvm/ADT/GenericUniformityInfo.h
@@ -40,6 +40,10 @@ template <typename ContextT> class GenericUniformityInfo {
using CycleInfoT = GenericCycleInfo<ContextT>;
using CycleT = typename CycleInfoT::CycleT;
+ // Use outside cycle with divergent exit
+ using UOCWDE =
+ std::tuple<const InstructionT *, const InstructionT *, const CycleT *>;
+
GenericUniformityInfo(const DominatorTreeT &DT, const CycleInfoT &CI,
const TargetTransformInfo *TTI = nullptr);
GenericUniformityInfo() = default;
@@ -78,6 +82,8 @@ template <typename ContextT> class GenericUniformityInfo {
void print(raw_ostream &Out) const;
+ iterator_range<UOCWDE *> getUsesOutsideCycleWithDivergentExit() const;
+
private:
using ImplT = GenericUniformityAnalysisImpl<ContextT>;
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 592de1067e191a..ad40b6294f3afb 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -79,13 +79,12 @@ template <>
void llvm::GenericUniformityAnalysisImpl<
SSAContext>::propagateTemporalDivergence(const Instruction &I,
const Cycle &DefCycle) {
- if (isDivergent(I))
- return;
for (auto *User : I.users()) {
auto *UserInstr = cast<Instruction>(User);
if (DefCycle.contains(UserInstr->getParent()))
continue;
markDivergent(*UserInstr);
+ recordUseOutsideCycleWithDivergentExit(&I, UserInstr, &DefCycle);
}
}
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index a4b78c1c75ceb0..fb5b19968ac78a 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -117,12 +117,12 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::
if (!Op.getReg().isVirtual())
continue;
auto Reg = Op.getReg();
- if (isDivergent(Reg))
- continue;
- for (MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) {
+ for (const MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) {
if (DefCycle.contains(UserInstr.getParent()))
continue;
markDivergent(UserInstr);
+
+ recordUseOutsideCycleWithDivergentExit(&I, &UserInstr, &DefCycle);
}
}
}
@@ -193,7 +193,7 @@ INITIALIZE_PASS_END(MachineUniformityAnalysisPass, "machine-uniformity",
void MachineUniformityAnalysisPass::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesAll();
- AU.addRequired<MachineCycleInfoWrapperPass>();
+ AU.addRequiredTransitive<MachineCycleInfoWrapperPass>();
AU.addRequired<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
index fb258547e8fb90..d8cd1e7379c93f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
@@ -16,6 +16,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPUGlobalISelUtils.h"
#include "SILowerI1Copies.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -45,7 +46,6 @@ class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
AU.addRequired<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachinePostDominatorTreeWrapperPass>();
AU.addRequired<MachineUniformityAnalysisPass>();
@@ -78,6 +78,8 @@ class DivergenceLoweringHelper : public PhiLoweringHelper {
Register DstReg, Register PrevReg,
Register CurReg) override;
void constrainAsLaneMask(Incoming &In) override;
+
+ bool lowerTempDivergence();
};
DivergenceLoweringHelper::DivergenceLoweringHelper(
@@ -188,6 +190,37 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
In.Reg = Copy.getReg(0);
}
+void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst,
+ Register NewReg) {
+ for (MachineOperand &Op : Inst->operands()) {
+ if (Op.isReg() && Op.getReg() == Reg)
+ Op.setReg(NewReg);
+ }
+}
+
+bool DivergenceLoweringHelper::lowerTempDivergence() {
+ AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF);
+
+ for (auto [Inst, UseInst, _] : MUI->getUsesOutsideCycleWithDivergentExit()) {
+ Register Reg = Inst->getOperand(0).getReg();
+ if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) ||
+ ILMA.isS32S64LaneMask(Reg))
+ continue;
+
+ MachineInstr *MI = const_cast<MachineInstr *>(Inst);
+ MachineBasicBlock *MBB = MI->getParent();
+ B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(MI->getIterator())));
+
+ Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
+ B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg})
+ .addUse(ExecReg, RegState::Implicit);
+
+ replaceUsesOfRegInInstWith(Reg, const_cast<MachineInstr *>(UseInst),
+ VgprReg);
+ }
+ return false;
+}
+
} // End anonymous namespace.
INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
@@ -218,5 +251,15 @@ bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
- return Helper.lowerPhis();
+ bool Changed = false;
+ // Temporal divergence lowering needs to inspect list of instructions used
+ // outside cycle with divergent exit provided by uniformity analysis. Uniform
+ // instructions from the list require lowering, no instruction is deleted.
+ // Thus it needs to be run before lowerPhis that deletes phis that require
+ // lowering and replaces them with new instructions.
+
+ // Non-i1 temporal divergence lowering.
+ Changed |= Helper.lowerTempDivergence();
+ Changed |= Helper.lowerPhis();
+ return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
index abd7dcecc93ad0..452d75498545ce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
@@ -83,6 +83,7 @@ class RegBankSelectHelper {
MachineRegisterInfo &MRI;
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA;
const MachineUniformityInfo &MUI;
+ const SIRegisterInfo &TRI;
const RegisterBank *SgprRB;
const RegisterBank *VgprRB;
const RegisterBank *VccRB;
@@ -91,14 +92,29 @@ class RegBankSelectHelper {
RegBankSelectHelper(MachineIRBuilder &B,
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA,
const MachineUniformityInfo &MUI,
- const RegisterBankInfo &RBI)
- : B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI),
+ const SIRegisterInfo &TRI, const RegisterBankInfo &RBI)
+ : B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI),
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
+ // Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of
+ // the cycle
+ // Note: uniformity analysis does not consider that registers with vgpr def
+ // are divergent (you can have uniform value in vgpr).
+ // - TODO: implicit use of $exec could be implemented as indicator that
+ // instruction is divergent
+ bool isTemporalDivergenceCopy(Register Reg) {
+ MachineInstr *MI = MRI.getVRegDef(Reg);
+ if (!MI->isCopy() || MI->getNumImplicitOperands() != 1)
+ return false;
+
+ return MI->implicit_operands().begin()->getReg() == TRI.getExec();
+ }
+
const RegisterBank *getRegBankToAssign(Register Reg) {
- if (MUI.isUniform(Reg) || ILMA.isS32S64LaneMask(Reg))
+ if (!isTemporalDivergenceCopy(Reg) &&
+ (MUI.isUniform(Reg) || ILMA.isS32S64LaneMask(Reg)))
return SgprRB;
if (MRI.getType(Reg) == LLT::scalar(1))
return VccRB;
@@ -209,7 +225,7 @@ bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
MachineRegisterInfo &MRI = *B.getMRI();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- RegBankSelectHelper RBSHelper(B, ILMA, MUI, *ST.getRegBankInfo());
+ RegBankSelectHelper RBSHelper(B, ILMA, MUI, *ST.getRegisterInfo(), *ST.getRegBankInfo());
// Virtual registers at this point don't have register banks.
// Virtual registers in def and use operands of already inst-selected
// instruction have register class.
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.h b/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
index a407e3e014edd7..fd90328c2b9269 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
@@ -15,6 +15,7 @@
#include "GCNSubtarget.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineSSAUpdater.h"
namespace llvm {
@@ -72,6 +73,11 @@ class PhiLoweringHelper {
LaneMaskRegAttrs = MRI->getVRegAttrs(LaneMask);
}
+ void
+ initializeLaneMaskRegisterAttributes(MachineRegisterInfo::VRegAttrs Attrs) {
+ LaneMaskRegAttrs = Attrs;
+ }
+
bool isLaneMaskReg(Register Reg) const {
return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
index 6ce2f9b7a2c77c..f244639ed97623 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
@@ -471,7 +471,7 @@ body: |
; GFX10-NEXT: bb.2:
; GFX10-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY3]](s1), %bb.0, %56(s1), %bb.4
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY3]](s1), %bb.0, %57(s1), %bb.4
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %29(s32), %bb.4, [[DEF]](s32), %bb.0
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
; GFX10-NEXT: G_BRCOND [[COPY4]](s1), %bb.5
@@ -486,6 +486,7 @@ body: |
; GFX10-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX10-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:_(s32) = G_AMDGPU_BUFFER_LOAD [[UV]](<4 x s32>), [[C7]](s32), [[PHI2]], [[C7]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8)
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AMDGPU_BUFFER_LOAD1]], [[PHI4]]
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD]](s32), implicit $exec_lo
; GFX10-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C8]]
; GFX10-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
@@ -497,11 +498,11 @@ body: |
; GFX10-NEXT: bb.4:
; GFX10-NEXT: successors: %bb.2(0x80000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[AMDGPU_BUFFER_LOAD]]
+ ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY5]](s32), [[AMDGPU_BUFFER_LOAD]]
; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s1) = G_OR [[ICMP]], [[ICMP2]]
; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s1)
; GFX10-NEXT: [[C10:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
- ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C10]](s1)
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[C10]](s1)
; GFX10-NEXT: G_BR %bb.2
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.5:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
index 2299191d88b766..87e4f4b666d57e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
@@ -544,6 +544,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[PHI1]](s32), implicit $exec_lo
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.2:
; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
@@ -567,8 +568,8 @@ body: |
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
; GFX10-NEXT: [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI1]]
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
- ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
- ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[COPY6]](s1)
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-NEXT: G_BR %bb.5
; GFX10-NEXT: {{ $}}
@@ -578,19 +579,19 @@ body: |
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
- ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
- ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
- ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
+ ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
+ ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+ ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.6:
; GFX10-NEXT: successors: %bb.7(0x04000000), %bb.1(0x7c000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[COPY6]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[COPY7]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4
- ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
+ ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
- ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY9]](s1), [[PHI]](s32)
+ ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI]](s32)
; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-NEXT: G_BR %bb.7
; GFX10-NEXT: {{ $}}
@@ -604,7 +605,7 @@ body: |
; GFX10-NEXT: bb.8:
; GFX10-NEXT: successors: %bb.9(0x80000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: G_STORE [[PHI1]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
+ ; GFX10-NEXT: G_STORE [[COPY6]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.9:
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
index 6f1797455c6a83..1de0b52f36a48a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
@@ -5,20 +5,20 @@ define void @temporal_divergent_i32(float %val, ptr %addr) {
; GFX10-LABEL: temporal_divergent_i32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s4, -1
-; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_mov_b32 s5, -1
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB0_1: ; %loop
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_add_i32 s4, s4, 1
-; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s4
+; GFX10-NEXT: s_ad...
[truncated]
|
@llvm/pr-subscribers-llvm-globalisel Author: Petar Avramovic (petar-avramovic) ChangesRecord all uses outside cycle with divergent exit during Patch is 23.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124298.diff 12 Files Affected:
diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index bd09f4fe43e087..91ee0e41332199 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -342,6 +342,10 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
typename SyncDependenceAnalysisT::DivergenceDescriptor;
using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap;
+ // Use outside cycle with divergent exit
+ using UOCWDE =
+ std::tuple<const InstructionT *, const InstructionT *, const CycleT *>;
+
GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI,
const TargetTransformInfo *TTI)
: Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI),
@@ -395,6 +399,14 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
}
void print(raw_ostream &out) const;
+ SmallVector<UOCWDE, 8> UsesOutsideCycleWithDivergentExit;
+ void recordUseOutsideCycleWithDivergentExit(const InstructionT *,
+ const InstructionT *,
+ const CycleT *);
+ inline iterator_range<UOCWDE *> getUsesOutsideCycleWithDivergentExit() const {
+ return make_range(UsesOutsideCycleWithDivergentExit.begin(),
+ UsesOutsideCycleWithDivergentExit.end());
+ }
protected:
/// \brief Value/block pair representing a single phi input.
@@ -1129,6 +1141,14 @@ void GenericUniformityAnalysisImpl<ContextT>::compute() {
}
}
+template <typename ContextT>
+void GenericUniformityAnalysisImpl<
+ ContextT>::recordUseOutsideCycleWithDivergentExit(const InstructionT *Inst,
+ const InstructionT *User,
+ const CycleT *Cycle) {
+ UsesOutsideCycleWithDivergentExit.emplace_back(Inst, User, Cycle);
+}
+
template <typename ContextT>
bool GenericUniformityAnalysisImpl<ContextT>::isAlwaysUniform(
const InstructionT &Instr) const {
@@ -1180,6 +1200,16 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
}
}
+ if (!UsesOutsideCycleWithDivergentExit.empty()) {
+ OS << "\nUSES OUTSIDE CYCLES WITH DIVERGENT EXIT:\n";
+
+ for (auto [Inst, UseInst, Cycle] : UsesOutsideCycleWithDivergentExit) {
+ OS << "Inst :" << Context.print(Inst)
+ << "Used by :" << Context.print(UseInst)
+ << "Outside cycle :" << Cycle->print(Context) << "\n\n";
+ }
+ }
+
for (auto &block : F) {
OS << "\nBLOCK " << Context.print(&block) << '\n';
@@ -1210,6 +1240,13 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
}
}
+template <typename ContextT>
+iterator_range<typename GenericUniformityInfo<ContextT>::UOCWDE *>
+GenericUniformityInfo<ContextT>::getUsesOutsideCycleWithDivergentExit() const {
+ return make_range(DA->UsesOutsideCycleWithDivergentExit.begin(),
+ DA->UsesOutsideCycleWithDivergentExit.end());
+}
+
template <typename ContextT>
bool GenericUniformityInfo<ContextT>::hasDivergence() const {
return DA->hasDivergence();
diff --git a/llvm/include/llvm/ADT/GenericUniformityInfo.h b/llvm/include/llvm/ADT/GenericUniformityInfo.h
index e53afccc020b46..660fd6d46114d7 100644
--- a/llvm/include/llvm/ADT/GenericUniformityInfo.h
+++ b/llvm/include/llvm/ADT/GenericUniformityInfo.h
@@ -40,6 +40,10 @@ template <typename ContextT> class GenericUniformityInfo {
using CycleInfoT = GenericCycleInfo<ContextT>;
using CycleT = typename CycleInfoT::CycleT;
+ // Use outside cycle with divergent exit
+ using UOCWDE =
+ std::tuple<const InstructionT *, const InstructionT *, const CycleT *>;
+
GenericUniformityInfo(const DominatorTreeT &DT, const CycleInfoT &CI,
const TargetTransformInfo *TTI = nullptr);
GenericUniformityInfo() = default;
@@ -78,6 +82,8 @@ template <typename ContextT> class GenericUniformityInfo {
void print(raw_ostream &Out) const;
+ iterator_range<UOCWDE *> getUsesOutsideCycleWithDivergentExit() const;
+
private:
using ImplT = GenericUniformityAnalysisImpl<ContextT>;
diff --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index 592de1067e191a..ad40b6294f3afb 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -79,13 +79,12 @@ template <>
void llvm::GenericUniformityAnalysisImpl<
SSAContext>::propagateTemporalDivergence(const Instruction &I,
const Cycle &DefCycle) {
- if (isDivergent(I))
- return;
for (auto *User : I.users()) {
auto *UserInstr = cast<Instruction>(User);
if (DefCycle.contains(UserInstr->getParent()))
continue;
markDivergent(*UserInstr);
+ recordUseOutsideCycleWithDivergentExit(&I, UserInstr, &DefCycle);
}
}
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index a4b78c1c75ceb0..fb5b19968ac78a 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -117,12 +117,12 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::
if (!Op.getReg().isVirtual())
continue;
auto Reg = Op.getReg();
- if (isDivergent(Reg))
- continue;
- for (MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) {
+ for (const MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) {
if (DefCycle.contains(UserInstr.getParent()))
continue;
markDivergent(UserInstr);
+
+ recordUseOutsideCycleWithDivergentExit(&I, &UserInstr, &DefCycle);
}
}
}
@@ -193,7 +193,7 @@ INITIALIZE_PASS_END(MachineUniformityAnalysisPass, "machine-uniformity",
void MachineUniformityAnalysisPass::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesAll();
- AU.addRequired<MachineCycleInfoWrapperPass>();
+ AU.addRequiredTransitive<MachineCycleInfoWrapperPass>();
AU.addRequired<MachineDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
index fb258547e8fb90..d8cd1e7379c93f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
@@ -16,6 +16,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPUGlobalISelUtils.h"
#include "SILowerI1Copies.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -45,7 +46,6 @@ class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
AU.addRequired<MachineDominatorTreeWrapperPass>();
AU.addRequired<MachinePostDominatorTreeWrapperPass>();
AU.addRequired<MachineUniformityAnalysisPass>();
@@ -78,6 +78,8 @@ class DivergenceLoweringHelper : public PhiLoweringHelper {
Register DstReg, Register PrevReg,
Register CurReg) override;
void constrainAsLaneMask(Incoming &In) override;
+
+ bool lowerTempDivergence();
};
DivergenceLoweringHelper::DivergenceLoweringHelper(
@@ -188,6 +190,37 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
In.Reg = Copy.getReg(0);
}
+void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst,
+ Register NewReg) {
+ for (MachineOperand &Op : Inst->operands()) {
+ if (Op.isReg() && Op.getReg() == Reg)
+ Op.setReg(NewReg);
+ }
+}
+
+bool DivergenceLoweringHelper::lowerTempDivergence() {
+ AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF);
+
+ for (auto [Inst, UseInst, _] : MUI->getUsesOutsideCycleWithDivergentExit()) {
+ Register Reg = Inst->getOperand(0).getReg();
+ if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) ||
+ ILMA.isS32S64LaneMask(Reg))
+ continue;
+
+ MachineInstr *MI = const_cast<MachineInstr *>(Inst);
+ MachineBasicBlock *MBB = MI->getParent();
+ B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(MI->getIterator())));
+
+ Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
+ B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg})
+ .addUse(ExecReg, RegState::Implicit);
+
+ replaceUsesOfRegInInstWith(Reg, const_cast<MachineInstr *>(UseInst),
+ VgprReg);
+ }
+ return false;
+}
+
} // End anonymous namespace.
INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
@@ -218,5 +251,15 @@ bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
- return Helper.lowerPhis();
+ bool Changed = false;
+ // Temporal divergence lowering needs to inspect list of instructions used
+ // outside cycle with divergent exit provided by uniformity analysis. Uniform
+ // instructions from the list require lowering, no instruction is deleted.
+ // Thus it needs to be run before lowerPhis that deletes phis that require
+ // lowering and replaces them with new instructions.
+
+ // Non-i1 temporal divergence lowering.
+ Changed |= Helper.lowerTempDivergence();
+ Changed |= Helper.lowerPhis();
+ return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
index abd7dcecc93ad0..452d75498545ce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
@@ -83,6 +83,7 @@ class RegBankSelectHelper {
MachineRegisterInfo &MRI;
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA;
const MachineUniformityInfo &MUI;
+ const SIRegisterInfo &TRI;
const RegisterBank *SgprRB;
const RegisterBank *VgprRB;
const RegisterBank *VccRB;
@@ -91,14 +92,29 @@ class RegBankSelectHelper {
RegBankSelectHelper(MachineIRBuilder &B,
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA,
const MachineUniformityInfo &MUI,
- const RegisterBankInfo &RBI)
- : B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI),
+ const SIRegisterInfo &TRI, const RegisterBankInfo &RBI)
+ : B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI),
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
+ // Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of
+ // the cycle
+ // Note: uniformity analysis does not consider that registers with vgpr def
+ // are divergent (you can have uniform value in vgpr).
+ // - TODO: implicit use of $exec could be implemented as indicator that
+ // instruction is divergent
+ bool isTemporalDivergenceCopy(Register Reg) {
+ MachineInstr *MI = MRI.getVRegDef(Reg);
+ if (!MI->isCopy() || MI->getNumImplicitOperands() != 1)
+ return false;
+
+ return MI->implicit_operands().begin()->getReg() == TRI.getExec();
+ }
+
const RegisterBank *getRegBankToAssign(Register Reg) {
- if (MUI.isUniform(Reg) || ILMA.isS32S64LaneMask(Reg))
+ if (!isTemporalDivergenceCopy(Reg) &&
+ (MUI.isUniform(Reg) || ILMA.isS32S64LaneMask(Reg)))
return SgprRB;
if (MRI.getType(Reg) == LLT::scalar(1))
return VccRB;
@@ -209,7 +225,7 @@ bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
MachineRegisterInfo &MRI = *B.getMRI();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- RegBankSelectHelper RBSHelper(B, ILMA, MUI, *ST.getRegBankInfo());
+ RegBankSelectHelper RBSHelper(B, ILMA, MUI, *ST.getRegisterInfo(), *ST.getRegBankInfo());
// Virtual registers at this point don't have register banks.
// Virtual registers in def and use operands of already inst-selected
// instruction have register class.
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.h b/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
index a407e3e014edd7..fd90328c2b9269 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
@@ -15,6 +15,7 @@
#include "GCNSubtarget.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineSSAUpdater.h"
namespace llvm {
@@ -72,6 +73,11 @@ class PhiLoweringHelper {
LaneMaskRegAttrs = MRI->getVRegAttrs(LaneMask);
}
+ void
+ initializeLaneMaskRegisterAttributes(MachineRegisterInfo::VRegAttrs Attrs) {
+ LaneMaskRegAttrs = Attrs;
+ }
+
bool isLaneMaskReg(Register Reg) const {
return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
index 6ce2f9b7a2c77c..f244639ed97623 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
@@ -471,7 +471,7 @@ body: |
; GFX10-NEXT: bb.2:
; GFX10-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY3]](s1), %bb.0, %56(s1), %bb.4
+ ; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY3]](s1), %bb.0, %57(s1), %bb.4
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %29(s32), %bb.4, [[DEF]](s32), %bb.0
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
; GFX10-NEXT: G_BRCOND [[COPY4]](s1), %bb.5
@@ -486,6 +486,7 @@ body: |
; GFX10-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX10-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:_(s32) = G_AMDGPU_BUFFER_LOAD [[UV]](<4 x s32>), [[C7]](s32), [[PHI2]], [[C7]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8)
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AMDGPU_BUFFER_LOAD1]], [[PHI4]]
+ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD]](s32), implicit $exec_lo
; GFX10-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C8]]
; GFX10-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
@@ -497,11 +498,11 @@ body: |
; GFX10-NEXT: bb.4:
; GFX10-NEXT: successors: %bb.2(0x80000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[AMDGPU_BUFFER_LOAD]]
+ ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY5]](s32), [[AMDGPU_BUFFER_LOAD]]
; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s1) = G_OR [[ICMP]], [[ICMP2]]
; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s1)
; GFX10-NEXT: [[C10:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
- ; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C10]](s1)
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[C10]](s1)
; GFX10-NEXT: G_BR %bb.2
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.5:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
index 2299191d88b766..87e4f4b666d57e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
@@ -544,6 +544,7 @@ body: |
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6
+ ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[PHI1]](s32), implicit $exec_lo
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.2:
; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
@@ -567,8 +568,8 @@ body: |
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
; GFX10-NEXT: [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI1]]
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
- ; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
- ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[COPY6]](s1)
+ ; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
+ ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-NEXT: G_BR %bb.5
; GFX10-NEXT: {{ $}}
@@ -578,19 +579,19 @@ body: |
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
- ; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
- ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
- ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
+ ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
+ ; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+ ; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.6:
; GFX10-NEXT: successors: %bb.7(0x04000000), %bb.1(0x7c000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[COPY6]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
+ ; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[COPY7]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4
- ; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
+ ; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
- ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY9]](s1), [[PHI]](s32)
+ ; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI]](s32)
; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX10-NEXT: G_BR %bb.7
; GFX10-NEXT: {{ $}}
@@ -604,7 +605,7 @@ body: |
; GFX10-NEXT: bb.8:
; GFX10-NEXT: successors: %bb.9(0x80000000)
; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: G_STORE [[PHI1]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
+ ; GFX10-NEXT: G_STORE [[COPY6]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: bb.9:
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
index 6f1797455c6a83..1de0b52f36a48a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
@@ -5,20 +5,20 @@ define void @temporal_divergent_i32(float %val, ptr %addr) {
; GFX10-LABEL: temporal_divergent_i32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s4, -1
-; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_mov_b32 s5, -1
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB0_1: ; %loop
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_add_i32 s4, s4, 1
-; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s4
+; GFX10-NEXT: s_ad...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
@@ -395,6 +399,14 @@ template <typename ContextT> class GenericUniformityAnalysisImpl { | |||
} | |||
|
|||
void print(raw_ostream &out) const; | |||
SmallVector<UOCWDE, 8> UsesOutsideCycleWithDivergentExit; | |||
void recordUseOutsideCycleWithDivergentExit(const InstructionT *, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Everywhere in this patch, is there some reason to precisely say "UseOutsideCycleWithDivergentExit"? Can't we just say "TemporalDivergence"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was considering TemporalDivergenceCandidate. I did not find strict definition of Temporal Divergence so I ended up using UseOutsideCycleWithDivergentExit since it is more technical and, I assume, not target dependent.
It is not Temporal Divergence until we check uniformity of Src used OutsideCycleWithDivergentExit and it turns out to be uniform or the other case check type and it is i1. For us divergent i1 is also technically Temporal Divergence since it will ends up in sgpr.
I am fine with using different name instead of "UseOutsideCycleWithDivergentExit" if you think it is more appropriate.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You're right. The LLVM doc does not actually define the term "temporal divergence". It has always been used in a way that means "uniform inside cycle, divergent outside cycle, due to divergent cycle exit". But whether the value is uniform inside the cycle is less important. What matters is that values arrive at the use on exits from different iterations by different threads. I think we should use the name TemporalDivergence here. It's shorter and will show up when someone greps for temporal divergence. Let's also not add "Candidate" ... it just makes the name longer with only a little bit of new information.
@@ -342,6 +342,10 @@ template <typename ContextT> class GenericUniformityAnalysisImpl { | |||
typename SyncDependenceAnalysisT::DivergenceDescriptor; | |||
using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; | |||
|
|||
// Use outside cycle with divergent exit | |||
using UOCWDE = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use a less inscrutable type name?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just a suggestion, I would consider giving the name "TemporalDivergenceList" to the entire type SmallVector<std::tuple<InstructionT* ...
, and then derive other names from the public member types exposed by the SmallVector. In particular, the function that returns this vector as a range can benefit from simply saying auto
as its return types. This instance of "really long derived names" is justification enough for just saying auto
instead. There is no value in repeating all these names.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Alternatively, UOCWDE can be renamed to TemporalDivergenceTuple
?
ILMA.isS32S64LaneMask(Reg)) | ||
continue; | ||
|
||
MachineInstr *MI = const_cast<MachineInstr *>(Inst); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fix the const_casts, why is this const?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These come out of the analysis. The analysis itself uses const pointers/references in its implementation, which I believe is a good idea for const correctness. I wouldn't change that.
So a const_cast
is needed at some point. The only question is where.
I think here is as good a place as any, though perhaps grouping them together with a small explanation is in order. Something like:
// As an analysis, UniformityAnalysis treats instructions as const. We have the parent function
// as non-const, so casting const away here is inelegant but justified.
MachineInstr *MI = const_cast<MachineInstr *>(Inst);
MachineInstr *UseMI = const_cast<MachineInstr *>(UseInst);
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I lean on the other side. If you look at LoopInfoBase or LoopBase, their functions take const pointers as arguments but return non-const pointers when asked. Sure, an analysis should treat its inputs as const, but when it returns something to the client, that client owns it anyway, so forcing that to be const is just an inconvenience. I would rather have the analysis do the const_cast before returning a list of pointers to something I already own.
This seems to be the first time that uniformity analysis is returning something. Until now, the public interface has simply been a bunch of predicates like "isUniform" that take a const pointer as arguments.
@@ -342,6 +342,10 @@ template <typename ContextT> class GenericUniformityAnalysisImpl { | |||
typename SyncDependenceAnalysisT::DivergenceDescriptor; | |||
using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; | |||
|
|||
// Use outside cycle with divergent exit | |||
using UOCWDE = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just a suggestion, I would consider giving the name "TemporalDivergenceList" to the entire type SmallVector<std::tuple<InstructionT* ...
, and then derive other names from the public member types exposed by the SmallVector. In particular, the function that returns this vector as a range can benefit from simply saying auto
as its return types. This instance of "really long derived names" is justification enough for just saying auto
instead. There is no value in repeating all these names.
@@ -1210,6 +1240,13 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const { | |||
} | |||
} | |||
|
|||
template <typename ContextT> | |||
iterator_range<typename GenericUniformityInfo<ContextT>::UOCWDE *> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just say auto
as the return type here? Or if this needs to be exposed in an outer header file, then name a new type such as temporal_divergence_range
?
@@ -40,6 +40,10 @@ template <typename ContextT> class GenericUniformityInfo { | |||
using CycleInfoT = GenericCycleInfo<ContextT>; | |||
using CycleT = typename CycleInfoT::CycleT; | |||
|
|||
// Use outside cycle with divergent exit | |||
using UOCWDE = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This declaration got repeated. One of them can be eliminated?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My guess is that GenericUniformityAnalysisImpl and GenericUniformityInfo repeat typedefs because of terrible line break
This would work
typename GenericUniformityInfo::TemporalDivergenceTuple
1728ab4
to
cd3d069
Compare
3e04401
to
a5c340d
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I haven't done a detailed review of the code, but from a high-level algorithmic view this change already looks pretty reasonable to me.
bool DivergenceLoweringHelper::lowerTemporalDivergence() { | ||
AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF); | ||
|
||
for (auto [Inst, UseInst, _] : MUI->getTemporalDivergenceList()) { | ||
Register Reg = Inst->getOperand(0).getReg(); | ||
if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) || | ||
ILMA.isS32S64LaneMask(Reg)) | ||
continue; | ||
|
||
MachineBasicBlock *MBB = Inst->getParent(); | ||
B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Inst->getIterator()))); | ||
|
||
Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg)); | ||
B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg}) | ||
.addUse(ExecReg, RegState::Implicit); | ||
|
||
replaceUsesOfRegInInstWith(Reg, UseInst, VgprReg); | ||
} | ||
return false; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I do have one high-level comment about this.
Every Inst
may potentially appear with many UseInst
s in the temporal divergence list. The current code will create multiple new registers and multiple COPY
instructions, which seems wasteful even if downstream passes can often clean it up.
I would suggest capturing the created register in a DenseMap<Instruction *, Register>
for re-use.
Also, how about inserting the COPY
at the end of Inst->getParent()
? That way, the live range of the VGPR is reduced.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The changes to UA look good to me. I can't comment much about the actual patch itself.
bool DivergenceLoweringHelper::lowerTemporalDivergence() { | ||
AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF); | ||
|
||
for (auto [Inst, UseInst, _] : MUI->getTemporalDivergenceList()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure if it is ok to assume only the first def
can be temporal divergent. Maybe holds a map from the Register
to a array of temporal divergence users?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, true should map Register instead of Inst
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated types for recording TemporalDivergence and prints, improved new line prints.
MachineBasicBlock *MBB = Inst->getParent(); | ||
B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Inst->getIterator()))); | ||
|
||
Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure how it works in global-isel, can we set the RegisterClass of VgprReg to vector register here to make it more obvious this is copy from sgpr to vgpr?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It unnecessarily complicates new Reg bank select, regbankselect will set vgpr there. Also copy has implicit exec, should be special enough to indicate what we are doing.
cd3d069
to
46ad223
Compare
a5c340d
to
538c0b4
Compare
46ad223
to
7119120
Compare
538c0b4
to
3f039f9
Compare
How about this comment from earlier:
? |
7119120
to
11a9bd2
Compare
3f039f9
to
a637e56
Compare
11a9bd2
to
42ccf03
Compare
a637e56
to
3a94e7f
Compare
ebc5139
to
97a3855
Compare
4afb1c7
to
f084882
Compare
97a3855
to
c844e3a
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks!
Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source.
c844e3a
to
8e33358
Compare
f084882
to
31f4387
Compare
Record all uses outside cycle with divergent exit during
propagateTemporalDivergence in Uniformity analysis.
With this list of candidates for temporal divergence lowering,
excluding known lane masks from control flow intrinsics,
find sources from inside the cycle that are not i1 and uniform.
Temporal divergence lowering (non i1):
create copy(v_mov) to vgpr, with implicit exec (to stop other
passes from moving this copy outside of the cycle) and use this
vgpr outside of the cycle instead of original uniform source.