Skip to content

Commit 56d0ed2

Browse files
committed
[DA] GPUDivergenceAnalysis for unstructured GPU kernels
Summary: This is patch #3 of the new DivergenceAnalysis <https://lists.llvm.org/pipermail/llvm-dev/2018-May/123606.html> The GPUDivergenceAnalysis is intended to eventually supersede the existing LegacyDivergenceAnalysis. The existing LegacyDivergenceAnalysis produces incorrect results on unstructured Control-Flow Graphs: <https://bugs.llvm.org/show_bug.cgi?id=37185> This patch adds the option -use-gpu-divergence-analysis to the LegacyDivergenceAnalysis to turn it into a transparent wrapper for the GPUDivergenceAnalysis. Reviewers: nhaehnle Reviewed By: nhaehnle Subscribers: jholewinski, jvesely, jfb, llvm-commits, alex-t, sameerds, arsenm, nhaehnle Differential Revision: https://reviews.llvm.org/D53493 llvm-svn: 348048
1 parent 39298ca commit 56d0ed2

23 files changed

+1359
-27
lines changed

llvm/include/llvm/Analysis/DivergenceAnalysis.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,33 @@ class DivergenceAnalysis {
173173
std::vector<const Instruction *> Worklist;
174174
};
175175

176+
/// \brief Divergence analysis frontend for GPU kernels.
177+
class GPUDivergenceAnalysis {
178+
SyncDependenceAnalysis SDA;
179+
DivergenceAnalysis DA;
180+
181+
public:
182+
/// Runs the divergence analysis on @F, a GPU kernel
183+
GPUDivergenceAnalysis(Function &F, const DominatorTree &DT,
184+
const PostDominatorTree &PDT, const LoopInfo &LI,
185+
const TargetTransformInfo &TTI);
186+
187+
/// Whether any divergence was detected.
188+
bool hasDivergence() const { return DA.hasDetectedDivergence(); }
189+
190+
/// The GPU kernel this analysis result is for
191+
const Function &getFunction() const { return DA.getFunction(); }
192+
193+
/// Whether \p V is divergent.
194+
bool isDivergent(const Value &V) const;
195+
196+
/// Whether \p V is uniform/non-divergent
197+
bool isUniform(const Value &V) const { return !isDivergent(V); }
198+
199+
/// Print all divergent values in the kernel.
200+
void print(raw_ostream &OS, const Module *) const;
201+
};
202+
176203
} // namespace llvm
177204

178205
#endif // LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H

llvm/include/llvm/Analysis/LegacyDivergenceAnalysis.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@
1919
#include "llvm/ADT/DenseSet.h"
2020
#include "llvm/IR/Function.h"
2121
#include "llvm/Pass.h"
22+
#include "llvm/Analysis/DivergenceAnalysis.h"
2223

2324
namespace llvm {
2425
class Value;
26+
class GPUDivergenceAnalysis;
2527
class LegacyDivergenceAnalysis : public FunctionPass {
2628
public:
2729
static char ID;
@@ -41,7 +43,7 @@ class LegacyDivergenceAnalysis : public FunctionPass {
4143
//
4244
// Even if this function returns false, V may still be divergent when used
4345
// in a different basic block.
44-
bool isDivergent(const Value *V) const { return DivergentValues.count(V); }
46+
bool isDivergent(const Value *V) const;
4547

4648
// Returns true if V is uniform/non-divergent.
4749
//
@@ -53,6 +55,12 @@ class LegacyDivergenceAnalysis : public FunctionPass {
5355
void removeValue(const Value *V) { DivergentValues.erase(V); }
5456

5557
private:
58+
// Whether analysis should be performed by GPUDivergenceAnalysis.
59+
bool shouldUseGPUDivergenceAnalysis(const Function &F) const;
60+
61+
// (optional) handle to new DivergenceAnalysis
62+
std::unique_ptr<GPUDivergenceAnalysis> gpuDA;
63+
5664
// Stores all divergent values.
5765
DenseSet<const Value *> DivergentValues;
5866
};

llvm/lib/Analysis/DivergenceAnalysis.cpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,3 +422,36 @@ void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
422422
OS << "DIVERGENT:" << I << '\n';
423423
}
424424
}
425+
426+
// class GPUDivergenceAnalysis
427+
GPUDivergenceAnalysis::GPUDivergenceAnalysis(Function &F,
428+
const DominatorTree &DT,
429+
const PostDominatorTree &PDT,
430+
const LoopInfo &LI,
431+
const TargetTransformInfo &TTI)
432+
: SDA(DT, PDT, LI), DA(F, nullptr, DT, LI, SDA, false) {
433+
for (auto &I : instructions(F)) {
434+
if (TTI.isSourceOfDivergence(&I)) {
435+
DA.markDivergent(I);
436+
} else if (TTI.isAlwaysUniform(&I)) {
437+
DA.addUniformOverride(I);
438+
}
439+
}
440+
for (auto &Arg : F.args()) {
441+
if (TTI.isSourceOfDivergence(&Arg)) {
442+
DA.markDivergent(Arg);
443+
}
444+
}
445+
446+
DA.compute();
447+
}
448+
449+
bool GPUDivergenceAnalysis::isDivergent(const Value &val) const {
450+
return DA.isDivergent(val);
451+
}
452+
453+
void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const {
454+
OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n";
455+
DA.print(OS, mod);
456+
OS << "}\n";
457+
}

llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp

Lines changed: 75 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
//===- LegacyDivergenceAnalysis.cpp --------- Legacy Divergence Analysis Implementation -==//
1+
//===- LegacyDivergenceAnalysis.cpp --------- Legacy Divergence Analysis
2+
//Implementation -==//
23
//
34
// The LLVM Compiler Infrastructure
45
//
@@ -64,6 +65,9 @@
6465
//
6566
//===----------------------------------------------------------------------===//
6667

68+
#include "llvm/ADT/PostOrderIterator.h"
69+
#include "llvm/Analysis/CFG.h"
70+
#include "llvm/Analysis/DivergenceAnalysis.h"
6771
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
6872
#include "llvm/Analysis/Passes.h"
6973
#include "llvm/Analysis/PostDominators.h"
@@ -79,6 +83,12 @@ using namespace llvm;
7983

8084
#define DEBUG_TYPE "divergence"
8185

86+
// transparently use the GPUDivergenceAnalysis
87+
static cl::opt<bool> UseGPUDA("use-gpu-divergence-analysis", cl::init(false),
88+
cl::Hidden,
89+
cl::desc("turn the LegacyDivergenceAnalysis into "
90+
"a wrapper for GPUDivergenceAnalysis"));
91+
8292
namespace {
8393

8494
class DivergencePropagator {
@@ -262,16 +272,17 @@ void DivergencePropagator::propagate() {
262272
}
263273
}
264274

265-
} /// end namespace anonymous
275+
} // namespace
266276

267277
// Register this pass.
268278
char LegacyDivergenceAnalysis::ID = 0;
269-
INITIALIZE_PASS_BEGIN(LegacyDivergenceAnalysis, "divergence", "Legacy Divergence Analysis",
270-
false, true)
279+
INITIALIZE_PASS_BEGIN(LegacyDivergenceAnalysis, "divergence",
280+
"Legacy Divergence Analysis", false, true)
271281
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
272282
INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
273-
INITIALIZE_PASS_END(LegacyDivergenceAnalysis, "divergence", "Legacy Divergence Analysis",
274-
false, true)
283+
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
284+
INITIALIZE_PASS_END(LegacyDivergenceAnalysis, "divergence",
285+
"Legacy Divergence Analysis", false, true)
275286

276287
FunctionPass *llvm::createLegacyDivergenceAnalysisPass() {
277288
return new LegacyDivergenceAnalysis();
@@ -280,9 +291,24 @@ FunctionPass *llvm::createLegacyDivergenceAnalysisPass() {
280291
void LegacyDivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
281292
AU.addRequired<DominatorTreeWrapperPass>();
282293
AU.addRequired<PostDominatorTreeWrapperPass>();
294+
if (UseGPUDA)
295+
AU.addRequired<LoopInfoWrapperPass>();
283296
AU.setPreservesAll();
284297
}
285298

299+
bool LegacyDivergenceAnalysis::shouldUseGPUDivergenceAnalysis(
300+
const Function &F) const {
301+
if (!UseGPUDA)
302+
return false;
303+
304+
// GPUDivergenceAnalysis requires a reducible CFG.
305+
auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
306+
using RPOTraversal = ReversePostOrderTraversal<const Function *>;
307+
RPOTraversal FuncRPOT(&F);
308+
return !containsIrreducibleCFG<const BasicBlock *, const RPOTraversal,
309+
const LoopInfo>(FuncRPOT, LI);
310+
}
311+
286312
bool LegacyDivergenceAnalysis::runOnFunction(Function &F) {
287313
auto *TTIWP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
288314
if (TTIWP == nullptr)
@@ -295,44 +321,67 @@ bool LegacyDivergenceAnalysis::runOnFunction(Function &F) {
295321
return false;
296322

297323
DivergentValues.clear();
324+
gpuDA = nullptr;
325+
326+
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
298327
auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
299-
DivergencePropagator DP(F, TTI,
300-
getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
301-
PDT, DivergentValues);
302-
DP.populateWithSourcesOfDivergence();
303-
DP.propagate();
304-
LLVM_DEBUG(
305-
dbgs() << "\nAfter divergence analysis on " << F.getName() << ":\n";
306-
print(dbgs(), F.getParent())
307-
);
328+
329+
if (shouldUseGPUDivergenceAnalysis(F)) {
330+
// run the new GPU divergence analysis
331+
auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
332+
gpuDA = llvm::make_unique<GPUDivergenceAnalysis>(F, DT, PDT, LI, TTI);
333+
334+
} else {
335+
// run LLVM's existing DivergenceAnalysis
336+
DivergencePropagator DP(F, TTI, DT, PDT, DivergentValues);
337+
DP.populateWithSourcesOfDivergence();
338+
DP.propagate();
339+
}
340+
341+
LLVM_DEBUG(dbgs() << "\nAfter divergence analysis on " << F.getName()
342+
<< ":\n";
343+
print(dbgs(), F.getParent()));
344+
308345
return false;
309346
}
310347

348+
bool LegacyDivergenceAnalysis::isDivergent(const Value *V) const {
349+
if (gpuDA) {
350+
return gpuDA->isDivergent(*V);
351+
}
352+
return DivergentValues.count(V);
353+
}
354+
311355
void LegacyDivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
312-
if (DivergentValues.empty())
356+
if ((!gpuDA || !gpuDA->hasDivergence()) && DivergentValues.empty())
313357
return;
314-
const Value *FirstDivergentValue = *DivergentValues.begin();
358+
315359
const Function *F;
316-
if (const Argument *Arg = dyn_cast<Argument>(FirstDivergentValue)) {
317-
F = Arg->getParent();
318-
} else if (const Instruction *I =
319-
dyn_cast<Instruction>(FirstDivergentValue)) {
320-
F = I->getParent()->getParent();
321-
} else {
322-
llvm_unreachable("Only arguments and instructions can be divergent");
360+
if (!DivergentValues.empty()) {
361+
const Value *FirstDivergentValue = *DivergentValues.begin();
362+
if (const Argument *Arg = dyn_cast<Argument>(FirstDivergentValue)) {
363+
F = Arg->getParent();
364+
} else if (const Instruction *I =
365+
dyn_cast<Instruction>(FirstDivergentValue)) {
366+
F = I->getParent()->getParent();
367+
} else {
368+
llvm_unreachable("Only arguments and instructions can be divergent");
369+
}
370+
} else if (gpuDA) {
371+
F = &gpuDA->getFunction();
323372
}
324373

325374
// Dumps all divergent values in F, arguments and then instructions.
326375
for (auto &Arg : F->args()) {
327-
OS << (DivergentValues.count(&Arg) ? "DIVERGENT: " : " ");
376+
OS << (isDivergent(&Arg) ? "DIVERGENT: " : " ");
328377
OS << Arg << "\n";
329378
}
330379
// Iterate instructions using instructions() to ensure a deterministic order.
331380
for (auto BI = F->begin(), BE = F->end(); BI != BE; ++BI) {
332381
auto &BB = *BI;
333382
OS << "\n " << BB.getName() << ":\n";
334383
for (auto &I : BB.instructionsWithoutDebug()) {
335-
OS << (DivergentValues.count(&I) ? "DIVERGENT: " : " ");
384+
OS << (isDivergent(&I) ? "DIVERGENT: " : " ");
336385
OS << I << "\n";
337386
}
338387
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
2+
3+
define amdgpu_kernel void @workitem_id_x() #1 {
4+
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
5+
; CHECK: DIVERGENT: %id.x = call i32 @llvm.amdgcn.workitem.id.x()
6+
%first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x)
7+
; CHECK-NOT: DIVERGENT: %first.lane = call i32 @llvm.amdgcn.readfirstlane(i32 %id.x)
8+
ret void
9+
}
10+
11+
declare i32 @llvm.amdgcn.workitem.id.x() #0
12+
declare i32 @llvm.amdgcn.readfirstlane(i32) #0
13+
14+
attributes #0 = { nounwind readnone }
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
; RUN: opt -mtriple=amdgcn-- -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
2+
3+
; CHECK: DIVERGENT: %orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst
4+
define i32 @test1(i32* %ptr, i32 %val) #0 {
5+
%orig = atomicrmw xchg i32* %ptr, i32 %val seq_cst
6+
ret i32 %orig
7+
}
8+
9+
; CHECK: DIVERGENT: %orig = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
10+
define {i32, i1} @test2(i32* %ptr, i32 %cmp, i32 %new) {
11+
%orig = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
12+
ret {i32, i1} %orig
13+
}
14+
15+
; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false)
16+
define i32 @test_atomic_inc_i32(i32 addrspace(1)* %ptr, i32 %val) #0 {
17+
%ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false)
18+
ret i32 %ret
19+
}
20+
21+
; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false)
22+
define i64 @test_atomic_inc_i64(i64 addrspace(1)* %ptr, i64 %val) #0 {
23+
%ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false)
24+
ret i64 %ret
25+
}
26+
27+
; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false)
28+
define i32 @test_atomic_dec_i32(i32 addrspace(1)* %ptr, i32 %val) #0 {
29+
%ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false)
30+
ret i32 %ret
31+
}
32+
33+
; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false)
34+
define i64 @test_atomic_dec_i64(i64 addrspace(1)* %ptr, i64 %val) #0 {
35+
%ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false)
36+
ret i64 %ret
37+
}
38+
39+
declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1
40+
declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1
41+
declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1
42+
declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1
43+
44+
attributes #0 = { nounwind }
45+
attributes #1 = { nounwind argmemonly }
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
2+
3+
define amdgpu_kernel void @hidden_diverge(i32 %n, i32 %a, i32 %b) #0 {
4+
; CHECK-LABEL: Printing analysis 'Legacy Divergence Analysis' for function 'hidden_diverge'
5+
entry:
6+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
7+
%cond.var = icmp slt i32 %tid, 0
8+
br i1 %cond.var, label %B, label %C ; divergent
9+
; CHECK: DIVERGENT: br i1 %cond.var,
10+
B:
11+
%cond.uni = icmp slt i32 %n, 0
12+
br i1 %cond.uni, label %C, label %merge ; uniform
13+
; CHECK-NOT: DIVERGENT: br i1 %cond.uni,
14+
C:
15+
%phi.var.hidden = phi i32 [ 1, %entry ], [ 2, %B ]
16+
; CHECK: DIVERGENT: %phi.var.hidden = phi i32
17+
br label %merge
18+
merge:
19+
%phi.ipd = phi i32 [ %a, %B ], [ %b, %C ]
20+
; CHECK: DIVERGENT: %phi.ipd = phi i32
21+
ret void
22+
}
23+
24+
declare i32 @llvm.amdgcn.workitem.id.x() #0
25+
26+
attributes #0 = { nounwind readnone }

0 commit comments

Comments
 (0)