Skip to content

[AMDGPU][UnifyDivergentExitNodes][StructurizeCFG] Add support for callbr instruction with inline-asm #152161

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: users/ro-i/callbr-amdgpu_1
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 54 additions & 35 deletions llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,14 +181,52 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
return NewRetBlock;
}

static BasicBlock *
createDummyReturnBlock(Function &F,
SmallVector<BasicBlock *, 4> &ReturningBlocks) {
BasicBlock *DummyReturnBB =
BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F);
Type *RetTy = F.getReturnType();
Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
ReturningBlocks.push_back(DummyReturnBB);
return DummyReturnBB;
}

/// Handle conditional branch instructions (-> 2 targets) and callbr
/// instructions with N targets.
static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI,
BasicBlock *DummyReturnBB,
std::vector<DominatorTree::UpdateType> &Updates) {
SmallVector<BasicBlock *, 2> Successors(successors(BB));

// Create a new transition block to hold the conditional branch.
BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");

Updates.reserve(Updates.size() + 2 * Successors.size() + 2);

// 'Successors' become successors of TransitionBB instead of BB,
// and TransitionBB becomes a single successor of BB.
Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
for (BasicBlock *Successor : Successors) {
Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
Updates.emplace_back(DominatorTree::Delete, BB, Successor);
}

// Create a branch that will always branch to the transition block and
// references DummyReturnBB.
BB->getTerminator()->eraseFromParent();
BranchInst::Create(TransitionBB, DummyReturnBB,
ConstantInt::getTrue(F.getContext()), BB);
Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
}

bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
const PostDominatorTree &PDT,
const UniformityInfo &UA) {
assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator.");

if (PDT.root_size() == 0 ||
(PDT.root_size() == 1 &&
!isa<BranchInst>(PDT.getRoot()->getTerminator())))
!isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator())))
return false;

// Loop over all of the blocks in a function, tracking all of the blocks that
Expand Down Expand Up @@ -222,46 +260,27 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
if (HasDivergentExitBlock)
UnreachableBlocks.push_back(BB);
} else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {

ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
if (DummyReturnBB == nullptr) {
DummyReturnBB = BasicBlock::Create(F.getContext(),
"DummyReturnBlock", &F);
Type *RetTy = F.getReturnType();
Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
ReturningBlocks.push_back(DummyReturnBB);
}
if (DummyReturnBB == nullptr)
DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);

if (BI->isUnconditional()) {
BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
BI->eraseFromParent(); // Delete the unconditional branch.
// Add a new conditional branch with a dummy edge to the return block.
BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
} else { // Conditional branch.
SmallVector<BasicBlock *, 2> Successors(successors(BB));

// Create a new transition block to hold the conditional branch.
BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");

Updates.reserve(Updates.size() + 2 * Successors.size() + 2);

// 'Successors' become successors of TransitionBB instead of BB,
// and TransitionBB becomes a single successor of BB.
Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
for (BasicBlock *Successor : Successors) {
Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
Updates.emplace_back(DominatorTree::Delete, BB, Successor);
}

// Create a branch that will always branch to the transition block and
// references DummyReturnBB.
BB->getTerminator()->eraseFromParent();
BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
BranchInst::Create(LoopHeaderBB, DummyReturnBB,
ConstantInt::getTrue(F.getContext()), BB);
Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
} else {
handleNBranch(F, BB, BI, DummyReturnBB, Updates);
}
Changed = true;
} else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) {
if (DummyReturnBB == nullptr)
DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);

handleNBranch(F, BB, CBI, DummyReturnBB, Updates);
} else {
llvm_unreachable("unsupported block terminator");
}
}

Expand Down
10 changes: 7 additions & 3 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16476,12 +16476,12 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,

const TargetRegisterClass *RC = nullptr;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Everything in this file looks like unrelated style changes

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, this is actually important to support the asm label constraint "!i". This should fall through to the default case (imho) without causing VT.getSizeInBits() to crash.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you do this separately? You should be able to refer to labels in regular inline asm

if (Constraint.size() == 1) {
const unsigned BitWidth = VT.getSizeInBits();
switch (Constraint[0]) {
default:
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
case 's':
case 'r':
case 'r': {
const unsigned BitWidth = VT.getSizeInBits();
switch (BitWidth) {
case 16:
RC = &AMDGPU::SReg_32RegClass;
Expand All @@ -16496,7 +16496,9 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
break;
}
break;
case 'v':
}
case 'v': {
const unsigned BitWidth = VT.getSizeInBits();
switch (BitWidth) {
case 16:
RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
Expand All @@ -16509,9 +16511,11 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
break;
}
break;
}
case 'a':
if (!Subtarget->hasMAIInsts())
break;
const unsigned BitWidth = VT.getSizeInBits();
switch (BitWidth) {
case 16:
RC = &AMDGPU::AGPR_32RegClass;
Expand Down
15 changes: 7 additions & 8 deletions llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -480,11 +480,10 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
} else {
// Test for successors as back edge
BasicBlock *BB = N->getNodeAs<BasicBlock>();
BranchInst *Term = cast<BranchInst>(BB->getTerminator());

for (BasicBlock *Succ : Term->successors())
if (Visited.count(Succ))
Loops[Succ] = BB;
if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()))
for (BasicBlock *Succ : Term->successors())
if (Visited.count(Succ))
Loops[Succ] = BB;
}
}

Expand Down Expand Up @@ -516,7 +515,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {

for (BasicBlock *P : predecessors(BB)) {
// Ignore it if it's a branch from outside into our region entry
if (!ParentRegion->contains(P))
if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator()))
continue;

Region *R = RI->getRegionFor(P);
Expand Down Expand Up @@ -1284,13 +1283,13 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) {

/// Run the transformation for each region found
bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
if (R->isTopLevelRegion())
// CallBr and its corresponding blocks must not be modified by this pass.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is saying that if a region is headed by a CallBr, then anything inside it is separately structurized by visiting the nested regions. Is that correct? Are there any assertions we could put here about the current region R, to demonstrate that there is nothing to structurize?

if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator()))
return false;

this->DT = DT;

Func = R->getEntry()->getParent();
assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator.");

ParentRegion = R;

Expand Down
54 changes: 54 additions & 0 deletions llvm/test/CodeGen/AMDGPU/callbr.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s

define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) {
; CHECK-LABEL: callbr_inline_asm:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_load_dword v0, v[0:1]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ; %bb.1: ; %fallthrough
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_dword v[2:3], v0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
; CHECK-NEXT: .LBB0_2: ; Inline asm indirect target
; CHECK-NEXT: ; %indirect
; CHECK-NEXT: ; Label of block must be emitted
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_dword v[4:5], v0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%a = load i32, ptr %src, align 4
callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect]
fallthrough:
store i32 %a, ptr %dst1, align 4
br label %ret
indirect:
store i32 %a, ptr %dst2, align 4
br label %ret
ret:
ret void
}

define void @callbr_self_loop(i1 %c) {
; CHECK-LABEL: callbr_self_loop:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: .LBB1_1: ; %callbr
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_branch .LBB1_1
; CHECK-NEXT: .LBB1_2: ; Inline asm indirect target
; CHECK-NEXT: ; %callbr.target.ret
; CHECK-NEXT: ; Label of block must be emitted
; CHECK-NEXT: s_setpc_b64 s[30:31]
br label %callbr
callbr:
callbr void asm "", "!i"() to label %callbr [label %ret]
ret:
ret void
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

declare void @foo(ptr)
declare i1 @bar(ptr)
declare i32 @bar32(ptr)

define void @musttail_call_without_return_value(ptr %p) {
; CHECK-LABEL: define void @musttail_call_without_return_value(
Expand All @@ -28,6 +29,31 @@ bb.1:
ret void
}

define void @musttail_call_without_return_value_callbr(ptr %p) {
; CHECK-LABEL: define void @musttail_call_without_return_value_callbr(
; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1
; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]])
; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1]
; CHECK: [[BB_0]]:
; CHECK-NEXT: musttail call void @foo(ptr [[P]])
; CHECK-NEXT: ret void
; CHECK: [[BB_1:.*:]]
; CHECK-NEXT: ret void
;
entry:
%load = load i32, ptr %p, align 1
callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]

bb.0:
musttail call void @foo(ptr %p)
ret void

bb.1:
ret void
}

define i1 @musttail_call_with_return_value(ptr %p) {
; CHECK-LABEL: define i1 @musttail_call_with_return_value(
; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
Expand All @@ -51,3 +77,28 @@ bb.0:
bb.1:
ret i1 %load
}

define i32 @musttail_call_with_return_value_callbr(ptr %p) {
; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr(
; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1
; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]])
; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1]
; CHECK: [[BB_0]]:
; CHECK-NEXT: [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]])
; CHECK-NEXT: ret i32 [[RET]]
; CHECK: [[BB_1:.*:]]
; CHECK-NEXT: ret i32 [[LOAD]]
;
entry:
%load = load i32, ptr %p, align 1
callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]

bb.0:
%ret = musttail call i32 @bar32(ptr %p)
ret i32 %ret

bb.1:
ret i32 %load
}
Loading
Loading