Skip to content

Commit 6b7bd0e

Browse files
author
Changpeng Fang
committed
AMDGPU/SI: Move the local memory usage related checking after calling convention checking in PromoteAlloca
Summary: Promoting Alloca to Vector and Promoting Alloca to LDS are two independent handling of Alloca and should not affect each other. As a result, we should not give up promoting to vector if there is not enough LDS. This patch factors out the local memory usage related checking out and replace it after the calling convention checking. Reviewer: arsenm Differential Revision: http://reviews.llvm.org/D33139 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@303684 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 8eeef87 commit 6b7bd0e

File tree

2 files changed

+136
-99
lines changed

2 files changed

+136
-99
lines changed

lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

+114-99
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ class AMDGPUPromoteAlloca : public FunctionPass {
9797
Instruction *UseInst,
9898
int OpIdx0, int OpIdx1) const;
9999

100+
/// Check whether we have enough local memory for promotion.
101+
bool hasSufficientLocalMem(const Function &F);
102+
100103
public:
101104
static char ID;
102105

@@ -107,7 +110,7 @@ class AMDGPUPromoteAlloca : public FunctionPass {
107110

108111
StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
109112

110-
void handleAlloca(AllocaInst &I);
113+
bool handleAlloca(AllocaInst &I, bool SufficientLDS);
111114

112115
void getAnalysisUsage(AnalysisUsage &AU) const override {
113116
AU.setPreservesCFG();
@@ -147,105 +150,21 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
147150
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
148151
if (!ST.isPromoteAllocaEnabled())
149152
return false;
150-
AS = AMDGPU::getAMDGPUAS(*F.getParent());
151-
152-
FunctionType *FTy = F.getFunctionType();
153-
154-
// If the function has any arguments in the local address space, then it's
155-
// possible these arguments require the entire local memory space, so
156-
// we cannot use local memory in the pass.
157-
for (Type *ParamTy : FTy->params()) {
158-
PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
159-
if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
160-
LocalMemLimit = 0;
161-
DEBUG(dbgs() << "Function has local memory argument. Promoting to "
162-
"local memory disabled.\n");
163-
return false;
164-
}
165-
}
166-
167-
LocalMemLimit = ST.getLocalMemorySize();
168-
if (LocalMemLimit == 0)
169-
return false;
170-
171-
const DataLayout &DL = Mod->getDataLayout();
172-
173-
// Check how much local memory is being used by global objects
174-
CurrentLocalMemUsage = 0;
175-
for (GlobalVariable &GV : Mod->globals()) {
176-
if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
177-
continue;
178-
179-
for (const User *U : GV.users()) {
180-
const Instruction *Use = dyn_cast<Instruction>(U);
181-
if (!Use)
182-
continue;
183-
184-
if (Use->getParent()->getParent() == &F) {
185-
unsigned Align = GV.getAlignment();
186-
if (Align == 0)
187-
Align = DL.getABITypeAlignment(GV.getValueType());
188153

189-
// FIXME: Try to account for padding here. The padding is currently
190-
// determined from the inverse order of uses in the function. I'm not
191-
// sure if the use list order is in any way connected to this, so the
192-
// total reported size is likely incorrect.
193-
uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
194-
CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
195-
CurrentLocalMemUsage += AllocSize;
196-
break;
197-
}
198-
}
199-
}
200-
201-
unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
202-
F);
203-
204-
// Restrict local memory usage so that we don't drastically reduce occupancy,
205-
// unless it is already significantly reduced.
206-
207-
// TODO: Have some sort of hint or other heuristics to guess occupancy based
208-
// on other factors..
209-
unsigned OccupancyHint = ST.getWavesPerEU(F).second;
210-
if (OccupancyHint == 0)
211-
OccupancyHint = 7;
212-
213-
// Clamp to max value.
214-
OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
215-
216-
// Check the hint but ignore it if it's obviously wrong from the existing LDS
217-
// usage.
218-
MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
219-
220-
221-
// Round up to the next tier of usage.
222-
unsigned MaxSizeWithWaveCount
223-
= ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
224-
225-
// Program is possibly broken by using more local mem than available.
226-
if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
227-
return false;
228-
229-
LocalMemLimit = MaxSizeWithWaveCount;
230-
231-
DEBUG(
232-
dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
233-
<< " Rounding size to " << MaxSizeWithWaveCount
234-
<< " with a maximum occupancy of " << MaxOccupancy << '\n'
235-
<< " and " << (LocalMemLimit - CurrentLocalMemUsage)
236-
<< " available for promotion\n"
237-
);
154+
AS = AMDGPU::getAMDGPUAS(*F.getParent());
238155

156+
bool SufficientLDS = hasSufficientLocalMem(F);
157+
bool Changed = false;
239158
BasicBlock &EntryBB = *F.begin();
240159
for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
241160
AllocaInst *AI = dyn_cast<AllocaInst>(I);
242161

243162
++I;
244163
if (AI)
245-
handleAlloca(*AI);
164+
Changed |= handleAlloca(*AI, SufficientLDS);
246165
}
247166

248-
return true;
167+
return Changed;
249168
}
250169

251170
std::pair<Value *, Value *>
@@ -661,12 +580,105 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
661580
return true;
662581
}
663582

583+
bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
584+
585+
FunctionType *FTy = F.getFunctionType();
586+
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
587+
588+
// If the function has any arguments in the local address space, then it's
589+
// possible these arguments require the entire local memory space, so
590+
// we cannot use local memory in the pass.
591+
for (Type *ParamTy : FTy->params()) {
592+
PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
593+
if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
594+
LocalMemLimit = 0;
595+
DEBUG(dbgs() << "Function has local memory argument. Promoting to "
596+
"local memory disabled.\n");
597+
return false;
598+
}
599+
}
600+
601+
LocalMemLimit = ST.getLocalMemorySize();
602+
if (LocalMemLimit == 0)
603+
return false;
604+
605+
const DataLayout &DL = Mod->getDataLayout();
606+
607+
// Check how much local memory is being used by global objects
608+
CurrentLocalMemUsage = 0;
609+
for (GlobalVariable &GV : Mod->globals()) {
610+
if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
611+
continue;
612+
613+
for (const User *U : GV.users()) {
614+
const Instruction *Use = dyn_cast<Instruction>(U);
615+
if (!Use)
616+
continue;
617+
618+
if (Use->getParent()->getParent() == &F) {
619+
unsigned Align = GV.getAlignment();
620+
if (Align == 0)
621+
Align = DL.getABITypeAlignment(GV.getValueType());
622+
623+
// FIXME: Try to account for padding here. The padding is currently
624+
// determined from the inverse order of uses in the function. I'm not
625+
// sure if the use list order is in any way connected to this, so the
626+
// total reported size is likely incorrect.
627+
uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
628+
CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
629+
CurrentLocalMemUsage += AllocSize;
630+
break;
631+
}
632+
}
633+
}
634+
635+
unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
636+
F);
637+
638+
// Restrict local memory usage so that we don't drastically reduce occupancy,
639+
// unless it is already significantly reduced.
640+
641+
// TODO: Have some sort of hint or other heuristics to guess occupancy based
642+
// on other factors..
643+
unsigned OccupancyHint = ST.getWavesPerEU(F).second;
644+
if (OccupancyHint == 0)
645+
OccupancyHint = 7;
646+
647+
// Clamp to max value.
648+
OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
649+
650+
// Check the hint but ignore it if it's obviously wrong from the existing LDS
651+
// usage.
652+
MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
653+
654+
655+
// Round up to the next tier of usage.
656+
unsigned MaxSizeWithWaveCount
657+
= ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
658+
659+
// Program is possibly broken by using more local mem than available.
660+
if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
661+
return false;
662+
663+
LocalMemLimit = MaxSizeWithWaveCount;
664+
665+
DEBUG(
666+
dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
667+
<< " Rounding size to " << MaxSizeWithWaveCount
668+
<< " with a maximum occupancy of " << MaxOccupancy << '\n'
669+
<< " and " << (LocalMemLimit - CurrentLocalMemUsage)
670+
<< " available for promotion\n"
671+
);
672+
673+
return true;
674+
}
675+
664676
// FIXME: Should try to pick the most likely to be profitable allocas first.
665-
void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
677+
bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
666678
// Array allocations are probably not worth handling, since an allocation of
667679
// the array type is the canonical form.
668680
if (!I.isStaticAlloca() || I.isArrayAllocation())
669-
return;
681+
return false;
670682

671683
IRBuilder<> Builder(&I);
672684

@@ -675,10 +687,8 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
675687

676688
DEBUG(dbgs() << "Trying to promote " << I << '\n');
677689

678-
if (tryPromoteAllocaToVector(&I, AS)) {
679-
DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
680-
return;
681-
}
690+
if (tryPromoteAllocaToVector(&I, AS))
691+
return true; // Promoted to vector.
682692

683693
const Function &ContainingFunction = *I.getParent()->getParent();
684694
CallingConv::ID CC = ContainingFunction.getCallingConv();
@@ -692,9 +702,13 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
692702
break;
693703
default:
694704
DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n");
695-
return;
705+
return false;
696706
}
697707

708+
// Not likely to have sufficient local memory for promotion.
709+
if (!SufficientLDS)
710+
return false;
711+
698712
const AMDGPUSubtarget &ST =
699713
TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
700714
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
@@ -718,7 +732,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
718732
if (NewSize > LocalMemLimit) {
719733
DEBUG(dbgs() << " " << AllocSize
720734
<< " bytes of local memory not available to promote\n");
721-
return;
735+
return false;
722736
}
723737

724738
CurrentLocalMemUsage = NewSize;
@@ -727,7 +741,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
727741

728742
if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
729743
DEBUG(dbgs() << " Do not know how to convert all uses\n");
730-
return;
744+
return false;
731745
}
732746

733747
DEBUG(dbgs() << "Promoting alloca to local memory\n");
@@ -873,6 +887,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
873887
llvm_unreachable("Don't know how to promote alloca intrinsic use.");
874888
}
875889
}
890+
return true;
876891
}
877892

878893
FunctionPass *llvm::createAMDGPUPromoteAlloca() {

test/CodeGen/AMDGPU/vector-alloca.ll

+22
Original file line numberDiff line numberDiff line change
@@ -138,3 +138,25 @@ entry:
138138
store float %tmp2, float addrspace(1)* %out
139139
ret void
140140
}
141+
142+
; The pointer arguments in local address space should not affect promotion to vector.
143+
144+
; OPT-LABEL: @vector_read_with_local_arg(
145+
; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
146+
; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
147+
define amdgpu_kernel void @vector_read_with_local_arg(i32 addrspace(3)* %stopper, i32 addrspace(1)* %out, i32 %index) {
148+
entry:
149+
%tmp = alloca [4 x i32]
150+
%x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
151+
%y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
152+
%z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
153+
%w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
154+
store i32 0, i32* %x
155+
store i32 1, i32* %y
156+
store i32 2, i32* %z
157+
store i32 3, i32* %w
158+
%tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %index
159+
%tmp2 = load i32, i32* %tmp1
160+
store i32 %tmp2, i32 addrspace(1)* %out
161+
ret void
162+
}

0 commit comments

Comments
 (0)