@@ -97,6 +97,9 @@ class AMDGPUPromoteAlloca : public FunctionPass {
97
97
Instruction *UseInst,
98
98
int OpIdx0, int OpIdx1) const ;
99
99
100
+ // / Check whether we have enough local memory for promotion.
101
+ bool hasSufficientLocalMem (const Function &F);
102
+
100
103
public:
101
104
static char ID;
102
105
@@ -107,7 +110,7 @@ class AMDGPUPromoteAlloca : public FunctionPass {
107
110
108
111
StringRef getPassName () const override { return " AMDGPU Promote Alloca" ; }
109
112
110
- void handleAlloca (AllocaInst &I);
113
+ bool handleAlloca (AllocaInst &I, bool SufficientLDS );
111
114
112
115
void getAnalysisUsage (AnalysisUsage &AU) const override {
113
116
AU.setPreservesCFG ();
@@ -147,105 +150,21 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
147
150
const AMDGPUSubtarget &ST = TM->getSubtarget <AMDGPUSubtarget>(F);
148
151
if (!ST.isPromoteAllocaEnabled ())
149
152
return false ;
150
- AS = AMDGPU::getAMDGPUAS (*F.getParent ());
151
-
152
- FunctionType *FTy = F.getFunctionType ();
153
-
154
- // If the function has any arguments in the local address space, then it's
155
- // possible these arguments require the entire local memory space, so
156
- // we cannot use local memory in the pass.
157
- for (Type *ParamTy : FTy->params ()) {
158
- PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
159
- if (PtrTy && PtrTy->getAddressSpace () == AS.LOCAL_ADDRESS ) {
160
- LocalMemLimit = 0 ;
161
- DEBUG (dbgs () << " Function has local memory argument. Promoting to "
162
- " local memory disabled.\n " );
163
- return false ;
164
- }
165
- }
166
-
167
- LocalMemLimit = ST.getLocalMemorySize ();
168
- if (LocalMemLimit == 0 )
169
- return false ;
170
-
171
- const DataLayout &DL = Mod->getDataLayout ();
172
-
173
- // Check how much local memory is being used by global objects
174
- CurrentLocalMemUsage = 0 ;
175
- for (GlobalVariable &GV : Mod->globals ()) {
176
- if (GV.getType ()->getAddressSpace () != AS.LOCAL_ADDRESS )
177
- continue ;
178
-
179
- for (const User *U : GV.users ()) {
180
- const Instruction *Use = dyn_cast<Instruction>(U);
181
- if (!Use)
182
- continue ;
183
-
184
- if (Use->getParent ()->getParent () == &F) {
185
- unsigned Align = GV.getAlignment ();
186
- if (Align == 0 )
187
- Align = DL.getABITypeAlignment (GV.getValueType ());
188
153
189
- // FIXME: Try to account for padding here. The padding is currently
190
- // determined from the inverse order of uses in the function. I'm not
191
- // sure if the use list order is in any way connected to this, so the
192
- // total reported size is likely incorrect.
193
- uint64_t AllocSize = DL.getTypeAllocSize (GV.getValueType ());
194
- CurrentLocalMemUsage = alignTo (CurrentLocalMemUsage, Align);
195
- CurrentLocalMemUsage += AllocSize;
196
- break ;
197
- }
198
- }
199
- }
200
-
201
- unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize (CurrentLocalMemUsage,
202
- F);
203
-
204
- // Restrict local memory usage so that we don't drastically reduce occupancy,
205
- // unless it is already significantly reduced.
206
-
207
- // TODO: Have some sort of hint or other heuristics to guess occupancy based
208
- // on other factors..
209
- unsigned OccupancyHint = ST.getWavesPerEU (F).second ;
210
- if (OccupancyHint == 0 )
211
- OccupancyHint = 7 ;
212
-
213
- // Clamp to max value.
214
- OccupancyHint = std::min (OccupancyHint, ST.getMaxWavesPerEU ());
215
-
216
- // Check the hint but ignore it if it's obviously wrong from the existing LDS
217
- // usage.
218
- MaxOccupancy = std::min (OccupancyHint, MaxOccupancy);
219
-
220
-
221
- // Round up to the next tier of usage.
222
- unsigned MaxSizeWithWaveCount
223
- = ST.getMaxLocalMemSizeWithWaveCount (MaxOccupancy, F);
224
-
225
- // Program is possibly broken by using more local mem than available.
226
- if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
227
- return false ;
228
-
229
- LocalMemLimit = MaxSizeWithWaveCount;
230
-
231
- DEBUG (
232
- dbgs () << F.getName () << " uses " << CurrentLocalMemUsage << " bytes of LDS\n "
233
- << " Rounding size to " << MaxSizeWithWaveCount
234
- << " with a maximum occupancy of " << MaxOccupancy << ' \n '
235
- << " and " << (LocalMemLimit - CurrentLocalMemUsage)
236
- << " available for promotion\n "
237
- );
154
+ AS = AMDGPU::getAMDGPUAS (*F.getParent ());
238
155
156
+ bool SufficientLDS = hasSufficientLocalMem (F);
157
+ bool Changed = false ;
239
158
BasicBlock &EntryBB = *F.begin ();
240
159
for (auto I = EntryBB.begin (), E = EntryBB.end (); I != E; ) {
241
160
AllocaInst *AI = dyn_cast<AllocaInst>(I);
242
161
243
162
++I;
244
163
if (AI)
245
- handleAlloca (*AI);
164
+ Changed |= handleAlloca (*AI, SufficientLDS );
246
165
}
247
166
248
- return true ;
167
+ return Changed ;
249
168
}
250
169
251
170
std::pair<Value *, Value *>
@@ -661,12 +580,105 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
661
580
return true ;
662
581
}
663
582
583
+ bool AMDGPUPromoteAlloca::hasSufficientLocalMem (const Function &F) {
584
+
585
+ FunctionType *FTy = F.getFunctionType ();
586
+ const AMDGPUSubtarget &ST = TM->getSubtarget <AMDGPUSubtarget>(F);
587
+
588
+ // If the function has any arguments in the local address space, then it's
589
+ // possible these arguments require the entire local memory space, so
590
+ // we cannot use local memory in the pass.
591
+ for (Type *ParamTy : FTy->params ()) {
592
+ PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
593
+ if (PtrTy && PtrTy->getAddressSpace () == AS.LOCAL_ADDRESS ) {
594
+ LocalMemLimit = 0 ;
595
+ DEBUG (dbgs () << " Function has local memory argument. Promoting to "
596
+ " local memory disabled.\n " );
597
+ return false ;
598
+ }
599
+ }
600
+
601
+ LocalMemLimit = ST.getLocalMemorySize ();
602
+ if (LocalMemLimit == 0 )
603
+ return false ;
604
+
605
+ const DataLayout &DL = Mod->getDataLayout ();
606
+
607
+ // Check how much local memory is being used by global objects
608
+ CurrentLocalMemUsage = 0 ;
609
+ for (GlobalVariable &GV : Mod->globals ()) {
610
+ if (GV.getType ()->getAddressSpace () != AS.LOCAL_ADDRESS )
611
+ continue ;
612
+
613
+ for (const User *U : GV.users ()) {
614
+ const Instruction *Use = dyn_cast<Instruction>(U);
615
+ if (!Use)
616
+ continue ;
617
+
618
+ if (Use->getParent ()->getParent () == &F) {
619
+ unsigned Align = GV.getAlignment ();
620
+ if (Align == 0 )
621
+ Align = DL.getABITypeAlignment (GV.getValueType ());
622
+
623
+ // FIXME: Try to account for padding here. The padding is currently
624
+ // determined from the inverse order of uses in the function. I'm not
625
+ // sure if the use list order is in any way connected to this, so the
626
+ // total reported size is likely incorrect.
627
+ uint64_t AllocSize = DL.getTypeAllocSize (GV.getValueType ());
628
+ CurrentLocalMemUsage = alignTo (CurrentLocalMemUsage, Align);
629
+ CurrentLocalMemUsage += AllocSize;
630
+ break ;
631
+ }
632
+ }
633
+ }
634
+
635
+ unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize (CurrentLocalMemUsage,
636
+ F);
637
+
638
+ // Restrict local memory usage so that we don't drastically reduce occupancy,
639
+ // unless it is already significantly reduced.
640
+
641
+ // TODO: Have some sort of hint or other heuristics to guess occupancy based
642
+ // on other factors..
643
+ unsigned OccupancyHint = ST.getWavesPerEU (F).second ;
644
+ if (OccupancyHint == 0 )
645
+ OccupancyHint = 7 ;
646
+
647
+ // Clamp to max value.
648
+ OccupancyHint = std::min (OccupancyHint, ST.getMaxWavesPerEU ());
649
+
650
+ // Check the hint but ignore it if it's obviously wrong from the existing LDS
651
+ // usage.
652
+ MaxOccupancy = std::min (OccupancyHint, MaxOccupancy);
653
+
654
+
655
+ // Round up to the next tier of usage.
656
+ unsigned MaxSizeWithWaveCount
657
+ = ST.getMaxLocalMemSizeWithWaveCount (MaxOccupancy, F);
658
+
659
+ // Program is possibly broken by using more local mem than available.
660
+ if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
661
+ return false ;
662
+
663
+ LocalMemLimit = MaxSizeWithWaveCount;
664
+
665
+ DEBUG (
666
+ dbgs () << F.getName () << " uses " << CurrentLocalMemUsage << " bytes of LDS\n "
667
+ << " Rounding size to " << MaxSizeWithWaveCount
668
+ << " with a maximum occupancy of " << MaxOccupancy << ' \n '
669
+ << " and " << (LocalMemLimit - CurrentLocalMemUsage)
670
+ << " available for promotion\n "
671
+ );
672
+
673
+ return true ;
674
+ }
675
+
664
676
// FIXME: Should try to pick the most likely to be profitable allocas first.
665
- void AMDGPUPromoteAlloca::handleAlloca (AllocaInst &I) {
677
+ bool AMDGPUPromoteAlloca::handleAlloca (AllocaInst &I, bool SufficientLDS ) {
666
678
// Array allocations are probably not worth handling, since an allocation of
667
679
// the array type is the canonical form.
668
680
if (!I.isStaticAlloca () || I.isArrayAllocation ())
669
- return ;
681
+ return false ;
670
682
671
683
IRBuilder<> Builder (&I);
672
684
@@ -675,10 +687,8 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
675
687
676
688
DEBUG (dbgs () << " Trying to promote " << I << ' \n ' );
677
689
678
- if (tryPromoteAllocaToVector (&I, AS)) {
679
- DEBUG (dbgs () << " alloca is not a candidate for vectorization.\n " );
680
- return ;
681
- }
690
+ if (tryPromoteAllocaToVector (&I, AS))
691
+ return true ; // Promoted to vector.
682
692
683
693
const Function &ContainingFunction = *I.getParent ()->getParent ();
684
694
CallingConv::ID CC = ContainingFunction.getCallingConv ();
@@ -692,9 +702,13 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
692
702
break ;
693
703
default :
694
704
DEBUG (dbgs () << " promote alloca to LDS not supported with calling convention.\n " );
695
- return ;
705
+ return false ;
696
706
}
697
707
708
+ // Not likely to have sufficient local memory for promotion.
709
+ if (!SufficientLDS)
710
+ return false ;
711
+
698
712
const AMDGPUSubtarget &ST =
699
713
TM->getSubtarget <AMDGPUSubtarget>(ContainingFunction);
700
714
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes (ContainingFunction).second ;
@@ -718,7 +732,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
718
732
if (NewSize > LocalMemLimit) {
719
733
DEBUG (dbgs () << " " << AllocSize
720
734
<< " bytes of local memory not available to promote\n " );
721
- return ;
735
+ return false ;
722
736
}
723
737
724
738
CurrentLocalMemUsage = NewSize;
@@ -727,7 +741,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
727
741
728
742
if (!collectUsesWithPtrTypes (&I, &I, WorkList)) {
729
743
DEBUG (dbgs () << " Do not know how to convert all uses\n " );
730
- return ;
744
+ return false ;
731
745
}
732
746
733
747
DEBUG (dbgs () << " Promoting alloca to local memory\n " );
@@ -873,6 +887,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
873
887
llvm_unreachable (" Don't know how to promote alloca intrinsic use." );
874
888
}
875
889
}
890
+ return true ;
876
891
}
877
892
878
893
FunctionPass *llvm::createAMDGPUPromoteAlloca () {
0 commit comments