@@ -376,26 +376,24 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
376
376
CallInst *const SetInactive =
377
377
B.CreateIntrinsic (Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
378
378
379
- CallInst * const FirstDPP =
379
+ ExclScan =
380
380
B.CreateIntrinsic (Intrinsic::amdgcn_update_dpp, Ty,
381
381
{Identity, SetInactive, B.getInt32 (DPP_WF_SR1),
382
382
B.getInt32 (0xf ), B.getInt32 (0xf ), B.getFalse ()});
383
- ExclScan = FirstDPP;
384
383
385
- const unsigned Iters = 7 ;
386
- const unsigned DPPCtrl[Iters] = {
387
- DPP_ROW_SR1, DPP_ROW_SR2, DPP_ROW_SR3, DPP_ROW_SR4,
388
- DPP_ROW_SR8, DPP_ROW_BCAST15, DPP_ROW_BCAST31};
389
- const unsigned RowMask[Iters] = {0xf , 0xf , 0xf , 0xf , 0xf , 0xa , 0xc };
390
- const unsigned BankMask[Iters] = {0xf , 0xf , 0xf , 0xe , 0xc , 0xf , 0xf };
384
+ const unsigned Iters = 6 ;
385
+ const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1, DPP_ROW_SR2,
386
+ DPP_ROW_SR4, DPP_ROW_SR8 ,
387
+ DPP_ROW_BCAST15, DPP_ROW_BCAST31};
388
+ const unsigned RowMask[Iters] = {0xf , 0xf , 0xf , 0xf , 0xa , 0xc };
389
+ const unsigned BankMask[Iters] = {0xf , 0xf , 0xe , 0xc , 0xf , 0xf };
391
390
392
391
// This loop performs an exclusive scan across the wavefront, with all lanes
393
392
// active (by using the WWM intrinsic).
394
393
for (unsigned Idx = 0 ; Idx < Iters; Idx++) {
395
- Value *const UpdateValue = Idx < 3 ? FirstDPP : ExclScan;
396
394
CallInst *const DPP = B.CreateIntrinsic (
397
395
Intrinsic::amdgcn_update_dpp, Ty,
398
- {Identity, UpdateValue , B.getInt32 (DPPCtrl[Idx]),
396
+ {Identity, ExclScan , B.getInt32 (DPPCtrl[Idx]),
399
397
B.getInt32 (RowMask[Idx]), B.getInt32 (BankMask[Idx]), B.getFalse ()});
400
398
401
399
ExclScan = buildNonAtomicBinOp (B, Op, ExclScan, DPP);
0 commit comments