Skip to content

Commit 0d3ba08

Browse files
committed
[LV] Move IV bypass value creation out of ILV (NFC)
createInductionAdditionalBypassValues is only used for epilogue vectorization now. Move it out of ILV, which means we do not have to thread through ExpandedSCEVs and also don't have to track the bypass values in ILV. Instead, directly create them if needed after executing the epilogue plan. This moves more the epilogue specific logic out of the generic executePlan.
1 parent 5d54043 commit 0d3ba08

File tree

3 files changed

+69
-107
lines changed

3 files changed

+69
-107
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -446,17 +446,15 @@ class LoopVectorizationPlanner {
446446
/// TODO: \p VectorizingEpilogue indicates if the executed VPlan is for the
447447
/// epilogue vector loop. It should be removed once the re-use issue has been
448448
/// fixed.
449-
/// \p ExpandedSCEVs is passed during execution of the plan for epilogue loop
450-
/// to re-use expansion results generated during main plan execution.
451449
///
452450
/// Returns a mapping of SCEVs to their expanded IR values.
453451
/// Note that this is a temporary workaround needed due to the current
454452
/// epilogue handling.
455-
DenseMap<const SCEV *, Value *>
456-
executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,
457-
InnerLoopVectorizer &LB, DominatorTree *DT,
458-
bool VectorizingEpilogue,
459-
const DenseMap<const SCEV *, Value *> *ExpandedSCEVs = nullptr);
453+
DenseMap<const SCEV *, Value *> executePlan(ElementCount VF, unsigned UF,
454+
VPlan &BestPlan,
455+
InnerLoopVectorizer &LB,
456+
DominatorTree *DT,
457+
bool VectorizingEpilogue);
460458

461459
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
462460
void printPlans(raw_ostream &O);

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 57 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -503,11 +503,8 @@ class InnerLoopVectorizer {
503503
/// is generated around the vectorized (and scalar epilogue) loops consisting
504504
/// of various checks and bypasses. Return the pre-header block of the new
505505
/// loop. In the case of epilogue vectorization, this function is overriden to
506-
/// handle the more complex control flow around the loops. \p ExpandedSCEVs is
507-
/// used to look up SCEV expansions for expressions needed during skeleton
508-
/// creation.
509-
virtual BasicBlock *
510-
createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
506+
/// handle the more complex control flow around the loops.
507+
virtual BasicBlock *createVectorizedLoopSkeleton();
511508

512509
/// Fix the vectorized code, taking care of header phi's, and more.
513510
void fixVectorizedLoop(VPTransformState &State);
@@ -535,12 +532,6 @@ class InnerLoopVectorizer {
535532
/// count of the original loop for both main loop and epilogue vectorization.
536533
void setTripCount(Value *TC) { TripCount = TC; }
537534

538-
// Retrieve the additional bypass value associated with an original
539-
/// induction header phi.
540-
Value *getInductionAdditionalBypassValue(PHINode *OrigPhi) const {
541-
return Induction2AdditionalBypassValue.at(OrigPhi);
542-
}
543-
544535
/// Return the additional bypass block which targets the scalar loop by
545536
/// skipping the epilogue loop after completing the main loop.
546537
BasicBlock *getAdditionalBypassBlock() const {
@@ -577,11 +568,6 @@ class InnerLoopVectorizer {
577568
/// vector loop preheader, middle block and scalar preheader.
578569
void createVectorLoopSkeleton(StringRef Prefix);
579570

580-
/// Create and record the values for induction variables to resume coming from
581-
/// the additional bypass block.
582-
void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs,
583-
Value *MainVectorTripCount);
584-
585571
/// Allow subclasses to override and print debug traces before/after vplan
586572
/// execution, when trace information is requested.
587573
virtual void printDebugTracesAtStart() {}
@@ -671,11 +657,6 @@ class InnerLoopVectorizer {
671657
/// for cleaning the checks, if vectorization turns out unprofitable.
672658
GeneratedRTChecks &RTChecks;
673659

674-
/// Mapping of induction phis to their additional bypass values. They
675-
/// need to be added as operands to phi nodes in the scalar loop preheader
676-
/// after the epilogue skeleton has been created.
677-
DenseMap<PHINode *, Value *> Induction2AdditionalBypassValue;
678-
679660
/// The additional bypass block which conditionally skips over the epilogue
680661
/// loop after executing the main loop. Needed to resume inductions and
681662
/// reductions during epilogue vectorization.
@@ -738,16 +719,14 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
738719

739720
// Override this function to handle the more complex control flow around the
740721
// three loops.
741-
BasicBlock *
742-
createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final {
743-
return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
722+
BasicBlock *createVectorizedLoopSkeleton() final {
723+
return createEpilogueVectorizedLoopSkeleton();
744724
}
745725

746726
/// The interface for creating a vectorized skeleton using one of two
747727
/// different strategies, each corresponding to one execution of the vplan
748728
/// as described above.
749-
virtual BasicBlock *
750-
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
729+
virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
751730

752731
/// Holds and updates state information required to vectorize the main loop
753732
/// and its epilogue in two separate passes. This setup helps us avoid
@@ -775,8 +754,7 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
775754
EPI, LVL, CM, BFI, PSI, Check, Plan) {}
776755
/// Implements the interface for creating a vectorized skeleton using the
777756
/// *main loop* strategy (ie the first pass of vplan execution).
778-
BasicBlock *
779-
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
757+
BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
780758

781759
protected:
782760
/// Emits an iteration count bypass check once for the main loop (when \p
@@ -806,8 +784,7 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
806784
}
807785
/// Implements the interface for creating a vectorized skeleton using the
808786
/// *epilogue loop* strategy (ie the second pass of vplan execution).
809-
BasicBlock *
810-
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
787+
BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
811788

812789
protected:
813790
/// Emits an iteration count bypass check after the main vector loop has
@@ -2722,44 +2699,7 @@ static void addFullyUnrolledInstructionsToIgnore(
27222699
}
27232700
}
27242701

2725-
void InnerLoopVectorizer::createInductionAdditionalBypassValues(
2726-
const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) {
2727-
assert(MainVectorTripCount && "Must have bypass information");
2728-
2729-
Instruction *OldInduction = Legal->getPrimaryInduction();
2730-
IRBuilder<> BypassBuilder(getAdditionalBypassBlock(),
2731-
getAdditionalBypassBlock()->getFirstInsertionPt());
2732-
for (const auto &InductionEntry : Legal->getInductionVars()) {
2733-
PHINode *OrigPhi = InductionEntry.first;
2734-
const InductionDescriptor &II = InductionEntry.second;
2735-
Value *Step = getExpandedStep(II, ExpandedSCEVs);
2736-
// For the primary induction the additional bypass end value is known.
2737-
// Otherwise it is computed.
2738-
Value *EndValueFromAdditionalBypass = MainVectorTripCount;
2739-
if (OrigPhi != OldInduction) {
2740-
auto *BinOp = II.getInductionBinOp();
2741-
// Fast-math-flags propagate from the original induction instruction.
2742-
if (isa_and_nonnull<FPMathOperator>(BinOp))
2743-
BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
2744-
2745-
// Compute the end value for the additional bypass.
2746-
EndValueFromAdditionalBypass =
2747-
emitTransformedIndex(BypassBuilder, MainVectorTripCount,
2748-
II.getStartValue(), Step, II.getKind(), BinOp);
2749-
EndValueFromAdditionalBypass->setName("ind.end");
2750-
}
2751-
2752-
// Store the bypass value here, as it needs to be added as operand to its
2753-
// scalar preheader phi node after the epilogue skeleton has been created.
2754-
// TODO: Directly add as extra operand to the VPResumePHI recipe.
2755-
assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
2756-
"entry for OrigPhi already exits");
2757-
Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
2758-
}
2759-
}
2760-
2761-
BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
2762-
const SCEV2ValueTy &ExpandedSCEVs) {
2702+
BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
27632703
/*
27642704
In this function we generate a new loop. The new loop will contain
27652705
the vectorized instructions while the old loop will continue to run the
@@ -7726,16 +7666,11 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
77267666

77277667
DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
77287668
ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7729-
InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue,
7730-
const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7669+
InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) {
77317670
assert(BestVPlan.hasVF(BestVF) &&
77327671
"Trying to execute plan with unsupported VF");
77337672
assert(BestVPlan.hasUF(BestUF) &&
77347673
"Trying to execute plan with unsupported UF");
7735-
assert(
7736-
((VectorizingEpilogue && ExpandedSCEVs) ||
7737-
(!VectorizingEpilogue && !ExpandedSCEVs)) &&
7738-
"expanded SCEVs to reuse can only be used during epilogue vectorization");
77397674
// TODO: Move to VPlan transform stage once the transition to the VPlan-based
77407675
// cost model is complete for better cost estimates.
77417676
VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
@@ -7773,8 +7708,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
77737708
// middle block. The vector loop is created during VPlan execution.
77747709
VPBasicBlock *VectorPH =
77757710
cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor());
7776-
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(
7777-
ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
7711+
7712+
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
77787713
if (VectorizingEpilogue)
77797714
VPlanTransforms::removeDeadRecipes(BestVPlan);
77807715

@@ -7815,8 +7750,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
78157750
BestVPlan.execute(&State);
78167751

78177752
auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7818-
// 2.5 When vectorizing the epilogue, fix reduction and induction resume
7819-
// values from the additional bypass block.
7753+
// 2.5 When vectorizing the epilogue, fix reduction resume values from the
7754+
// additional bypass block.
78207755
if (VectorizingEpilogue) {
78217756
assert(!ILV.Legal->hasUncountableEarlyExit() &&
78227757
"Epilogue vectorisation not yet supported with early exits");
@@ -7834,11 +7769,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
78347769
fixReductionScalarResumeWhenVectorizingEpilog(
78357770
&R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock);
78367771
}
7837-
for (const auto &[IVPhi, _] : Legal->getInductionVars()) {
7838-
auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
7839-
Value *V = ILV.getInductionAdditionalBypassValue(IVPhi);
7840-
Inc->setIncomingValueForBlock(BypassBlock, V);
7841-
}
78427772
}
78437773

78447774
// 2.6. Maintain Loop Hints
@@ -7900,8 +7830,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
79007830

79017831
/// This function is partially responsible for generating the control flow
79027832
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7903-
BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7904-
const SCEV2ValueTy &ExpandedSCEVs) {
7833+
BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
79057834
createVectorLoopSkeleton("");
79067835

79077836
// Generate the code to check the minimum iteration count of the vector
@@ -8011,8 +7940,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
80117940
/// This function is partially responsible for generating the control flow
80127941
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
80137942
BasicBlock *
8014-
EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
8015-
const SCEV2ValueTy &ExpandedSCEVs) {
7943+
EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
80167944
createVectorLoopSkeleton("vec.epilog.");
80177945

80187946
// Now, compare the remaining count and if there aren't enough iterations to
@@ -8080,11 +8008,6 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
80808008
Phi->removeIncomingValue(EPI.MemSafetyCheck);
80818009
}
80828010

8083-
// Generate bypass values from the additional bypass block. Note that when the
8084-
// vectorized epilogue is skipped due to iteration count check, then the
8085-
// resume value for the induction variable comes from the trip count of the
8086-
// main vector loop, passed as the second argument.
8087-
createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount);
80888011
return LoopVectorPreHeader;
80898012
}
80908013

@@ -10529,6 +10452,33 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
1052910452
}
1053010453
}
1053110454

10455+
// Generate bypass values from the additional bypass block. Note that when the
10456+
// vectorized epilogue is skipped due to iteration count check, then the
10457+
// resume value for the induction variable comes from the trip count of the
10458+
// main vector loop, passed as the second argument.
10459+
static Value *createInductionAdditionalBypassValues(
10460+
PHINode *OrigPhi, const InductionDescriptor &II, IRBuilder<> &BypassBuilder,
10461+
const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
10462+
Instruction *OldInduction) {
10463+
Value *Step = getExpandedStep(II, ExpandedSCEVs);
10464+
// For the primary induction the additional bypass end value is known.
10465+
// Otherwise it is computed.
10466+
Value *EndValueFromAdditionalBypass = MainVectorTripCount;
10467+
if (OrigPhi != OldInduction) {
10468+
auto *BinOp = II.getInductionBinOp();
10469+
// Fast-math-flags propagate from the original induction instruction.
10470+
if (isa_and_nonnull<FPMathOperator>(BinOp))
10471+
BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
10472+
10473+
// Compute the end value for the additional bypass.
10474+
EndValueFromAdditionalBypass =
10475+
emitTransformedIndex(BypassBuilder, MainVectorTripCount,
10476+
II.getStartValue(), Step, II.getKind(), BinOp);
10477+
EndValueFromAdditionalBypass->setName("ind.end");
10478+
}
10479+
return EndValueFromAdditionalBypass;
10480+
}
10481+
1053210482
bool LoopVectorizePass::processLoop(Loop *L) {
1053310483
assert((EnableVPlanNativePath || L->isInnermost()) &&
1053410484
"VPlan-native path is not enabled. Only process inner loops.");
@@ -10912,7 +10862,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1091210862
preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
1091310863

1091410864
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10915-
DT, true, &ExpandedSCEVs);
10865+
DT, true);
10866+
10867+
// Fix induction resume values from the additional bypass block.
10868+
BasicBlock *BypassBlock = EpilogILV.getAdditionalBypassBlock();
10869+
IRBuilder<> BypassBuilder(BypassBlock,
10870+
BypassBlock->getFirstInsertionPt());
10871+
BasicBlock *PH = L->getLoopPreheader();
10872+
for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
10873+
auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
10874+
Value *V = createInductionAdditionalBypassValues(
10875+
IVPhi, II, BypassBuilder, ExpandedSCEVs, EPI.VectorTripCount,
10876+
LVL.getPrimaryInduction());
10877+
// TODO: Directly add as extra operand to the VPResumePHI recipe.
10878+
Inc->setIncomingValueForBlock(BypassBlock, V);
10879+
}
1091610880
++LoopsEpilogueVectorized;
1091710881

1091810882
if (!MainILV.areSafetyChecksAdded())

llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -182,11 +182,11 @@ define void @_Z3fn1v() #0 {
182182
; CHECK-NEXT: [[DOTSPLATINSERT67:%.*]] = insertelement <8 x i64> poison, i64 [[BC_RESUME_VAL44]], i64 0
183183
; CHECK-NEXT: [[DOTSPLAT68:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT67]], <8 x i64> poison, <8 x i32> zeroinitializer
184184
; CHECK-NEXT: [[INDUCTION69:%.*]] = add <8 x i64> [[DOTSPLAT68]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
185-
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY51:%.*]]
186-
; CHECK: vec.epilog.vector.body51:
187-
; CHECK-NEXT: [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], [[VEC_EPILOG_PH42]] ], [ [[INDEX_NEXT74:%.*]], [[VEC_EPILOG_VECTOR_BODY51]] ]
188-
; CHECK-NEXT: [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT66:%.*]], [[VEC_EPILOG_VECTOR_BODY51]] ]
189-
; CHECK-NEXT: [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT71:%.*]], [[VEC_EPILOG_VECTOR_BODY51]] ]
185+
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY49:%.*]]
186+
; CHECK: vec.epilog.vector.body49:
187+
; CHECK-NEXT: [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], [[VEC_EPILOG_PH42]] ], [ [[INDEX_NEXT74:%.*]], [[VEC_EPILOG_VECTOR_BODY49]] ]
188+
; CHECK-NEXT: [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT66:%.*]], [[VEC_EPILOG_VECTOR_BODY49]] ]
189+
; CHECK-NEXT: [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT71:%.*]], [[VEC_EPILOG_VECTOR_BODY49]] ]
190190
; CHECK-NEXT: [[TMP44:%.*]] = sub nsw <8 x i64> splat (i64 8), [[VEC_IND65]]
191191
; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <8 x i64> [[VEC_IND65]]
192192
; CHECK-NEXT: [[TMP46:%.*]] = add nsw <8 x i64> [[TMP44]], [[VEC_IND70]]
@@ -205,8 +205,8 @@ define void @_Z3fn1v() #0 {
205205
; CHECK-NEXT: [[VEC_IND_NEXT66]] = add <8 x i64> [[VEC_IND65]], splat (i64 16)
206206
; CHECK-NEXT: [[VEC_IND_NEXT71]] = add <8 x i64> [[VEC_IND70]], splat (i64 16)
207207
; CHECK-NEXT: [[TMP55:%.*]] = icmp eq i64 [[INDEX_NEXT74]], [[N_VEC53]]
208-
; CHECK-NEXT: br i1 [[TMP55]], label [[VEC_EPILOG_MIDDLE_BLOCK40:%.*]], label [[VEC_EPILOG_VECTOR_BODY51]], !llvm.loop [[LOOP5:![0-9]+]]
209-
; CHECK: vec.epilog.middle.block64:
208+
; CHECK-NEXT: br i1 [[TMP55]], label [[VEC_EPILOG_MIDDLE_BLOCK40:%.*]], label [[VEC_EPILOG_VECTOR_BODY49]], !llvm.loop [[LOOP5:![0-9]+]]
209+
; CHECK: vec.epilog.middle.block62:
210210
; CHECK-NEXT: [[CMP_N65:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC53]]
211211
; CHECK-NEXT: br i1 [[CMP_N65]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH40]]
212212
; CHECK: vec.epilog.scalar.ph40:

0 commit comments

Comments
 (0)