-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[VPlan] Emit VPVectorEndPointerRecipe for reverse interleave pointer adjustment #144864
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-risc-v @llvm/pr-subscribers-llvm-transforms Author: Mel Chen (Mel-Chen) ChangesThis patch introduces VPReverseInterleavePtrRecipe, a new recipe that adjusts the pointer of a reverse interleave group. It takes the pointer of member 0 and the VF as operands and computes the pointer to the last vector lane. The final goal is to support EVL tail folding for interleaved accesses. Given that VPInterleaveRecipe is large and tightly coupled — combining both load and store, and embedding operations like reverse pointer adjustion (GEP), widen load/store, deinterleave/interleave, and reversal — breaking it down into smaller, dedicated recipes may allow VPlanTransforms::tryAddExplicitVectorLength to lower them into EVL-aware form more effectively. One foreseeable challenge is that VPlanTransforms::convertToConcreteRecipes currently runs after tryAddExplicitVectorLength, so decomposing VPInterleaveRecipe will likely need to happen earlier in the pipeline to be effective. Full diff: https://github.com/llvm/llvm-project/pull/144864.diff 7 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f887b34e76422..ce40c6ccba92e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4256,6 +4256,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPDerivedIVSC:
case VPDef::VPScalarIVStepsSC:
case VPDef::VPReplicateSC:
+ case VPDef::VPReverseInterleavePtrSC:
case VPDef::VPInstructionSC:
case VPDef::VPCanonicalIVPHISC:
case VPDef::VPVectorPointerSC:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index f3306ad7cb8ec..daef26fe86d79 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -531,6 +531,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPInstructionSC:
case VPRecipeBase::VPReductionEVLSC:
case VPRecipeBase::VPReductionSC:
+ case VPRecipeBase::VPReverseInterleavePtrSC:
case VPRecipeBase::VPMulAccumulateReductionSC:
case VPRecipeBase::VPExtendedReductionSC:
case VPRecipeBase::VPReplicateSC:
@@ -851,6 +852,7 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
R->getVPDefID() == VPRecipeBase::VPReductionSC ||
R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
+ R->getVPDefID() == VPRecipeBase::VPReverseInterleavePtrSC ||
R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC ||
R->getVPDefID() == VPRecipeBase::VPVectorPointerSC ||
R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC ||
@@ -1796,6 +1798,53 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
#endif
};
+class VPReverseInterleavePtrRecipe : public VPRecipeWithIRFlags {
+ Type *IndexedTy;
+ unsigned Factor;
+
+public:
+ VPReverseInterleavePtrRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy,
+ unsigned Factor, GEPNoWrapFlags GEPFlags,
+ DebugLoc DL)
+ : VPRecipeWithIRFlags(VPDef::VPReverseInterleavePtrSC,
+ ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL),
+ IndexedTy(IndexedTy), Factor(Factor) {
+ assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPReverseInterleavePtrSC)
+
+ VPValue *getPtr() const { return getOperand(0); }
+
+ VPValue *getVFValue() const { return getOperand(1); }
+
+ void execute(VPTransformState &State) override;
+
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return true;
+ }
+
+ InstructionCost computeCost(ElementCount VF,
+ VPCostContext &Ctx) const override {
+ // TODO: Compute accurate cost after retiring the legacy cost model.
+ return 0;
+ }
+
+ VPReverseInterleavePtrRecipe *clone() override {
+ return new VPReverseInterleavePtrRecipe(getPtr(), getVFValue(), IndexedTy,
+ Factor, getGEPNoWrapFlags(),
+ getDebugLoc());
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// A pure virtual base class for all recipes modeling header phis, including
/// phis for first order recurrences, pointer inductions and reductions. The
/// start value is the first operand of the recipe and the incoming value from
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 76da5b0314a8e..98889cb5c520c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -282,9 +282,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
.Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe,
- VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
- return inferScalarType(R->getOperand(0));
- })
+ VPPartialReductionRecipe, VPReverseInterleavePtrRecipe>(
+ [this](const VPRecipeBase *R) {
+ return inferScalarType(R->getOperand(0));
+ })
// VPInstructionWithType must be handled before VPInstruction.
.Case<VPInstructionWithType, VPWidenIntrinsicRecipe,
VPWidenCastRecipe>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1ed0b97849a8d..40dde8cfaea73 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -150,6 +150,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPDerivedIVSC:
case VPFirstOrderRecurrencePHISC:
case VPPredInstPHISC:
+ case VPReverseInterleavePtrSC:
case VPVectorEndPointerSC:
return false;
case VPInstructionSC:
@@ -2262,6 +2263,33 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
+void VPReverseInterleavePtrRecipe::execute(VPTransformState &State) {
+ auto &Builder = State.Builder;
+ Value *Ptr = State.get(getPtr(), /*IsScalar*/ true);
+ Value *RuntimeVF = State.get(getVFValue(), /*IsScalar*/ true);
+ Type *IndexTy = Builder.getInt32Ty();
+ if (RuntimeVF->getType() != IndexTy)
+ RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, IndexTy);
+ Value *Index = Builder.CreateSub(RuntimeVF, Builder.getInt32(1));
+ Index = Builder.CreateMul(Index, Builder.getInt32(Factor));
+ Index = Builder.CreateNeg(Index);
+ Value *ReversePtr =
+ Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags());
+
+ State.set(this, ReversePtr, /*IsScalar*/ true);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPReverseInterleavePtrRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent;
+ printAsOperand(O, SlotTracker);
+ O << " = reverse-interleave-ptr";
+ printFlags(O);
+ printOperands(O, SlotTracker);
+}
+#endif
+
void VPBlendRecipe::execute(VPTransformState &State) {
assert(isNormalized() && "Expected blend to be normalized!");
// We know that all PHIs in non-header blocks are converted into
@@ -3223,25 +3251,6 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
if (auto *I = dyn_cast<Instruction>(ResAddr))
State.setDebugLocFrom(I->getDebugLoc());
- // If the group is reverse, adjust the index to refer to the last vector lane
- // instead of the first. We adjust the index from the first vector lane,
- // rather than directly getting the pointer for lane VF - 1, because the
- // pointer operand of the interleaved access is supposed to be uniform.
- if (Group->isReverse()) {
- Value *RuntimeVF =
- getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
- Value *Index =
- State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
- Index = State.Builder.CreateMul(Index,
- State.Builder.getInt32(Group->getFactor()));
- Index = State.Builder.CreateNeg(Index);
-
- bool InBounds = false;
- if (auto *Gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts()))
- InBounds = Gep->isInBounds();
- ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds);
- }
-
State.setDebugLocFrom(getDebugLoc());
Value *PoisonVec = PoisonValue::get(VecTy);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 11f0f2a930329..6068b87663047 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2489,6 +2489,21 @@ void VPlanTransforms::createInterleaveGroups(
Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV)
: B.createPtrAdd(InsertPos->getAddr(), OffsetVPV);
}
+ // If the group is reverse, adjust the index to refer to the last vector
+ // lane instead of the first. We adjust the index from the first vector
+ // lane, rather than directly getting the pointer for lane VF - 1, because
+ // the pointer operand of the interleaved access is supposed to be uniform.
+ if (IG->isReverse()) {
+ auto *GEP = dyn_cast<GetElementPtrInst>(
+ getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts());
+ auto *ReversePtr = new VPReverseInterleavePtrRecipe(
+ Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos), IG->getFactor(),
+ GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds()
+ : GEPNoWrapFlags::none(),
+ InsertPos->getDebugLoc());
+ ReversePtr->insertBefore(InsertPos);
+ Addr = ReversePtr;
+ }
auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
InsertPos->getMask(), NeedsMaskForGaps, InsertPos->getDebugLoc());
VPIG->insertBefore(InsertPos);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index a0d3dc9b934cc..83f6ac223af1e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -335,6 +335,7 @@ class VPDef {
VPInterleaveSC,
VPReductionEVLSC,
VPReductionSC,
+ VPReverseInterleavePtrSC,
VPMulAccumulateReductionSC,
VPExtendedReductionSC,
VPPartialReductionSC,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 7e4edf739695a..0333035a4b0bf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -367,8 +367,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP5]], 3
+; CHECK-NEXT: [[TMP15:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP15]], 1
; CHECK-NEXT: [[TMP7:%.*]] = sub nsw i32 2, [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP8]]
@@ -381,8 +381,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP15]], 3
+; CHECK-NEXT: [[TMP21:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP21]], 1
; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 2, [[TMP16]]
; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]]
@@ -1579,8 +1579,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4
+; CHECK-NEXT: [[TMP6:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 2
; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
@@ -1599,8 +1599,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <vscale x 4 x i32> [[REVERSE4]], [[VEC_IND]]
; CHECK-NEXT: [[TMP20:%.*]] = shl nuw nsw <vscale x 4 x i32> [[REVERSE5]], [[VEC_IND]]
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4
+; CHECK-NEXT: [[TMP22:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 2
; CHECK-NEXT: [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]]
; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]]
|
As mentioned in this patch, supporting interleaved access with EVL tail folding could be complex. The another simpler approach might be to directly create a VPInterleaveEVLRecipe and extract as much shared code as possible into static functions. I'd like to open a discussion to clarify which direction would be preferable. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible to somehow reuse VPVectorEndPointerRecipe
and add a Factor field on it? The two recipes look quite similar
Mel-Chen@9e1df0f |
@@ -1796,6 +1798,53 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags, | |||
#endif | |||
}; | |||
|
|||
class VPReverseInterleavePtrRecipe : public VPRecipeWithIRFlags { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add comments to the class and the its member functions
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I’ve removed this recipe. On second thought, it’s better to reuse VPVectorEndPointer so we don’t need to convert one more recipe when VPlanTransform::tryAddExplicitVectorLength.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With the other
auto *GEP = dyn_cast<GetElementPtrInst>( | ||
getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
would be good to share the logic to determine inbounds from the GEP with similar code above
class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, | ||
public VPUnrollPartAccessor<2> { | ||
Type *IndexedTy; | ||
|
||
int64_t Stride; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Needs a comment
: VPRecipeWithIRFlags(VPDef::VPVectorEndPointerSC, | ||
ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL), | ||
IndexedTy(IndexedTy) {} | ||
IndexedTy(IndexedTy), Stride(Stride) { | ||
assert(Stride != 0 && "Unexpected stride"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
assert(Stride != 0 && "Unexpected stride"); | |
assert(Stride != 0 && "Stride cannot be zero"); |
A reverse interleave access is essentially composed of multiple load/store operations with same negative stride, and their addresses are based on the last lane address of member 0 in the interleaved group.
Currently, we already have VPVectorEndPointerRecipe for computing the last lane address of consecutive reverse memory accesses. This patch extends VPVectorEndPointerRecipe to support constant stride and extracts the reverse interleave group address adjustment from VPInterleaveRecipe::execute, replacing it with a VPVectorEndPointerRecipe.
The final goal is to support interleaved accesses with EVL tail folding. Given that VPInterleaveRecipe is large and tightly coupled — combining both load and store, and embedding operations like reverse pointer adjustion (GEP), widen load/store, deinterleave/interleave, and reversal — breaking it down into smaller, dedicated recipes may allow VPlanTransforms::tryAddExplicitVectorLength to lower them into EVL-aware form more effectively.
One foreseeable challenge is that VPlanTransforms::convertToConcreteRecipes currently runs after tryAddExplicitVectorLength, so decomposing VPInterleaveRecipe will likely need to happen earlier in the pipeline to be effective.