Skip to content

[VPlan] Introduces explicit broadcast for live-in constants. #133213

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -1399,6 +1399,17 @@ class LLVM_ABI_FOR_TEST VPWidenRecipe : public VPRecipeWithIRFlags,
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif

bool onlyFirstLaneUsed(const VPValue *Op) const override {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if it's worth pulling the VPlan.h and VPlanRecipes.cpp changes out into a separate PR as they make sense on their own? That way we can see what test changes are due to which code changes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Split off to #145449, thanks!

assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
switch (Opcode) {
default:
return false;
case Instruction::ExtractValue:
return Op == getOperand(1);
Comment on lines +1404 to +1410
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Simpler to check for the single opcode we are intersted in with an if?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, update in #145449. Thanks!

}
}
};

/// VPWidenCastRecipe is a recipe to create vector cast instructions.
Expand Down Expand Up @@ -1594,6 +1605,14 @@ class LLVM_ABI_FOR_TEST VPWidenCallRecipe : public VPRecipeWithIRFlags,
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif

/// Returns true if the recipe only uses the first lane of operand \p Op.
bool onlyFirstLaneUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
// Scalar called fuction cannot be vectorized.
return Op == getOperand(getNumOperands() - 1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I understand correctly, the last operand is the called scalar function, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it is the function pointer of the scalar function.

}
};

/// A recipe representing a sequence of load -> update -> store as part of
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ template <typename Pred, unsigned BitWidth = 0> struct int_pred_ty {
int_pred_ty() : P() {}

bool match(VPValue *VPV) const {
auto *VPI = dyn_cast<VPInstruction>(VPV);
if (VPI && VPI->getOpcode() == VPInstruction::Broadcast &&
VPI->getOperand(0)->isLiveIn())
VPV = VPI->getOperand(0);
if (!VPV->isLiveIn())
return false;
Value *V = VPV->getLiveInIRValue();
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1108,6 +1108,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
default:
return false;
case Instruction::ExtractElement:
case Instruction::ExtractValue:
return Op == getOperand(1);
case Instruction::PHI:
return true;
Expand All @@ -1132,8 +1133,9 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
case VPInstruction::WidePtrAdd:
return Op == getOperand(0);
case VPInstruction::ComputeAnyOfResult:
case VPInstruction::ComputeFindIVResult:
return Op == getOperand(1);
case VPInstruction::ComputeFindIVResult:
return Op == getOperand(1) || Op == getOperand(2);
case VPInstruction::ExtractLane:
return Op == getOperand(0);
};
Expand Down
26 changes: 20 additions & 6 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3193,10 +3193,7 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {

auto *VectorPreheader = Plan.getVectorPreheader();
for (VPValue *VPV : VPValues) {
if (all_of(VPV->users(),
[VPV](VPUser *U) { return U->usesScalars(VPV); }) ||
(VPV->isLiveIn() && VPV->getLiveInIRValue() &&
isa<Constant>(VPV->getLiveInIRValue())))
if (all_of(VPV->users(), [VPV](VPUser *U) { return U->usesScalars(VPV); }))
continue;

// Add explicit broadcast at the insert point that dominates all users.
Expand All @@ -3213,8 +3210,25 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
"All users must be in the vector preheader or dominated by it");
}

VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
VPInstruction *Broadcast;
if (VPV->isLiveIn() && isa_and_nonnull<Constant>(VPV->getLiveInIRValue())) {
// We cannot replace the constant live-ins for PHIs by broadcast in the
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I understand the relevance of PHIs in this code, since nothing else in this function seems to use or create them?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because the broadcast of the constant live-ins are put in the same VPBB. Replace the live-in from predecessor by a broadcast in the same block will break the definition of the PHI.

// same VPBB because it will break PHI. Also cannot replace the
// VPWidenGEPRecipe since it broadcasts the generated pointer instead of
// operands.
if (auto *R = dyn_cast_if_present<VPRecipeBase>(*(VPV->users().begin()));
R && !isa<VPHeaderPHIRecipe, VPWidenPHIRecipe, VPWidenGEPRecipe>(R) &&
!VPV->hasMoreThanOneUniqueUser()) {
Broadcast = new VPInstruction(VPInstruction::Broadcast, {VPV});
// Insert just before the user to reduce register pressure.
Broadcast->insertBefore(R);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't there a problem here? It looks like you're assuming that the first user of VPV dominates all users of VPV, but I don't think that is always true. The HoistPoint above was chosen as a location to dominate all users so why not just use that instead?

Copy link
Contributor Author

@ElvisWang123 ElvisWang123 Jun 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For register pressure, if hoisting all of constant broadcast to the preheader, the register pressure will be high in the loop body.
The register allocated in the preheader will mark as used across entire vector body (perhaps not now, but more accurate in future), which is not that accurate. If the broadcast instruction only in loop body, the register usage can be free when it reach the usage.

Probably need to limit all the user of the constant need to in the same BB.

} else {
continue;
}
} else {
VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
}
VPV->replaceUsesWithIf(Broadcast,
[VPV, Broadcast](VPUser &U, unsigned Idx) {
return Broadcast != &U && !U.usesScalars(VPV);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -383,15 +383,15 @@ define void @single_fmul_used_by_each_member(ptr noalias %A, ptr noalias %B, ptr
; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX24:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT25:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP45:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX24]]
; CHECK-NEXT: [[TMP47:%.*]] = load double, ptr [[TMP45]], align 8
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP47]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = load <2 x double>, ptr [[TMP45]], align 8
; CHECK-NEXT: [[TMP48:%.*]] = fmul <2 x double> [[BROADCAST_SPLAT]], splat (double 5.000000e+00)
; CHECK-NEXT: [[TMP49:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX24]]
; CHECK-NEXT: store <2 x double> [[TMP48]], ptr [[TMP49]], align 8
; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <2 x double> [[TMP48]], <2 x double> [[TMP48]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[INTERLEAVED_VEC26:%.*]] = shufflevector <4 x double> [[TMP52]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC26]], ptr [[TMP49]], align 8
; CHECK-NEXT: [[TMP50:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX24]]
; CHECK-NEXT: store <2 x double> [[TMP48]], ptr [[TMP50]], align 8
; CHECK-NEXT: [[INDEX_NEXT25]] = add nuw i64 [[INDEX24]], 1
; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC26]], ptr [[TMP50]], align 8
; CHECK-NEXT: [[INDEX_NEXT25]] = add nuw i64 [[INDEX24]], 2
; CHECK-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT25]], [[N_VEC23]]
; CHECK-NEXT: br i1 [[TMP51]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
Expand Down
Loading