Skip to content

[IA] Support vp.load in lowerInterleavedLoad [nfc-ish] #149174

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 6 additions & 13 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -3201,11 +3201,15 @@ class LLVM_ABI TargetLoweringBase {
/// Lower an interleaved load to target specific intrinsics. Return
/// true on success.
///
/// \p LI is the vector load instruction.
/// \p Load is the vector load instruction. Can be either a plain load
/// instruction or a vp.load intrinsic.
/// \p Mask is a per-segment (i.e. number of lanes equal to that of one
/// component being interwoven) mask. Can be nullptr, in which case the
/// result is uncondiitional.
/// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
/// \p Indices is the corresponding indices for each shufflevector.
/// \p Factor is the interleave factor.
virtual bool lowerInterleavedLoad(LoadInst *LI,
virtual bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const {
Expand All @@ -3223,17 +3227,6 @@ class LLVM_ABI TargetLoweringBase {
return false;
}

/// Lower an interleaved load to target specific intrinsics. Return
/// true on success.
///
/// \p Load is a vp.load instruction.
/// \p Mask is a mask value
/// \p DeinterleaveRes is a list of deinterleaved results.
virtual bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
ArrayRef<Value *> DeinterleaveRes) const {
return false;
}

/// Lower an interleaved store to target specific intrinsics. Return
/// true on success.
///
Expand Down
31 changes: 10 additions & 21 deletions llvm/lib/CodeGen/InterleavedAccessPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -367,34 +367,23 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
bool BinOpShuffleChanged =
replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);

Value *Mask = nullptr;
if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) {
Value *LaneMask =
getMask(VPLoad->getMaskParam(), Factor, cast<VectorType>(VecTy));
if (!LaneMask)
Mask = getMask(VPLoad->getMaskParam(), Factor, cast<VectorType>(VecTy));
if (!Mask)
return false;

LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *Load << "\n");

// Sometimes the number of Shuffles might be less than Factor, we have to
// fill the gaps with null. Also, lowerInterleavedVPLoad
// expects them to be sorted.
SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues))
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
return !Extracts.empty() || BinOpShuffleChanged;
} else {
LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");

// Try to create target specific intrinsics to replace the load and
// shuffles.
if (!TLI->lowerInterleavedLoad(cast<LoadInst>(Load), Shuffles, Indices,
Factor))
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
return !Extracts.empty() || BinOpShuffleChanged;
}

// Try to create target specific intrinsics to replace the load and
// shuffles.
if (!TLI->lowerInterleavedLoad(cast<Instruction>(Load), Mask, Shuffles,
Indices, Factor))
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
return !Extracts.empty() || BinOpShuffleChanged;

DeadInsts.insert_range(Shuffles);

DeadInsts.insert(Load);
Expand Down
7 changes: 6 additions & 1 deletion llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17155,14 +17155,19 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool AArch64TargetLowering::lowerInterleavedLoad(
LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
assert(!Shuffles.empty() && "Empty shufflevector input");
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");

auto *LI = dyn_cast<LoadInst>(Load);
if (!LI)
return false;
assert(!Mask && "Unexpected mask on a load");

const DataLayout &DL = LI->getDataLayout();

VectorType *VTy = Shuffles[0]->getType();
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AArch64/AArch64ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ class AArch64TargetLowering : public TargetLowering {

unsigned getMaxSupportedInterleaveFactor() const override { return 4; }

bool lowerInterleavedLoad(LoadInst *LI,
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
Expand Down
7 changes: 6 additions & 1 deletion llvm/lib/Target/ARM/ARMISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21585,14 +21585,19 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
bool ARMTargetLowering::lowerInterleavedLoad(
LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
assert(!Shuffles.empty() && "Empty shufflevector input");
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");

auto *LI = dyn_cast<LoadInst>(Load);
if (!LI)
return false;
assert(!Mask && "Unexpected mask on a load");

auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
Type *EltTy = VecTy->getElementType();

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/ARM/ARMISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -681,7 +681,7 @@ class VectorType;

unsigned getMaxSupportedInterleaveFactor() const override;

bool lowerInterleavedLoad(LoadInst *LI,
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
Expand Down
5 changes: 1 addition & 4 deletions llvm/lib/Target/RISCV/RISCVISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ class RISCVTargetLowering : public TargetLowering {

bool fallBackToDAGISel(const Instruction &Inst) const override;

bool lowerInterleavedLoad(LoadInst *LI,
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
Expand All @@ -444,9 +444,6 @@ class RISCVTargetLowering : public TargetLowering {
Instruction *Store, Value *Mask,
ArrayRef<Value *> InterleaveValues) const override;

bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
ArrayRef<Value *> DeinterleaveRes) const override;

bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
ArrayRef<Value *> InterleaveOps) const override;

Expand Down
181 changes: 45 additions & 136 deletions llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,21 +115,49 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool RISCVTargetLowering::lowerInterleavedLoad(
LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Indices.size() == Shuffles.size());

IRBuilder<> Builder(LI);

const DataLayout &DL = LI->getDataLayout();
IRBuilder<> Builder(Load);

const DataLayout &DL = Load->getDataLayout();
auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType());
if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(),
LI->getPointerAddressSpace(), DL))
return false;
auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());

Value *Ptr, *VL;
Align Alignment;
if (auto *LI = dyn_cast<LoadInst>(Load)) {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to note, this block of code is now repeated across two different reviews with slight differences, and one case already in tree. Once at least one more of these lands, I'm going to extract a helper, and post that for review. I'm picturing something like a getMemOperands(I, Ptr, Alignment, Mask, VL) like interface where the later three are updated by reference.

assert(LI->isSimple());
Ptr = LI->getPointerOperand();
Alignment = LI->getAlign();
assert(!Mask && "Unexpected mask on a load\n");
Mask = Builder.getAllOnesMask(VTy->getElementCount());
VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount());
} else {
auto *VPLoad = cast<VPIntrinsic>(Load);
assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load &&
"Unexpected intrinsic");
Ptr = VPLoad->getMemoryPointerParam();
Alignment = VPLoad->getPointerAlignment().value_or(
DL.getABITypeAlign(VTy->getElementType()));

auto *PtrTy = LI->getPointerOperandType();
auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
assert(Mask && "vp.load needs a mask!");

Value *WideEVL = VPLoad->getVectorLengthParam();
// Conservatively check if EVL is a multiple of factor, otherwise some
// (trailing) elements might be lost after the transformation.
if (!isMultipleOfN(WideEVL, DL, Factor))
return false;

auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
}

Type *PtrTy = Ptr->getType();
unsigned AS = PtrTy->getPointerAddressSpace();
if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
return false;

// If the segment load is going to be performed segment at a time anyways
// and there's only one element used, use a strided load instead. This
Expand All @@ -138,26 +166,23 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset);
Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(),
VTy->getElementCount());

Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
// Note: Same VL as above, but i32 not xlen due to signature of
// vp.strided.load
VL = Builder.CreateElementCount(Builder.getInt32Ty(),
VTy->getElementCount());
CallInst *CI =
Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
{VTy, BasePtr->getType(), Stride->getType()},
{BasePtr, Stride, Mask, VL});
CI->addParamAttr(
0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign()));
CI->addParamAttr(0,
Attribute::getWithAlignment(CI->getContext(), Alignment));
Shuffles[0]->replaceAllUsesWith(CI);
return true;
};

Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount());
Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
CallInst *VlsegN = Builder.CreateIntrinsic(
FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy},
{LI->getPointerOperand(), Mask, VL});
FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});

for (unsigned i = 0; i < Shuffles.size(); i++) {
Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]);
Expand Down Expand Up @@ -426,122 +451,6 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
return true;
}

/// Lower an interleaved vp.load into a vlsegN intrinsic.
///
/// E.g. Lower an interleaved vp.load (Factor = 2):
/// %l = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr %ptr,
/// %mask,
/// i32 %wide.rvl)
/// %dl = tail call { <vscale x 32 x i8>, <vscale x 32 x i8> }
/// @llvm.vector.deinterleave2.nxv64i8(
/// <vscale x 64 x i8> %l)
/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 0
/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 1
///
/// Into:
/// %rvl = udiv %wide.rvl, 2
/// %sl = call { <vscale x 32 x i8>, <vscale x 32 x i8> }
/// @llvm.riscv.vlseg2.mask.nxv32i8.i64(<vscale x 32 x i8> undef,
/// <vscale x 32 x i8> undef,
/// ptr %ptr,
/// %mask,
/// i64 %rvl,
/// i64 1)
/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 0
/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 1
///
/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be
/// removed by the caller
/// TODO: We probably can loosen the dependency on matching extractvalue when
/// dealing with factor of 2 (extractvalue is still required for most of other
/// factors though).
bool RISCVTargetLowering::lowerInterleavedVPLoad(
VPIntrinsic *Load, Value *Mask,
ArrayRef<Value *> DeinterleaveResults) const {
const unsigned Factor = DeinterleaveResults.size();
assert(Mask && "Expect a valid mask");
assert(Load->getIntrinsicID() == Intrinsic::vp_load &&
"Unexpected intrinsic");

Value *FirstActive = *llvm::find_if(DeinterleaveResults,
[](Value *V) { return V != nullptr; });
VectorType *VTy = cast<VectorType>(FirstActive->getType());

auto &DL = Load->getModule()->getDataLayout();
Align Alignment = Load->getParamAlign(0).value_or(
DL.getABITypeAlign(VTy->getElementType()));
if (!isLegalInterleavedAccessType(
VTy, Factor, Alignment,
Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL))
return false;

IRBuilder<> Builder(Load);

Value *WideEVL = Load->getVectorLengthParam();
// Conservatively check if EVL is a multiple of factor, otherwise some
// (trailing) elements might be lost after the transformation.
if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor))
return false;

auto *PtrTy = Load->getArgOperand(0)->getType();
auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
Value *EVL =
Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);

Value *Return = nullptr;
if (isa<FixedVectorType>(VTy)) {
Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
{VTy, PtrTy, XLenTy},
{Load->getArgOperand(0), Mask, EVL});
} else {
unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
unsigned NumElts = VTy->getElementCount().getKnownMinValue();
Type *VecTupTy = TargetExtType::get(
Load->getContext(), "riscv.vector.tuple",
ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
NumElts * SEW / 8),
Factor);

Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
Load->getModule(), ScalableVlsegIntrIds[Factor - 2],
{VecTupTy, PtrTy, Mask->getType(), EVL->getType()});

Value *Operands[] = {
PoisonValue::get(VecTupTy),
Load->getArgOperand(0),
Mask,
EVL,
ConstantInt::get(XLenTy,
RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC),
ConstantInt::get(XLenTy, Log2_64(SEW))};

CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands);

SmallVector<Type *, 8> AggrTypes{Factor, VTy};
Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration(
Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy});
for (unsigned i = 0; i < Factor; ++i) {
Value *VecExtract =
Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)});
Return = Builder.CreateInsertValue(Return, VecExtract, i);
}
}

for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) {
if (!DIO)
continue;
// We have to create a brand new ExtractValue to replace each
// of these old ExtractValue instructions.
Value *NewEV =
Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
DIO->replaceAllUsesWith(NewEV);
}

return true;
}

/// Lower an interleaved vp.store into a vssegN intrinsic.
///
/// E.g. Lower an interleaved vp.store (Factor = 2):
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/X86/X86ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -1661,7 +1661,7 @@ namespace llvm {

/// Lower interleaved load(s) into target specific
/// instructions/intrinsics.
bool lowerInterleavedLoad(LoadInst *LI,
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
Expand Down
Loading