Skip to content

Commit 28417e6

Browse files
authored
[IA] Support vp.load in lowerInterleavedLoad [nfc-ish] (#149174)
This continues in the direction started by commit 4b81dc7. We essentially merges the handling for VPLoad - currently in lowerInterleavedVPLoad - into the existing dedicated routine. This removes the last use of the dedicate lowerInterleavedVPLoad and thus we can remove it. This isn't quite NFC as the main callback has support for the strided load optimization whereas the VPLoad specific version didn't. So this adds the ability to form a strided load for a vp.load deinterleave with one shuffle used.
1 parent cd6311b commit 28417e6

File tree

11 files changed

+87
-182
lines changed

11 files changed

+87
-182
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3201,11 +3201,15 @@ class LLVM_ABI TargetLoweringBase {
32013201
/// Lower an interleaved load to target specific intrinsics. Return
32023202
/// true on success.
32033203
///
3204-
/// \p LI is the vector load instruction.
3204+
/// \p Load is the vector load instruction. Can be either a plain load
3205+
/// instruction or a vp.load intrinsic.
3206+
/// \p Mask is a per-segment (i.e. number of lanes equal to that of one
3207+
/// component being interwoven) mask. Can be nullptr, in which case the
3208+
/// result is uncondiitional.
32053209
/// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
32063210
/// \p Indices is the corresponding indices for each shufflevector.
32073211
/// \p Factor is the interleave factor.
3208-
virtual bool lowerInterleavedLoad(LoadInst *LI,
3212+
virtual bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
32093213
ArrayRef<ShuffleVectorInst *> Shuffles,
32103214
ArrayRef<unsigned> Indices,
32113215
unsigned Factor) const {
@@ -3223,17 +3227,6 @@ class LLVM_ABI TargetLoweringBase {
32233227
return false;
32243228
}
32253229

3226-
/// Lower an interleaved load to target specific intrinsics. Return
3227-
/// true on success.
3228-
///
3229-
/// \p Load is a vp.load instruction.
3230-
/// \p Mask is a mask value
3231-
/// \p DeinterleaveRes is a list of deinterleaved results.
3232-
virtual bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
3233-
ArrayRef<Value *> DeinterleaveRes) const {
3234-
return false;
3235-
}
3236-
32373230
/// Lower an interleaved store to target specific intrinsics. Return
32383231
/// true on success.
32393232
///

llvm/lib/CodeGen/InterleavedAccessPass.cpp

Lines changed: 10 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -367,34 +367,23 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
367367
bool BinOpShuffleChanged =
368368
replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);
369369

370+
Value *Mask = nullptr;
370371
if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) {
371-
Value *LaneMask =
372-
getMask(VPLoad->getMaskParam(), Factor, cast<VectorType>(VecTy));
373-
if (!LaneMask)
372+
Mask = getMask(VPLoad->getMaskParam(), Factor, cast<VectorType>(VecTy));
373+
if (!Mask)
374374
return false;
375-
376375
LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *Load << "\n");
377-
378-
// Sometimes the number of Shuffles might be less than Factor, we have to
379-
// fill the gaps with null. Also, lowerInterleavedVPLoad
380-
// expects them to be sorted.
381-
SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
382-
for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
383-
ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
384-
if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues))
385-
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
386-
return !Extracts.empty() || BinOpShuffleChanged;
387376
} else {
388377
LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
389-
390-
// Try to create target specific intrinsics to replace the load and
391-
// shuffles.
392-
if (!TLI->lowerInterleavedLoad(cast<LoadInst>(Load), Shuffles, Indices,
393-
Factor))
394-
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
395-
return !Extracts.empty() || BinOpShuffleChanged;
396378
}
397379

380+
// Try to create target specific intrinsics to replace the load and
381+
// shuffles.
382+
if (!TLI->lowerInterleavedLoad(cast<Instruction>(Load), Mask, Shuffles,
383+
Indices, Factor))
384+
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
385+
return !Extracts.empty() || BinOpShuffleChanged;
386+
398387
DeadInsts.insert_range(Shuffles);
399388

400389
DeadInsts.insert(Load);

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17155,14 +17155,19 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
1715517155
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
1715617156
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
1715717157
bool AArch64TargetLowering::lowerInterleavedLoad(
17158-
LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
17158+
Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
1715917159
ArrayRef<unsigned> Indices, unsigned Factor) const {
1716017160
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
1716117161
"Invalid interleave factor");
1716217162
assert(!Shuffles.empty() && "Empty shufflevector input");
1716317163
assert(Shuffles.size() == Indices.size() &&
1716417164
"Unmatched number of shufflevectors and indices");
1716517165

17166+
auto *LI = dyn_cast<LoadInst>(Load);
17167+
if (!LI)
17168+
return false;
17169+
assert(!Mask && "Unexpected mask on a load");
17170+
1716617171
const DataLayout &DL = LI->getDataLayout();
1716717172

1716817173
VectorType *VTy = Shuffles[0]->getType();

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ class AArch64TargetLowering : public TargetLowering {
211211

212212
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
213213

214-
bool lowerInterleavedLoad(LoadInst *LI,
214+
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
215215
ArrayRef<ShuffleVectorInst *> Shuffles,
216216
ArrayRef<unsigned> Indices,
217217
unsigned Factor) const override;

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21584,14 +21584,19 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
2158421584
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
2158521585
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
2158621586
bool ARMTargetLowering::lowerInterleavedLoad(
21587-
LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
21587+
Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
2158821588
ArrayRef<unsigned> Indices, unsigned Factor) const {
2158921589
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
2159021590
"Invalid interleave factor");
2159121591
assert(!Shuffles.empty() && "Empty shufflevector input");
2159221592
assert(Shuffles.size() == Indices.size() &&
2159321593
"Unmatched number of shufflevectors and indices");
2159421594

21595+
auto *LI = dyn_cast<LoadInst>(Load);
21596+
if (!LI)
21597+
return false;
21598+
assert(!Mask && "Unexpected mask on a load");
21599+
2159521600
auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
2159621601
Type *EltTy = VecTy->getElementType();
2159721602

llvm/lib/Target/ARM/ARMISelLowering.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -681,7 +681,7 @@ class VectorType;
681681

682682
unsigned getMaxSupportedInterleaveFactor() const override;
683683

684-
bool lowerInterleavedLoad(LoadInst *LI,
684+
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
685685
ArrayRef<ShuffleVectorInst *> Shuffles,
686686
ArrayRef<unsigned> Indices,
687687
unsigned Factor) const override;

llvm/lib/Target/RISCV/RISCVISelLowering.h

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -429,7 +429,7 @@ class RISCVTargetLowering : public TargetLowering {
429429

430430
bool fallBackToDAGISel(const Instruction &Inst) const override;
431431

432-
bool lowerInterleavedLoad(LoadInst *LI,
432+
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
433433
ArrayRef<ShuffleVectorInst *> Shuffles,
434434
ArrayRef<unsigned> Indices,
435435
unsigned Factor) const override;
@@ -444,9 +444,6 @@ class RISCVTargetLowering : public TargetLowering {
444444
Instruction *Store, Value *Mask,
445445
ArrayRef<Value *> InterleaveValues) const override;
446446

447-
bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
448-
ArrayRef<Value *> DeinterleaveRes) const override;
449-
450447
bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
451448
ArrayRef<Value *> InterleaveOps) const override;
452449

llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp

Lines changed: 45 additions & 136 deletions
Original file line numberDiff line numberDiff line change
@@ -115,21 +115,49 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
115115
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
116116
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
117117
bool RISCVTargetLowering::lowerInterleavedLoad(
118-
LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
118+
Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
119119
ArrayRef<unsigned> Indices, unsigned Factor) const {
120120
assert(Indices.size() == Shuffles.size());
121121

122-
IRBuilder<> Builder(LI);
123-
124-
const DataLayout &DL = LI->getDataLayout();
122+
IRBuilder<> Builder(Load);
125123

124+
const DataLayout &DL = Load->getDataLayout();
126125
auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType());
127-
if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(),
128-
LI->getPointerAddressSpace(), DL))
129-
return false;
126+
auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
127+
128+
Value *Ptr, *VL;
129+
Align Alignment;
130+
if (auto *LI = dyn_cast<LoadInst>(Load)) {
131+
assert(LI->isSimple());
132+
Ptr = LI->getPointerOperand();
133+
Alignment = LI->getAlign();
134+
assert(!Mask && "Unexpected mask on a load\n");
135+
Mask = Builder.getAllOnesMask(VTy->getElementCount());
136+
VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount());
137+
} else {
138+
auto *VPLoad = cast<VPIntrinsic>(Load);
139+
assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load &&
140+
"Unexpected intrinsic");
141+
Ptr = VPLoad->getMemoryPointerParam();
142+
Alignment = VPLoad->getPointerAlignment().value_or(
143+
DL.getABITypeAlign(VTy->getElementType()));
130144

131-
auto *PtrTy = LI->getPointerOperandType();
132-
auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
145+
assert(Mask && "vp.load needs a mask!");
146+
147+
Value *WideEVL = VPLoad->getVectorLengthParam();
148+
// Conservatively check if EVL is a multiple of factor, otherwise some
149+
// (trailing) elements might be lost after the transformation.
150+
if (!isMultipleOfN(WideEVL, DL, Factor))
151+
return false;
152+
153+
auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
154+
VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
155+
}
156+
157+
Type *PtrTy = Ptr->getType();
158+
unsigned AS = PtrTy->getPointerAddressSpace();
159+
if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
160+
return false;
133161

134162
// If the segment load is going to be performed segment at a time anyways
135163
// and there's only one element used, use a strided load instead. This
@@ -138,26 +166,23 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
138166
unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
139167
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
140168
Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
141-
Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset);
142-
Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
143-
Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(),
144-
VTy->getElementCount());
145-
169+
Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
170+
// Note: Same VL as above, but i32 not xlen due to signature of
171+
// vp.strided.load
172+
VL = Builder.CreateElementCount(Builder.getInt32Ty(),
173+
VTy->getElementCount());
146174
CallInst *CI =
147175
Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
148176
{VTy, BasePtr->getType(), Stride->getType()},
149177
{BasePtr, Stride, Mask, VL});
150-
CI->addParamAttr(
151-
0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign()));
178+
CI->addParamAttr(0,
179+
Attribute::getWithAlignment(CI->getContext(), Alignment));
152180
Shuffles[0]->replaceAllUsesWith(CI);
153181
return true;
154182
};
155183

156-
Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount());
157-
Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
158184
CallInst *VlsegN = Builder.CreateIntrinsic(
159-
FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy},
160-
{LI->getPointerOperand(), Mask, VL});
185+
FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
161186

162187
for (unsigned i = 0; i < Shuffles.size(); i++) {
163188
Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]);
@@ -426,122 +451,6 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
426451
return true;
427452
}
428453

429-
/// Lower an interleaved vp.load into a vlsegN intrinsic.
430-
///
431-
/// E.g. Lower an interleaved vp.load (Factor = 2):
432-
/// %l = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr %ptr,
433-
/// %mask,
434-
/// i32 %wide.rvl)
435-
/// %dl = tail call { <vscale x 32 x i8>, <vscale x 32 x i8> }
436-
/// @llvm.vector.deinterleave2.nxv64i8(
437-
/// <vscale x 64 x i8> %l)
438-
/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 0
439-
/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 1
440-
///
441-
/// Into:
442-
/// %rvl = udiv %wide.rvl, 2
443-
/// %sl = call { <vscale x 32 x i8>, <vscale x 32 x i8> }
444-
/// @llvm.riscv.vlseg2.mask.nxv32i8.i64(<vscale x 32 x i8> undef,
445-
/// <vscale x 32 x i8> undef,
446-
/// ptr %ptr,
447-
/// %mask,
448-
/// i64 %rvl,
449-
/// i64 1)
450-
/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 0
451-
/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 1
452-
///
453-
/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be
454-
/// removed by the caller
455-
/// TODO: We probably can loosen the dependency on matching extractvalue when
456-
/// dealing with factor of 2 (extractvalue is still required for most of other
457-
/// factors though).
458-
bool RISCVTargetLowering::lowerInterleavedVPLoad(
459-
VPIntrinsic *Load, Value *Mask,
460-
ArrayRef<Value *> DeinterleaveResults) const {
461-
const unsigned Factor = DeinterleaveResults.size();
462-
assert(Mask && "Expect a valid mask");
463-
assert(Load->getIntrinsicID() == Intrinsic::vp_load &&
464-
"Unexpected intrinsic");
465-
466-
Value *FirstActive = *llvm::find_if(DeinterleaveResults,
467-
[](Value *V) { return V != nullptr; });
468-
VectorType *VTy = cast<VectorType>(FirstActive->getType());
469-
470-
auto &DL = Load->getModule()->getDataLayout();
471-
Align Alignment = Load->getParamAlign(0).value_or(
472-
DL.getABITypeAlign(VTy->getElementType()));
473-
if (!isLegalInterleavedAccessType(
474-
VTy, Factor, Alignment,
475-
Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL))
476-
return false;
477-
478-
IRBuilder<> Builder(Load);
479-
480-
Value *WideEVL = Load->getVectorLengthParam();
481-
// Conservatively check if EVL is a multiple of factor, otherwise some
482-
// (trailing) elements might be lost after the transformation.
483-
if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor))
484-
return false;
485-
486-
auto *PtrTy = Load->getArgOperand(0)->getType();
487-
auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen());
488-
auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
489-
Value *EVL =
490-
Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
491-
492-
Value *Return = nullptr;
493-
if (isa<FixedVectorType>(VTy)) {
494-
Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
495-
{VTy, PtrTy, XLenTy},
496-
{Load->getArgOperand(0), Mask, EVL});
497-
} else {
498-
unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
499-
unsigned NumElts = VTy->getElementCount().getKnownMinValue();
500-
Type *VecTupTy = TargetExtType::get(
501-
Load->getContext(), "riscv.vector.tuple",
502-
ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
503-
NumElts * SEW / 8),
504-
Factor);
505-
506-
Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
507-
Load->getModule(), ScalableVlsegIntrIds[Factor - 2],
508-
{VecTupTy, PtrTy, Mask->getType(), EVL->getType()});
509-
510-
Value *Operands[] = {
511-
PoisonValue::get(VecTupTy),
512-
Load->getArgOperand(0),
513-
Mask,
514-
EVL,
515-
ConstantInt::get(XLenTy,
516-
RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC),
517-
ConstantInt::get(XLenTy, Log2_64(SEW))};
518-
519-
CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands);
520-
521-
SmallVector<Type *, 8> AggrTypes{Factor, VTy};
522-
Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
523-
Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration(
524-
Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy});
525-
for (unsigned i = 0; i < Factor; ++i) {
526-
Value *VecExtract =
527-
Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)});
528-
Return = Builder.CreateInsertValue(Return, VecExtract, i);
529-
}
530-
}
531-
532-
for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) {
533-
if (!DIO)
534-
continue;
535-
// We have to create a brand new ExtractValue to replace each
536-
// of these old ExtractValue instructions.
537-
Value *NewEV =
538-
Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)});
539-
DIO->replaceAllUsesWith(NewEV);
540-
}
541-
542-
return true;
543-
}
544-
545454
/// Lower an interleaved vp.store into a vssegN intrinsic.
546455
///
547456
/// E.g. Lower an interleaved vp.store (Factor = 2):

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1661,7 +1661,7 @@ namespace llvm {
16611661

16621662
/// Lower interleaved load(s) into target specific
16631663
/// instructions/intrinsics.
1664-
bool lowerInterleavedLoad(LoadInst *LI,
1664+
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
16651665
ArrayRef<ShuffleVectorInst *> Shuffles,
16661666
ArrayRef<unsigned> Indices,
16671667
unsigned Factor) const override;

0 commit comments

Comments
 (0)