Skip to content

Commit e160c70

Browse files
committed
Return lower cost for dupq
1 parent a42ffba commit e160c70

File tree

2 files changed

+25
-5
lines changed

2 files changed

+25
-5
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5599,6 +5599,26 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
55995599
SrcTy = DstTy;
56005600
}
56015601

5602+
// Segmented shuffle matching.
5603+
if (ST->hasSVE2p1() && CostKind == TTI::TCK_RecipThroughput &&
5604+
Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(Tp) &&
5605+
Tp->getPrimitiveSizeInBits().isKnownMultipleOf(128)) {
5606+
5607+
FixedVectorType *VTy = cast<FixedVectorType>(Tp);
5608+
unsigned Segments = VTy->getPrimitiveSizeInBits() / 128;
5609+
unsigned SegmentElts = VTy->getNumElements() / Segments;
5610+
5611+
// dupq zd.t, zn.t[idx]
5612+
unsigned Lane = (unsigned)Mask[0];
5613+
if (SegmentElts * Segments == Mask.size() && Lane < SegmentElts) {
5614+
bool IsDupQ = true;
5615+
for (unsigned I = 1; I < Mask.size(); ++I)
5616+
IsDupQ &= (unsigned)Mask[I] == Lane + ((I / SegmentElts) * SegmentElts);
5617+
if (IsDupQ)
5618+
return LT.first;
5619+
}
5620+
}
5621+
56025622
// Check for broadcast loads, which are supported by the LD1R instruction.
56035623
// In terms of code-size, the shuffle vector is free when a load + dup get
56045624
// folded into a LD1R. That's what we check and return here. For performance

llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
;; Broadcast indexed lane within 128b segments (dupq zd.t, zn.t[idx])
55
define void @dup_within_each_segment() #0 {
66
; CHECK-LABEL: 'dup_within_each_segment'
7-
; CHECK-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
8-
; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
9-
; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
10-
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
11-
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
7+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
8+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_h2 = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
9+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_s3 = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
10+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dupq_d0 = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
11+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %dupq_512b_d1 = shufflevector <8 x i64> poison, <8 x i64> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
1212
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
1313
;
1414
%dupq_b11 = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11,

0 commit comments

Comments
 (0)