Skip to content

Commit 20aa7b1

Browse files
committed
[VectorCombine] New folding pattern for extract/binop/shuffle chains
Resolves #144654 Part of #143088 This adds a new `foldShuffleChainsToReduce` for horizontal reduction of patterns like: ```llvm define i16 @test_reduce_v8i16(<8 x i16> %a0) local_unnamed_addr #0 { %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison> %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) %7 = extractelement <8 x i16> %6, i64 0 ret i16 %7 } ``` ...which can be reduced to a llvm.vector.reduce.umin.v8i16(%a0) intrinsic call. Similar transformation for other ops when costs permit to do so.
1 parent d4826cd commit 20aa7b1

File tree

2 files changed

+144
-0
lines changed

2 files changed

+144
-0
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ class VectorCombine {
129129
bool foldShuffleOfIntrinsics(Instruction &I);
130130
bool foldShuffleToIdentity(Instruction &I);
131131
bool foldShuffleFromReductions(Instruction &I);
132+
bool foldShuffleChainsToReduce(Instruction &I);
132133
bool foldCastFromReductions(Instruction &I);
133134
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
134135
bool foldInterleaveIntrinsics(Instruction &I);
@@ -2910,6 +2911,130 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
29102911
return foldSelectShuffle(*Shuffle, true);
29112912
}
29122913

2914+
bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
2915+
auto *SVI = dyn_cast<ShuffleVectorInst>(&I);
2916+
if (!SVI)
2917+
return false;
2918+
2919+
std::queue<Value *> Worklist;
2920+
SmallVector<Instruction *> ToEraseFromParent;
2921+
2922+
SmallVector<int> ShuffleMask;
2923+
bool IsShuffleOp = true;
2924+
2925+
Worklist.push(SVI);
2926+
SVI->getShuffleMask(ShuffleMask);
2927+
2928+
if (ShuffleMask.size() < 2)
2929+
return false;
2930+
2931+
Instruction *Prev0 = nullptr, *Prev1 = nullptr;
2932+
Instruction *LastOp = nullptr;
2933+
2934+
int MaskHalfPos = ShuffleMask.size() / 2;
2935+
bool IsFirst = true;
2936+
2937+
while (!Worklist.empty()) {
2938+
Value *V = Worklist.front();
2939+
Worklist.pop();
2940+
2941+
auto *CI = dyn_cast<Instruction>(V);
2942+
if (!CI)
2943+
return false;
2944+
2945+
if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
2946+
if (!IsShuffleOp || MaskHalfPos < 1 || (!Prev1 && !IsFirst))
2947+
return false;
2948+
2949+
auto *Op0 = SV->getOperand(0);
2950+
auto *Op1 = SV->getOperand(1);
2951+
if (!Op0 || !Op1)
2952+
return false;
2953+
2954+
auto *FVT = dyn_cast<FixedVectorType>(Op1->getType());
2955+
if (!FVT || !isa<PoisonValue>(Op1))
2956+
return false;
2957+
2958+
SmallVector<int> CurrentMask;
2959+
SV->getShuffleMask(CurrentMask);
2960+
2961+
int64_t MaskSize = CurrentMask.size();
2962+
for (int MaskPos = 0; MaskPos != MaskSize; ++MaskPos) {
2963+
if (MaskPos < MaskHalfPos && CurrentMask[MaskPos] != MaskHalfPos + MaskPos)
2964+
return false;
2965+
if (MaskPos >= MaskHalfPos && CurrentMask[MaskPos] != -1)
2966+
return false;
2967+
}
2968+
MaskHalfPos /= 2;
2969+
Prev0 = SV;
2970+
} else if (auto *Call = dyn_cast<CallInst>(V)) {
2971+
if (IsShuffleOp || !Prev0)
2972+
return false;
2973+
2974+
auto *II = dyn_cast<IntrinsicInst>(Call);
2975+
if (!II)
2976+
return false;
2977+
2978+
switch (II->getIntrinsicID()) {
2979+
case Intrinsic::umin: {
2980+
auto *Op0 = Call->getOperand(0);
2981+
auto *Op1 = Call->getOperand(1);
2982+
if (!(Op0 == Prev0 && Op1 == Prev1) && !(Op0 == Prev1 && Op1 == Prev0) && !IsFirst)
2983+
return false;
2984+
2985+
if (!IsFirst)
2986+
Prev0 = Prev1;
2987+
else
2988+
IsFirst = false;
2989+
Prev1 = Call;
2990+
break;
2991+
}
2992+
default:
2993+
return false;
2994+
}
2995+
} else if (auto *ExtractElement = dyn_cast<ExtractElementInst>(CI)) {
2996+
if (!IsShuffleOp || !Prev0 || !Prev1 || MaskHalfPos != 0)
2997+
return false;
2998+
2999+
auto *Op0 = ExtractElement->getOperand(0);
3000+
auto *Op1 = ExtractElement->getOperand(1);
3001+
if (Op0 != Prev1)
3002+
return false;
3003+
3004+
if (auto *Op1Idx = dyn_cast<ConstantInt>(Op1)) {
3005+
if (Op1Idx->getValue() != 0)
3006+
return false;
3007+
} else {
3008+
return false;
3009+
}
3010+
LastOp = ExtractElement;
3011+
break;
3012+
}
3013+
IsShuffleOp ^= 1;
3014+
ToEraseFromParent.push_back(CI);
3015+
3016+
auto *NextI = CI->getNextNode();
3017+
if (!NextI)
3018+
return false;
3019+
Worklist.push(NextI);
3020+
}
3021+
3022+
if (!LastOp)
3023+
return false;
3024+
3025+
auto *ReducedResult = Builder.CreateIntrinsic(Intrinsic::vector_reduce_umin, {SVI->getType()}, {SVI->getOperand(0)});
3026+
replaceValue(*LastOp, *ReducedResult);
3027+
3028+
ToEraseFromParent.push_back(LastOp);
3029+
3030+
std::reverse(ToEraseFromParent.begin(), ToEraseFromParent.end());
3031+
// for (auto &Instr : ToEraseFromParent)
3032+
// eraseInstruction(*Instr);
3033+
// Instr->eraseFromParent();
3034+
3035+
return true;
3036+
}
3037+
29133038
/// Determine if its more efficient to fold:
29143039
/// reduce(trunc(x)) -> trunc(reduce(x)).
29153040
/// reduce(sext(x)) -> sext(reduce(x)).
@@ -3607,6 +3732,7 @@ bool VectorCombine::run() {
36073732
MadeChange |= foldShuffleOfIntrinsics(I);
36083733
MadeChange |= foldSelectShuffle(I);
36093734
MadeChange |= foldShuffleToIdentity(I);
3735+
MadeChange |= foldShuffleChainsToReduce(I);
36103736
break;
36113737
case Instruction::BitCast:
36123738
MadeChange |= foldBitcastShuffle(I);
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s -passes=vector-combine -S | FileCheck %s
3+
4+
define i16 @test_reduce_v8i16(<8 x i16> %a0) local_unnamed_addr #0 {
5+
; CHECK-LABEL: define i16 @test_reduce_v8i16(
6+
; CHECK-SAME: <8 x i16> [[A0:%.*]]) local_unnamed_addr {
7+
; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]])
8+
; CHECK-NEXT: ret i16 [[TMP1]]
9+
;
10+
%1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
11+
%2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
12+
%3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
13+
%4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
14+
%5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
15+
%6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5)
16+
%7 = extractelement <8 x i16> %6, i64 0
17+
ret i16 %7
18+
}

0 commit comments

Comments
 (0)