Skip to content

Commit 708edb1

Browse files
committed
[VPlan] Handle FirstActiveLane when unrolling.
Currently FirstActiveLane is not handled correctly during unrolling. This is currently causing mis-compiles when vectorizing early-exit loops with interleaving forced. This patch updates handling of FirstActiveLane to be analogous to computing final reduction results: during unrolling, the created copies for its original operand are added as additional operands, and FirstActiveLane will always produce the index of the first active lane across all unrolled iterations. Note that some of the generated code is still incorrect, as we also need to handle ExtractElement with FirstActiveLane operands. I will share patches for those soon as well.
1 parent e391301 commit 708edb1

File tree

6 files changed

+344
-17
lines changed

6 files changed

+344
-17
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -955,7 +955,9 @@ class VPInstruction : public VPRecipeWithIRFlags,
955955
// Returns a scalar boolean value, which is true if any lane of its (only
956956
// boolean) vector operand is true.
957957
AnyOf,
958-
// Calculates the first active lane index of the vector predicate operand.
958+
// Calculates the first active lane index of the vector predicate operands.
959+
// It produces the lane index across all unrolled iterations. Unrolling will
960+
// add all copies of its original operand as additional operands.
959961
FirstActiveLane,
960962

961963
// The opcodes below are used for VPInstructionWithType.

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -765,9 +765,35 @@ Value *VPInstruction::generate(VPTransformState &State) {
765765
return Builder.CreateOrReduce(A);
766766
}
767767
case VPInstruction::FirstActiveLane: {
768-
Value *Mask = State.get(getOperand(0));
769-
return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
770-
true, Name);
768+
if (getNumOperands() == 1) {
769+
Value *Mask = State.get(getOperand(0));
770+
return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
771+
true, Name);
772+
}
773+
// If there are multiple operands, create a chain of selects to pick the
774+
// first operand with an active lane and add the number of lanes of the
775+
// preceding operands.
776+
Value *RuntimeVF =
777+
getRuntimeVF(State.Builder, State.Builder.getInt64Ty(), State.VF);
778+
Type *ElemTy = State.TypeAnalysis.inferScalarType(getOperand(0));
779+
Value *RuntimeBitwidth = Builder.CreateMul(
780+
Builder.getInt64(ElemTy->getScalarSizeInBits()), RuntimeVF);
781+
unsigned LastOpIdx = getNumOperands() - 1;
782+
Value *Res = nullptr;
783+
for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
784+
Value *Current = Builder.CreateCountTrailingZeroElems(
785+
Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name);
786+
Current = Builder.CreateAdd(
787+
Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), Current);
788+
if (Res) {
789+
Value *Cmp = Builder.CreateICmpNE(Current, RuntimeBitwidth);
790+
Res = Builder.CreateSelect(Cmp, Current, Res);
791+
} else {
792+
Res = Current;
793+
}
794+
}
795+
796+
return Res;
771797
}
772798
default:
773799
llvm_unreachable("Unsupported opcode for instruction");

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -344,10 +344,12 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
344344
if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R))
345345
continue;
346346

347-
// Add all VPValues for all parts to ComputeReductionResult which combines
348-
// the parts to compute the final reduction value.
347+
// Add all VPValues for all parts to Compute*Result and FirstActiveLaneMask
348+
// which combine the parts to compute the final value.
349349
VPValue *Op1;
350-
if (match(&R, m_VPInstruction<VPInstruction::ComputeAnyOfResult>(
350+
if (match(&R, m_VPInstruction<VPInstruction::FirstActiveLane>(
351+
m_VPValue(Op1))) ||
352+
match(&R, m_VPInstruction<VPInstruction::ComputeAnyOfResult>(
351353
m_VPValue(), m_VPValue(), m_VPValue(Op1))) ||
352354
match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
353355
m_VPValue(), m_VPValue(Op1))) ||

llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,38 @@ define i64 @same_exit_block_pre_inc_use1() #0 {
3131
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
3232
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[OFFSET_IDX]]
3333
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
34+
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
35+
; CHECK-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 16
36+
; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP19]]
37+
; CHECK-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
38+
; CHECK-NEXT: [[TMP34:%.*]] = mul nuw i64 [[TMP33]], 32
39+
; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP34]]
40+
; CHECK-NEXT: [[TMP36:%.*]] = call i64 @llvm.vscale.i64()
41+
; CHECK-NEXT: [[TMP37:%.*]] = mul nuw i64 [[TMP36]], 48
42+
; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP37]]
3443
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
44+
; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 16 x i8>, ptr [[TMP29]], align 1
45+
; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP35]], align 1
46+
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP38]], align 1
3547
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[OFFSET_IDX]]
3648
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
49+
; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
50+
; CHECK-NEXT: [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 16
51+
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP21]]
52+
; CHECK-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
53+
; CHECK-NEXT: [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 32
54+
; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP24]]
55+
; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
56+
; CHECK-NEXT: [[TMP27:%.*]] = mul nuw i64 [[TMP26]], 48
57+
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP27]]
3758
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
59+
; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP22]], align 1
60+
; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <vscale x 16 x i8>, ptr [[TMP25]], align 1
61+
; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 16 x i8>, ptr [[TMP28]], align 1
3862
; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
63+
; CHECK-NEXT: [[TMP30:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD5]], [[WIDE_LOAD6]]
64+
; CHECK-NEXT: [[TMP31:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
65+
; CHECK-NEXT: [[TMP32:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD8]]
3966
; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], [[TMP5]]
4067
; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP11]])
4168
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
@@ -47,8 +74,28 @@ define i64 @same_exit_block_pre_inc_use1() #0 {
4774
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 510, [[N_VEC]]
4875
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
4976
; CHECK: vector.early.exit:
77+
; CHECK-NEXT: [[TMP63:%.*]] = call i64 @llvm.vscale.i64()
78+
; CHECK-NEXT: [[TMP42:%.*]] = mul nuw i64 [[TMP63]], 16
79+
; CHECK-NEXT: [[TMP43:%.*]] = mul i64 1, [[TMP42]]
80+
; CHECK-NEXT: [[TMP44:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
81+
; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP42]], 3
82+
; CHECK-NEXT: [[TMP45:%.*]] = add i64 [[TMP62]], [[TMP44]]
83+
; CHECK-NEXT: [[TMP46:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP31]], i1 true)
84+
; CHECK-NEXT: [[TMP58:%.*]] = mul i64 [[TMP42]], 2
85+
; CHECK-NEXT: [[TMP50:%.*]] = add i64 [[TMP58]], [[TMP46]]
86+
; CHECK-NEXT: [[TMP47:%.*]] = icmp ne i64 [[TMP50]], [[TMP43]]
87+
; CHECK-NEXT: [[TMP51:%.*]] = select i1 [[TMP47]], i64 [[TMP50]], i64 [[TMP45]]
88+
; CHECK-NEXT: [[TMP52:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP30]], i1 true)
89+
; CHECK-NEXT: [[TMP64:%.*]] = mul i64 [[TMP42]], 1
90+
; CHECK-NEXT: [[TMP56:%.*]] = add i64 [[TMP64]], [[TMP52]]
91+
; CHECK-NEXT: [[TMP53:%.*]] = icmp ne i64 [[TMP56]], [[TMP43]]
92+
; CHECK-NEXT: [[TMP57:%.*]] = select i1 [[TMP53]], i64 [[TMP56]], i64 [[TMP51]]
5093
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP11]], i1 true)
51-
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP15]]
94+
; CHECK-NEXT: [[TMP65:%.*]] = mul i64 [[TMP42]], 0
95+
; CHECK-NEXT: [[TMP60:%.*]] = add i64 [[TMP65]], [[TMP15]]
96+
; CHECK-NEXT: [[TMP59:%.*]] = icmp ne i64 [[TMP60]], [[TMP43]]
97+
; CHECK-NEXT: [[TMP61:%.*]] = select i1 [[TMP59]], i64 [[TMP60]], i64 [[TMP57]]
98+
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX1]], [[TMP61]]
5299
; CHECK-NEXT: [[TMP17:%.*]] = add i64 3, [[TMP16]]
53100
; CHECK-NEXT: br label [[LOOP_END]]
54101
; CHECK: scalar.ph:

0 commit comments

Comments
 (0)