Skip to content

Commit 0f45bcd

Browse files
davemgreenmemfrob
authored and
memfrob
committed
[ARM] Turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
Under MVE we can use VADDV/VADDVA's to perform integer add reductions, so it can be beneficial to use more reductions than summing subvectors and reducing once. Especially for VMLAV/VMLAVA the mul can be incorporated into the reduction, producing less instructions. Some of the test cases currently get larger due to extra integer adds, but will be improved in a followup patch. Differential Revision: https://reviews.llvm.org/D106531
1 parent 679b748 commit 0f45bcd

File tree

5 files changed

+446
-453
lines changed

5 files changed

+446
-453
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15976,6 +15976,15 @@ static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,
1597615976
SDValue N0 = N->getOperand(0);
1597715977
SDLoc dl(N);
1597815978

15979+
// Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
15980+
if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
15981+
(N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
15982+
N0.getValueType() == MVT::v16i8)) {
15983+
SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
15984+
SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
15985+
return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
15986+
}
15987+
1597915988
// We are looking for something that will have illegal types if left alone,
1598015989
// but that we can convert to a single instruction under MVE. For example
1598115990
// vecreduce_add(sext(A, v8i32)) => VADDV.s16 A

llvm/test/CodeGen/Thumb2/mve-vaddv.ll

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ entry:
3535
define arm_aapcs_vfpcc i32 @vaddv_v8i32_i32(<8 x i32> %s1) {
3636
; CHECK-LABEL: vaddv_v8i32_i32:
3737
; CHECK: @ %bb.0: @ %entry
38-
; CHECK-NEXT: vadd.i32 q0, q0, q1
39-
; CHECK-NEXT: vaddv.u32 r0, q0
38+
; CHECK-NEXT: vaddv.u32 r0, q1
39+
; CHECK-NEXT: vaddva.u32 r0, q0
4040
; CHECK-NEXT: bx lr
4141
entry:
4242
%r = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %s1)
@@ -56,8 +56,8 @@ entry:
5656
define arm_aapcs_vfpcc i16 @vaddv_v16i16_i16(<16 x i16> %s1) {
5757
; CHECK-LABEL: vaddv_v16i16_i16:
5858
; CHECK: @ %bb.0: @ %entry
59-
; CHECK-NEXT: vadd.i16 q0, q0, q1
60-
; CHECK-NEXT: vaddv.u16 r0, q0
59+
; CHECK-NEXT: vaddv.u16 r0, q1
60+
; CHECK-NEXT: vaddva.u16 r0, q0
6161
; CHECK-NEXT: bx lr
6262
entry:
6363
%r = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %s1)
@@ -77,8 +77,8 @@ entry:
7777
define arm_aapcs_vfpcc i8 @vaddv_v32i8_i8(<32 x i8> %s1) {
7878
; CHECK-LABEL: vaddv_v32i8_i8:
7979
; CHECK: @ %bb.0: @ %entry
80-
; CHECK-NEXT: vadd.i8 q0, q0, q1
81-
; CHECK-NEXT: vaddv.u8 r0, q0
80+
; CHECK-NEXT: vaddv.u8 r0, q1
81+
; CHECK-NEXT: vaddva.u8 r0, q0
8282
; CHECK-NEXT: bx lr
8383
entry:
8484
%r = call i8 @llvm.vector.reduce.add.i8.v32i8(<32 x i8> %s1)
@@ -117,8 +117,9 @@ entry:
117117
define arm_aapcs_vfpcc i32 @vaddva_v8i32_i32(<8 x i32> %s1, i32 %x) {
118118
; CHECK-LABEL: vaddva_v8i32_i32:
119119
; CHECK: @ %bb.0: @ %entry
120-
; CHECK-NEXT: vadd.i32 q0, q0, q1
121-
; CHECK-NEXT: vaddva.u32 r0, q0
120+
; CHECK-NEXT: vaddv.u32 r2, q1
121+
; CHECK-NEXT: vaddva.u32 r2, q0
122+
; CHECK-NEXT: add r0, r2
122123
; CHECK-NEXT: bx lr
123124
entry:
124125
%t = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %s1)
@@ -140,8 +141,9 @@ entry:
140141
define arm_aapcs_vfpcc i16 @vaddva_v16i16_i16(<16 x i16> %s1, i16 %x) {
141142
; CHECK-LABEL: vaddva_v16i16_i16:
142143
; CHECK: @ %bb.0: @ %entry
143-
; CHECK-NEXT: vadd.i16 q0, q0, q1
144-
; CHECK-NEXT: vaddva.u16 r0, q0
144+
; CHECK-NEXT: vaddv.u16 r2, q1
145+
; CHECK-NEXT: vaddva.u16 r2, q0
146+
; CHECK-NEXT: add r0, r2
145147
; CHECK-NEXT: bx lr
146148
entry:
147149
%t = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %s1)
@@ -163,8 +165,9 @@ entry:
163165
define arm_aapcs_vfpcc i8 @vaddva_v32i8_i8(<32 x i8> %s1, i8 %x) {
164166
; CHECK-LABEL: vaddva_v32i8_i8:
165167
; CHECK: @ %bb.0: @ %entry
166-
; CHECK-NEXT: vadd.i8 q0, q0, q1
167-
; CHECK-NEXT: vaddva.u8 r0, q0
168+
; CHECK-NEXT: vaddv.u8 r2, q1
169+
; CHECK-NEXT: vaddva.u8 r2, q0
170+
; CHECK-NEXT: add r0, r2
168171
; CHECK-NEXT: bx lr
169172
entry:
170173
%t = call i8 @llvm.vector.reduce.add.i8.v32i8(<32 x i8> %s1)

llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -522,13 +522,11 @@ define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_szext(<16 x i8> %x, <16 x i
522522
; CHECK-NEXT: vstrw.32 q0, [r1]
523523
; CHECK-NEXT: vldrb.u16 q0, [r0, #8]
524524
; CHECK-NEXT: vldrb.s16 q1, [r1, #8]
525-
; CHECK-NEXT: vldrb.s16 q2, [r1]
526-
; CHECK-NEXT: vmul.i16 q0, q1, q0
527-
; CHECK-NEXT: vldrb.u16 q1, [r0]
528-
; CHECK-NEXT: vmul.i16 q1, q2, q1
529-
; CHECK-NEXT: vadd.i16 q0, q1, q0
530-
; CHECK-NEXT: vaddv.u16 r0, q0
531-
; CHECK-NEXT: sxth r0, r0
525+
; CHECK-NEXT: vmlav.u16 r2, q1, q0
526+
; CHECK-NEXT: vldrb.u16 q0, [r0]
527+
; CHECK-NEXT: vldrb.s16 q1, [r1]
528+
; CHECK-NEXT: vmlava.u16 r2, q1, q0
529+
; CHECK-NEXT: sxth r0, r2
532530
; CHECK-NEXT: add sp, #32
533531
; CHECK-NEXT: bx lr
534532
entry:

llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll

Lines changed: 32 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -703,53 +703,50 @@ define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_szext(<16 x i8> %x, <16 x i
703703
; CHECK-NEXT: vcmp.i8 eq, q2, zr
704704
; CHECK-NEXT: vmov.i8 q0, #0x0
705705
; CHECK-NEXT: vmov.i8 q1, #0xff
706-
; CHECK-NEXT: vldrb.u16 q2, [r0]
706+
; CHECK-NEXT: vldrb.s16 q2, [r1, #8]
707707
; CHECK-NEXT: vpsel q0, q1, q0
708-
; CHECK-NEXT: vldrb.s16 q3, [r1]
709-
; CHECK-NEXT: vmov.u8 r2, q0[0]
708+
; CHECK-NEXT: vmov.u8 r2, q0[8]
709+
; CHECK-NEXT: vmov.u8 r3, q0[0]
710710
; CHECK-NEXT: vmov.16 q1[0], r2
711-
; CHECK-NEXT: vmov.u8 r2, q0[1]
711+
; CHECK-NEXT: vmov.u8 r2, q0[9]
712712
; CHECK-NEXT: vmov.16 q1[1], r2
713-
; CHECK-NEXT: vmov.u8 r2, q0[2]
713+
; CHECK-NEXT: vmov.u8 r2, q0[10]
714714
; CHECK-NEXT: vmov.16 q1[2], r2
715-
; CHECK-NEXT: vmov.u8 r2, q0[3]
715+
; CHECK-NEXT: vmov.u8 r2, q0[11]
716716
; CHECK-NEXT: vmov.16 q1[3], r2
717-
; CHECK-NEXT: vmov.u8 r2, q0[4]
717+
; CHECK-NEXT: vmov.u8 r2, q0[12]
718718
; CHECK-NEXT: vmov.16 q1[4], r2
719-
; CHECK-NEXT: vmov.u8 r2, q0[5]
719+
; CHECK-NEXT: vmov.u8 r2, q0[13]
720720
; CHECK-NEXT: vmov.16 q1[5], r2
721-
; CHECK-NEXT: vmov.u8 r2, q0[6]
721+
; CHECK-NEXT: vmov.u8 r2, q0[14]
722722
; CHECK-NEXT: vmov.16 q1[6], r2
723-
; CHECK-NEXT: vmov.u8 r2, q0[7]
723+
; CHECK-NEXT: vmov.u8 r2, q0[15]
724724
; CHECK-NEXT: vmov.16 q1[7], r2
725-
; CHECK-NEXT: vmov.u8 r2, q0[8]
726725
; CHECK-NEXT: vcmp.i16 ne, q1, zr
727-
; CHECK-NEXT: vmov.i32 q1, #0x0
726+
; CHECK-NEXT: vldrb.u16 q1, [r0, #8]
728727
; CHECK-NEXT: vpst
729-
; CHECK-NEXT: vmult.i16 q1, q3, q2
730-
; CHECK-NEXT: vmov.16 q2[0], r2
731-
; CHECK-NEXT: vmov.u8 r2, q0[9]
732-
; CHECK-NEXT: vmov.16 q2[1], r2
733-
; CHECK-NEXT: vmov.u8 r2, q0[10]
734-
; CHECK-NEXT: vmov.16 q2[2], r2
735-
; CHECK-NEXT: vmov.u8 r2, q0[11]
736-
; CHECK-NEXT: vmov.16 q2[3], r2
737-
; CHECK-NEXT: vmov.u8 r2, q0[12]
738-
; CHECK-NEXT: vmov.16 q2[4], r2
739-
; CHECK-NEXT: vmov.u8 r2, q0[13]
740-
; CHECK-NEXT: vmov.16 q2[5], r2
741-
; CHECK-NEXT: vmov.u8 r2, q0[14]
742-
; CHECK-NEXT: vmov.16 q2[6], r2
743-
; CHECK-NEXT: vmov.u8 r2, q0[15]
744-
; CHECK-NEXT: vmov.16 q2[7], r2
745-
; CHECK-NEXT: vldrb.u16 q0, [r0, #8]
746-
; CHECK-NEXT: vcmp.i16 ne, q2, zr
747-
; CHECK-NEXT: vldrb.s16 q2, [r1, #8]
748-
; CHECK-NEXT: vmul.i16 q0, q2, q0
728+
; CHECK-NEXT: vmlavt.u16 r2, q2, q1
729+
; CHECK-NEXT: vmov.16 q1[0], r3
730+
; CHECK-NEXT: vmov.u8 r3, q0[1]
731+
; CHECK-NEXT: vmov.16 q1[1], r3
732+
; CHECK-NEXT: vmov.u8 r3, q0[2]
733+
; CHECK-NEXT: vmov.16 q1[2], r3
734+
; CHECK-NEXT: vmov.u8 r3, q0[3]
735+
; CHECK-NEXT: vmov.16 q1[3], r3
736+
; CHECK-NEXT: vmov.u8 r3, q0[4]
737+
; CHECK-NEXT: vmov.16 q1[4], r3
738+
; CHECK-NEXT: vmov.u8 r3, q0[5]
739+
; CHECK-NEXT: vmov.16 q1[5], r3
740+
; CHECK-NEXT: vmov.u8 r3, q0[6]
741+
; CHECK-NEXT: vmov.16 q1[6], r3
742+
; CHECK-NEXT: vmov.u8 r3, q0[7]
743+
; CHECK-NEXT: vmov.16 q1[7], r3
744+
; CHECK-NEXT: vldrb.u16 q0, [r0]
745+
; CHECK-NEXT: vcmp.i16 ne, q1, zr
746+
; CHECK-NEXT: vldrb.s16 q1, [r1]
749747
; CHECK-NEXT: vpst
750-
; CHECK-NEXT: vaddt.i16 q1, q1, q0
751-
; CHECK-NEXT: vaddv.u16 r0, q1
752-
; CHECK-NEXT: sxth r0, r0
748+
; CHECK-NEXT: vmlavat.u16 r2, q1, q0
749+
; CHECK-NEXT: sxth r0, r2
753750
; CHECK-NEXT: add sp, #32
754751
; CHECK-NEXT: bx lr
755752
entry:

0 commit comments

Comments
 (0)