Skip to content

Commit 779aea4

Browse files
jgreenhalgh-armJames Greenhalgh
authored andcommitted
[AArch64] Implement vmul<q>_lane<q>_<fsu><16,32,64> intrinsics in C
gcc/ * config/aarch64/aarch64-simd.md (aarch64_mul3_elt<mode>): New. (aarch64_mul3_elt_<vswap_width_name><mode>): Likewise. (aarch64_mul3_elt_to_128df): Likewise. (aarch64_mul3_elt_to_64v2df): Likewise. * config/aarch64/iterators.md (VEL): Also handle DFmode. (VMUL): New. (VMUL_CHANGE_NLANES) Likewise. (h_con): Likewise. (f): Likewise. * config/aarch64/arm_neon.h (vmul<q>_lane<q>_<suf><16,32,64>): Convert to C implementation. gcc/testsuite/ * gcc.target/aarch64/mul_intrinsic_1.c: New. * gcc.target/aarch64/fmul_intrinsic_1.c: Likewise. From-SVN: r202624
1 parent a407a75 commit 779aea4

File tree

7 files changed

+446
-287
lines changed

7 files changed

+446
-287
lines changed

gcc/ChangeLog

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
2013-09-16 James Greenhalgh <[email protected]>
2+
3+
* config/aarch64/aarch64-simd.md (aarch64_mul3_elt<mode>): New.
4+
(aarch64_mul3_elt_<vswap_width_name><mode>): Likewise.
5+
(aarch64_mul3_elt_to_128df): Likewise.
6+
(aarch64_mul3_elt_to_64v2df): Likewise.
7+
* config/aarch64/iterators.md (VEL): Also handle DFmode.
8+
(VMUL): New.
9+
(VMUL_CHANGE_NLANES) Likewise.
10+
(h_con): Likewise.
11+
(f): Likewise.
12+
* config/aarch64/arm_neon.h
13+
(vmul<q>_lane<q>_<suf><16,32,64>): Convert to C implementation.
14+
115
2013-09-16 James Greenhalgh <[email protected]>
216

317
* config/aarch64/arm_neon.h

gcc/config/aarch64/aarch64-simd.md

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -582,6 +582,59 @@
582582
(set_attr "simd_mode" "<MODE>")]
583583
)
584584

585+
(define_insn "*aarch64_mul3_elt<mode>"
586+
[(set (match_operand:VMUL 0 "register_operand" "=w")
587+
(mult:VMUL
588+
(vec_duplicate:VMUL
589+
(vec_select:<VEL>
590+
(match_operand:VMUL 1 "register_operand" "<h_con>")
591+
(parallel [(match_operand:SI 2 "immediate_operand")])))
592+
(match_operand:VMUL 3 "register_operand" "w")))]
593+
"TARGET_SIMD"
594+
"<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]"
595+
[(set_attr "simd_type" "simd_<f>mul_elt")
596+
(set_attr "simd_mode" "<MODE>")]
597+
)
598+
599+
(define_insn "*aarch64_mul3_elt_<vswap_width_name><mode>"
600+
[(set (match_operand:VMUL_CHANGE_NLANES 0 "register_operand" "=w")
601+
(mult:VMUL_CHANGE_NLANES
602+
(vec_duplicate:VMUL_CHANGE_NLANES
603+
(vec_select:<VEL>
604+
(match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
605+
(parallel [(match_operand:SI 2 "immediate_operand")])))
606+
(match_operand:VMUL_CHANGE_NLANES 3 "register_operand" "w")))]
607+
"TARGET_SIMD"
608+
"<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]"
609+
[(set_attr "simd_type" "simd_<f>mul_elt")
610+
(set_attr "simd_mode" "<MODE>")]
611+
)
612+
613+
(define_insn "*aarch64_mul3_elt_to_128df"
614+
[(set (match_operand:V2DF 0 "register_operand" "=w")
615+
(mult:V2DF
616+
(vec_duplicate:V2DF
617+
(match_operand:DF 2 "register_operand" "w"))
618+
(match_operand:V2DF 1 "register_operand" "w")))]
619+
"TARGET_SIMD"
620+
"fmul\\t%0.2d, %1.2d, %2.d[0]"
621+
[(set_attr "simd_type" "simd_fmul_elt")
622+
(set_attr "simd_mode" "V2DF")]
623+
)
624+
625+
(define_insn "*aarch64_mul3_elt_to_64v2df"
626+
[(set (match_operand:DF 0 "register_operand" "=w")
627+
(mult:DF
628+
(vec_select:DF
629+
(match_operand:V2DF 1 "register_operand" "w")
630+
(parallel [(match_operand:SI 2 "immediate_operand")]))
631+
(match_operand:DF 3 "register_operand" "w")))]
632+
"TARGET_SIMD"
633+
"fmul\\t%0.2d, %3.2d, %1.d[%2]"
634+
[(set_attr "simd_type" "simd_fmul_elt")
635+
(set_attr "simd_mode" "V2DF")]
636+
)
637+
585638
(define_insn "neg<mode>2"
586639
[(set (match_operand:VDQ 0 "register_operand" "=w")
587640
(neg:VDQ (match_operand:VDQ 1 "register_operand" "w")))]

gcc/config/aarch64/arm_neon.h

Lines changed: 152 additions & 286 deletions
Large diffs are not rendered by default.

gcc/config/aarch64/iterators.md

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,12 @@
169169
;; Double scalar modes
170170
(define_mode_iterator DX [DI DF])
171171

172+
;; Modes available for <f>mul lane operations.
173+
(define_mode_iterator VMUL [V4HI V8HI V2SI V4SI V2SF V4SF V2DF])
174+
175+
;; Modes available for <f>mul lane operations changing lane count.
176+
(define_mode_iterator VMUL_CHANGE_NLANES [V4HI V8HI V2SI V4SI V2SF V4SF])
177+
172178
;; ------------------------------------------------------------------
173179
;; Unspec enumerations for Advance SIMD. These could well go into
174180
;; aarch64.md but for their use in int_iterators here.
@@ -358,7 +364,7 @@
358364
(V2SI "SI") (V4SI "SI")
359365
(DI "DI") (V2DI "DI")
360366
(V2SF "SF") (V4SF "SF")
361-
(V2DF "DF")
367+
(V2DF "DF") (DF "DF")
362368
(SI "SI") (HI "HI")
363369
(QI "QI")])
364370

@@ -541,6 +547,22 @@
541547
(V2SF "to_128") (V4SF "to_64")
542548
(DF "to_128") (V2DF "to_64")])
543549

550+
;; For certain vector-by-element multiplication instructions we must
551+
;; constrain the HI cases to use only V0-V15. This is covered by
552+
;; the 'x' constraint. All other modes may use the 'w' constraint.
553+
(define_mode_attr h_con [(V2SI "w") (V4SI "w")
554+
(V4HI "x") (V8HI "x")
555+
(V2SF "w") (V4SF "w")
556+
(V2DF "w") (DF "w")])
557+
558+
;; Defined to 'f' for types whose element type is a float type.
559+
(define_mode_attr f [(V8QI "") (V16QI "")
560+
(V4HI "") (V8HI "")
561+
(V2SI "") (V4SI "")
562+
(DI "") (V2DI "")
563+
(V2SF "f") (V4SF "f")
564+
(V2DF "f") (DF "f")])
565+
544566
;; -------------------------------------------------------------------
545567
;; Code Iterators
546568
;; -------------------------------------------------------------------

gcc/testsuite/ChangeLog

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
2013-09-16 James Greenhalgh <[email protected]>
2+
3+
* gcc.target/aarch64/mul_intrinsic_1.c: New.
4+
* gcc.target/aarch64/fmul_intrinsic_1.c: Likewise.
5+
16
2013-09-16 Richard Biener <[email protected]>
27

38
* gcc.dg/tree-ssa/ldist-22.c: New testcase.
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
/* { dg-do run } */
2+
/* { dg-options "-O3 --save-temps" } */
3+
4+
#include <arm_neon.h>
5+
6+
#define DELTA 0.0001
7+
extern void abort (void);
8+
extern double fabs (double);
9+
10+
#define TEST_VMUL(q1, q2, size, in1_lanes, in2_lanes) \
11+
static void \
12+
test_vmul##q1##_lane##q2##_f##size (float##size##_t * res, \
13+
const float##size##_t *in1, \
14+
const float##size##_t *in2) \
15+
{ \
16+
float##size##x##in1_lanes##_t a = vld1##q1##_f##size (res); \
17+
float##size##x##in1_lanes##_t b = vld1##q1##_f##size (in1); \
18+
float##size##x##in2_lanes##_t c; \
19+
if (in2_lanes > 1) \
20+
{ \
21+
c = vld1##q2##_f##size (in2); \
22+
a = vmul##q1##_lane##q2##_f##size (b, c, 1); \
23+
} \
24+
else \
25+
{ \
26+
c = vld1##q2##_f##size (in2 + 1); \
27+
a = vmul##q1##_lane##q2##_f##size (b, c, 0); \
28+
} \
29+
vst1##q1##_f##size (res, a); \
30+
}
31+
32+
#define BUILD_VARS(width, n_lanes, n_half_lanes) \
33+
TEST_VMUL ( , , width, n_half_lanes, n_half_lanes) \
34+
TEST_VMUL (q, , width, n_lanes, n_half_lanes) \
35+
TEST_VMUL ( , q, width, n_half_lanes, n_lanes) \
36+
TEST_VMUL (q, q, width, n_lanes, n_lanes)
37+
38+
BUILD_VARS (32, 4, 2)
39+
BUILD_VARS (64, 2, 1)
40+
41+
#define POOL2 {0.0, 1.0}
42+
#define POOL4 {0.0, 1.0, 2.0, 3.0}
43+
#define EMPTY2 {0.0, 0.0}
44+
#define EMPTY4 {0.0, 0.0, 0.0, 0.0}
45+
46+
#define BUILD_TEST(size, lanes) \
47+
static void \
48+
test_f##size (void) \
49+
{ \
50+
int i; \
51+
float##size##_t pool[lanes] = POOL##lanes; \
52+
float##size##_t res[lanes] = EMPTY##lanes; \
53+
float##size##_t res2[lanes] = EMPTY##lanes; \
54+
float##size##_t res3[lanes] = EMPTY##lanes; \
55+
float##size##_t res4[lanes] = EMPTY##lanes; \
56+
\
57+
/* Avoid constant folding the multiplication. */ \
58+
asm volatile ("" : : : "memory"); \
59+
test_vmul_lane_f##size (res, pool, pool); \
60+
/* Avoid fusing multiplication and subtraction. */ \
61+
asm volatile ("" : :"Q" (res) : "memory"); \
62+
for (i = 0; i < lanes / 2; i++) \
63+
if (fabs (res[i] - pool[i]) > DELTA) \
64+
abort (); \
65+
\
66+
test_vmulq_lane_f##size (res2, pool, pool); \
67+
/* Avoid fusing multiplication and subtraction. */ \
68+
asm volatile ("" : :"Q" (res2) : "memory"); \
69+
for (i = 0; i < lanes; i++) \
70+
if (fabs (res2[i] - pool[i]) > DELTA) \
71+
abort (); \
72+
\
73+
test_vmul_laneq_f##size (res3, pool, pool); \
74+
/* Avoid fusing multiplication and subtraction. */ \
75+
asm volatile ("" : :"Q" (res3) : "memory"); \
76+
for (i = 0; i < lanes / 2; i++) \
77+
if (fabs (res3[i] - pool[i]) > DELTA) \
78+
abort (); \
79+
\
80+
test_vmulq_laneq_f##size (res4, pool, pool); \
81+
/* Avoid fusing multiplication and subtraction. */ \
82+
asm volatile ("" : :"Q" (res4) : "memory"); \
83+
for (i = 0; i < lanes; i++) \
84+
if (fabs (res4[i] - pool[i]) > DELTA) \
85+
abort (); \
86+
}
87+
88+
BUILD_TEST (32, 4)
89+
BUILD_TEST (64, 2)
90+
91+
int
92+
main (int argc, char **argv)
93+
{
94+
test_f32 ();
95+
test_f64 ();
96+
return 0;
97+
}
98+
99+
/* vmul_laneq_f32.
100+
vmul_lane_f32. */
101+
/* { dg-final { scan-assembler-times "fmul\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.s\\\[\[0-9\]+\\\]" 2 } } */
102+
103+
/* vmulq_lane_f32.
104+
vmulq_laneq_f32. */
105+
/* { dg-final { scan-assembler-times "fmul\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[\[0-9\]+\\\]" 2 } } */
106+
107+
/* vmul_lane_f64. */
108+
/* { dg-final { scan-assembler-times "fmul\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 1 } } */
109+
110+
/* vmul_laneq_f64.
111+
vmulq_lane_f64.
112+
vmulq_laneq_f64. */
113+
/* { dg-final { scan-assembler-times "fmul\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 3 } } */
114+
115+
/* { dg-final { cleanup-saved-temps } } */
116+
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
/* { dg-do run } */
2+
/* { dg-options "-O3 --save-temps" } */
3+
4+
#include <arm_neon.h>
5+
6+
extern void abort (void);
7+
8+
#define MAPs(size, xx) int##size##xx##_t
9+
#define MAPu(size, xx) uint##size##xx##_t
10+
11+
12+
#define TEST_VMUL(q, su, size, in1_lanes, in2_lanes) \
13+
static void \
14+
test_vmulq_lane##q##_##su##size (MAP##su (size, ) * res, \
15+
const MAP##su(size, ) *in1, \
16+
const MAP##su(size, ) *in2) \
17+
{ \
18+
MAP##su (size, x##in1_lanes) a = vld1q_##su##size (in1); \
19+
MAP##su (size, x##in2_lanes) b = vld1##q##_##su##size (in2); \
20+
a = vmulq_lane##q##_##su##size (a, b, 1); \
21+
vst1q_##su##size (res, a); \
22+
}
23+
24+
#define BUILD_VARS(width, n_lanes, n_half_lanes) \
25+
TEST_VMUL (, s, width, n_lanes, n_half_lanes) \
26+
TEST_VMUL (q, s, width, n_lanes, n_lanes) \
27+
TEST_VMUL (, u, width, n_lanes, n_half_lanes) \
28+
TEST_VMUL (q, u, width, n_lanes, n_lanes) \
29+
30+
BUILD_VARS (32, 4, 2)
31+
BUILD_VARS (16, 8, 4)
32+
33+
#define POOL4 {0, 1, 2, 3}
34+
#define POOL8 {0, 1, 2, 3, 4, 5, 6, 7}
35+
#define EMPTY4 {0, 0, 0, 0}
36+
#define EMPTY8 {0, 0, 0, 0, 0, 0, 0, 0}
37+
38+
#define BUILD_TEST(su, size, lanes) \
39+
static void \
40+
test_##su##size (void) \
41+
{ \
42+
int i; \
43+
MAP##su (size,) pool[lanes] = POOL##lanes; \
44+
MAP##su (size,) res[lanes] = EMPTY##lanes; \
45+
MAP##su (size,) res2[lanes] = EMPTY##lanes; \
46+
\
47+
/* Forecfully avoid optimization. */ \
48+
asm volatile ("" : : : "memory"); \
49+
test_vmulq_lane_##su##size (res, pool, pool); \
50+
for (i = 0; i < lanes; i++) \
51+
if (res[i] != pool[i]) \
52+
abort (); \
53+
\
54+
/* Forecfully avoid optimization. */ \
55+
asm volatile ("" : : : "memory"); \
56+
test_vmulq_laneq_##su##size (res2, pool, pool); \
57+
for (i = 0; i < lanes; i++) \
58+
if (res2[i] != pool[i]) \
59+
abort (); \
60+
}
61+
62+
#undef BUILD_VARS
63+
#define BUILD_VARS(size, lanes) \
64+
BUILD_TEST (s, size, lanes) \
65+
BUILD_TEST (u, size, lanes)
66+
67+
BUILD_VARS (32, 4)
68+
BUILD_VARS (16, 8)
69+
70+
int
71+
main (int argc, char **argv)
72+
{
73+
test_s32 ();
74+
test_u32 ();
75+
test_s16 ();
76+
test_u16 ();
77+
return 0;
78+
}
79+
80+
/* { dg-final { scan-assembler-times "mul\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[\[0-9\]+\\\]" 4 } } */
81+
/* { dg-final { scan-assembler-times "mul\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.h\\\[\[0-9\]+\\\]" 4 } } */
82+
/* { dg-final { cleanup-saved-temps } } */
83+

0 commit comments

Comments
 (0)