Skip to content

Commit a17acf4

Browse files
committed
Support vector float_truncate for SF to BF.
Generate native instruction whenever possible, otherwise use vector permutation with odd indices. gcc/ChangeLog: * config/i386/i386-expand.cc (ix86_expand_vector_sf2bf_with_vec_perm): New function. * config/i386/i386-protos.h (ix86_expand_vector_sf2bf_with_vec_perm): New declare. * config/i386/mmx.md (truncv2sfv2bf2): New expander. * config/i386/sse.md (truncv4sfv4bf2): Ditto. (truncv8sfv8bf2): Ditto. (truncv16sfv16bf2): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512bf16-truncsfbf.c: New test. * gcc.target/i386/avx512bw-truncsfbf.c: New test. * gcc.target/i386/ssse3-truncsfbf.c: New test.
1 parent c1bbad0 commit a17acf4

File tree

7 files changed

+172
-0
lines changed

7 files changed

+172
-0
lines changed

gcc/config/i386/i386-expand.cc

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26842,4 +26842,42 @@ ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input, machine_mode cvt_m
2684226842
emit_move_insn (output, gen_lowpart (out_mode, d.target));
2684326843
}
2684426844

26845+
/* Implement truncv8sfv8bf2 with vector permutation. */
26846+
void
26847+
ix86_expand_vector_sf2bf_with_vec_perm (rtx dest, rtx src)
26848+
{
26849+
machine_mode vperm_mode, src_mode = GET_MODE (src);
26850+
switch (src_mode)
26851+
{
26852+
case V16SFmode:
26853+
vperm_mode = V32BFmode;
26854+
break;
26855+
case V8SFmode:
26856+
vperm_mode = V16BFmode;
26857+
break;
26858+
case V4SFmode:
26859+
vperm_mode = V8BFmode;
26860+
break;
26861+
default:
26862+
gcc_unreachable ();
26863+
}
26864+
26865+
int nelt = GET_MODE_NUNITS (vperm_mode);
26866+
vec_perm_builder sel (nelt, nelt, 1);
26867+
sel.quick_grow (nelt);
26868+
for (int i = 0; i != nelt; i++)
26869+
sel[i] = (2 * i + 1) % nelt;
26870+
vec_perm_indices indices (sel, 1, nelt);
26871+
26872+
rtx target = gen_reg_rtx (vperm_mode);
26873+
rtx op0 = lowpart_subreg (vperm_mode,
26874+
force_reg (src_mode, src),
26875+
src_mode);
26876+
bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode,
26877+
target, op0, op0, indices);
26878+
gcc_assert (ok);
26879+
emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
26880+
}
26881+
26882+
2684526883
#include "gt-i386-expand.h"

gcc/config/i386/i386-protos.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,7 @@ extern int ix86_ternlog_idx (rtx op, rtx *args);
258258
extern bool ix86_ternlog_operand_p (rtx op);
259259
extern rtx ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2,
260260
int idx, rtx target);
261+
extern void ix86_expand_vector_sf2bf_with_vec_perm (rtx, rtx);
261262

262263
#ifdef TREE_CODE
263264
extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);

gcc/config/i386/mmx.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2994,6 +2994,24 @@
29942994
DONE;
29952995
})
29962996

2997+
(define_expand "truncv2sfv2bf2"
2998+
[(set (match_operand:V2BF 0 "register_operand")
2999+
(float_truncate:V2BF
3000+
(match_operand:V2SF 1 "nonimmediate_operand")))]
3001+
"TARGET_SSSE3 && TARGET_MMX_WITH_SSE"
3002+
{
3003+
rtx op1 = gen_reg_rtx (V4SFmode);
3004+
rtx op0 = gen_reg_rtx (V4BFmode);
3005+
3006+
emit_move_insn (op1, lowpart_subreg (V4SFmode,
3007+
force_reg (V2SFmode, operands[1]),
3008+
V2SFmode));
3009+
emit_insn (gen_truncv4sfv4bf2 (op0, op1));
3010+
3011+
emit_move_insn (operands[0], lowpart_subreg (V2BFmode, op0, V4BFmode));
3012+
DONE;
3013+
})
3014+
29973015
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29983016
;;
29993017
;; Parallel integral arithmetic

gcc/config/i386/sse.md

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30984,6 +30984,24 @@
3098430984
"TARGET_AVX512BF16"
3098530985
"vcvtne2ps2bf16\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}")
3098630986

30987+
(define_expand "truncv4sfv4bf2"
30988+
[(set (match_operand:V4BF 0 "register_operand")
30989+
(float_truncate:V4BF
30990+
(match_operand:V4SF 1 "nonimmediate_operand")))]
30991+
"TARGET_SSSE3"
30992+
{
30993+
if (!TARGET_AVXNECONVERT
30994+
&& !(TARGET_AVX512BF16 && TARGET_AVX512VL))
30995+
ix86_expand_vector_sf2bf_with_vec_perm (operands[0], operands[1]);
30996+
else
30997+
{
30998+
rtx dest = gen_reg_rtx (V8BFmode);
30999+
emit_insn (gen_vcvtneps2bf16_v4sf (dest, operands[1]));
31000+
emit_move_insn (operands[0], lowpart_subreg (V4BFmode, dest, V8BFmode));
31001+
}
31002+
DONE;
31003+
})
31004+
3098731005
(define_expand "vcvtneps2bf16_v4sf"
3098831006
[(set (match_operand:V8BF 0 "register_operand")
3098931007
(vec_concat:V8BF
@@ -31059,6 +31077,20 @@
3105931077
DONE;
3106031078
})
3106131079

31080+
(define_expand "truncv8sfv8bf2"
31081+
[(set (match_operand:V8BF 0 "register_operand")
31082+
(float_truncate:V8BF
31083+
(match_operand:V8SF 1 "nonimmediate_operand")))]
31084+
"TARGET_AVX2"
31085+
{
31086+
if (!TARGET_AVXNECONVERT
31087+
&& !(TARGET_AVX512BF16 && TARGET_AVX512VL))
31088+
{
31089+
ix86_expand_vector_sf2bf_with_vec_perm (operands[0], operands[1]);
31090+
DONE;
31091+
}
31092+
})
31093+
3106231094
(define_insn "vcvtneps2bf16_v8sf"
3106331095
[(set (match_operand:V8BF 0 "register_operand" "=x,v")
3106431096
(float_truncate:V8BF
@@ -31071,6 +31103,18 @@
3107131103
(set_attr "addr" "gpr16,*")
3107231104
(set_attr "prefix" "vex,evex")])
3107331105

31106+
(define_expand "truncv16sfv16bf2"
31107+
[(set (match_operand:V16BF 0 "register_operand")
31108+
(float_truncate:V16BF
31109+
(match_operand:V16SF 1 "nonimmediate_operand")))]
31110+
"TARGET_AVX512BW && TARGET_EVEX512"
31111+
{
31112+
if (!TARGET_AVX512BF16)
31113+
{
31114+
ix86_expand_vector_sf2bf_with_vec_perm (operands[0], operands[1]);
31115+
DONE;
31116+
}
31117+
})
3107431118

3107531119
(define_insn "avx512f_cvtneps2bf16_<mode><mask_name>"
3107631120
[(set (match_operand:<sf_cvt_bf16> 0 "register_operand" "=v")
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
/* { dg-do compile } */
2+
/* { dg-options "-mavx512vl -mavx512bf16 -O2" } */
3+
/* { dg-final { scan-assembler-times {(?n)vcvtneps2bf16} 6 } } */
4+
5+
#include "avx512bw-truncsfbf.c"
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/* { dg-do compile } */
2+
/* { dg-options "-mavx512bw -mavx512vl -mno-avx512bf16 -mno-avxneconvert -O2" } */
3+
/* { dg-final { scan-assembler-times {(?n)(?:vpermw|vpshufb)} 6 } } */
4+
5+
typedef float v4sf __attribute__((vector_size(16)));
6+
typedef float v8sf __attribute__((vector_size(32)));
7+
typedef float v16sf __attribute__((vector_size(64)));
8+
typedef __bf16 v4bf __attribute__((vector_size(8)));
9+
typedef __bf16 v8bf __attribute__((vector_size(16)));
10+
typedef __bf16 v16bf __attribute__((vector_size(32)));
11+
12+
v4bf
13+
foo (v4sf b, v4sf a)
14+
{
15+
return __builtin_convertvector (a, v4bf);
16+
}
17+
18+
v8bf
19+
foo2 (v8sf b, v8sf a)
20+
{
21+
return __builtin_convertvector (a, v8bf);
22+
}
23+
24+
v16bf
25+
foo3 (v16sf b, v16sf a)
26+
{
27+
return __builtin_convertvector (a, v16bf);
28+
}
29+
30+
v4bf
31+
foo_mem (v4sf* a)
32+
{
33+
return __builtin_convertvector (*a, v4bf);
34+
}
35+
36+
v8bf
37+
foo2_mem (v8sf* a)
38+
{
39+
return __builtin_convertvector (*a, v8bf);
40+
}
41+
42+
v16bf
43+
foo3_mem (v16sf* a)
44+
{
45+
return __builtin_convertvector (*a, v16bf);
46+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
/* { dg-do compile } */
2+
/* { dg-options "-mssse3 -mno-avx512bf16 -mno-avxneconvert -O2" } */
3+
/* { dg-final { scan-assembler-times {(?n)pshufb} 2 { target { ! ia32 } } } } */
4+
5+
typedef float v2sf __attribute__((vector_size(8)));
6+
typedef __bf16 v2bf __attribute__((vector_size(4)));
7+
8+
v2bf
9+
foo (v2sf b, v2sf a)
10+
{
11+
return __builtin_convertvector (a, v2bf);
12+
}
13+
14+
15+
v2bf
16+
foo_mem (v2sf* a)
17+
{
18+
return __builtin_convertvector (*a, v2bf);
19+
}
20+

0 commit comments

Comments
 (0)