Support vector float_truncate for SF to BF.

algebra84 · algebra84 · commit a17acf4f25f0 · 2024-11-05T00:20:04.000-08:00
Generate native instruction whenever possible, otherwise use vector
permutation with odd indices.

gcc/ChangeLog:

	* config/i386/i386-expand.cc
	(ix86_expand_vector_sf2bf_with_vec_perm): New function.
	* config/i386/i386-protos.h
	(ix86_expand_vector_sf2bf_with_vec_perm): New declare.
	* config/i386/mmx.md (truncv2sfv2bf2): New expander.
	* config/i386/sse.md (truncv4sfv4bf2): Ditto.
	(truncv8sfv8bf2): Ditto.
	(truncv16sfv16bf2): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx512bf16-truncsfbf.c: New test.
	* gcc.target/i386/avx512bw-truncsfbf.c: New test.
	* gcc.target/i386/ssse3-truncsfbf.c: New test.
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
@@ -26842,4 +26842,42 @@ ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input, machine_mode cvt_m
   emit_move_insn (output, gen_lowpart (out_mode, d.target));
 }
 
+/* Implement truncv8sfv8bf2 with vector permutation.  */
+void
+ix86_expand_vector_sf2bf_with_vec_perm (rtx dest, rtx src)
+{
+  machine_mode vperm_mode, src_mode = GET_MODE (src);
+  switch (src_mode)
+    {
+    case V16SFmode:
+      vperm_mode = V32BFmode;
+      break;
+    case V8SFmode:
+      vperm_mode = V16BFmode;
+      break;
+    case V4SFmode:
+      vperm_mode = V8BFmode;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  int nelt = GET_MODE_NUNITS (vperm_mode);
+  vec_perm_builder sel (nelt, nelt, 1);
+  sel.quick_grow (nelt);
+  for (int i = 0; i != nelt; i++)
+    sel[i] = (2 * i + 1) % nelt;
+  vec_perm_indices indices (sel, 1, nelt);
+
+  rtx target = gen_reg_rtx (vperm_mode);
+  rtx op0 = lowpart_subreg (vperm_mode,
+			    force_reg (src_mode, src),
+			    src_mode);
+  bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode,
+					      target, op0, op0, indices);
+  gcc_assert (ok);
+  emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
+}
+
+
 #include "gt-i386-expand.h"
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
@@ -258,6 +258,7 @@ extern int ix86_ternlog_idx (rtx op, rtx *args);
 extern bool ix86_ternlog_operand_p (rtx op);
 extern rtx ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2,
 				int idx, rtx target);
+extern void ix86_expand_vector_sf2bf_with_vec_perm (rtx, rtx);
 
 #ifdef TREE_CODE
 extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
@@ -2994,6 +2994,24 @@
   DONE;
 })
 
+(define_expand "truncv2sfv2bf2"
+  [(set (match_operand:V2BF 0 "register_operand")
+	(float_truncate:V2BF
+	  (match_operand:V2SF 1 "nonimmediate_operand")))]
+  "TARGET_SSSE3 && TARGET_MMX_WITH_SSE"
+{
+  rtx op1 = gen_reg_rtx (V4SFmode);
+  rtx op0 = gen_reg_rtx (V4BFmode);
+
+  emit_move_insn (op1, lowpart_subreg (V4SFmode,
+				       force_reg (V2SFmode, operands[1]),
+				       V2SFmode));
+  emit_insn (gen_truncv4sfv4bf2 (op0, op1));
+
+  emit_move_insn (operands[0], lowpart_subreg (V2BFmode, op0, V4BFmode));
+  DONE;
+})
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Parallel integral arithmetic
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
@@ -30984,6 +30984,24 @@
   "TARGET_AVX512BF16"
   "vcvtne2ps2bf16\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}")
 
+(define_expand "truncv4sfv4bf2"
+  [(set (match_operand:V4BF 0 "register_operand")
+	  (float_truncate:V4BF
+	    (match_operand:V4SF 1 "nonimmediate_operand")))]
+  "TARGET_SSSE3"
+{
+  if (!TARGET_AVXNECONVERT
+      && !(TARGET_AVX512BF16 && TARGET_AVX512VL))
+    ix86_expand_vector_sf2bf_with_vec_perm (operands[0], operands[1]);
+  else
+    {
+      rtx dest = gen_reg_rtx (V8BFmode);
+      emit_insn (gen_vcvtneps2bf16_v4sf (dest, operands[1]));
+      emit_move_insn (operands[0], lowpart_subreg (V4BFmode, dest, V8BFmode));
+    }
+  DONE;
+})
+
 (define_expand "vcvtneps2bf16_v4sf"
   [(set (match_operand:V8BF 0 "register_operand")
 	(vec_concat:V8BF
@@ -31059,6 +31077,20 @@
   DONE;
 })
 
+(define_expand "truncv8sfv8bf2"
+  [(set (match_operand:V8BF 0 "register_operand")
+	(float_truncate:V8BF
+	  (match_operand:V8SF 1 "nonimmediate_operand")))]
+  "TARGET_AVX2"
+{
+  if (!TARGET_AVXNECONVERT
+      && !(TARGET_AVX512BF16 && TARGET_AVX512VL))
+    {
+      ix86_expand_vector_sf2bf_with_vec_perm (operands[0], operands[1]);
+      DONE;
+    }
+})
+
 (define_insn "vcvtneps2bf16_v8sf"
   [(set (match_operand:V8BF 0 "register_operand" "=x,v")
 	(float_truncate:V8BF
@@ -31071,6 +31103,18 @@
    (set_attr "addr" "gpr16,*")
    (set_attr "prefix" "vex,evex")])
 
+(define_expand "truncv16sfv16bf2"
+  [(set (match_operand:V16BF 0 "register_operand")
+	(float_truncate:V16BF
+	  (match_operand:V16SF 1 "nonimmediate_operand")))]
+  "TARGET_AVX512BW && TARGET_EVEX512"
+{
+  if (!TARGET_AVX512BF16)
+    {
+      ix86_expand_vector_sf2bf_with_vec_perm (operands[0], operands[1]);
+      DONE;
+    }
+})
 
 (define_insn "avx512f_cvtneps2bf16_<mode><mask_name>"
   [(set (match_operand:<sf_cvt_bf16> 0 "register_operand" "=v")
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c b/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -mavx512bf16 -O2" } */
+/* { dg-final { scan-assembler-times {(?n)vcvtneps2bf16} 6 } } */
+
+#include "avx512bw-truncsfbf.c"
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c b/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512vl -mno-avx512bf16 -mno-avxneconvert -O2" } */
+/* { dg-final { scan-assembler-times {(?n)(?:vpermw|vpshufb)} 6 } } */
+
+typedef float v4sf __attribute__((vector_size(16)));
+typedef float v8sf __attribute__((vector_size(32)));
+typedef float v16sf __attribute__((vector_size(64)));
+typedef __bf16 v4bf __attribute__((vector_size(8)));
+typedef __bf16 v8bf __attribute__((vector_size(16)));
+typedef __bf16 v16bf __attribute__((vector_size(32)));
+
+v4bf
+foo (v4sf b, v4sf a)
+{
+  return __builtin_convertvector (a, v4bf);
+}
+
+v8bf
+foo2 (v8sf b, v8sf a)
+{
+  return __builtin_convertvector (a, v8bf);
+}
+
+v16bf
+foo3 (v16sf b, v16sf a)
+{
+  return __builtin_convertvector (a, v16bf);
+}
+
+v4bf
+foo_mem (v4sf* a)
+{
+  return __builtin_convertvector (*a, v4bf);
+}
+
+v8bf
+foo2_mem (v8sf* a)
+{
+  return __builtin_convertvector (*a, v8bf);
+}
+
+v16bf
+foo3_mem (v16sf* a)
+{
+  return __builtin_convertvector (*a, v16bf);
+}
diff --git a/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c b/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-mssse3 -mno-avx512bf16 -mno-avxneconvert -O2" } */
+/* { dg-final { scan-assembler-times {(?n)pshufb} 2 { target { ! ia32 } } } } */
+
+typedef float v2sf __attribute__((vector_size(8)));
+typedef __bf16 v2bf __attribute__((vector_size(4)));
+
+v2bf
+foo (v2sf b, v2sf a)
+{
+  return __builtin_convertvector (a, v2bf);
+}
+
+
+v2bf
+foo_mem (v2sf* a)
+{
+  return __builtin_convertvector (*a, v2bf);
+}
+