@@ -2642,25 +2642,74 @@ pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
2642
2642
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi32)
2643
2643
#[ inline]
2644
2644
#[ target_feature( enable = "avx2" ) ]
2645
- #[ cfg_attr( test, assert_instr( vpermilps, MASK = 9 ) ) ]
2646
- #[ rustc_legacy_const_generics ( 1 ) ]
2645
+ #[ cfg_attr( test, assert_instr( vpermilps, imm8 = 9 ) ) ]
2646
+ #[ rustc_args_required_const ( 1 ) ]
2647
2647
#[ stable( feature = "simd_x86" , since = "1.27.0" ) ]
2648
- pub unsafe fn _mm256_shuffle_epi32 < const MASK : i32 > ( a : __m256i ) -> __m256i {
2649
- static_assert_imm8 ! ( MASK ) ;
2650
- let r: i32x8 = simd_shuffle8 (
2651
- a. as_i32x8 ( ) ,
2652
- a. as_i32x8 ( ) ,
2653
- [
2654
- MASK as u32 & 0b11 ,
2655
- ( MASK as u32 >> 2 ) & 0b11 ,
2656
- ( MASK as u32 >> 4 ) & 0b11 ,
2657
- ( MASK as u32 >> 6 ) & 0b11 ,
2658
- ( MASK as u32 & 0b11 ) + 4 ,
2659
- ( ( MASK as u32 >> 2 ) & 0b11 ) + 4 ,
2660
- ( ( MASK as u32 >> 4 ) & 0b11 ) + 4 ,
2661
- ( ( MASK as u32 >> 6 ) & 0b11 ) + 4 ,
2662
- ] ,
2663
- ) ;
2648
+ pub unsafe fn _mm256_shuffle_epi32 ( a : __m256i , imm8 : i32 ) -> __m256i {
2649
+ // simd_shuffleX requires that its selector parameter be made up of
2650
+ // constant values, but we can't enforce that here. In spirit, we need
2651
+ // to write a `match` on all possible values of a byte, and for each value,
2652
+ // hard-code the correct `simd_shuffleX` call using only constants. We
2653
+ // then hope for LLVM to do the rest.
2654
+ //
2655
+ // Of course, that's... awful. So we try to use macros to do it for us.
2656
+ let imm8 = ( imm8 & 0xFF ) as u8 ;
2657
+
2658
+ let a = a. as_i32x8 ( ) ;
2659
+ macro_rules! shuffle_done {
2660
+ ( $x01: expr, $x23: expr, $x45: expr, $x67: expr) => {
2661
+ simd_shuffle8(
2662
+ a,
2663
+ a,
2664
+ [
2665
+ $x01,
2666
+ $x23,
2667
+ $x45,
2668
+ $x67,
2669
+ 4 + $x01,
2670
+ 4 + $x23,
2671
+ 4 + $x45,
2672
+ 4 + $x67,
2673
+ ] ,
2674
+ )
2675
+ } ;
2676
+ }
2677
+ macro_rules! shuffle_x67 {
2678
+ ( $x01: expr, $x23: expr, $x45: expr) => {
2679
+ match ( imm8 >> 6 ) & 0b11 {
2680
+ 0b00 => shuffle_done!( $x01, $x23, $x45, 0 ) ,
2681
+ 0b01 => shuffle_done!( $x01, $x23, $x45, 1 ) ,
2682
+ 0b10 => shuffle_done!( $x01, $x23, $x45, 2 ) ,
2683
+ _ => shuffle_done!( $x01, $x23, $x45, 3 ) ,
2684
+ }
2685
+ } ;
2686
+ }
2687
+ macro_rules! shuffle_x45 {
2688
+ ( $x01: expr, $x23: expr) => {
2689
+ match ( imm8 >> 4 ) & 0b11 {
2690
+ 0b00 => shuffle_x67!( $x01, $x23, 0 ) ,
2691
+ 0b01 => shuffle_x67!( $x01, $x23, 1 ) ,
2692
+ 0b10 => shuffle_x67!( $x01, $x23, 2 ) ,
2693
+ _ => shuffle_x67!( $x01, $x23, 3 ) ,
2694
+ }
2695
+ } ;
2696
+ }
2697
+ macro_rules! shuffle_x23 {
2698
+ ( $x01: expr) => {
2699
+ match ( imm8 >> 2 ) & 0b11 {
2700
+ 0b00 => shuffle_x45!( $x01, 0 ) ,
2701
+ 0b01 => shuffle_x45!( $x01, 1 ) ,
2702
+ 0b10 => shuffle_x45!( $x01, 2 ) ,
2703
+ _ => shuffle_x45!( $x01, 3 ) ,
2704
+ }
2705
+ } ;
2706
+ }
2707
+ let r: i32x8 = match imm8 & 0b11 {
2708
+ 0b00 => shuffle_x23 ! ( 0 ) ,
2709
+ 0b01 => shuffle_x23 ! ( 1 ) ,
2710
+ 0b10 => shuffle_x23 ! ( 2 ) ,
2711
+ _ => shuffle_x23 ! ( 3 ) ,
2712
+ } ;
2664
2713
transmute ( r)
2665
2714
}
2666
2715
0 commit comments