Skip to content

Commit 3559569

Browse files
authored
mm256_srli,slli_si256; mm256_bsrli,bslli_epi128 to const generics (#1067)
1 parent 1d4c3dd commit 3559569

File tree

4 files changed

+381
-445
lines changed

4 files changed

+381
-445
lines changed

crates/core_arch/src/x86/avx2.rs

+195-42
Original file line numberDiff line numberDiff line change
@@ -2565,35 +2565,65 @@ pub unsafe fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
25652565
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_si256)
25662566
#[inline]
25672567
#[target_feature(enable = "avx2")]
2568-
#[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
2569-
#[rustc_args_required_const(1)]
2568+
#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2569+
#[rustc_legacy_const_generics(1)]
25702570
#[stable(feature = "simd_x86", since = "1.27.0")]
2571-
pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i {
2572-
let a = a.as_i64x4();
2573-
macro_rules! call {
2574-
($imm8:expr) => {
2575-
vpslldq(a, $imm8)
2576-
};
2577-
}
2578-
transmute(constify_imm8!(imm8 * 8, call))
2571+
pub unsafe fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2572+
static_assert_imm8!(IMM8);
2573+
_mm256_bslli_epi128::<IMM8>(a)
25792574
}
25802575

25812576
/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
25822577
///
25832578
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bslli_epi128)
25842579
#[inline]
25852580
#[target_feature(enable = "avx2")]
2586-
#[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
2587-
#[rustc_args_required_const(1)]
2581+
#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2582+
#[rustc_legacy_const_generics(1)]
25882583
#[stable(feature = "simd_x86", since = "1.27.0")]
2589-
pub unsafe fn _mm256_bslli_epi128(a: __m256i, imm8: i32) -> __m256i {
2590-
let a = a.as_i64x4();
2591-
macro_rules! call {
2592-
($imm8:expr) => {
2593-
vpslldq(a, $imm8)
2594-
};
2595-
}
2596-
transmute(constify_imm8!(imm8 * 8, call))
2584+
pub unsafe fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2585+
static_assert_imm8!(IMM8);
2586+
let a = a.as_i8x32();
2587+
let zero = _mm256_setzero_si256().as_i8x32();
2588+
let r: i8x32 = simd_shuffle32(
2589+
zero,
2590+
a,
2591+
[
2592+
32 - (IMM8 as u32 & 0xff),
2593+
33 - (IMM8 as u32 & 0xff),
2594+
34 - (IMM8 as u32 & 0xff),
2595+
35 - (IMM8 as u32 & 0xff),
2596+
36 - (IMM8 as u32 & 0xff),
2597+
37 - (IMM8 as u32 & 0xff),
2598+
38 - (IMM8 as u32 & 0xff),
2599+
39 - (IMM8 as u32 & 0xff),
2600+
40 - (IMM8 as u32 & 0xff),
2601+
41 - (IMM8 as u32 & 0xff),
2602+
42 - (IMM8 as u32 & 0xff),
2603+
43 - (IMM8 as u32 & 0xff),
2604+
44 - (IMM8 as u32 & 0xff),
2605+
45 - (IMM8 as u32 & 0xff),
2606+
46 - (IMM8 as u32 & 0xff),
2607+
47 - (IMM8 as u32 & 0xff),
2608+
48 - (IMM8 as u32 & 0xff) - 16,
2609+
49 - (IMM8 as u32 & 0xff) - 16,
2610+
50 - (IMM8 as u32 & 0xff) - 16,
2611+
51 - (IMM8 as u32 & 0xff) - 16,
2612+
52 - (IMM8 as u32 & 0xff) - 16,
2613+
53 - (IMM8 as u32 & 0xff) - 16,
2614+
54 - (IMM8 as u32 & 0xff) - 16,
2615+
55 - (IMM8 as u32 & 0xff) - 16,
2616+
56 - (IMM8 as u32 & 0xff) - 16,
2617+
57 - (IMM8 as u32 & 0xff) - 16,
2618+
58 - (IMM8 as u32 & 0xff) - 16,
2619+
59 - (IMM8 as u32 & 0xff) - 16,
2620+
60 - (IMM8 as u32 & 0xff) - 16,
2621+
61 - (IMM8 as u32 & 0xff) - 16,
2622+
62 - (IMM8 as u32 & 0xff) - 16,
2623+
63 - (IMM8 as u32 & 0xff) - 16,
2624+
],
2625+
);
2626+
transmute(r)
25972627
}
25982628

25992629
/// Shifts packed 32-bit integers in `a` left by the amount
@@ -2729,35 +2759,158 @@ pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
27292759
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_si256)
27302760
#[inline]
27312761
#[target_feature(enable = "avx2")]
2732-
#[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
2733-
#[rustc_args_required_const(1)]
2762+
#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
2763+
#[rustc_legacy_const_generics(1)]
27342764
#[stable(feature = "simd_x86", since = "1.27.0")]
2735-
pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i {
2736-
let a = a.as_i64x4();
2737-
macro_rules! call {
2738-
($imm8:expr) => {
2739-
vpsrldq(a, $imm8)
2740-
};
2741-
}
2742-
transmute(constify_imm8!(imm8 * 8, call))
2765+
pub unsafe fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2766+
static_assert_imm8!(IMM8);
2767+
_mm256_bsrli_epi128::<IMM8>(a)
27432768
}
27442769

27452770
/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
27462771
///
27472772
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bsrli_epi128)
27482773
#[inline]
27492774
#[target_feature(enable = "avx2")]
2750-
#[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
2751-
#[rustc_args_required_const(1)]
2775+
#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
2776+
#[rustc_legacy_const_generics(1)]
27522777
#[stable(feature = "simd_x86", since = "1.27.0")]
2753-
pub unsafe fn _mm256_bsrli_epi128(a: __m256i, imm8: i32) -> __m256i {
2754-
let a = a.as_i64x4();
2755-
macro_rules! call {
2756-
($imm8:expr) => {
2757-
vpsrldq(a, $imm8)
2758-
};
2759-
}
2760-
transmute(constify_imm8!(imm8 * 8, call))
2778+
pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2779+
static_assert_imm8!(IMM8);
2780+
let a = a.as_i8x32();
2781+
let zero = _mm256_setzero_si256().as_i8x32();
2782+
let r: i8x32 = match IMM8 % 16 {
2783+
0 => simd_shuffle32(
2784+
a,
2785+
zero,
2786+
[
2787+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
2788+
23, 24, 25, 26, 27, 28, 29, 30, 31,
2789+
],
2790+
),
2791+
1 => simd_shuffle32(
2792+
a,
2793+
zero,
2794+
[
2795+
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
2796+
24, 25, 26, 27, 28, 29, 30, 31, 32,
2797+
],
2798+
),
2799+
2 => simd_shuffle32(
2800+
a,
2801+
zero,
2802+
[
2803+
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 18, 19, 20, 21, 22, 23, 24,
2804+
25, 26, 27, 28, 29, 30, 31, 32, 32,
2805+
],
2806+
),
2807+
3 => simd_shuffle32(
2808+
a,
2809+
zero,
2810+
[
2811+
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 19, 20, 21, 22, 23, 24,
2812+
25, 26, 27, 28, 29, 30, 31, 32, 32, 32,
2813+
],
2814+
),
2815+
4 => simd_shuffle32(
2816+
a,
2817+
zero,
2818+
[
2819+
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 20, 21, 22, 23, 24, 25,
2820+
26, 27, 28, 29, 30, 31, 32, 32, 32, 32,
2821+
],
2822+
),
2823+
5 => simd_shuffle32(
2824+
a,
2825+
zero,
2826+
[
2827+
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 21, 22, 23, 24, 25, 26,
2828+
27, 28, 29, 30, 31, 32, 32, 32, 32, 32,
2829+
],
2830+
),
2831+
6 => simd_shuffle32(
2832+
a,
2833+
zero,
2834+
[
2835+
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 22, 23, 24, 25, 26, 27,
2836+
28, 29, 30, 31, 32, 32, 32, 32, 32, 32,
2837+
],
2838+
),
2839+
7 => simd_shuffle32(
2840+
a,
2841+
zero,
2842+
[
2843+
7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 23, 24, 25, 26, 27,
2844+
28, 29, 30, 31, 32, 32, 32, 32, 32, 32, 32,
2845+
],
2846+
),
2847+
8 => simd_shuffle32(
2848+
a,
2849+
zero,
2850+
[
2851+
8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 24, 25, 26, 27, 28,
2852+
29, 30, 31, 32, 32, 32, 32, 32, 32, 32, 32,
2853+
],
2854+
),
2855+
9 => simd_shuffle32(
2856+
a,
2857+
zero,
2858+
[
2859+
9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 25, 26, 27, 28, 29,
2860+
30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
2861+
],
2862+
),
2863+
10 => simd_shuffle32(
2864+
a,
2865+
zero,
2866+
[
2867+
10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 26, 27, 28, 29, 30,
2868+
31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
2869+
],
2870+
),
2871+
11 => simd_shuffle32(
2872+
a,
2873+
zero,
2874+
[
2875+
11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 27, 28, 29, 30, 31,
2876+
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
2877+
],
2878+
),
2879+
12 => simd_shuffle32(
2880+
a,
2881+
zero,
2882+
[
2883+
12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 28, 29, 30, 31, 32,
2884+
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
2885+
],
2886+
),
2887+
13 => simd_shuffle32(
2888+
a,
2889+
zero,
2890+
[
2891+
13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 29, 30, 31, 32, 32,
2892+
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
2893+
],
2894+
),
2895+
14 => simd_shuffle32(
2896+
a,
2897+
zero,
2898+
[
2899+
14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 30, 31, 32, 32, 32,
2900+
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
2901+
],
2902+
),
2903+
15 => simd_shuffle32(
2904+
a,
2905+
zero,
2906+
[
2907+
14, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32,
2908+
32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
2909+
],
2910+
),
2911+
_ => zero,
2912+
};
2913+
transmute(r)
27612914
}
27622915

27632916
/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
@@ -4824,7 +4977,7 @@ mod tests {
48244977
#[simd_test(enable = "avx2")]
48254978
unsafe fn test_mm256_slli_si256() {
48264979
let a = _mm256_set1_epi64x(0xFFFFFFFF);
4827-
let r = _mm256_slli_si256(a, 3);
4980+
let r = _mm256_slli_si256::<3>(a);
48284981
assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
48294982
}
48304983

@@ -4923,7 +5076,7 @@ mod tests {
49235076
17, 18, 19, 20, 21, 22, 23, 24,
49245077
25, 26, 27, 28, 29, 30, 31, 32,
49255078
);
4926-
let r = _mm256_srli_si256(a, 3);
5079+
let r = _mm256_srli_si256::<3>(a);
49275080
#[rustfmt::skip]
49285081
let e = _mm256_setr_epi8(
49295082
4, 5, 6, 7, 8, 9, 10, 11,

0 commit comments

Comments
 (0)