Skip to content

Commit 4f0d822

Browse files
Implement dynamic byte-swizzle prototype (#334)
This is meant to be an example that is used to test a Rust intrinsic against, which will replace it. The interface is fairly direct and doesn't address more nuanced or interesting permutations one can do, nevermind on types other than bytes. The ultimate goal is for direct LLVM support for this.
1 parent f916add commit 4f0d822

File tree

3 files changed

+231
-0
lines changed

3 files changed

+231
-0
lines changed

crates/core_simd/src/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ mod masks;
1717
mod ops;
1818
mod ord;
1919
mod select;
20+
mod swizzle_dyn;
2021
mod vector;
2122
mod vendor;
2223

@@ -32,5 +33,6 @@ pub mod simd {
3233
pub use crate::core_simd::masks::*;
3334
pub use crate::core_simd::ord::*;
3435
pub use crate::core_simd::swizzle::*;
36+
pub use crate::core_simd::swizzle_dyn::*;
3537
pub use crate::core_simd::vector::*;
3638
}

crates/core_simd/src/swizzle_dyn.rs

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
use crate::simd::{LaneCount, Simd, SupportedLaneCount};
2+
use core::mem;
3+
4+
impl<const N: usize> Simd<u8, N>
5+
where
6+
LaneCount<N>: SupportedLaneCount,
7+
{
8+
/// Swizzle a vector of bytes according to the index vector.
9+
/// Indices within range select the appropriate byte.
10+
/// Indices "out of bounds" instead select 0.
11+
///
12+
/// Note that the current implementation is selected during build-time
13+
/// of the standard library, so `cargo build -Zbuild-std` may be necessary
14+
/// to unlock better performance, especially for larger vectors.
15+
/// A planned compiler improvement will enable using `#[target_feature]` instead.
16+
#[inline]
17+
pub fn swizzle_dyn(self, idxs: Simd<u8, N>) -> Self {
18+
#![allow(unused_imports, unused_unsafe)]
19+
#[cfg(target_arch = "aarch64")]
20+
use core::arch::aarch64::{uint8x8_t, vqtbl1q_u8, vtbl1_u8};
21+
#[cfg(all(target_arch = "arm", target_feature = "v7"))]
22+
use core::arch::arm::{uint8x8_t, vtbl1_u8};
23+
#[cfg(target_arch = "wasm32")]
24+
use core::arch::wasm32 as wasm;
25+
#[cfg(target_arch = "x86")]
26+
use core::arch::x86;
27+
#[cfg(target_arch = "x86_64")]
28+
use core::arch::x86_64 as x86;
29+
// SAFETY: Intrinsics covered by cfg
30+
unsafe {
31+
match N {
32+
#[cfg(target_feature = "neon")]
33+
8 => transize(vtbl1_u8, self, idxs),
34+
#[cfg(target_feature = "ssse3")]
35+
16 => transize(x86::_mm_shuffle_epi8, self, idxs),
36+
#[cfg(target_feature = "simd128")]
37+
16 => transize(wasm::i8x16_swizzle, self, idxs),
38+
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
39+
16 => transize(vqtbl1q_u8, self, idxs),
40+
#[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
41+
32 => transize_raw(avx2_pshufb, self, idxs),
42+
#[cfg(target_feature = "avx512vl,avx512vbmi")]
43+
32 => transize(x86::_mm256_permutexvar_epi8, self, idxs),
44+
// Notable absence: avx512bw shuffle
45+
// If avx512bw is available, odds of avx512vbmi are good
46+
#[cfg(target_feature = "avx512vbmi")]
47+
64 => transize(x86::_mm512_permutexvar_epi8, self, idxs),
48+
_ => {
49+
let mut array = [0; N];
50+
for (i, k) in idxs.to_array().into_iter().enumerate() {
51+
if (k as usize) < N {
52+
array[i] = self[k as usize];
53+
};
54+
}
55+
array.into()
56+
}
57+
}
58+
}
59+
}
60+
}
61+
62+
/// "vpshufb like it was meant to be" on AVX2
63+
///
64+
/// # Safety
65+
/// This requires AVX2 to work
66+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
67+
#[target_feature(enable = "avx2")]
68+
#[allow(unused)]
69+
#[inline]
70+
unsafe fn avx2_pshufb(bytes: Simd<u8, 32>, idxs: Simd<u8, 32>) -> Simd<u8, 32> {
71+
use crate::simd::SimdPartialOrd;
72+
#[cfg(target_arch = "x86")]
73+
use core::arch::x86;
74+
#[cfg(target_arch = "x86_64")]
75+
use core::arch::x86_64 as x86;
76+
use x86::_mm256_permute2x128_si256 as avx2_cross_shuffle;
77+
use x86::_mm256_shuffle_epi8 as avx2_half_pshufb;
78+
let mid = Simd::splat(16u8);
79+
let high = mid + mid;
80+
// SAFETY: Caller promised AVX2
81+
unsafe {
82+
// This is ordering sensitive, and LLVM will order these how you put them.
83+
// Most AVX2 impls use ~5 "ports", and only 1 or 2 are capable of permutes.
84+
// But the "compose" step will lower to ops that can also use at least 1 other port.
85+
// So this tries to break up permutes so composition flows through "open" ports.
86+
// Comparative benches should be done on multiple AVX2 CPUs before reordering this
87+
88+
let hihi = avx2_cross_shuffle::<0x11>(bytes.into(), bytes.into());
89+
let hi_shuf = Simd::from(avx2_half_pshufb(
90+
hihi, // duplicate the vector's top half
91+
idxs.into(), // so that using only 4 bits of an index still picks bytes 16-31
92+
));
93+
// A zero-fill during the compose step gives the "all-Neon-like" OOB-is-0 semantics
94+
let compose = idxs.simd_lt(high).select(hi_shuf, Simd::splat(0));
95+
let lolo = avx2_cross_shuffle::<0x00>(bytes.into(), bytes.into());
96+
let lo_shuf = Simd::from(avx2_half_pshufb(lolo, idxs.into()));
97+
// Repeat, then pick indices < 16, overwriting indices 0-15 from previous compose step
98+
let compose = idxs.simd_lt(mid).select(lo_shuf, compose);
99+
compose
100+
}
101+
}
102+
103+
/// This sets up a call to an architecture-specific function, and in doing so
104+
/// it persuades rustc that everything is the correct size. Which it is.
105+
/// This would not be needed if one could convince Rust that, by matching on N,
106+
/// N is that value, and thus it would be valid to substitute e.g. 16.
107+
///
108+
/// # Safety
109+
/// The correctness of this function hinges on the sizes agreeing in actuality.
110+
#[allow(dead_code)]
111+
#[inline(always)]
112+
unsafe fn transize<T, const N: usize>(
113+
f: unsafe fn(T, T) -> T,
114+
bytes: Simd<u8, N>,
115+
idxs: Simd<u8, N>,
116+
) -> Simd<u8, N>
117+
where
118+
LaneCount<N>: SupportedLaneCount,
119+
{
120+
let idxs = zeroing_idxs(idxs);
121+
// SAFETY: Same obligation to use this function as to use mem::transmute_copy.
122+
unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) }
123+
}
124+
125+
/// Make indices that yield 0 for this architecture
126+
#[inline(always)]
127+
fn zeroing_idxs<const N: usize>(idxs: Simd<u8, N>) -> Simd<u8, N>
128+
where
129+
LaneCount<N>: SupportedLaneCount,
130+
{
131+
// On x86, make sure the top bit is set.
132+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
133+
let idxs = {
134+
use crate::simd::SimdPartialOrd;
135+
idxs.simd_lt(Simd::splat(N as u8))
136+
.select(idxs, Simd::splat(u8::MAX))
137+
};
138+
// Simply do nothing on most architectures.
139+
idxs
140+
}
141+
142+
/// As transize but no implicit call to `zeroing_idxs`.
143+
#[allow(dead_code)]
144+
#[inline(always)]
145+
unsafe fn transize_raw<T, const N: usize>(
146+
f: unsafe fn(T, T) -> T,
147+
bytes: Simd<u8, N>,
148+
idxs: Simd<u8, N>,
149+
) -> Simd<u8, N>
150+
where
151+
LaneCount<N>: SupportedLaneCount,
152+
{
153+
// SAFETY: Same obligation to use this function as to use mem::transmute_copy.
154+
unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) }
155+
}

crates/core_simd/tests/swizzle_dyn.rs

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#![feature(portable_simd)]
2+
use core::{fmt, ops::RangeInclusive};
3+
use proptest;
4+
use test_helpers::{self, biteq, make_runner, prop_assert_biteq};
5+
6+
fn swizzle_dyn_scalar_ver<const N: usize>(values: [u8; N], idxs: [u8; N]) -> [u8; N] {
7+
let mut array = [0; N];
8+
for (i, k) in idxs.into_iter().enumerate() {
9+
if (k as usize) < N {
10+
array[i] = values[k as usize];
11+
};
12+
}
13+
array
14+
}
15+
16+
test_helpers::test_lanes! {
17+
fn swizzle_dyn<const N: usize>() {
18+
match_simd_with_fallback(
19+
&core_simd::simd::Simd::<u8, N>::swizzle_dyn,
20+
&swizzle_dyn_scalar_ver,
21+
&|_, _| true,
22+
);
23+
}
24+
}
25+
26+
fn match_simd_with_fallback<Scalar, ScalarResult, Vector, VectorResult, const N: usize>(
27+
fv: &dyn Fn(Vector, Vector) -> VectorResult,
28+
fs: &dyn Fn([Scalar; N], [Scalar; N]) -> [ScalarResult; N],
29+
check: &dyn Fn([Scalar; N], [Scalar; N]) -> bool,
30+
) where
31+
Scalar: Copy + fmt::Debug + SwizzleStrategy,
32+
ScalarResult: Copy + biteq::BitEq + fmt::Debug + SwizzleStrategy,
33+
Vector: Into<[Scalar; N]> + From<[Scalar; N]> + Copy,
34+
VectorResult: Into<[ScalarResult; N]> + From<[ScalarResult; N]> + Copy,
35+
{
36+
test_swizzles_2(&|x: [Scalar; N], y: [Scalar; N]| {
37+
proptest::prop_assume!(check(x, y));
38+
let result_v: [ScalarResult; N] = fv(x.into(), y.into()).into();
39+
let result_s: [ScalarResult; N] = fs(x, y);
40+
crate::prop_assert_biteq!(result_v, result_s);
41+
Ok(())
42+
});
43+
}
44+
45+
fn test_swizzles_2<A: fmt::Debug + SwizzleStrategy, B: fmt::Debug + SwizzleStrategy>(
46+
f: &dyn Fn(A, B) -> proptest::test_runner::TestCaseResult,
47+
) {
48+
let mut runner = make_runner();
49+
runner
50+
.run(
51+
&(A::swizzled_strategy(), B::swizzled_strategy()),
52+
|(a, b)| f(a, b),
53+
)
54+
.unwrap();
55+
}
56+
57+
pub trait SwizzleStrategy {
58+
type Strategy: proptest::strategy::Strategy<Value = Self>;
59+
fn swizzled_strategy() -> Self::Strategy;
60+
}
61+
62+
impl SwizzleStrategy for u8 {
63+
type Strategy = RangeInclusive<u8>;
64+
fn swizzled_strategy() -> Self::Strategy {
65+
0..=64
66+
}
67+
}
68+
69+
impl<T: fmt::Debug + SwizzleStrategy, const N: usize> SwizzleStrategy for [T; N] {
70+
type Strategy = test_helpers::array::UniformArrayStrategy<T::Strategy, Self>;
71+
fn swizzled_strategy() -> Self::Strategy {
72+
Self::Strategy::new(T::swizzled_strategy())
73+
}
74+
}

0 commit comments

Comments
 (0)