|
| 1 | +use crate::simd::{LaneCount, Simd, SupportedLaneCount}; |
| 2 | +use core::mem; |
| 3 | + |
| 4 | +impl<const N: usize> Simd<u8, N> |
| 5 | +where |
| 6 | + LaneCount<N>: SupportedLaneCount, |
| 7 | +{ |
| 8 | + /// Swizzle a vector of bytes according to the index vector. |
| 9 | + /// Indices within range select the appropriate byte. |
| 10 | + /// Indices "out of bounds" instead select 0. |
| 11 | + /// |
| 12 | + /// Note that the current implementation is selected during build-time |
| 13 | + /// of the standard library, so `cargo build -Zbuild-std` may be necessary |
| 14 | + /// to unlock better performance, especially for larger vectors. |
| 15 | + /// A planned compiler improvement will enable using `#[target_feature]` instead. |
| 16 | + #[inline] |
| 17 | + pub fn swizzle_dyn(self, idxs: Simd<u8, N>) -> Self { |
| 18 | + #![allow(unused_imports, unused_unsafe)] |
| 19 | + #[cfg(target_arch = "aarch64")] |
| 20 | + use core::arch::aarch64::{uint8x8_t, vqtbl1q_u8, vtbl1_u8}; |
| 21 | + #[cfg(all(target_arch = "arm", target_feature = "v7"))] |
| 22 | + use core::arch::arm::{uint8x8_t, vtbl1_u8}; |
| 23 | + #[cfg(target_arch = "wasm32")] |
| 24 | + use core::arch::wasm32 as wasm; |
| 25 | + #[cfg(target_arch = "x86")] |
| 26 | + use core::arch::x86; |
| 27 | + #[cfg(target_arch = "x86_64")] |
| 28 | + use core::arch::x86_64 as x86; |
| 29 | + // SAFETY: Intrinsics covered by cfg |
| 30 | + unsafe { |
| 31 | + match N { |
| 32 | + #[cfg(target_feature = "neon")] |
| 33 | + 8 => transize(vtbl1_u8, self, idxs), |
| 34 | + #[cfg(target_feature = "ssse3")] |
| 35 | + 16 => transize(x86::_mm_shuffle_epi8, self, idxs), |
| 36 | + #[cfg(target_feature = "simd128")] |
| 37 | + 16 => transize(wasm::i8x16_swizzle, self, idxs), |
| 38 | + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] |
| 39 | + 16 => transize(vqtbl1q_u8, self, idxs), |
| 40 | + #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))] |
| 41 | + 32 => transize_raw(avx2_pshufb, self, idxs), |
| 42 | + #[cfg(target_feature = "avx512vl,avx512vbmi")] |
| 43 | + 32 => transize(x86::_mm256_permutexvar_epi8, self, idxs), |
| 44 | + // Notable absence: avx512bw shuffle |
| 45 | + // If avx512bw is available, odds of avx512vbmi are good |
| 46 | + #[cfg(target_feature = "avx512vbmi")] |
| 47 | + 64 => transize(x86::_mm512_permutexvar_epi8, self, idxs), |
| 48 | + _ => { |
| 49 | + let mut array = [0; N]; |
| 50 | + for (i, k) in idxs.to_array().into_iter().enumerate() { |
| 51 | + if (k as usize) < N { |
| 52 | + array[i] = self[k as usize]; |
| 53 | + }; |
| 54 | + } |
| 55 | + array.into() |
| 56 | + } |
| 57 | + } |
| 58 | + } |
| 59 | + } |
| 60 | +} |
| 61 | + |
| 62 | +/// "vpshufb like it was meant to be" on AVX2 |
| 63 | +/// |
| 64 | +/// # Safety |
| 65 | +/// This requires AVX2 to work |
| 66 | +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| 67 | +#[target_feature(enable = "avx2")] |
| 68 | +#[allow(unused)] |
| 69 | +#[inline] |
| 70 | +unsafe fn avx2_pshufb(bytes: Simd<u8, 32>, idxs: Simd<u8, 32>) -> Simd<u8, 32> { |
| 71 | + use crate::simd::SimdPartialOrd; |
| 72 | + #[cfg(target_arch = "x86")] |
| 73 | + use core::arch::x86; |
| 74 | + #[cfg(target_arch = "x86_64")] |
| 75 | + use core::arch::x86_64 as x86; |
| 76 | + use x86::_mm256_permute2x128_si256 as avx2_cross_shuffle; |
| 77 | + use x86::_mm256_shuffle_epi8 as avx2_half_pshufb; |
| 78 | + let mid = Simd::splat(16u8); |
| 79 | + let high = mid + mid; |
| 80 | + // SAFETY: Caller promised AVX2 |
| 81 | + unsafe { |
| 82 | + // This is ordering sensitive, and LLVM will order these how you put them. |
| 83 | + // Most AVX2 impls use ~5 "ports", and only 1 or 2 are capable of permutes. |
| 84 | + // But the "compose" step will lower to ops that can also use at least 1 other port. |
| 85 | + // So this tries to break up permutes so composition flows through "open" ports. |
| 86 | + // Comparative benches should be done on multiple AVX2 CPUs before reordering this |
| 87 | + |
| 88 | + let hihi = avx2_cross_shuffle::<0x11>(bytes.into(), bytes.into()); |
| 89 | + let hi_shuf = Simd::from(avx2_half_pshufb( |
| 90 | + hihi, // duplicate the vector's top half |
| 91 | + idxs.into(), // so that using only 4 bits of an index still picks bytes 16-31 |
| 92 | + )); |
| 93 | + // A zero-fill during the compose step gives the "all-Neon-like" OOB-is-0 semantics |
| 94 | + let compose = idxs.simd_lt(high).select(hi_shuf, Simd::splat(0)); |
| 95 | + let lolo = avx2_cross_shuffle::<0x00>(bytes.into(), bytes.into()); |
| 96 | + let lo_shuf = Simd::from(avx2_half_pshufb(lolo, idxs.into())); |
| 97 | + // Repeat, then pick indices < 16, overwriting indices 0-15 from previous compose step |
| 98 | + let compose = idxs.simd_lt(mid).select(lo_shuf, compose); |
| 99 | + compose |
| 100 | + } |
| 101 | +} |
| 102 | + |
| 103 | +/// This sets up a call to an architecture-specific function, and in doing so |
| 104 | +/// it persuades rustc that everything is the correct size. Which it is. |
| 105 | +/// This would not be needed if one could convince Rust that, by matching on N, |
| 106 | +/// N is that value, and thus it would be valid to substitute e.g. 16. |
| 107 | +/// |
| 108 | +/// # Safety |
| 109 | +/// The correctness of this function hinges on the sizes agreeing in actuality. |
| 110 | +#[allow(dead_code)] |
| 111 | +#[inline(always)] |
| 112 | +unsafe fn transize<T, const N: usize>( |
| 113 | + f: unsafe fn(T, T) -> T, |
| 114 | + bytes: Simd<u8, N>, |
| 115 | + idxs: Simd<u8, N>, |
| 116 | +) -> Simd<u8, N> |
| 117 | +where |
| 118 | + LaneCount<N>: SupportedLaneCount, |
| 119 | +{ |
| 120 | + let idxs = zeroing_idxs(idxs); |
| 121 | + // SAFETY: Same obligation to use this function as to use mem::transmute_copy. |
| 122 | + unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) } |
| 123 | +} |
| 124 | + |
| 125 | +/// Make indices that yield 0 for this architecture |
| 126 | +#[inline(always)] |
| 127 | +fn zeroing_idxs<const N: usize>(idxs: Simd<u8, N>) -> Simd<u8, N> |
| 128 | +where |
| 129 | + LaneCount<N>: SupportedLaneCount, |
| 130 | +{ |
| 131 | + // On x86, make sure the top bit is set. |
| 132 | + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| 133 | + let idxs = { |
| 134 | + use crate::simd::SimdPartialOrd; |
| 135 | + idxs.simd_lt(Simd::splat(N as u8)) |
| 136 | + .select(idxs, Simd::splat(u8::MAX)) |
| 137 | + }; |
| 138 | + // Simply do nothing on most architectures. |
| 139 | + idxs |
| 140 | +} |
| 141 | + |
| 142 | +/// As transize but no implicit call to `zeroing_idxs`. |
| 143 | +#[allow(dead_code)] |
| 144 | +#[inline(always)] |
| 145 | +unsafe fn transize_raw<T, const N: usize>( |
| 146 | + f: unsafe fn(T, T) -> T, |
| 147 | + bytes: Simd<u8, N>, |
| 148 | + idxs: Simd<u8, N>, |
| 149 | +) -> Simd<u8, N> |
| 150 | +where |
| 151 | + LaneCount<N>: SupportedLaneCount, |
| 152 | +{ |
| 153 | + // SAFETY: Same obligation to use this function as to use mem::transmute_copy. |
| 154 | + unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) } |
| 155 | +} |
0 commit comments