Skip to content

AVX512 implementation #47

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@ const_xxh64 = [] # Enable const xxh64 implementation
xxh3 = [] # Enable xxh3 implementation
const_xxh3 = [] # Enable const xxh3 implementation

unstable = [] # Enables AVX-512, which requires nightly compiler

[dev-dependencies]
getrandom = "0.2"
xxhash-c-sys = "0.8.6"

[package.metadata.docs.rs]
features = ["xxh32", "const_xxh32", "xxh64", "const_xxh64", "xxh3", "const_xxh3"]
features = ["xxh32", "const_xxh32", "xxh64", "const_xxh64", "xxh3", "const_xxh3", "unstable"]
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ By default all features are off.
- `const_xxh64` - `const fn` version of `xxh64` algorithm
- `xxh3` - Enables `xxh3` family of algorithms, superior to `xxh32` and `xxh64` in terms of performance.
- `const_xxh3` - `const fn` version of `xxh3` algorithm
- `unstable` - Enables AVX512 implementation which requires an unstable nightly feature

## HW acceleration

Expand All @@ -58,6 +59,7 @@ Used SIMD acceleration:

- SSE2 - widely available, can be safely enabled in 99% of cases. Enabled by default in `x86_64` targets.
- AVX2;
- AVX512 - needs nightly compiler and enabling the `unstable` feature.
- Neon - Enabled by default on aarch64 targets (most likely)
- Wasm SIMD128 - Has to be enabled via rust flag: `-Ctarget-feature=+simd128`

Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
#![no_std]
#![warn(missing_docs)]
#![cfg_attr(feature = "cargo-clippy", allow(clippy::style))]
#![cfg_attr(feature = "unstable", feature(stdarch_x86_avx512))]

#[cfg(feature = "std")]
extern crate std;
Expand Down
106 changes: 86 additions & 20 deletions src/xxh3.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,19 @@ use crate::utils::{Buffer, get_unaligned_chunk, get_aligned_chunk_ref};
// Code is as close to original C implementation as possible
// It does make it look ugly, but it is fast and easy to update once xxhash gets new version.

#[cfg(all(any(target_feature = "sse2", target_feature = "neon", all(target_family = "wasm", target_feature = "simd128")), not(target_feature = "avx2")))]
#[cfg(all(any(target_feature = "sse2", target_feature = "neon", all(target_family = "wasm", target_feature = "simd128")), not(any(target_feature = "avx2", all(feature = "unstable", target_feature = "avx512f")))))]
#[repr(align(16))]
#[derive(Clone)]
struct Acc([u64; ACC_NB]);
#[cfg(target_feature = "avx2")]
#[cfg(all(target_feature = "avx2", not(all(feature = "unstable", target_feature = "avx512f"))))]
#[repr(align(32))]
#[derive(Clone)]
struct Acc([u64; ACC_NB]);
#[cfg(not(any(target_feature = "avx2", target_feature = "neon", all(target_family = "wasm", target_feature = "simd128"), target_feature = "sse2")))]
#[cfg(all(feature = "unstable", target_feature = "avx512f"))]
#[repr(align(64))]
#[derive(Clone)]
struct Acc([u64; ACC_NB]);
#[cfg(not(any(all(feature = "unstable", target_feature = "avx512f"), target_feature = "avx2", target_feature = "neon", all(target_family = "wasm", target_feature = "simd128"), target_feature = "sse2")))]
#[repr(align(8))]
#[derive(Clone)]
struct Acc([u64; ACC_NB]);
Expand All @@ -36,18 +40,24 @@ type LongHashFn128 = fn(&[u8], u64, &[u8]) -> u128;

#[cfg(all(target_family = "wasm", target_feature = "simd128"))]
type StripeLanes = [[u8; mem::size_of::<core::arch::wasm32::v128>()]; STRIPE_LEN / mem::size_of::<core::arch::wasm32::v128>()];
#[cfg(all(target_arch = "x86", target_feature = "avx2"))]
#[cfg(all(feature = "unstable", target_arch = "x86", target_feature = "avx512f"))]
type StripeLanes = [[u8; mem::size_of::<core::arch::x86::__m512i>()]; STRIPE_LEN / mem::size_of::<core::arch::x86::__m512i>()];
#[cfg(all(feature = "unstable", target_arch = "x86_64", target_feature = "avx512f"))]
type StripeLanes = [[u8; mem::size_of::<core::arch::x86_64::__m512i>()]; STRIPE_LEN / mem::size_of::<core::arch::x86_64::__m512i>()];
#[cfg(all(target_arch = "x86", target_feature = "avx2", not(all(feature = "unstable", target_feature = "avx512f"))))]
type StripeLanes = [[u8; mem::size_of::<core::arch::x86::__m256i>()]; STRIPE_LEN / mem::size_of::<core::arch::x86::__m256i>()];
#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
#[cfg(all(target_arch = "x86_64", target_feature = "avx2", not(all(feature = "unstable", target_feature = "avx512f"))))]
type StripeLanes = [[u8; mem::size_of::<core::arch::x86_64::__m256i>()]; STRIPE_LEN / mem::size_of::<core::arch::x86_64::__m256i>()];
#[cfg(all(target_arch = "x86", target_feature = "sse2", not(target_feature = "avx2")))]
#[cfg(all(target_arch = "x86", target_feature = "sse2", not(any(target_feature = "avx2", all(feature = "unstable", target_feature = "avx512f")))))]
type StripeLanes = [[u8; mem::size_of::<core::arch::x86::__m128i>()]; STRIPE_LEN / mem::size_of::<core::arch::x86::__m128i>()];
#[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(target_feature = "avx2")))]
#[cfg(all(target_arch = "x86_64", target_feature = "sse2", not(any(target_feature = "avx2", all(feature = "unstable", target_feature = "avx512f")))))]
type StripeLanes = [[u8; mem::size_of::<core::arch::x86_64::__m128i>()]; STRIPE_LEN / mem::size_of::<core::arch::x86_64::__m128i>()];
#[cfg(target_feature = "neon")]
type StripeLanes = [[u8; mem::size_of::<core::arch::aarch64::uint8x16_t>()]; STRIPE_LEN / mem::size_of::<core::arch::aarch64::uint8x16_t>()];

#[cfg(any(target_feature = "sse2", target_feature = "avx2"))]
// TODO: replace with [`core::arch::x86::_MM_SHUFFLE`](https://doc.rust-lang.org/core/arch/x86/fn._MM_SHUFFLE.html)
// when it stabilizes
#[cfg(any(target_feature = "sse2", target_feature = "avx2", target_feature = "avx512f"))]
#[inline]
const fn _mm_shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 {
((z << 6) | (y << 4) | (x << 2) | w) as i32
Expand Down Expand Up @@ -273,7 +283,7 @@ fn accumulate_512_neon(acc: &mut Acc, input: &StripeLanes, secret: &StripeLanes)
}
}

#[cfg(all(target_feature = "sse2", not(target_feature = "avx2")))]
#[cfg(all(target_feature = "sse2", not(any(target_feature = "avx2", all(feature = "unstable", target_feature = "avx512f")))))]
fn accumulate_512_sse2(acc: &mut Acc, input: &StripeLanes, secret: &StripeLanes) {
unsafe {
#[cfg(target_arch = "x86")]
Expand All @@ -298,7 +308,7 @@ fn accumulate_512_sse2(acc: &mut Acc, input: &StripeLanes, secret: &StripeLanes)
}
}

#[cfg(target_feature = "avx2")]
#[cfg(all(target_feature = "avx2", not(all(feature = "unstable", target_feature = "avx512f"))))]
fn accumulate_512_avx2(acc: &mut Acc, input: &StripeLanes, secret: &StripeLanes) {
unsafe {
#[cfg(target_arch = "x86")]
Expand All @@ -323,7 +333,32 @@ fn accumulate_512_avx2(acc: &mut Acc, input: &StripeLanes, secret: &StripeLanes)
}
}

#[cfg(not(any(target_feature = "avx2", target_feature = "sse2", target_feature = "neon", all(target_family = "wasm", target_feature = "simd128"))))]
#[cfg(all(feature = "unstable", target_feature = "avx512f"))]
fn accumulate_512_avx512(acc: &mut Acc, input: &StripeLanes, secret: &StripeLanes) {
unsafe {
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;

let xacc = acc.0.as_mut_ptr() as *mut __m512i;

let idx = 0;

let data_vec = _mm512_loadu_si512(input[idx].as_ptr() as _);
let key_vec = _mm512_loadu_si512(secret[idx].as_ptr() as _);
let data_key = _mm512_xor_si512(data_vec, key_vec);

let data_key_lo = _mm512_srli_epi64(data_key, 32);
let product = _mm512_mul_epu32(data_key, data_key_lo);

let data_swap = _mm512_shuffle_epi32(data_vec, _mm_shuffle(1, 0, 3, 2));
let sum = _mm512_add_epi64(*xacc.add(idx), data_swap);
xacc.add(idx).write(_mm512_add_epi64(product, sum));
}
}

#[cfg(not(any(all(feature = "unstable", target_feature = "avx512f"), target_feature = "avx2", target_feature = "sse2", target_feature = "neon", all(target_family = "wasm", target_feature = "simd128"))))]
fn accumulate_512_scalar(acc: &mut Acc, input: &[[u8; 8]; ACC_NB], secret: &[[u8; 8]; ACC_NB]) {
for idx in 0..ACC_NB {
let data_val = u64::from_ne_bytes(input[idx]).to_le();
Expand All @@ -338,11 +373,13 @@ fn accumulate_512_scalar(acc: &mut Acc, input: &[[u8; 8]; ACC_NB], secret: &[[u8
use accumulate_512_wasm as accumulate_512;
#[cfg(target_feature = "neon")]
use accumulate_512_neon as accumulate_512;
#[cfg(all(target_feature = "sse2", not(target_feature = "avx2")))]
#[cfg(all(target_feature = "sse2", not(any(target_feature = "avx2", all(feature = "unstable", target_feature = "avx512f")))))]
use accumulate_512_sse2 as accumulate_512;
#[cfg(target_feature = "avx2")]
#[cfg(all(target_feature = "avx2", not(all(feature = "unstable", target_feature = "avx512f"))))]
use accumulate_512_avx2 as accumulate_512;
#[cfg(not(any(target_feature = "avx2", target_feature = "sse2", target_feature = "neon", all(target_family = "wasm", target_feature = "simd128"))))]
#[cfg(all(feature = "unstable", target_feature = "avx512f"))]
use accumulate_512_avx512 as accumulate_512;
#[cfg(not(any(all(feature = "unstable", target_feature = "avx512f"), target_feature = "avx2", target_feature = "sse2", target_feature = "neon", all(target_family = "wasm", target_feature = "simd128"))))]
use accumulate_512_scalar as accumulate_512;

#[cfg(all(target_family = "wasm", target_feature = "simd128"))]
Expand Down Expand Up @@ -397,7 +434,7 @@ fn scramble_acc_neon(acc: &mut Acc, secret: &StripeLanes) {
}
}

#[cfg(all(target_feature = "sse2", not(target_feature = "avx2")))]
#[cfg(all(target_feature = "sse2", not(any(target_feature = "avx2", all(feature = "unstable", target_feature = "avx512f")))))]
fn scramble_acc_sse2(acc: &mut Acc, secret: &StripeLanes) {
unsafe {
#[cfg(target_arch = "x86")]
Expand All @@ -424,7 +461,7 @@ fn scramble_acc_sse2(acc: &mut Acc, secret: &StripeLanes) {
}
}

#[cfg(target_feature = "avx2")]
#[cfg(all(target_feature = "avx2", not(all(feature = "unstable", target_feature = "avx512f"))))]
fn scramble_acc_avx2(acc: &mut Acc, secret: &StripeLanes) {
unsafe {
#[cfg(target_arch = "x86")]
Expand All @@ -451,7 +488,33 @@ fn scramble_acc_avx2(acc: &mut Acc, secret: &StripeLanes) {
}
}

#[cfg(not(any(target_feature = "avx2", target_feature = "sse2", target_feature = "neon", all(target_family = "wasm", target_feature = "simd128"))))]
#[cfg(all(feature = "unstable", target_feature = "avx512f"))]
fn scramble_acc_avx512(acc: &mut Acc, secret: &StripeLanes) {
unsafe {
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;

let xacc = acc.0.as_mut_ptr() as *mut __m512i;
let prime32 = _mm512_set1_epi32(xxh32::PRIME_1 as i32);

let idx = 0;

let acc_vec = *xacc.add(idx);
let shifted = _mm512_srli_epi64(acc_vec, 47);

let key_vec = _mm512_loadu_si512(secret[idx].as_ptr() as _);
let data_key = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96);

let data_key_hi = _mm512_srli_epi64(data_key, 32);
let prod_lo = _mm512_mul_epu32(data_key, prime32);
let prod_hi = _mm512_mul_epu32(data_key_hi, prime32);
xacc.add(idx).write(_mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)));
}
}

#[cfg(not(any(all(feature = "unstable", target_feature = "avx512f"), target_feature = "avx2", target_feature = "sse2", target_feature = "neon", all(target_family = "wasm", target_feature = "simd128"))))]
fn scramble_acc_scalar(acc: &mut Acc, secret: &[[u8; 8]; ACC_NB]) {
for idx in 0..secret.len() {
let key = u64::from_ne_bytes(secret[idx]).to_le();
Expand All @@ -467,13 +530,16 @@ use scramble_acc_wasm as scramble_acc;
#[cfg(target_feature = "neon")]
use scramble_acc_neon as scramble_acc;

#[cfg(all(target_feature = "sse2", not(target_feature = "avx2")))]
#[cfg(all(target_feature = "sse2", not(any(target_feature = "avx2", all(feature = "unstable", target_feature = "avx512f")))))]
use scramble_acc_sse2 as scramble_acc;

#[cfg(target_feature = "avx2")]
#[cfg(all(target_feature = "avx2", not(all(feature = "unstable", target_feature = "avx512f"))))]
use scramble_acc_avx2 as scramble_acc;

#[cfg(not(any(target_feature = "avx2", target_feature = "sse2", target_feature = "neon", all(target_family = "wasm", target_feature = "simd128"))))]
#[cfg(all(feature = "unstable", target_feature = "avx512f"))]
use scramble_acc_avx512 as scramble_acc;

#[cfg(not(any(all(feature = "unstable", target_feature = "avx512f"), target_feature = "avx2", target_feature = "sse2", target_feature = "neon", all(target_family = "wasm", target_feature = "simd128"))))]
use scramble_acc_scalar as scramble_acc;

#[inline(always)]
Expand Down