Skip to content

Commit 1fa5587

Browse files
committed
hs1-siv: add SSE2 version of Hasher::update_block
Despite my best efforts I seem unable to get LLVM to emit vectorized code, even though it should be obviously beneficial. I suspect LLVM is thrown off by the 64 bit multiply, which is missing in the SSE2 instruction set. It did take me a while to figure out that casting an array of __m128i to [u64; 2] would end up the most performant. The SSE2 version is about ~%20 faster for me, so it is a substantial improvement. Also, inline(always) on pretty much everything is now beneficial, whereas before it led to significant regressions. It does create a fair bit of code bloat though.
1 parent 7f412fe commit 1fa5587

File tree

2 files changed

+80
-27
lines changed

2 files changed

+80
-27
lines changed

hs1-siv/src/hash.rs

Lines changed: 31 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
use super::{
2-
mask, Array, ArraySize, False, Gr, Hs1HashKey, Hs1Params, PhantomData, Quot, True, B16, U4,
3-
};
1+
use super::{mask, Array, False, Gr, Hs1HashKey, Hs1Params, PhantomData, Quot, True, B16, U4};
42
use aead::array::typenum::Unsigned;
53
use core::mem;
64

5+
#[cfg(target_feature = "sse2")]
6+
mod sse2;
7+
78
#[derive(Clone)]
89
pub struct Hasher<P: Hs1Params> {
910
k: Hs1HashKey<P>,
@@ -53,41 +54,52 @@ impl<P: Hs1Params> Hasher<P> {
5354
pub fn new(k: &Hs1HashKey<P>) -> Self {
5455
Self {
5556
k: k.clone(),
56-
h: array_from_iter(core::iter::repeat(1)),
57-
block: Array::default(),
57+
h: Array::from_fn(|_| 1),
58+
block: Default::default(),
5859
bytes: 0,
5960
_marker: PhantomData,
6061
}
6162
}
6263

64+
#[inline(always)]
6365
fn update_block(&mut self) -> &mut Self {
6466
assert!(usize::from(self.bytes) <= self.block_u8().len());
6567

68+
#[cfg(target_feature = "sse2")]
69+
if true {
70+
// SAFETY: sse2 is supported
71+
unsafe {
72+
return self.update_block_sse2();
73+
}
74+
}
75+
6676
#[inline(always)]
67-
fn nh_step(&[ax, bx, cx, dx]: &[u32; 4], &[ay, by, cy, dy]: &[u32; 4]) -> u64 {
68-
let d = u64::from(dx.wrapping_add(dy));
69-
let c = u64::from(cx.wrapping_add(cy));
70-
let b = u64::from(bx.wrapping_add(by));
77+
fn nh_step(&[ax, bx, cx, dx]: &[u32; 4], &[ay, by, cy, dy]: &[u32; 4]) -> [u64; 2] {
7178
let a = u64::from(ax.wrapping_add(ay));
72-
(a * c).wrapping_add(b * d)
79+
let b = u64::from(bx.wrapping_add(by));
80+
let c = u64::from(cx.wrapping_add(cy));
81+
let d = u64::from(dx.wrapping_add(dy));
82+
[a * c, b * d]
7383
}
7484

7585
let m_ints = &self.block;
7686

7787
let block16_count = usize::from(((self.bytes + 15) / 16).max(1));
7888

79-
let mut nh = Array::<u64, P::T>::default();
89+
let mut nh = Array::<[u64; 2], P::T>::default();
8090
for (i0, m_ints_i) in m_ints.chunks_exact(4).enumerate().take(block16_count) {
81-
for (nh_i, k_n_i_i) in nh.iter_mut().zip(self.k.nh.chunks_exact(4).skip(i0)) {
91+
for ([nh_i0, nh_i1], k_n_i_i) in nh.iter_mut().zip(self.k.nh.chunks_exact(4).skip(i0)) {
8292
let k_n_i_i = k_n_i_i.try_into().expect("exactly 4 elements");
8393
let m_ints_i = m_ints_i.try_into().expect("exactly 4 elements");
84-
let s = nh_step(k_n_i_i, m_ints_i);
85-
*nh_i = nh_i.wrapping_add(s);
94+
let [s0, s1] = nh_step(k_n_i_i, m_ints_i);
95+
*nh_i0 = nh_i0.wrapping_add(s0);
96+
*nh_i1 = nh_i1.wrapping_add(s1);
8697
}
8798
}
8899

89100
nh.iter()
90-
.map(|nh_i| (nh_i + (u64::from(self.bytes) & mask(4))) & mask(60))
101+
.map(|&[ac, bd]| ac.wrapping_add(bd))
102+
.map(|nh_i| (nh_i.wrapping_add(u64::from(self.bytes) & mask(4))) & mask(60))
91103
.zip(self.k.poly.iter())
92104
.zip(self.h.iter_mut())
93105
.for_each(|((a_i, &k_p_i), h_i)| *h_i = poly_step(*h_i, a_i, k_p_i));
@@ -97,6 +109,7 @@ impl<P: Hs1Params> Hasher<P> {
97109
self
98110
}
99111

112+
#[inline(always)]
100113
pub fn update<'a>(&'a mut self, bytes: &[u8]) -> &'a mut Self {
101114
assert!(usize::from(self.bytes) < self.block_u8().len());
102115
let start = usize::from(self.bytes);
@@ -123,6 +136,7 @@ impl<P: Hs1Params> Hasher<P> {
123136
self
124137
}
125138

139+
#[inline(always)]
126140
pub(crate) fn pad_to(&mut self, bits: u8) -> &mut Self {
127141
debug_assert!(1 << bits <= B16::<P>::to_u8());
128142
let m = mask(bits) as u8;
@@ -131,6 +145,7 @@ impl<P: Hs1Params> Hasher<P> {
131145
}
132146

133147
// TODO &mut self helps avoid needing to clone(), but might be unintuitive
148+
#[inline(always)]
134149
pub fn finalize(&mut self) -> Array<Output<P>, P::T> {
135150
// TODO we need to handle empty data properly
136151
// However, see the note in crate::test::test_vectors::hash_me_empty
@@ -146,6 +161,7 @@ impl<P: Hs1Params> Hasher<P> {
146161
out
147162
}
148163

164+
#[inline(always)]
149165
fn block_u8(&mut self) -> &mut Array<u8, B16<P>> {
150166
const {
151167
assert!(
@@ -177,18 +193,6 @@ const fn poly_finalize(a: u64) -> u64 {
177193
a & c
178194
}
179195

180-
#[inline(always)]
181-
fn array_from_iter<I, L>(it: I) -> Array<I::Item, L>
182-
where
183-
I: IntoIterator,
184-
L: ArraySize,
185-
I::Item: Default,
186-
{
187-
let mut v = Array::<I::Item, L>::default();
188-
v.iter_mut().zip(it).for_each(|(w, r)| *w = r);
189-
v
190-
}
191-
192196
#[cfg(test)]
193197
mod test {
194198
#[test]

hs1-siv/src/hash/sse2.rs

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
use super::{mask, poly_step, Array, Hasher, Hs1Params};
2+
use core::arch::x86_64::*;
3+
4+
impl<P: Hs1Params> Hasher<P> {
5+
#[inline(always)]
6+
#[cfg(target_feature = "sse2")]
7+
pub(super) unsafe fn update_block_sse2(&mut self) -> &mut Self {
8+
assert!(usize::from(self.bytes) <= self.block_u8().len());
9+
10+
#[inline(always)]
11+
unsafe fn nh_step(x: &[u32; 4], y: &[u32; 4]) -> __m128i {
12+
let x = x.as_ptr().cast::<__m128i>().read_unaligned();
13+
let y = y.as_ptr().cast::<__m128i>().read_unaligned();
14+
let xy = _mm_add_epi32(x, y);
15+
16+
let a_b = _mm_shuffle_epi32::<0b00_01_00_00>(xy);
17+
let c_d = _mm_shuffle_epi32::<0b00_11_00_10>(xy);
18+
_mm_mul_epu32(a_b, c_d)
19+
}
20+
21+
let m_ints = &self.block;
22+
23+
let block16_count = usize::from(((self.bytes + 15) / 16).max(1));
24+
25+
let mut nh: Array<__m128i, P::T> = Array::from_fn(|_| _mm_setzero_si128());
26+
for (i0, m_ints_i) in m_ints.chunks_exact(4).enumerate().take(block16_count) {
27+
for (nh_i, k_n_i_i) in nh.iter_mut().zip(self.k.nh.chunks_exact(4).skip(i0)) {
28+
let k_n_i_i = k_n_i_i.try_into().expect("exactly 4 elements");
29+
let m_ints_i = m_ints_i.try_into().expect("exactly 4 elements");
30+
let s = nh_step(k_n_i_i, m_ints_i);
31+
*nh_i = _mm_add_epi64(*nh_i, s);
32+
}
33+
}
34+
35+
nh.iter()
36+
.map(|nh_i| {
37+
let &[ac, bd] = &*(nh_i as *const _ as *const [u64; 2]);
38+
ac.wrapping_add(bd)
39+
})
40+
.map(|nh_i| (nh_i.wrapping_add(u64::from(self.bytes) & mask(4))) & mask(60))
41+
.zip(self.k.poly.iter())
42+
.zip(self.h.iter_mut())
43+
.for_each(|((a_i, &k_p_i), h_i)| *h_i = poly_step(*h_i, a_i, k_p_i));
44+
45+
self.bytes = 0;
46+
47+
self
48+
}
49+
}

0 commit comments

Comments
 (0)