Skip to content

Commit 8a47574

Browse files
committed
chacha20: Switch to 4-block buffer for SSE2 / AVX2 backend
1 parent 972b633 commit 8a47574

File tree

2 files changed

+74
-26
lines changed

2 files changed

+74
-26
lines changed

chacha20/src/backend/autodetect.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@ use core::mem::ManuallyDrop;
88

99
/// Size of buffers passed to `generate` and `apply_keystream` for this
1010
/// backend, which operates on two blocks in parallel for optimal performance.
11-
pub(crate) const BUFFER_SIZE: usize = BLOCK_SIZE * 2;
11+
/// The backend consumes four blocks at a time, so that the AVX2 implementation
12+
/// can additionally pipeline the pairs of blocks for better ILP.
13+
pub(crate) const BUFFER_SIZE: usize = BLOCK_SIZE * 4;
1214

1315
cpufeatures::new!(avx2_cpuid, "avx2");
1416

chacha20/src/backend/avx2.rs

Lines changed: 71 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -18,69 +18,104 @@ use core::arch::x86::*;
1818
use core::arch::x86_64::*;
1919

2020
/// The number of blocks processed per invocation by this backend.
21-
const BLOCKS: usize = 2;
21+
const BLOCKS: usize = 4;
2222

2323
/// Helper union for accessing per-block state.
2424
///
2525
/// ChaCha20 block state is stored in four 32-bit words, so we can process two blocks in
2626
/// parallel. We store the state words as a union to enable cheap transformations between
2727
/// their interpretations.
28+
///
29+
/// Additionally, we process four blocks at a time to take advantage of ILP.
2830
#[derive(Clone, Copy)]
2931
union StateWord {
3032
blocks: [__m128i; BLOCKS],
31-
avx: __m256i,
33+
avx: [__m256i; BLOCKS / 2],
3234
}
3335

3436
impl StateWord {
3537
#[inline]
3638
#[target_feature(enable = "avx2")]
3739
unsafe fn add_assign_epi32(&mut self, rhs: &Self) {
38-
self.avx = _mm256_add_epi32(self.avx, rhs.avx);
40+
self.avx = [
41+
_mm256_add_epi32(self.avx[0], rhs.avx[0]),
42+
_mm256_add_epi32(self.avx[1], rhs.avx[1]),
43+
];
3944
}
4045

4146
#[inline]
4247
#[target_feature(enable = "avx2")]
4348
unsafe fn xor_assign(&mut self, rhs: &Self) {
44-
self.avx = _mm256_xor_si256(self.avx, rhs.avx);
49+
self.avx = [
50+
_mm256_xor_si256(self.avx[0], rhs.avx[0]),
51+
_mm256_xor_si256(self.avx[1], rhs.avx[1]),
52+
];
4553
}
4654

4755
#[inline]
4856
#[target_feature(enable = "avx2")]
4957
unsafe fn shuffle_epi32<const MASK: i32>(&mut self) {
50-
self.avx = _mm256_shuffle_epi32(self.avx, MASK);
58+
self.avx = [
59+
_mm256_shuffle_epi32(self.avx[0], MASK),
60+
_mm256_shuffle_epi32(self.avx[1], MASK),
61+
];
5162
}
5263

5364
#[inline]
5465
#[target_feature(enable = "avx2")]
5566
unsafe fn rol<const BY: i32, const REST: i32>(&mut self) {
56-
self.avx = _mm256_xor_si256(
57-
_mm256_slli_epi32(self.avx, BY),
58-
_mm256_srli_epi32(self.avx, REST),
59-
);
67+
self.avx = [
68+
_mm256_xor_si256(
69+
_mm256_slli_epi32(self.avx[0], BY),
70+
_mm256_srli_epi32(self.avx[0], REST),
71+
),
72+
_mm256_xor_si256(
73+
_mm256_slli_epi32(self.avx[1], BY),
74+
_mm256_srli_epi32(self.avx[1], REST),
75+
),
76+
];
6077
}
6178

6279
#[inline]
6380
#[target_feature(enable = "avx2")]
6481
unsafe fn rol_8(&mut self) {
65-
self.avx = _mm256_shuffle_epi8(
66-
self.avx,
67-
_mm256_set_epi8(
68-
14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, 14, 13, 12, 15, 10, 9, 8, 11,
69-
6, 5, 4, 7, 2, 1, 0, 3,
82+
self.avx = [
83+
_mm256_shuffle_epi8(
84+
self.avx[0],
85+
_mm256_set_epi8(
86+
14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, 14, 13, 12, 15, 10, 9, 8,
87+
11, 6, 5, 4, 7, 2, 1, 0, 3,
88+
),
89+
),
90+
_mm256_shuffle_epi8(
91+
self.avx[1],
92+
_mm256_set_epi8(
93+
14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, 14, 13, 12, 15, 10, 9, 8,
94+
11, 6, 5, 4, 7, 2, 1, 0, 3,
95+
),
7096
),
71-
);
97+
];
7298
}
7399

74100
#[inline]
75101
#[target_feature(enable = "avx2")]
76102
unsafe fn rol_16(&mut self) {
77-
self.avx = _mm256_shuffle_epi8(
78-
self.avx,
79-
_mm256_set_epi8(
80-
13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, 13, 12, 15, 14, 9, 8, 11, 10,
81-
5, 4, 7, 6, 1, 0, 3, 2,
103+
self.avx = [
104+
_mm256_shuffle_epi8(
105+
self.avx[0],
106+
_mm256_set_epi8(
107+
13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, 13, 12, 15, 14, 9, 8, 11,
108+
10, 5, 4, 7, 6, 1, 0, 3, 2,
109+
),
82110
),
83-
);
111+
_mm256_shuffle_epi8(
112+
self.avx[1],
113+
_mm256_set_epi8(
114+
13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, 13, 12, 15, 14, 9, 8, 11,
115+
10, 5, 4, 7, 6, 1, 0, 3, 2,
116+
),
117+
),
118+
];
84119
}
85120
}
86121

@@ -179,9 +214,15 @@ unsafe fn key_setup(key: &[u8; KEY_SIZE]) -> (StateWord, StateWord, StateWord) {
179214
let v2 = _mm_loadu_si128(key.as_ptr().offset(0x10) as *const __m128i);
180215

181216
(
182-
StateWord { blocks: [v0, v0] },
183-
StateWord { blocks: [v1, v1] },
184-
StateWord { blocks: [v2, v2] },
217+
StateWord {
218+
blocks: [v0, v0, v0, v0],
219+
},
220+
StateWord {
221+
blocks: [v1, v1, v1, v1],
222+
},
223+
StateWord {
224+
blocks: [v2, v2, v2, v2],
225+
},
185226
)
186227
}
187228

@@ -196,7 +237,12 @@ unsafe fn iv_setup(iv: [i32; 2], counter: u64) -> StateWord {
196237
);
197238

198239
StateWord {
199-
blocks: [s3, _mm_add_epi64(s3, _mm_set_epi64x(0, 1))],
240+
blocks: [
241+
s3,
242+
_mm_add_epi64(s3, _mm_set_epi64x(0, 1)),
243+
_mm_add_epi64(s3, _mm_set_epi64x(0, 2)),
244+
_mm_add_epi64(s3, _mm_set_epi64x(0, 3)),
245+
],
200246
}
201247
}
202248

0 commit comments

Comments
 (0)