@@ -18,69 +18,104 @@ use core::arch::x86::*;
18
18
use core:: arch:: x86_64:: * ;
19
19
20
20
/// The number of blocks processed per invocation by this backend.
21
- const BLOCKS : usize = 2 ;
21
+ const BLOCKS : usize = 4 ;
22
22
23
23
/// Helper union for accessing per-block state.
24
24
///
25
25
/// ChaCha20 block state is stored in four 32-bit words, so we can process two blocks in
26
26
/// parallel. We store the state words as a union to enable cheap transformations between
27
27
/// their interpretations.
28
+ ///
29
+ /// Additionally, we process four blocks at a time to take advantage of ILP.
28
30
#[ derive( Clone , Copy ) ]
29
31
union StateWord {
30
32
blocks : [ __m128i ; BLOCKS ] ,
31
- avx : __m256i ,
33
+ avx : [ __m256i ; BLOCKS / 2 ] ,
32
34
}
33
35
34
36
impl StateWord {
35
37
#[ inline]
36
38
#[ target_feature( enable = "avx2" ) ]
37
39
unsafe fn add_assign_epi32 ( & mut self , rhs : & Self ) {
38
- self . avx = _mm256_add_epi32 ( self . avx , rhs. avx ) ;
40
+ self . avx = [
41
+ _mm256_add_epi32 ( self . avx [ 0 ] , rhs. avx [ 0 ] ) ,
42
+ _mm256_add_epi32 ( self . avx [ 1 ] , rhs. avx [ 1 ] ) ,
43
+ ] ;
39
44
}
40
45
41
46
#[ inline]
42
47
#[ target_feature( enable = "avx2" ) ]
43
48
unsafe fn xor_assign ( & mut self , rhs : & Self ) {
44
- self . avx = _mm256_xor_si256 ( self . avx , rhs. avx ) ;
49
+ self . avx = [
50
+ _mm256_xor_si256 ( self . avx [ 0 ] , rhs. avx [ 0 ] ) ,
51
+ _mm256_xor_si256 ( self . avx [ 1 ] , rhs. avx [ 1 ] ) ,
52
+ ] ;
45
53
}
46
54
47
55
#[ inline]
48
56
#[ target_feature( enable = "avx2" ) ]
49
57
unsafe fn shuffle_epi32 < const MASK : i32 > ( & mut self ) {
50
- self . avx = _mm256_shuffle_epi32 ( self . avx , MASK ) ;
58
+ self . avx = [
59
+ _mm256_shuffle_epi32 ( self . avx [ 0 ] , MASK ) ,
60
+ _mm256_shuffle_epi32 ( self . avx [ 1 ] , MASK ) ,
61
+ ] ;
51
62
}
52
63
53
64
#[ inline]
54
65
#[ target_feature( enable = "avx2" ) ]
55
66
unsafe fn rol < const BY : i32 , const REST : i32 > ( & mut self ) {
56
- self . avx = _mm256_xor_si256 (
57
- _mm256_slli_epi32 ( self . avx , BY ) ,
58
- _mm256_srli_epi32 ( self . avx , REST ) ,
59
- ) ;
67
+ self . avx = [
68
+ _mm256_xor_si256 (
69
+ _mm256_slli_epi32 ( self . avx [ 0 ] , BY ) ,
70
+ _mm256_srli_epi32 ( self . avx [ 0 ] , REST ) ,
71
+ ) ,
72
+ _mm256_xor_si256 (
73
+ _mm256_slli_epi32 ( self . avx [ 1 ] , BY ) ,
74
+ _mm256_srli_epi32 ( self . avx [ 1 ] , REST ) ,
75
+ ) ,
76
+ ] ;
60
77
}
61
78
62
79
#[ inline]
63
80
#[ target_feature( enable = "avx2" ) ]
64
81
unsafe fn rol_8 ( & mut self ) {
65
- self . avx = _mm256_shuffle_epi8 (
66
- self . avx ,
67
- _mm256_set_epi8 (
68
- 14 , 13 , 12 , 15 , 10 , 9 , 8 , 11 , 6 , 5 , 4 , 7 , 2 , 1 , 0 , 3 , 14 , 13 , 12 , 15 , 10 , 9 , 8 , 11 ,
69
- 6 , 5 , 4 , 7 , 2 , 1 , 0 , 3 ,
82
+ self . avx = [
83
+ _mm256_shuffle_epi8 (
84
+ self . avx [ 0 ] ,
85
+ _mm256_set_epi8 (
86
+ 14 , 13 , 12 , 15 , 10 , 9 , 8 , 11 , 6 , 5 , 4 , 7 , 2 , 1 , 0 , 3 , 14 , 13 , 12 , 15 , 10 , 9 , 8 ,
87
+ 11 , 6 , 5 , 4 , 7 , 2 , 1 , 0 , 3 ,
88
+ ) ,
89
+ ) ,
90
+ _mm256_shuffle_epi8 (
91
+ self . avx [ 1 ] ,
92
+ _mm256_set_epi8 (
93
+ 14 , 13 , 12 , 15 , 10 , 9 , 8 , 11 , 6 , 5 , 4 , 7 , 2 , 1 , 0 , 3 , 14 , 13 , 12 , 15 , 10 , 9 , 8 ,
94
+ 11 , 6 , 5 , 4 , 7 , 2 , 1 , 0 , 3 ,
95
+ ) ,
70
96
) ,
71
- ) ;
97
+ ] ;
72
98
}
73
99
74
100
#[ inline]
75
101
#[ target_feature( enable = "avx2" ) ]
76
102
unsafe fn rol_16 ( & mut self ) {
77
- self . avx = _mm256_shuffle_epi8 (
78
- self . avx ,
79
- _mm256_set_epi8 (
80
- 13 , 12 , 15 , 14 , 9 , 8 , 11 , 10 , 5 , 4 , 7 , 6 , 1 , 0 , 3 , 2 , 13 , 12 , 15 , 14 , 9 , 8 , 11 , 10 ,
81
- 5 , 4 , 7 , 6 , 1 , 0 , 3 , 2 ,
103
+ self . avx = [
104
+ _mm256_shuffle_epi8 (
105
+ self . avx [ 0 ] ,
106
+ _mm256_set_epi8 (
107
+ 13 , 12 , 15 , 14 , 9 , 8 , 11 , 10 , 5 , 4 , 7 , 6 , 1 , 0 , 3 , 2 , 13 , 12 , 15 , 14 , 9 , 8 , 11 ,
108
+ 10 , 5 , 4 , 7 , 6 , 1 , 0 , 3 , 2 ,
109
+ ) ,
82
110
) ,
83
- ) ;
111
+ _mm256_shuffle_epi8 (
112
+ self . avx [ 1 ] ,
113
+ _mm256_set_epi8 (
114
+ 13 , 12 , 15 , 14 , 9 , 8 , 11 , 10 , 5 , 4 , 7 , 6 , 1 , 0 , 3 , 2 , 13 , 12 , 15 , 14 , 9 , 8 , 11 ,
115
+ 10 , 5 , 4 , 7 , 6 , 1 , 0 , 3 , 2 ,
116
+ ) ,
117
+ ) ,
118
+ ] ;
84
119
}
85
120
}
86
121
@@ -179,9 +214,15 @@ unsafe fn key_setup(key: &[u8; KEY_SIZE]) -> (StateWord, StateWord, StateWord) {
179
214
let v2 = _mm_loadu_si128 ( key. as_ptr ( ) . offset ( 0x10 ) as * const __m128i ) ;
180
215
181
216
(
182
- StateWord { blocks : [ v0, v0] } ,
183
- StateWord { blocks : [ v1, v1] } ,
184
- StateWord { blocks : [ v2, v2] } ,
217
+ StateWord {
218
+ blocks : [ v0, v0, v0, v0] ,
219
+ } ,
220
+ StateWord {
221
+ blocks : [ v1, v1, v1, v1] ,
222
+ } ,
223
+ StateWord {
224
+ blocks : [ v2, v2, v2, v2] ,
225
+ } ,
185
226
)
186
227
}
187
228
@@ -196,7 +237,12 @@ unsafe fn iv_setup(iv: [i32; 2], counter: u64) -> StateWord {
196
237
) ;
197
238
198
239
StateWord {
199
- blocks : [ s3, _mm_add_epi64 ( s3, _mm_set_epi64x ( 0 , 1 ) ) ] ,
240
+ blocks : [
241
+ s3,
242
+ _mm_add_epi64 ( s3, _mm_set_epi64x ( 0 , 1 ) ) ,
243
+ _mm_add_epi64 ( s3, _mm_set_epi64x ( 0 , 2 ) ) ,
244
+ _mm_add_epi64 ( s3, _mm_set_epi64x ( 0 , 3 ) ) ,
245
+ ] ,
200
246
}
201
247
}
202
248
0 commit comments