Skip to content

Commit 4ab9f09

Browse files
committed
Extend prefetch option.
1 parent ec1e41c commit 4ab9f09

File tree

1 file changed

+22
-12
lines changed

1 file changed

+22
-12
lines changed

crypto/cryptonight_aesni.h

+22-12
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ static inline void soft_aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i
149149
*x7 = soft_aesenc(*x7, key);
150150
}
151151

152-
template<size_t MEM, bool SOFT_AES>
152+
template<size_t MEM, bool SOFT_AES, bool PREFETCH>
153153
void cn_explode_scratchpad(const __m128i* input, __m128i* output)
154154
{
155155
// This is more than we have registers, compiler will assign 2 keys on the stack
@@ -200,16 +200,21 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
200200
_mm_store_si128(output + i + 1, xin1);
201201
_mm_store_si128(output + i + 2, xin2);
202202
_mm_store_si128(output + i + 3, xin3);
203-
_mm_prefetch((const char*)output + i + 0, _MM_HINT_T2);
203+
204+
if(PREFETCH)
205+
_mm_prefetch((const char*)output + i + 0, _MM_HINT_T2);
206+
204207
_mm_store_si128(output + i + 4, xin4);
205208
_mm_store_si128(output + i + 5, xin5);
206209
_mm_store_si128(output + i + 6, xin6);
207210
_mm_store_si128(output + i + 7, xin7);
208-
_mm_prefetch((const char*)output + i + 4, _MM_HINT_T2);
211+
212+
if(PREFETCH)
213+
_mm_prefetch((const char*)output + i + 4, _MM_HINT_T2);
209214
}
210215
}
211216

212-
template<size_t MEM, bool SOFT_AES>
217+
template<size_t MEM, bool SOFT_AES, bool PREFETCH>
213218
void cn_implode_scratchpad(const __m128i* input, __m128i* output)
214219
{
215220
// This is more than we have registers, compiler will assign 2 keys on the stack
@@ -229,12 +234,17 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
229234

230235
for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
231236
{
232-
_mm_prefetch((const char*)input + i + 0, _MM_HINT_NTA);
237+
if(PREFETCH)
238+
_mm_prefetch((const char*)input + i + 0, _MM_HINT_NTA);
239+
233240
xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0);
234241
xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1);
235242
xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2);
236243
xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3);
237-
_mm_prefetch((const char*)input + i + 4, _MM_HINT_NTA);
244+
245+
if(PREFETCH)
246+
_mm_prefetch((const char*)input + i + 4, _MM_HINT_NTA);
247+
238248
xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4);
239249
xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5);
240250
xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
@@ -284,7 +294,7 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
284294
keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
285295

286296
// Optim - 99% time boundary
287-
cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
297+
cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
288298

289299
uint8_t* l0 = ctx0->long_state;
290300
uint64_t* h0 = (uint64_t*)ctx0->hash_state;
@@ -332,7 +342,7 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
332342
}
333343

334344
// Optim - 90% time boundary
335-
cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
345+
cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
336346

337347
// Optim - 99% time boundary
338348

@@ -350,8 +360,8 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
350360
keccak((const uint8_t *)input+len, len, ctx1->hash_state, 200);
351361

352362
// Optim - 99% time boundary
353-
cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
354-
cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx1->hash_state, (__m128i*)ctx1->long_state);
363+
cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
364+
cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx1->hash_state, (__m128i*)ctx1->long_state);
355365

356366
uint8_t* l0 = ctx0->long_state;
357367
uint64_t* h0 = (uint64_t*)ctx0->hash_state;
@@ -425,8 +435,8 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
425435
}
426436

427437
// Optim - 90% time boundary
428-
cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
429-
cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx1->long_state, (__m128i*)ctx1->hash_state);
438+
cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
439+
cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH>((__m128i*)ctx1->long_state, (__m128i*)ctx1->hash_state);
430440

431441
// Optim - 99% time boundary
432442

0 commit comments

Comments
 (0)