@@ -149,7 +149,7 @@ static inline void soft_aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i
149
149
*x7 = soft_aesenc (*x7, key);
150
150
}
151
151
152
- template <size_t MEM, bool SOFT_AES>
152
+ template <size_t MEM, bool SOFT_AES, bool PREFETCH >
153
153
void cn_explode_scratchpad (const __m128i* input, __m128i* output)
154
154
{
155
155
// This is more than we have registers, compiler will assign 2 keys on the stack
@@ -200,16 +200,21 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
200
200
_mm_store_si128 (output + i + 1 , xin1);
201
201
_mm_store_si128 (output + i + 2 , xin2);
202
202
_mm_store_si128 (output + i + 3 , xin3);
203
- _mm_prefetch ((const char *)output + i + 0 , _MM_HINT_T2);
203
+
204
+ if (PREFETCH)
205
+ _mm_prefetch ((const char *)output + i + 0 , _MM_HINT_T2);
206
+
204
207
_mm_store_si128 (output + i + 4 , xin4);
205
208
_mm_store_si128 (output + i + 5 , xin5);
206
209
_mm_store_si128 (output + i + 6 , xin6);
207
210
_mm_store_si128 (output + i + 7 , xin7);
208
- _mm_prefetch ((const char *)output + i + 4 , _MM_HINT_T2);
211
+
212
+ if (PREFETCH)
213
+ _mm_prefetch ((const char *)output + i + 4 , _MM_HINT_T2);
209
214
}
210
215
}
211
216
212
- template <size_t MEM, bool SOFT_AES>
217
+ template <size_t MEM, bool SOFT_AES, bool PREFETCH >
213
218
void cn_implode_scratchpad (const __m128i* input, __m128i* output)
214
219
{
215
220
// This is more than we have registers, compiler will assign 2 keys on the stack
@@ -229,12 +234,17 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
229
234
230
235
for (size_t i = 0 ; i < MEM / sizeof (__m128i); i += 8 )
231
236
{
232
- _mm_prefetch ((const char *)input + i + 0 , _MM_HINT_NTA);
237
+ if (PREFETCH)
238
+ _mm_prefetch ((const char *)input + i + 0 , _MM_HINT_NTA);
239
+
233
240
xout0 = _mm_xor_si128 (_mm_load_si128 (input + i + 0 ), xout0);
234
241
xout1 = _mm_xor_si128 (_mm_load_si128 (input + i + 1 ), xout1);
235
242
xout2 = _mm_xor_si128 (_mm_load_si128 (input + i + 2 ), xout2);
236
243
xout3 = _mm_xor_si128 (_mm_load_si128 (input + i + 3 ), xout3);
237
- _mm_prefetch ((const char *)input + i + 4 , _MM_HINT_NTA);
244
+
245
+ if (PREFETCH)
246
+ _mm_prefetch ((const char *)input + i + 4 , _MM_HINT_NTA);
247
+
238
248
xout4 = _mm_xor_si128 (_mm_load_si128 (input + i + 4 ), xout4);
239
249
xout5 = _mm_xor_si128 (_mm_load_si128 (input + i + 5 ), xout5);
240
250
xout6 = _mm_xor_si128 (_mm_load_si128 (input + i + 6 ), xout6);
@@ -284,7 +294,7 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
284
294
keccak ((const uint8_t *)input, len, ctx0->hash_state , 200 );
285
295
286
296
// Optim - 99% time boundary
287
- cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->hash_state , (__m128i*)ctx0->long_state );
297
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH >((__m128i*)ctx0->hash_state , (__m128i*)ctx0->long_state );
288
298
289
299
uint8_t * l0 = ctx0->long_state ;
290
300
uint64_t * h0 = (uint64_t *)ctx0->hash_state ;
@@ -332,7 +342,7 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
332
342
}
333
343
334
344
// Optim - 90% time boundary
335
- cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->long_state , (__m128i*)ctx0->hash_state );
345
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH >((__m128i*)ctx0->long_state , (__m128i*)ctx0->hash_state );
336
346
337
347
// Optim - 99% time boundary
338
348
@@ -350,8 +360,8 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
350
360
keccak ((const uint8_t *)input+len, len, ctx1->hash_state , 200 );
351
361
352
362
// Optim - 99% time boundary
353
- cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->hash_state , (__m128i*)ctx0->long_state );
354
- cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx1->hash_state , (__m128i*)ctx1->long_state );
363
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH >((__m128i*)ctx0->hash_state , (__m128i*)ctx0->long_state );
364
+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH >((__m128i*)ctx1->hash_state , (__m128i*)ctx1->long_state );
355
365
356
366
uint8_t * l0 = ctx0->long_state ;
357
367
uint64_t * h0 = (uint64_t *)ctx0->hash_state ;
@@ -425,8 +435,8 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
425
435
}
426
436
427
437
// Optim - 90% time boundary
428
- cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->long_state , (__m128i*)ctx0->hash_state );
429
- cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx1->long_state , (__m128i*)ctx1->hash_state );
438
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH >((__m128i*)ctx0->long_state , (__m128i*)ctx0->hash_state );
439
+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH >((__m128i*)ctx1->long_state , (__m128i*)ctx1->hash_state );
430
440
431
441
// Optim - 99% time boundary
432
442
0 commit comments