@@ -149,7 +149,7 @@ static inline void soft_aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i
149149 *x7 = soft_aesenc (*x7, key);
150150}
151151
152- template <size_t MEM, bool SOFT_AES>
152+ template <size_t MEM, bool SOFT_AES, bool PREFETCH >
153153void cn_explode_scratchpad (const __m128i* input, __m128i* output)
154154{
155155 // This is more than we have registers, compiler will assign 2 keys on the stack
@@ -200,16 +200,21 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
200200 _mm_store_si128 (output + i + 1 , xin1);
201201 _mm_store_si128 (output + i + 2 , xin2);
202202 _mm_store_si128 (output + i + 3 , xin3);
203- _mm_prefetch ((const char *)output + i + 0 , _MM_HINT_T2);
203+
204+ if (PREFETCH)
205+ _mm_prefetch ((const char *)output + i + 0 , _MM_HINT_T2);
206+
204207 _mm_store_si128 (output + i + 4 , xin4);
205208 _mm_store_si128 (output + i + 5 , xin5);
206209 _mm_store_si128 (output + i + 6 , xin6);
207210 _mm_store_si128 (output + i + 7 , xin7);
208- _mm_prefetch ((const char *)output + i + 4 , _MM_HINT_T2);
211+
212+ if (PREFETCH)
213+ _mm_prefetch ((const char *)output + i + 4 , _MM_HINT_T2);
209214 }
210215}
211216
212- template <size_t MEM, bool SOFT_AES>
217+ template <size_t MEM, bool SOFT_AES, bool PREFETCH >
213218void cn_implode_scratchpad (const __m128i* input, __m128i* output)
214219{
215220 // This is more than we have registers, compiler will assign 2 keys on the stack
@@ -229,12 +234,17 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
229234
230235 for (size_t i = 0 ; i < MEM / sizeof (__m128i); i += 8 )
231236 {
232- _mm_prefetch ((const char *)input + i + 0 , _MM_HINT_NTA);
237+ if (PREFETCH)
238+ _mm_prefetch ((const char *)input + i + 0 , _MM_HINT_NTA);
239+
233240 xout0 = _mm_xor_si128 (_mm_load_si128 (input + i + 0 ), xout0);
234241 xout1 = _mm_xor_si128 (_mm_load_si128 (input + i + 1 ), xout1);
235242 xout2 = _mm_xor_si128 (_mm_load_si128 (input + i + 2 ), xout2);
236243 xout3 = _mm_xor_si128 (_mm_load_si128 (input + i + 3 ), xout3);
237- _mm_prefetch ((const char *)input + i + 4 , _MM_HINT_NTA);
244+
245+ if (PREFETCH)
246+ _mm_prefetch ((const char *)input + i + 4 , _MM_HINT_NTA);
247+
238248 xout4 = _mm_xor_si128 (_mm_load_si128 (input + i + 4 ), xout4);
239249 xout5 = _mm_xor_si128 (_mm_load_si128 (input + i + 5 ), xout5);
240250 xout6 = _mm_xor_si128 (_mm_load_si128 (input + i + 6 ), xout6);
@@ -284,7 +294,7 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
284294 keccak ((const uint8_t *)input, len, ctx0->hash_state , 200 );
285295
286296 // Optim - 99% time boundary
287- cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->hash_state , (__m128i*)ctx0->long_state );
297+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH >((__m128i*)ctx0->hash_state , (__m128i*)ctx0->long_state );
288298
289299 uint8_t * l0 = ctx0->long_state ;
290300 uint64_t * h0 = (uint64_t *)ctx0->hash_state ;
@@ -332,7 +342,7 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
332342 }
333343
334344 // Optim - 90% time boundary
335- cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->long_state , (__m128i*)ctx0->hash_state );
345+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH >((__m128i*)ctx0->long_state , (__m128i*)ctx0->hash_state );
336346
337347 // Optim - 99% time boundary
338348
@@ -350,8 +360,8 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
350360 keccak ((const uint8_t *)input+len, len, ctx1->hash_state , 200 );
351361
352362 // Optim - 99% time boundary
353- cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->hash_state , (__m128i*)ctx0->long_state );
354- cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx1->hash_state , (__m128i*)ctx1->long_state );
363+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH >((__m128i*)ctx0->hash_state , (__m128i*)ctx0->long_state );
364+ cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH >((__m128i*)ctx1->hash_state , (__m128i*)ctx1->long_state );
355365
356366 uint8_t * l0 = ctx0->long_state ;
357367 uint64_t * h0 = (uint64_t *)ctx0->hash_state ;
@@ -425,8 +435,8 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
425435 }
426436
427437 // Optim - 90% time boundary
428- cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx0->long_state , (__m128i*)ctx0->hash_state );
429- cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*)ctx1->long_state , (__m128i*)ctx1->hash_state );
438+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH >((__m128i*)ctx0->long_state , (__m128i*)ctx0->hash_state );
439+ cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH >((__m128i*)ctx1->long_state , (__m128i*)ctx1->hash_state );
430440
431441 // Optim - 99% time boundary
432442
0 commit comments