-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPrimes.cxx
652 lines (604 loc) · 26.6 KB
/
Primes.cxx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
#include "sys.h"
#include "Primes.h"
#include "utils/ctz.h"
#include "utils/cpu_relax.h"
#include "threadpool/AIThreadPool.h"
#include <tuple>
#include <cmath>
#include <cstring>
#include <thread>
#if defined(__OPTIMIZE__)
#define USE_STOPWATCH 0 // Set to 1 to write execution time information to std::cout.
#endif
#if USE_STOPWATCH
#include "cwds/benchmark.h"
#endif
#ifdef CWDEBUG
#define CHECK_PRIMES 0 // Set to 1 to check that the first 189961811 generated primes are correct.
#define CHECK_SIEVING 0 // Set to 1 to check that bits set in the sieve correspond to integers that are divisible by the current prime.
#else
#define CHECK_PRIMES 0 // Leave at 0.
#define CHECK_SIEVING 0 // idem
#endif
namespace fastprimes {
namespace {
int modular_inverse(int64_t n, int64_t m)
{
int64_t x0 = 1, x1 = 0;
int64_t y0 = n, y1 = m;
while (y1 > 0)
{
int64_t q = y0 / y1;
std::tie(x0, x1) = std::make_tuple(x1, x0 - q * x1);
std::tie(y0, y1) = std::make_tuple(y1, y0 - q * y1);
}
// Ensure the result is positive.
return (x0 + m) % m;
}
#ifdef CWDEBUG
constexpr int compression_first_prime_second_row = Primes::compression_first_prime + Primes::compression_primorial;
#endif
constexpr std::array<int, compression_repeat> calc_row0()
{
std::array<int, compression_repeat> row0 = {};
int col = 0;
int candidate = Primes::compression_first_prime;
do
{
bool is_divisible = false;
for (int i = 1; i < Primes::compression; ++i)
if (candidate % small_primes[i] == 0)
{
is_divisible = true;
break;
}
if (!is_divisible)
row0[col++] = candidate;
candidate += 2;
}
while (col < compression_repeat);
return row0;
}
constexpr std::array<int, compression_repeat> row0 = calc_row0();
// This is the largest multiplier that is required to keep `offset - row0[col]` positive.
// Since offset = compression_offset_multiplier * prime, that value is maximal when prime
// is as small as possible (prime = compression_first_prime) and row0[col] is as large
// as possible (row0.back()).
constexpr int compression_offset_multiplier = (row0.back() + Primes::compression_first_prime - 1) / Primes::compression_first_prime;
inline prime_t sieve_row_column_to_prime(int row, int column)
{
return row * static_cast<prime_t>(Primes::compression_primorial) + row0[column];
}
#if CHECK_PRIMES
std::vector<uint32_t> debug_primes;
void debug_init_primes()
{
std::ifstream ifs("primes_till_4000000000", std::ios::binary);
if (!ifs.is_open()) {
DoutFatal(dc::fatal, "Failed to open file primes_till_4000000000 for reading.");
}
// Read the size of the vector first.
size_t size;
ifs.read(reinterpret_cast<char*>(&size), sizeof(size));
// Resize the vector and read the data.
debug_primes.resize(size);
ifs.read(reinterpret_cast<char*>(debug_primes.data()), size * sizeof(uint32_t));
ifs.close();
}
#define CHECK_PRIME(prime) \
ASSERT(debug_pi_ > uint64_t{189961811} || prime == debug_primes[debug_pi_++]); // We can't test primes larger than 3,999,999,979.
#else
#define CHECK_PRIME(prime) do {} while(0)
#endif // CWDEBUG
#if USE_STOPWATCH
int const cpu = benchmark::Stopwatch::cpu_any; // The CPU to run Stopwatch on.
double const cpu_frequency = 3612059050.0; // In cycles per second.
size_t const loopsize = 1000;
size_t const minimum_of = 3;
#endif
std::array<uint16_t, Primes::row0_to_column_size> calc_row0_to_column()
{
std::array<uint16_t, Primes::row0_to_column_size> result;
#if CW_DEBUG
std::memset(result.data(), 0, sizeof(result));
#endif
ASSERT(row0.size() <= std::numeric_limits<uint16_t>::max());
for (uint16_t i = 0; i < row0.size(); ++i)
{
int ri = Primes::n_to_row0_to_column_index(row0[i]);
ASSERT(result[ri] == 0);
result[ri] = i;
}
return result;
}
} // namespace
// Returns an upper bound on the number of primes that are smaller than or equal to n.
// Just a random function that I thought matched quite well (for n > 1000 or so).
// For n larger than 500,000,000 it is off on average 0.0079% (too large, thus).
//static
integer_t Primes::calc_upper_bound_number_of_primes(integer_t n)
{
ASSERT(n > 54); // So that log(n) > 4.
double logn = std::log(n);
return std::exp(0.3125 * std::pow(1 / (logn - 4), 1.655) + logn - std::log(logn - 1)) - 4;
}
// COMMENT "precision":
//
// The largest value of `offset - row0[col]` is when `offset` has its largest value
// and row0[col] is at its smallest. The latter happens when col = 0 (at which point
// row0[col] equals compression_first_prime). The former, `offset` at its largest,
// happens when `prime` is at its largest, which is `compression_first_prime_second_row`.
// Note that compression_first_prime_second_row = compression_first_prime + compression_primorial.
//
// Let P = compression_primorial, F = compression_first_prime.
// Then compression_first_prime_second_row = P + F.
// Note that compression_offset_multiplier is more or less (P + F) / F.
//
// And we can write for the largest values involved:
// prime = P + F,
// offset = floor((P + F) / F) * (P + F);
// -row0[col] = -F
//
// The largest possible value of compression_primorial_inverse is prime - 1, or P + F - 1.
//
// Thus the largest possible value of ((offset - row0[col]) * compression_primorial_inverse) is (less than or) equal
//
// M = (floor((P + F) / F) * (P + F) - F) * (P + F - 1)
//
// which becomes larger than what fits in an int when compression is 6:
// compression F P M
// 2 5 6 170
// 3 7 30 6408
// 4 11 210 969980
// 5 13 2310 960102882
// 6 17 30030 1595233239472
//
// This means that for compression = 6 we need 64 bit precision when multiplying with the compression_primorial_inverse.
// Calculate the first row that has a multiple of this prime in colum `col`.
template<bool is_row0>
int Primes::wipe_primes_of_row(int row, int number_of_threads)
{
AIThreadPool& thread_pool = AIThreadPool::instance();
integer_t const sieve_size = sieve_rows_ * words_per_row; // The size of the sieve_ in words.
bool const use_thread_pool = !queue_handle_.undefined();
sieve_word_t* row_ptr = sieve_ + row;
int column = 0;
// Loop over all words in a row, by running over the word index offsets relative to the start of the current row.
for (unsigned int word_index_offset = 0; word_index_offset < sieve_size; word_index_offset += sieve_rows_)
{
// Loop over all bits in a word.
for (sieve_word_t column_mask = 1; column_mask != 0; column_mask <<= 1, ++column)
{
// Did we find the next prime?
if ((row_ptr[word_index_offset] & column_mask))
{
struct WipeWordColData
{
prime_t const prime_;
int const offset_;
int const compression_primorial_inverse_;
std::atomic_int col_word_;
WipeWordColData(prime_t prime) :
prime_(prime),
// For row >= 1, prime >= row0[col] so we can simply subtract row0 from prime; this then avoids an overflow.
// Also see "precision" comment above, for row 0.
offset_(is_row0 ? compression_offset_multiplier * prime : prime),
compression_primorial_inverse_(modular_inverse(Primes::compression_primorial, prime)),
col_word_(0)
{ }
};
WipeWordColData data(sieve_row_column_to_prime(row, column));
ASSERT(is_row0 || data.prime_ > Primes::compression_primorial);
CHECK_PRIME(data.prime_);
if constexpr (!is_row0)
{
if (data.prime_ > sqrt_max_value_)
{
// We're done.
return column;
}
}
auto wipe_word_column = [this, &data]() -> bool {
// Run over unprocessed word-columns.
for (;;)
{
// Get a word-column that isn't being wiped yet.
int col_word = data.col_word_.fetch_add(1, std::memory_order::release);
// Exit if all word-column have been processed.
if (col_word >= words_per_row)
return false;
unsigned int col_word_offset = col_word * sieve_rows_;
int col = col_word * sieve_word_bits;
// Run over all bits in the word(-column).
for (sieve_word_t col_mask = 1; col_mask != 0; col_mask <<= 1, ++col)
{
// Calculate the first row that contains a multiple of data.prime_ in this col(umn).
uint64_t first_row_with_prime_multiple64 = data.offset_ - row0[col];
first_row_with_prime_multiple64 *= data.compression_primorial_inverse_;
int first_row_with_prime_multiple = first_row_with_prime_multiple64 % data.prime_;
// Run wi (word index) over all words in this word-column that have a bit in this col(umn)
// that represents an integer that is divisible by prime and reset it.
for (unsigned int wi = first_row_with_prime_multiple + col_word_offset; wi < sieve_rows_ + col_word_offset; wi += data.prime_)
{
sieve_[wi] &= ~col_mask;
#if CHECK_SIEVING
int debug_row = wi % sieve_rows_;
int debug_col = (wi / sieve_rows_) * sieve_word_bits + (col % sieve_word_bits);
prime_t debug_prime = sieve_row_column_to_prime(debug_row, debug_col);
ASSERT(debug_col == col);
ASSERT(debug_prime % data.prime_ == 0);
// Dout(dc::notice, "Inner loop: setting " << debug_prime << " to 0 because it is " << (debug_prime / data.prime_) << " * " << data.prime_);
#endif
}
}
}
};
for (int thread = 0; thread < number_of_threads - 1; ++thread)
{
// Get read access to AIThreadPool::m_queues.
auto queues_access = thread_pool.queues_read_access();
// Get a reference to one of the queues in m_queues.
auto& queue = thread_pool.get_queue(queues_access, queue_handle_);
{
// Get producer accesses to this queue.
auto queue_access = queue.producer_access();
// The queue must be larger than or equal to the number of threads you use.
ASSERT(queue_access.length() < queue.capacity());
// Place a lambda in the queue.
queue_access.move_in(wipe_word_column);
// Release producer accesses, so another thread can write to this queue again.
}
// This function must be called every time move_in was called on a queue that was returned by thread_pool.get_queue.
queue.notify_one();
// Release read access to AIThreadPool::m_queues so another thread can use AIThreadPool::new_queue again.
}
wipe_word_column();
// Wait till all threads have finished.
while (data.col_word_.load(std::memory_order::acquire) < words_per_row + number_of_threads)
cpu_relax();
row_ptr[word_index_offset] |= column_mask;
}
}
}
// Continue with the next row.
return -1;
}
// Construct a sieve initialized for all primes up till and including max_value.
Primes::Primes(integer_t max_value, int number_of_threads, AIQueueHandle queue_handle) :
sieve_(nullptr), max_value_(max_value), sqrt_max_value_(std::sqrt(max_value)),
// Calculate how many integers are not divisible by the first `compression` number of primes that are less than or equal max_value.
sieve_rows_((max_value - compression_first_prime) / compression_primorial + 1),
index_(-compression - 1), queue_handle_(queue_handle)
{
ASSERT(max_value >= compression_first_prime_second_row);
#if USE_STOPWATCH
benchmark::Stopwatch stopwatch(cpu); // Declare stopwatch and configure on which CPU it must run.
#endif
#if CHECK_PRIMES
Debug(debug_init_primes());
debug_pi_ = compression;
#endif
#if USE_STOPWATCH
stopwatch.start();
#endif
// Lets allocate whole rows, even if we only partially need the last one.
integer_t sieve_size = sieve_rows_ * words_per_row; // The size of the sieve_ in words.
index_end_ = sieve_size * sieve_word_bits; // The size of the sieve_ in bits.
// Allocate the sieve and fill it with ones.
sieve_ = (sieve_word_t*)std::malloc(sieve_size * sizeof(sieve_word_t));
if (!sieve_)
throw std::bad_alloc();
// This assumes that unused_bits is zero.
std::memset(sieve_, 0xff, sieve_size * sizeof(sieve_word_t));
#if USE_STOPWATCH
stopwatch.stop();
uint64_t cycles = stopwatch.diff_cycles() - benchmark::Stopwatch::s_stopwatch_overhead;
float delta = cycles / cpu_frequency;
std::cout << "Time spent allocating and initializing sieve: " << delta << " seconds." << std::endl;
#endif
// sieve_ is a one dimensional vector, but can best be throught of as a two dimensional
// table with width compression_repeat.
// For example, if compression = 3 then compression_repeat = 8, and sieve_ represents
// the following integers (all integers not divisible by 2, 3 or 5):
//
// row |col: 0 1 2 3 4 5 6 7
// -----+--------------------------------------------------------
// 0 : | ! 7| @ 11| # 13| $ 17| % 19| ^ 23| & 29| * 31| <-- row0
// 1 : | 37| 41| 43| 47| ! 49| 53| 59| 61|
// 2 : | 67| 71| 73| @! 77| 79| 83| 89| #! 91|
// 3 : | 97| 101| 103| 107| 109| 113| $!119| @121|
// 4 : | 127| 131| %!133| 137| 139| #@143| 149| 151|
// 5 : | 157| ^!161| 163| 167| #169| 173| 179| 181|
// 6 : | $@187| 191| 193| 197| 199| &!203| %@209| 211|
// 7 : | *!217| $#221| 223| 227| 229| 233| 239| 241|
// 8 : | %#247| 251| ^@253| 257| !259| 263| 269| 271|
// 9 : | 277| 281| 283| !287| $289| 293| ^#299| !301|
// 10 : | 307| 311| 313| 317| &@319| %$323| !329| 331|
// 11 : | 337| *@341| !343| 347| 349| 353| 359| %361|
// 12 : | 367| !371| 373| Ź| 379| 383| 389| ^$391|
// 13 : | 397| 401| *#403| @407| 409| !413| 419| 421|
// 14 : | !427| 431| 433| ^%437| 439| 443| 449| @451|
// 15 : | 457| 461| 463| 467| !469| @473| 479| #481|
// 16 : | 487| 491| &$493| !497| 499| 503| 509| !511|
// 17 : | @517| 521| 523| *$527| ^529| #533| @!539| 541|
// 18 : | 547| &%551| !553| 557| #559| 563| 569| 571|
// 19 : | 577| !581| @583| 587| *%589| 593| 599| 601|
// 20 : | 607| #611| 613| 617| 619| !623| $629| 631|
// 21 : | #!637| 641| 643| 647| @649| 653| 659| 661|
// 22 : | &^667| @671| 673| 677| !679| 683| #689| 691|
// 23 : | $697| 701| %703| !707| 709| *^713| 719| !721|
// 24 : | 727| $731| 733| @737| 739| 743| !749| 751|
// 25 : | 757| 761| !763| #767| 769| 773| %779| @781|
// 26 : | 787| !791| #793| 797| $799| @803| 809| 811|
// 27 : | %817| 821| 823| 827| 829| $!833| 839| &841|
// 28 : | @!847| ^851| 853| 857| 859| 863| @869| #871|
// 29 : | 877| 881| 883| 887| !889| %893| *&899| $901|
// 30 : | 907| 911| @913| !917| 919| #923| 929| %!931|
// 31 : | 937| 941| ^943| 947| #949| 953| !959| *961|
// 32 : | ... etc
//
// Here every field contains the integer that is represented by the corresponding bit in sieve_.
// All such integers divisible by 7 are marked with a '!', those that are divisible by 11 are
// marked with a '@', and so on. Below we'll say "contains a number" while
// really it's just a single bit in sieve_ that represents that number.
//
// All integers in row 0 have already been calculated during compile time
// and are stored in row0.
//
// In the loop(s) below, we find the next prime (p) and set its corresponding
// bits to zero. This is done per column; for each column we determine
// the first row that contains an integer in that column that is divisible
// by p. For example, for p=13 and col=1, we find that the first number that
// is divisible by 13 is 221 (= 13*17). While the first number in column 3
// that is divisible by the prime 11 is 77, row 2.
//
// Every subsequent multiple of that prime in the given colum is precisely
// prime rows lower every time. For example, the vertical distance between
// every '!' is 7, and between every '$' is 17, etc.
//
// Note that the first row does NOT necessarily only contain primes.
// In this case, the first number that is not a prime is 7^2 = 49 which is
// larger than 31, the last number in the first row; and thus all numbers in
// the first row are primes in this case. But that is not the case for
// compression = 4 or larger: for compression = 4 the first
// (top-left) number in the table is 11 and the first non-prime is 11^2 = 121.
// The first number of row 1 is 11 + (2 * 3 * 5 * 7) = 221. Therefore 121 is
// part of the first row (and so are 11*13=143, 13*13=169, 11*17=187 and
// 11*19=209).
//
// The first row for a given column that contains a number that is divisible
// by a prime p is given by: -row0[col] / compression_primorial [MODULO p].
//
// For example, we want to know the row in column 2 that contains the first
// number that is divisible by 19. Note that each time we go one row down,
// the number is incremented with compression_primorial.
//
// row col=2 MOD 19
// 0 13 13 = 13 + 0 * 30
// 1 43 5 = 13 + 1 * 30
// 2 73 16 = 13 + 2 * 30
// 3 103 8 = 13 + 3 * 30
// 4 133 0 = 13 + 4 * 30 --> 4 * 30 = -13 --> 4 = -13 / 30 [MOD 19]
//
// The compression_primorial is a constant (only depending on the compression used).
// But the modulo (p) is not. The row index can be calculated slightly more
// efficient when p > compression_primorial, which is at least the case for every
// prime in row 1 or larger (for example, in the case of the above table, with
// compression = 3, the prime 31 is already larger than the compression_primorial=30,
// but also 37, the first prime of row 1, is).
// The first loop deals with these primes (less than compression_primorial) using the
// fact that row=0, while the second loop makes use of the fact that each prime will
// be larger than compression_primorial.
// The sieve_ is a 1D array of words (of type sieve_word_t), having `words_per_row` words per row.
// For example, for compression = 6 and sieve_word_t = uint64_t, words_per_row = 90 and we
// can picture the sieve_ as: __ compression_repeat = 5760 (number of 'columns').
// /
// 55...55
// 11 11...11 66...77
// 00...66 66...22 22...99 ... 99...55
// column: 01...23 45...67 89...01 67...89
// row=0: [word_00][word_01][word_02]...[word_89]
// row=1: [word_90][word_91][word_92]...
// row=2: 010..00 <-- column_mask belonging to column 129.
// ... <--------2------->
// \__ column_word_offset = 2 (belonging to column 129) (called word_index for row 0).
//
#if USE_STOPWATCH
stopwatch.start();
#endif
// Reset all bits in sieve_ corresponding to composite numbers divisible by all primes found in row 0.
wipe_primes_of_row<true>(0, number_of_threads);
#if USE_STOPWATCH
stopwatch.stop();
cycles = stopwatch.diff_cycles() - benchmark::Stopwatch::s_stopwatch_overhead;
delta = cycles / cpu_frequency;
std::cout << "Time spent sieving row0: " << delta << " seconds." << std::endl;
#endif
#if USE_STOPWATCH
stopwatch.start();
#endif
int row = 0;
// Loop over all rows, starting with row 1 and find all primes up till
// and including the first prime that is larger than sqrt_max_value.
// For all primes less than or equal sqrt_max_value, remove multiples from the sieve_.
int column;
while ((column = wipe_primes_of_row<false>(++row, number_of_threads)) == -1)
;
#if USE_STOPWATCH
stopwatch.stop();
cycles = stopwatch.diff_cycles() - benchmark::Stopwatch::s_stopwatch_overhead;
delta = cycles / cpu_frequency;
std::cout << "Time spent sieving row1-" << row << ": " << delta << " seconds." << std::endl;
#endif
#if CHECK_PRIMES
// Find the word containing the last prime that is less than or equal max_value.
//
// For example, with sieve_size = 12 (words)
// sieve_rows_ = 4
// words_per_row = 3
//
// cols: 0-63 | 64-127|128-191
// wc:0 | wc:1 | wc:2
// r: 0 0 4 8
// r: 1 1 5 9
// r: 2 2 6 10
// r: 3 3 7 11 <-- word_index
int last_row = sieve_rows_;
for (;;)
{
int last_word_index = sieve_size - sieve_rows_ + --last_row;
// Run over all columns, right-to-left.
for (int wc = words_per_row - 1; wc >= 0; --wc)
{
int col = wc * sieve_word_bits;
if (sieve_[last_word_index] != 0 &&
sieve_row_column_to_prime(last_row, col) <= max_value)
{
for (sieve_word_t column_mask = 1; column_mask != 0; column_mask <<= 1, ++col)
{
if ((sieve_[last_word_index] & column_mask))
{
if (sieve_row_column_to_prime(last_row, col) <= max_value)
goto found;
else
break;
}
}
}
last_word_index -= sieve_rows_;
}
}
found:
#if USE_STOPWATCH
stopwatch.start();
#endif
// Add all primes larger than sqrt(max_value) and less than or equal max_value.
++column;
while (row < last_row)
{
for (; column < compression_repeat; ++column)
{
int column_word_offset = (column / sieve_word_bits) * sieve_rows_;
sieve_word_t* next_word = sieve_ + row + column_word_offset;
sieve_word_t column_mask = sieve_word_t{1} << (column % sieve_word_bits);
if ((*next_word & column_mask))
{
prime_t prime = sieve_row_column_to_prime(row, column);
CHECK_PRIME(prime);
}
}
// Next row.
++row;
column = 0;
}
#if USE_STOPWATCH
stopwatch.stop();
cycles = stopwatch.diff_cycles() - benchmark::Stopwatch::s_stopwatch_overhead;
delta = cycles / cpu_frequency;
std::cout << "Time spent adding all primes up till row " << last_row << ": " << delta << " seconds." << std::endl;
#endif
for (; column < compression_repeat; ++column)
{
int column_word_offset = (column / sieve_word_bits) * sieve_rows_;
sieve_word_t* next_word = sieve_ + row + column_word_offset;
sieve_word_t column_mask = sieve_word_t{1} << (column % sieve_word_bits);
if ((*next_word & column_mask))
{
prime_t prime = sieve_row_column_to_prime(row, column);
if (prime > max_value)
break;
CHECK_PRIME(prime);
}
}
#endif // CHECK_PRIMES
}
prime_t Primes::next_prime()
{
if (AI_UNLIKELY(++index_ < 0))
return small_primes[compression + index_];
// Sieve words, as stored in memory, run top to bottom, left to right.
// Each "word column" has sieve_rows_ (SR) rows.
// One row has compression_repeat/sieve_word_bits words (RW) (and compression_repeat columns/bits).
//
// column: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ... 16 17 18 19 20 21 22 23
// row ---------------------------------------------------------------------------------
// 0: [ word_0 ] [ word_{SR} ] ... [ word_{(RW - 1)*SR} ]
// 1: [ word_1 ] [ word_{SR+1} ] ... [ word_{(RW - 1)*SR + 1}]
// 2: [ word_2 ] [ word_{SR+2} ] ... [ word_{(RW - 1)*SR + 2}]
// ... ... ...
// SR: [ word_{SR-1} ] [ word_{2*SR-1} ] ... [ word_{(RW)*SR - 1} ]
//
// index_ runs over all bits of sieve_ from left to right (words left to right, bits inside a word from lsb to msb), top to bottom.
// For example, if compression_repeat = 24 and sieve_word_bits = 8 we'd have:
//
// column: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ... 16 17 18 19 20 21 22 23
// row ---------------------------------------------------------------------------------
// 0: [ 0 1 2 3 4 5 6 7] [ 8 9 10 11 12 13 14 15] ... [16 17 18 19 20 21 22 23]
// 1: [24 25 26 27 28 29 30 31] [32 33 34 35 36 37 38 39] ... [40 41 42 43 44 45 46 47]
// 2: [48 49 50 51 52 53 54 55] [56 57 58 59 60 61 62 63] ... [64 65 66 67 68 69 70 71]
// ...
// SR: [24*{SR-1} ... ] [24*{SR-1}+8 ...] ... [... 24*{SR-1}+23]
//
// A single word stores the lowest column (the left-most) in the least significant bit:
//
// column: 15 14 13 12 11 10 9 8
// bits: 1 1 0 1 0 1 1 1 = 0xd7
// 0 0 0 1 0 0 0 0 <-- col_mask example (for column = 12).
//
// col_mask is word with a single bit set that represents the current column inside the current word.
// col_word_offset is the word_index corresponding to the current column:
//
// column: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ... 16 17 18 19 20 21 22 23
// [ 0 ] [ SR ] ... [ (RW - 1)*SR ] <-- col_word_offset
//
// The current is therefore given by sieve_[row + col_word_offset].
static constexpr int words_per_row = compression_repeat / sieve_word_bits;
int column = index_ % compression_repeat;
int row = index_ / compression_repeat;
sieve_word_t col_mask = sieve_word_t{1} << (column % sieve_word_bits);
integer_t word_col = column / sieve_word_bits;
sieve_word_t word = sieve_[row + word_col * sieve_rows_];
word &= ~(col_mask - 1); // Remove all bits that represent integers less than the current one.
// Run over wi (row + word_col * sieve_rows_).
for (;;)
{
if (word)
{
column = word_col * sieve_word_bits + utils::ctz(word);
index_ = int64_t{row} * compression_repeat + column;
return sieve_row_column_to_prime(row, column);
}
if (words_per_row == 1 ||
AI_UNLIKELY(++word_col == words_per_row))
{
if (AI_UNLIKELY(++row == sieve_rows_))
throw std::out_of_range("Primes: reached end of sieve.");
if constexpr (words_per_row > 2)
word_col = 0;
}
word = sieve_[row + word_col * sieve_rows_];
}
}
std::vector<prime_t> Primes::make_vector()
{
std::vector<prime_t> result;
result.reserve(calc_upper_bound_number_of_primes(max_value_));
reset();
try
{
for (;;)
{
prime_t p = next_prime();
if (p > max_value_)
break;
result.push_back(p);
}
}
catch (std::out_of_range const&)
{
}
return result;
}
//static
std::array<uint16_t, Primes::row0_to_column_size> const Primes::row0_to_column = calc_row0_to_column();
} // namespace fastprimes