Skip to content

Commit 9b63c84

Browse files
llogiqTeXitoi
authored andcommitted
A (hopefully faster) non-SIMD version of fannkuch
1 parent 5336228 commit 9b63c84

File tree

1 file changed

+36
-57
lines changed

1 file changed

+36
-57
lines changed

src/fannkuch_redux.rs

Lines changed: 36 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -10,28 +10,39 @@ extern crate rayon;
1010

1111
use std::cmp;
1212
use rayon::prelude::*;
13-
use std::arch::x86_64::*;
1413

15-
#[cfg(target_arch = "x86_64")]
16-
#[derive(Copy, Clone)]
17-
pub struct U8x16(__m128i);
18-
19-
#[cfg(all(target_arch = "x86_64", target_feature = "sse2",
20-
target_feature = "ssse3"))]
21-
impl U8x16 {
22-
pub fn zero() -> U8x16 { U8x16(unsafe { _mm_setzero_si128() }) }
23-
pub fn from_slice_unaligned(s: &[u8; 16]) -> U8x16 {
24-
U8x16(unsafe { _mm_loadu_si128(s.as_ptr() as *const _) })
25-
}
26-
pub fn write_to_slice_unaligned(self, s: &mut [u8; 16]) {
27-
unsafe { _mm_storeu_si128(s.as_mut_ptr() as *mut _, self.0) }
28-
}
29-
pub fn extract0(self) -> i32 {
30-
unsafe { _mm_extract_epi16(self.0, 0i32) & 0xFF }
14+
pub fn pack(perm: &[u8; 16]) -> u64 {
15+
perm.iter().rev().fold(0, |acc, &i| (acc << 4) + i as u64)
16+
}
17+
pub fn flips(perm: u64) -> i32 {
18+
const LOWER: u64 = 0x0f0f0f0f0f0f0f0fu64;
19+
let (mut flip, mut flip_count) = (perm, 0);
20+
loop {
21+
let flip_index = (flip & 0xf) as usize;
22+
if flip_index == 0 { break; }
23+
let (s, n4) = (flip.swap_bytes(), flip_index * 4);
24+
flip &= !0xf << n4;
25+
flip |= (((s & LOWER) << 4) | (s >> 4) & LOWER) >> (60 - n4);
26+
flip_count += 1;
3127
}
32-
pub fn permute_dyn(self, indices: U8x16) -> U8x16 {
33-
U8x16(unsafe { _mm_shuffle_epi8(self.0, indices.0) })
28+
flip_count
29+
}
30+
pub fn permute(perm: u64, count: &mut [u8; 16]) -> u64 {
31+
let mut perm = rotate(perm, 1);
32+
// Generate the next permutation.
33+
let mut i = 1;
34+
while count[i] >= i as u8 {
35+
count[i] = 0;
36+
i += 1;
37+
perm = rotate(perm, i);
3438
}
39+
count[i] += 1;
40+
perm
41+
}
42+
fn rotate(perm: u64, n: usize) -> u64 {
43+
let n4 = n * 4;
44+
let mask = !0xf << n4;
45+
perm & mask | (perm & !mask) >> 4 | (perm & 0xf) << n4
3546
}
3647

3748
// This value controls the preferred maximum number of blocks the workload is
@@ -59,25 +70,10 @@ fn fannkuch(n: i32) -> (i32, i32) {
5970
perm_max / NUM_BLOCKS)
6071
};
6172

62-
// precompute flips and rotations
63-
let mut flip_masks = [U8x16::zero(); 16];
64-
let mut rotate_masks = [U8x16::zero(); 16];
65-
let mut mask = [0u8; 16];
66-
for i in 0..16 {
67-
mask.iter_mut().enumerate().for_each(|(j, m)| *m = j as u8);
68-
mask[0..i + 1].reverse();
69-
flip_masks[i] = U8x16::from_slice_unaligned(&mask);
70-
mask.iter_mut().enumerate().for_each(|(j, m)| *m = j as u8);
71-
let c = mask[0];
72-
(0..i).for_each(|i| mask[i] = mask[i + 1]);
73-
mask[i] = c;
74-
rotate_masks[i] = U8x16::from_slice_unaligned(&mask);
75-
}
76-
7773
// Compute the `checksum` and `maxflips` for each block in parallel.
7874
(0..num_blocks).into_par_iter().map(|i_block| {
7975
let initial = i_block * block_size;
80-
let mut count = [0i32; 16];
76+
let mut count = [0u8; 16];
8177
let mut temp = [0u8; 16];
8278
let mut current = [0u8; 16];
8379

@@ -89,7 +85,7 @@ fn fannkuch(n: i32) -> (i32, i32) {
8985
let factorial = factorials[i] as i32;
9086
let d = permutation_index / factorial;
9187
permutation_index %= factorial;
92-
count[i] = d;
88+
count[i] = d as u8;
9389

9490
temp.copy_from_slice(&current);
9591
let d = d as usize;
@@ -98,25 +94,16 @@ fn fannkuch(n: i32) -> (i32, i32) {
9894
}
9995

10096
// Iterate over each permutation in the block.
101-
let mut perm = U8x16::from_slice_unaligned(&current);
97+
let mut perm = pack(&current);
10298
let last_permutation_in_block = cmp::min(initial + block_size,
10399
perm_max) - 1;
104100
let mut permutation_index = initial;
105101
let (mut checksum, mut maxflips) = (0, 0);
106102
loop {
107103
// If the first value in the current permutation is not 1 (0) then
108104
// we will need to do at least one flip for `current`.
109-
if perm.extract0() > 0 {
110-
// Copy the current permutation to work on it.
111-
let mut flip_count = 0;
112-
let mut flip = perm;
113-
loop {
114-
let flip_index = flip.extract0() as usize;
115-
if flip_index == 0 { break; }
116-
flip = flip.permute_dyn(flip_masks[flip_index]);
117-
flip_count += 1;
118-
}
119-
105+
if perm & 0xf > 0 {
106+
let flip_count = flips(perm);
120107
// Update the `checksum` and `maxflips` of this block.
121108
checksum += if permutation_index % 2 == 0 {
122109
flip_count
@@ -133,15 +120,7 @@ fn fannkuch(n: i32) -> (i32, i32) {
133120
return (checksum, maxflips);
134121
}
135122
permutation_index += 1;
136-
perm = perm.permute_dyn(rotate_masks[1]);
137-
// Generate the next permutation.
138-
let mut i = 1;
139-
while count[i] >= i as i32 {
140-
count[i] = 0;
141-
i += 1;
142-
perm = perm.permute_dyn(rotate_masks[i]);
143-
}
144-
count[i] += 1;
123+
perm = permute(perm, &mut count);
145124
}
146125
}).reduce(|| (0, 0),
147126
|(cs1, mf1), (cs2, mf2)| (cs1 + cs2, cmp::max(mf1, mf2)))

0 commit comments

Comments
 (0)