Skip to content

Commit 5336228

Browse files
llogiqTeXitoi
authored andcommitted
SIMDify fannkuch_redux
this time I had to use actual SIMD – LLVMs autovectorizer doesn't do shuffles for now. Awesome speedups ahead. I'll submit it, too.
1 parent 0d0ed75 commit 5336228

File tree

1 file changed

+56
-44
lines changed

1 file changed

+56
-44
lines changed

src/fannkuch_redux.rs

Lines changed: 56 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,35 @@
44
// contributed by the Rust Project Developers
55
// contributed by TeXitoi
66
// contributed by Cristi Cobzarenco (@cristicbz)
7+
// contributed by Andre Bogus
78

89
extern crate rayon;
910

10-
use std::{cmp, mem};
11+
use std::cmp;
1112
use rayon::prelude::*;
13+
use std::arch::x86_64::*;
14+
15+
#[cfg(target_arch = "x86_64")]
16+
#[derive(Copy, Clone)]
17+
pub struct U8x16(__m128i);
18+
19+
#[cfg(all(target_arch = "x86_64", target_feature = "sse2",
20+
target_feature = "ssse3"))]
21+
impl U8x16 {
22+
pub fn zero() -> U8x16 { U8x16(unsafe { _mm_setzero_si128() }) }
23+
pub fn from_slice_unaligned(s: &[u8; 16]) -> U8x16 {
24+
U8x16(unsafe { _mm_loadu_si128(s.as_ptr() as *const _) })
25+
}
26+
pub fn write_to_slice_unaligned(self, s: &mut [u8; 16]) {
27+
unsafe { _mm_storeu_si128(s.as_mut_ptr() as *mut _, self.0) }
28+
}
29+
pub fn extract0(self) -> i32 {
30+
unsafe { _mm_extract_epi16(self.0, 0i32) & 0xFF }
31+
}
32+
pub fn permute_dyn(self, indices: U8x16) -> U8x16 {
33+
U8x16(unsafe { _mm_shuffle_epi8(self.0, indices.0) })
34+
}
35+
}
1236

1337
// This value controls the preferred maximum number of blocks the workload is
1438
// broken up into. The actual value may be one higher (if the number of
@@ -19,7 +43,7 @@ const NUM_BLOCKS: u32 = 24;
1943
fn fannkuch(n: i32) -> (i32, i32) {
2044
// Precompute a table a factorials to reuse all over the place.
2145
let mut factorials = [1; 16];
22-
for i in 1..n as usize + 1 {
46+
for i in 1..=n as usize {
2347
factorials[i] = factorials[i - 1] * i as u32;
2448
}
2549
let perm_max = factorials[n as usize];
@@ -35,17 +59,30 @@ fn fannkuch(n: i32) -> (i32, i32) {
3559
perm_max / NUM_BLOCKS)
3660
};
3761

62+
// precompute flips and rotations
63+
let mut flip_masks = [U8x16::zero(); 16];
64+
let mut rotate_masks = [U8x16::zero(); 16];
65+
let mut mask = [0u8; 16];
66+
for i in 0..16 {
67+
mask.iter_mut().enumerate().for_each(|(j, m)| *m = j as u8);
68+
mask[0..i + 1].reverse();
69+
flip_masks[i] = U8x16::from_slice_unaligned(&mask);
70+
mask.iter_mut().enumerate().for_each(|(j, m)| *m = j as u8);
71+
let c = mask[0];
72+
(0..i).for_each(|i| mask[i] = mask[i + 1]);
73+
mask[i] = c;
74+
rotate_masks[i] = U8x16::from_slice_unaligned(&mask);
75+
}
76+
3877
// Compute the `checksum` and `maxflips` for each block in parallel.
3978
(0..num_blocks).into_par_iter().map(|i_block| {
4079
let initial = i_block * block_size;
4180
let mut count = [0i32; 16];
42-
let mut temp = [0i32; 16];
43-
let mut current = [0i32; 16];
81+
let mut temp = [0u8; 16];
82+
let mut current = [0u8; 16];
4483

4584
// Initialise `count` and the current permutation (`current`)
46-
for (i, value) in current.iter_mut().enumerate() {
47-
*value = i as i32;
48-
}
85+
current.iter_mut().enumerate().for_each(|(i, value)| *value = i as u8);
4986

5087
let mut permutation_index = initial as i32;
5188
for i in (1..n as usize).rev() {
@@ -56,44 +93,27 @@ fn fannkuch(n: i32) -> (i32, i32) {
5693

5794
temp.copy_from_slice(&current);
5895
let d = d as usize;
59-
for j in 0..i + 1 {
60-
current[j] = if j + d <= i {
61-
temp[j + d]
62-
} else {
63-
temp[j + d - i - 1]
64-
};
65-
}
96+
current[0..=i - d].copy_from_slice(&temp[d..=i]);
97+
current[i - d + 1..=i].copy_from_slice(&temp[0..d])
6698
}
6799

68100
// Iterate over each permutation in the block.
101+
let mut perm = U8x16::from_slice_unaligned(&current);
69102
let last_permutation_in_block = cmp::min(initial + block_size,
70103
perm_max) - 1;
71104
let mut permutation_index = initial;
72105
let (mut checksum, mut maxflips) = (0, 0);
73106
loop {
74107
// If the first value in the current permutation is not 1 (0) then
75108
// we will need to do at least one flip for `current`.
76-
if current[0] > 0 {
109+
if perm.extract0() > 0 {
77110
// Copy the current permutation to work on it.
78-
temp.copy_from_slice(&current);
79-
80-
// Flip `temp` (the copy of the current permutation) until its
81-
// first element is 1 (0).
82-
let mut flip_count = 1;
83-
let mut first_value = current[0] as usize;
84-
while temp[first_value] != 0 {
85-
let new_first_value = mem::replace(&mut temp[first_value],
86-
first_value as i32);
87-
// If the first value is greater than 3 (2), then we are
88-
// flipping a series of four or more values so we will need
89-
// to flip additional elements in the middle of `temp`.
90-
if first_value > 2 {
91-
temp[1..first_value].reverse();
92-
}
93-
94-
// Update `first_value` to the value we saved earlier and
95-
// record a flip in `flip_count`.
96-
first_value = new_first_value as usize;
111+
let mut flip_count = 0;
112+
let mut flip = perm;
113+
loop {
114+
let flip_index = flip.extract0() as usize;
115+
if flip_index == 0 { break; }
116+
flip = flip.permute_dyn(flip_masks[flip_index]);
97117
flip_count += 1;
98118
}
99119

@@ -113,21 +133,13 @@ fn fannkuch(n: i32) -> (i32, i32) {
113133
return (checksum, maxflips);
114134
}
115135
permutation_index += 1;
116-
136+
perm = perm.permute_dyn(rotate_masks[1]);
117137
// Generate the next permutation.
118-
let mut first_value = current[1];
119-
current[1] = current[0];
120-
current[0] = first_value;
121138
let mut i = 1;
122139
while count[i] >= i as i32 {
123140
count[i] = 0;
124141
i += 1;
125-
let new_first_value = current[1];
126-
current[0] = new_first_value;
127-
for j in 1..i {
128-
current[j] = current[j + 1];
129-
}
130-
current[i] = mem::replace(&mut first_value, new_first_value);
142+
perm = perm.permute_dyn(rotate_masks[i]);
131143
}
132144
count[i] += 1;
133145
}

0 commit comments

Comments
 (0)