4
4
// contributed by the Rust Project Developers
5
5
// contributed by TeXitoi
6
6
// contributed by Cristi Cobzarenco (@cristicbz)
7
+ // contributed by Andre Bogus
7
8
8
9
extern crate rayon;
9
10
10
- use std:: { cmp, mem } ;
11
+ use std:: cmp;
11
12
use rayon:: prelude:: * ;
13
+ use std:: arch:: x86_64:: * ;
14
+
15
+ #[ cfg( target_arch = "x86_64" ) ]
16
+ #[ derive( Copy , Clone ) ]
17
+ pub struct U8x16 ( __m128i ) ;
18
+
19
+ #[ cfg( all( target_arch = "x86_64" , target_feature = "sse2" ,
20
+ target_feature = "ssse3" ) ) ]
21
+ impl U8x16 {
22
+ pub fn zero ( ) -> U8x16 { U8x16 ( unsafe { _mm_setzero_si128 ( ) } ) }
23
+ pub fn from_slice_unaligned ( s : & [ u8 ; 16 ] ) -> U8x16 {
24
+ U8x16 ( unsafe { _mm_loadu_si128 ( s. as_ptr ( ) as * const _ ) } )
25
+ }
26
+ pub fn write_to_slice_unaligned ( self , s : & mut [ u8 ; 16 ] ) {
27
+ unsafe { _mm_storeu_si128 ( s. as_mut_ptr ( ) as * mut _ , self . 0 ) }
28
+ }
29
+ pub fn extract0 ( self ) -> i32 {
30
+ unsafe { _mm_extract_epi16 ( self . 0 , 0i32 ) & 0xFF }
31
+ }
32
+ pub fn permute_dyn ( self , indices : U8x16 ) -> U8x16 {
33
+ U8x16 ( unsafe { _mm_shuffle_epi8 ( self . 0 , indices. 0 ) } )
34
+ }
35
+ }
12
36
13
37
// This value controls the preferred maximum number of blocks the workload is
14
38
// broken up into. The actual value may be one higher (if the number of
@@ -19,7 +43,7 @@ const NUM_BLOCKS: u32 = 24;
19
43
fn fannkuch ( n : i32 ) -> ( i32 , i32 ) {
20
44
// Precompute a table a factorials to reuse all over the place.
21
45
let mut factorials = [ 1 ; 16 ] ;
22
- for i in 1 ..n as usize + 1 {
46
+ for i in 1 ..= n as usize {
23
47
factorials[ i] = factorials[ i - 1 ] * i as u32 ;
24
48
}
25
49
let perm_max = factorials[ n as usize ] ;
@@ -35,17 +59,30 @@ fn fannkuch(n: i32) -> (i32, i32) {
35
59
perm_max / NUM_BLOCKS )
36
60
} ;
37
61
62
+ // precompute flips and rotations
63
+ let mut flip_masks = [ U8x16 :: zero ( ) ; 16 ] ;
64
+ let mut rotate_masks = [ U8x16 :: zero ( ) ; 16 ] ;
65
+ let mut mask = [ 0u8 ; 16 ] ;
66
+ for i in 0 ..16 {
67
+ mask. iter_mut ( ) . enumerate ( ) . for_each ( |( j, m) | * m = j as u8 ) ;
68
+ mask[ 0 ..i + 1 ] . reverse ( ) ;
69
+ flip_masks[ i] = U8x16 :: from_slice_unaligned ( & mask) ;
70
+ mask. iter_mut ( ) . enumerate ( ) . for_each ( |( j, m) | * m = j as u8 ) ;
71
+ let c = mask[ 0 ] ;
72
+ ( 0 ..i) . for_each ( |i| mask[ i] = mask[ i + 1 ] ) ;
73
+ mask[ i] = c;
74
+ rotate_masks[ i] = U8x16 :: from_slice_unaligned ( & mask) ;
75
+ }
76
+
38
77
// Compute the `checksum` and `maxflips` for each block in parallel.
39
78
( 0 ..num_blocks) . into_par_iter ( ) . map ( |i_block| {
40
79
let initial = i_block * block_size;
41
80
let mut count = [ 0i32 ; 16 ] ;
42
- let mut temp = [ 0i32 ; 16 ] ;
43
- let mut current = [ 0i32 ; 16 ] ;
81
+ let mut temp = [ 0u8 ; 16 ] ;
82
+ let mut current = [ 0u8 ; 16 ] ;
44
83
45
84
// Initialise `count` and the current permutation (`current`)
46
- for ( i, value) in current. iter_mut ( ) . enumerate ( ) {
47
- * value = i as i32 ;
48
- }
85
+ current. iter_mut ( ) . enumerate ( ) . for_each ( |( i, value) | * value = i as u8 ) ;
49
86
50
87
let mut permutation_index = initial as i32 ;
51
88
for i in ( 1 ..n as usize ) . rev ( ) {
@@ -56,44 +93,27 @@ fn fannkuch(n: i32) -> (i32, i32) {
56
93
57
94
temp. copy_from_slice ( & current) ;
58
95
let d = d as usize ;
59
- for j in 0 ..i + 1 {
60
- current[ j] = if j + d <= i {
61
- temp[ j + d]
62
- } else {
63
- temp[ j + d - i - 1 ]
64
- } ;
65
- }
96
+ current[ 0 ..=i - d] . copy_from_slice ( & temp[ d..=i] ) ;
97
+ current[ i - d + 1 ..=i] . copy_from_slice ( & temp[ 0 ..d] )
66
98
}
67
99
68
100
// Iterate over each permutation in the block.
101
+ let mut perm = U8x16 :: from_slice_unaligned ( & current) ;
69
102
let last_permutation_in_block = cmp:: min ( initial + block_size,
70
103
perm_max) - 1 ;
71
104
let mut permutation_index = initial;
72
105
let ( mut checksum, mut maxflips) = ( 0 , 0 ) ;
73
106
loop {
74
107
// If the first value in the current permutation is not 1 (0) then
75
108
// we will need to do at least one flip for `current`.
76
- if current [ 0 ] > 0 {
109
+ if perm . extract0 ( ) > 0 {
77
110
// Copy the current permutation to work on it.
78
- temp. copy_from_slice ( & current) ;
79
-
80
- // Flip `temp` (the copy of the current permutation) until its
81
- // first element is 1 (0).
82
- let mut flip_count = 1 ;
83
- let mut first_value = current[ 0 ] as usize ;
84
- while temp[ first_value] != 0 {
85
- let new_first_value = mem:: replace ( & mut temp[ first_value] ,
86
- first_value as i32 ) ;
87
- // If the first value is greater than 3 (2), then we are
88
- // flipping a series of four or more values so we will need
89
- // to flip additional elements in the middle of `temp`.
90
- if first_value > 2 {
91
- temp[ 1 ..first_value] . reverse ( ) ;
92
- }
93
-
94
- // Update `first_value` to the value we saved earlier and
95
- // record a flip in `flip_count`.
96
- first_value = new_first_value as usize ;
111
+ let mut flip_count = 0 ;
112
+ let mut flip = perm;
113
+ loop {
114
+ let flip_index = flip. extract0 ( ) as usize ;
115
+ if flip_index == 0 { break ; }
116
+ flip = flip. permute_dyn ( flip_masks[ flip_index] ) ;
97
117
flip_count += 1 ;
98
118
}
99
119
@@ -113,21 +133,13 @@ fn fannkuch(n: i32) -> (i32, i32) {
113
133
return ( checksum, maxflips) ;
114
134
}
115
135
permutation_index += 1 ;
116
-
136
+ perm = perm . permute_dyn ( rotate_masks [ 1 ] ) ;
117
137
// Generate the next permutation.
118
- let mut first_value = current[ 1 ] ;
119
- current[ 1 ] = current[ 0 ] ;
120
- current[ 0 ] = first_value;
121
138
let mut i = 1 ;
122
139
while count[ i] >= i as i32 {
123
140
count[ i] = 0 ;
124
141
i += 1 ;
125
- let new_first_value = current[ 1 ] ;
126
- current[ 0 ] = new_first_value;
127
- for j in 1 ..i {
128
- current[ j] = current[ j + 1 ] ;
129
- }
130
- current[ i] = mem:: replace ( & mut first_value, new_first_value) ;
142
+ perm = perm. permute_dyn ( rotate_masks[ i] ) ;
131
143
}
132
144
count[ i] += 1 ;
133
145
}
0 commit comments