@@ -10,28 +10,39 @@ extern crate rayon;
10
10
11
11
use std:: cmp;
12
12
use rayon:: prelude:: * ;
13
- use std:: arch:: x86_64:: * ;
14
13
15
- #[ cfg( target_arch = "x86_64" ) ]
16
- #[ derive( Copy , Clone ) ]
17
- pub struct U8x16 ( __m128i ) ;
18
-
19
- #[ cfg( all( target_arch = "x86_64" , target_feature = "sse2" ,
20
- target_feature = "ssse3" ) ) ]
21
- impl U8x16 {
22
- pub fn zero ( ) -> U8x16 { U8x16 ( unsafe { _mm_setzero_si128 ( ) } ) }
23
- pub fn from_slice_unaligned ( s : & [ u8 ; 16 ] ) -> U8x16 {
24
- U8x16 ( unsafe { _mm_loadu_si128 ( s. as_ptr ( ) as * const _ ) } )
25
- }
26
- pub fn write_to_slice_unaligned ( self , s : & mut [ u8 ; 16 ] ) {
27
- unsafe { _mm_storeu_si128 ( s. as_mut_ptr ( ) as * mut _ , self . 0 ) }
28
- }
29
- pub fn extract0 ( self ) -> i32 {
30
- unsafe { _mm_extract_epi16 ( self . 0 , 0i32 ) & 0xFF }
14
+ pub fn pack ( perm : & [ u8 ; 16 ] ) -> u64 {
15
+ perm. iter ( ) . rev ( ) . fold ( 0 , |acc, & i| ( acc << 4 ) + i as u64 )
16
+ }
17
+ pub fn flips ( perm : u64 ) -> i32 {
18
+ const LOWER : u64 = 0x0f0f0f0f0f0f0f0fu64 ;
19
+ let ( mut flip, mut flip_count) = ( perm, 0 ) ;
20
+ loop {
21
+ let flip_index = ( flip & 0xf ) as usize ;
22
+ if flip_index == 0 { break ; }
23
+ let ( s, n4) = ( flip. swap_bytes ( ) , flip_index * 4 ) ;
24
+ flip &= !0xf << n4;
25
+ flip |= ( ( ( s & LOWER ) << 4 ) | ( s >> 4 ) & LOWER ) >> ( 60 - n4) ;
26
+ flip_count += 1 ;
31
27
}
32
- pub fn permute_dyn ( self , indices : U8x16 ) -> U8x16 {
33
- U8x16 ( unsafe { _mm_shuffle_epi8 ( self . 0 , indices. 0 ) } )
28
+ flip_count
29
+ }
30
+ pub fn permute ( perm : u64 , count : & mut [ u8 ; 16 ] ) -> u64 {
31
+ let mut perm = rotate ( perm, 1 ) ;
32
+ // Generate the next permutation.
33
+ let mut i = 1 ;
34
+ while count[ i] >= i as u8 {
35
+ count[ i] = 0 ;
36
+ i += 1 ;
37
+ perm = rotate ( perm, i) ;
34
38
}
39
+ count[ i] += 1 ;
40
+ perm
41
+ }
42
+ fn rotate ( perm : u64 , n : usize ) -> u64 {
43
+ let n4 = n * 4 ;
44
+ let mask = !0xf << n4;
45
+ perm & mask | ( perm & !mask) >> 4 | ( perm & 0xf ) << n4
35
46
}
36
47
37
48
// This value controls the preferred maximum number of blocks the workload is
@@ -59,25 +70,10 @@ fn fannkuch(n: i32) -> (i32, i32) {
59
70
perm_max / NUM_BLOCKS )
60
71
} ;
61
72
62
- // precompute flips and rotations
63
- let mut flip_masks = [ U8x16 :: zero ( ) ; 16 ] ;
64
- let mut rotate_masks = [ U8x16 :: zero ( ) ; 16 ] ;
65
- let mut mask = [ 0u8 ; 16 ] ;
66
- for i in 0 ..16 {
67
- mask. iter_mut ( ) . enumerate ( ) . for_each ( |( j, m) | * m = j as u8 ) ;
68
- mask[ 0 ..i + 1 ] . reverse ( ) ;
69
- flip_masks[ i] = U8x16 :: from_slice_unaligned ( & mask) ;
70
- mask. iter_mut ( ) . enumerate ( ) . for_each ( |( j, m) | * m = j as u8 ) ;
71
- let c = mask[ 0 ] ;
72
- ( 0 ..i) . for_each ( |i| mask[ i] = mask[ i + 1 ] ) ;
73
- mask[ i] = c;
74
- rotate_masks[ i] = U8x16 :: from_slice_unaligned ( & mask) ;
75
- }
76
-
77
73
// Compute the `checksum` and `maxflips` for each block in parallel.
78
74
( 0 ..num_blocks) . into_par_iter ( ) . map ( |i_block| {
79
75
let initial = i_block * block_size;
80
- let mut count = [ 0i32 ; 16 ] ;
76
+ let mut count = [ 0u8 ; 16 ] ;
81
77
let mut temp = [ 0u8 ; 16 ] ;
82
78
let mut current = [ 0u8 ; 16 ] ;
83
79
@@ -89,7 +85,7 @@ fn fannkuch(n: i32) -> (i32, i32) {
89
85
let factorial = factorials[ i] as i32 ;
90
86
let d = permutation_index / factorial;
91
87
permutation_index %= factorial;
92
- count[ i] = d;
88
+ count[ i] = d as u8 ;
93
89
94
90
temp. copy_from_slice ( & current) ;
95
91
let d = d as usize ;
@@ -98,25 +94,16 @@ fn fannkuch(n: i32) -> (i32, i32) {
98
94
}
99
95
100
96
// Iterate over each permutation in the block.
101
- let mut perm = U8x16 :: from_slice_unaligned ( & current) ;
97
+ let mut perm = pack ( & current) ;
102
98
let last_permutation_in_block = cmp:: min ( initial + block_size,
103
99
perm_max) - 1 ;
104
100
let mut permutation_index = initial;
105
101
let ( mut checksum, mut maxflips) = ( 0 , 0 ) ;
106
102
loop {
107
103
// If the first value in the current permutation is not 1 (0) then
108
104
// we will need to do at least one flip for `current`.
109
- if perm. extract0 ( ) > 0 {
110
- // Copy the current permutation to work on it.
111
- let mut flip_count = 0 ;
112
- let mut flip = perm;
113
- loop {
114
- let flip_index = flip. extract0 ( ) as usize ;
115
- if flip_index == 0 { break ; }
116
- flip = flip. permute_dyn ( flip_masks[ flip_index] ) ;
117
- flip_count += 1 ;
118
- }
119
-
105
+ if perm & 0xf > 0 {
106
+ let flip_count = flips ( perm) ;
120
107
// Update the `checksum` and `maxflips` of this block.
121
108
checksum += if permutation_index % 2 == 0 {
122
109
flip_count
@@ -133,15 +120,7 @@ fn fannkuch(n: i32) -> (i32, i32) {
133
120
return ( checksum, maxflips) ;
134
121
}
135
122
permutation_index += 1 ;
136
- perm = perm. permute_dyn ( rotate_masks[ 1 ] ) ;
137
- // Generate the next permutation.
138
- let mut i = 1 ;
139
- while count[ i] >= i as i32 {
140
- count[ i] = 0 ;
141
- i += 1 ;
142
- perm = perm. permute_dyn ( rotate_masks[ i] ) ;
143
- }
144
- count[ i] += 1 ;
123
+ perm = permute ( perm, & mut count) ;
145
124
}
146
125
} ) . reduce ( || ( 0 , 0 ) ,
147
126
|( cs1, mf1) , ( cs2, mf2) | ( cs1 + cs2, cmp:: max ( mf1, mf2) ) )
0 commit comments