9
9
10
10
extern crate rayon;
11
11
use rayon:: prelude:: * ;
12
+ use std:: ops:: * ;
12
13
14
+ #[ derive( Clone , Copy ) ]
15
+ struct F64x2 ( f64 , f64 ) ;
16
+
17
+ impl F64x2 {
18
+ pub fn splat ( x : f64 ) -> F64x2 { F64x2 ( x, x) }
19
+ pub fn new ( a : f64 , b : f64 ) -> F64x2 { F64x2 ( a, b) }
20
+ pub fn write_to_slice_unaligned ( self , slice : & mut [ f64 ] ) {
21
+ slice[ 0 ] = self . 0 ;
22
+ slice[ 1 ] = self . 1 ;
23
+ }
24
+ pub fn sum ( self ) -> f64 {
25
+ let mut s = [ 0f64 ; 2 ] ;
26
+ self . write_to_slice_unaligned ( & mut s) ;
27
+ s[ 0 ] + s[ 1 ]
28
+ }
29
+ }
30
+
31
+ impl Add for F64x2 {
32
+ type Output = Self ;
33
+ fn add ( self , rhs : Self ) -> Self {
34
+ F64x2 ( self . 0 + rhs. 0 , self . 1 + rhs. 1 )
35
+ }
36
+ }
37
+ impl Mul for F64x2 {
38
+ type Output = Self ;
39
+ fn mul ( self , rhs : Self ) -> Self {
40
+ F64x2 ( self . 0 * rhs. 0 , self . 1 * rhs. 1 )
41
+ }
42
+ }
43
+ impl Div for F64x2 {
44
+ type Output = Self ;
45
+ fn div ( self , rhs : Self ) -> Self {
46
+ F64x2 ( self . 0 / rhs. 0 , self . 1 / rhs. 1 )
47
+ }
48
+ }
13
49
14
50
fn main ( ) {
15
51
let n = std:: env:: args ( ) . nth ( 1 )
@@ -22,9 +58,9 @@ fn main() {
22
58
fn spectralnorm ( n : usize ) -> f64 {
23
59
// Group all vectors in pairs of two for SIMD convenience.
24
60
assert ! ( n % 2 == 0 , "only even lengths are accepted" ) ;
25
- let mut u = vec ! [ [ 1.0 , 1.0 ] ; n / 2 ] ;
26
- let mut v = vec ! [ [ 0.0 , 0.0 ] ; n / 2 ] ;
27
- let mut tmp = vec ! [ [ 0.0 , 0.0 ] ; n / 2 ] ;
61
+ let mut u = vec ! [ F64x2 :: splat ( 1.0 ) ; n / 2 ] ;
62
+ let mut v = vec ! [ F64x2 :: splat ( 0.0 ) ; n / 2 ] ;
63
+ let mut tmp = vec ! [ F64x2 :: splat ( 0.0 ) ; n / 2 ] ;
28
64
29
65
for _ in 0 ..10 {
30
66
mult_at_av ( & u, & mut v, & mut tmp) ;
@@ -34,13 +70,13 @@ fn spectralnorm(n: usize) -> f64 {
34
70
( dot ( & u, & v) / dot ( & v, & v) ) . sqrt ( )
35
71
}
36
72
37
- fn mult_at_av ( v : & [ [ f64 ; 2 ] ] , out : & mut [ [ f64 ; 2 ] ] , tmp : & mut [ [ f64 ; 2 ] ] ) {
73
+ fn mult_at_av ( v : & [ F64x2 ] , out : & mut [ F64x2 ] , tmp : & mut [ F64x2 ] ) {
38
74
mult ( v, tmp, a) ;
39
75
mult ( tmp, out, |i, j| a ( j, i) ) ;
40
76
}
41
77
42
- fn mult < F > ( v : & [ [ f64 ; 2 ] ] , out : & mut [ [ f64 ; 2 ] ] , a : F )
43
- where F : Fn ( [ usize ; 2 ] , [ usize ; 2 ] ) -> [ f64 ; 2 ] + Sync {
78
+ fn mult < F > ( v : & [ F64x2 ] , out : & mut [ F64x2 ] , a : F )
79
+ where F : Fn ( [ usize ; 2 ] , [ usize ; 2 ] ) -> F64x2 + Sync {
44
80
// Parallelize along the output vector, with each pair of slots as a parallelism unit.
45
81
out. par_iter_mut ( ) . enumerate ( ) . for_each ( |( i, slot) | {
46
82
// We're computing everything in chunks of two so the indces of slot[0] and slot[1] are 2*i
@@ -50,44 +86,41 @@ fn mult<F>(v: &[[f64; 2]], out: &mut [[f64; 2]], a: F)
50
86
51
87
// Each slot in the pair gets its own sum, which is further computed in two f64 lanes (which
52
88
// are summed at the end.
53
- let ( mut sum0, mut sum1) = ( [ 0.0 ; 2 ] , [ 0.0 ; 2 ] ) ;
89
+ let ( mut sum0, mut sum1) = ( F64x2 :: splat ( 0.0 ) , F64x2 :: splat ( 0.0 ) ) ;
54
90
for ( j, x) in v. iter ( ) . enumerate ( ) {
55
91
let j = [ 2 * j, 2 * j + 1 ] ;
56
- div_and_add ( x, & a ( i0, j) , & a ( i1, j) , & mut sum0, & mut sum1) ;
92
+ div_and_add ( * x, a ( i0, j) , a ( i1, j) , & mut sum0, & mut sum1) ;
57
93
}
58
94
59
95
// Sum the two lanes for each slot.
60
- slot[ 0 ] = sum0[ 0 ] + sum0[ 1 ] ;
61
- slot[ 1 ] = sum1[ 0 ] + sum1[ 1 ] ;
96
+ * slot = F64x2 :: new ( sum0. sum ( ) , sum1. sum ( ) ) ;
62
97
} ) ;
63
98
}
64
99
65
- fn a ( i : [ usize ; 2 ] , j : [ usize ; 2 ] ) -> [ f64 ; 2 ] {
66
- [ ( ( ( i[ 0 ] + j[ 0 ] ) * ( i[ 0 ] + j[ 0 ] + 1 ) / 2 + i[ 0 ] + 1 ) as f64 ) ,
67
- ( ( ( i[ 1 ] + j[ 1 ] ) * ( i[ 1 ] + j[ 1 ] + 1 ) / 2 + i[ 1 ] + 1 ) as f64 ) ]
100
+ fn a ( i : [ usize ; 2 ] , j : [ usize ; 2 ] ) -> F64x2 {
101
+ F64x2 :: new ( ( ( i[ 0 ] + j[ 0 ] ) * ( i[ 0 ] + j[ 0 ] + 1 ) / 2 + i[ 0 ] + 1 ) as f64 ,
102
+ ( ( i[ 1 ] + j[ 1 ] ) * ( i[ 1 ] + j[ 1 ] + 1 ) / 2 + i[ 1 ] + 1 ) as f64 )
68
103
}
69
104
70
- fn dot ( v : & [ [ f64 ; 2 ] ] , u : & [ [ f64 ; 2 ] ] ) -> f64 {
105
+ fn dot ( v : & [ F64x2 ] , u : & [ F64x2 ] ) -> f64 {
71
106
// Vectorised form of dot product: (1) compute dot across two lanes.
72
107
let r = u. iter ( )
73
108
. zip ( v)
74
- . map ( |( x, y) | [ x [ 0 ] * y[ 0 ] , x [ 1 ] * y [ 1 ] ] )
75
- . fold ( [ 0.0f64 ; 2 ] , |s, x| [ s [ 0 ] + x[ 0 ] , s [ 1 ] + x [ 1 ] ] ) ;
109
+ . map ( |( & x, & y) | x * y)
110
+ . fold ( F64x2 :: splat ( 0.0 ) , |s, x| s + x) ;
76
111
77
112
// (2) sum the two lanes.
78
- r[ 0 ] + r [ 1 ]
113
+ r. sum ( )
79
114
}
80
115
81
116
// Hint that this function should not be inlined. Keep the parallelised code tight, and vectorize
82
117
// better.
83
118
#[ inline( never) ]
84
- fn div_and_add ( x : & [ f64 ; 2 ] ,
85
- a0 : & [ f64 ; 2 ] ,
86
- a1 : & [ f64 ; 2 ] ,
87
- s0 : & mut [ f64 ; 2 ] ,
88
- s1 : & mut [ f64 ; 2 ] ) {
89
- s0[ 0 ] += x[ 0 ] / a0[ 0 ] ;
90
- s0[ 1 ] += x[ 1 ] / a0[ 1 ] ;
91
- s1[ 0 ] += x[ 0 ] / a1[ 0 ] ;
92
- s1[ 1 ] += x[ 1 ] / a1[ 1 ] ;
119
+ fn div_and_add ( x : F64x2 ,
120
+ a0 : F64x2 ,
121
+ a1 : F64x2 ,
122
+ s0 : & mut F64x2 ,
123
+ s1 : & mut F64x2 ) {
124
+ * s0 = * s0 + x / a0;
125
+ * s1 = * s1 + x / a1;
93
126
}
0 commit comments