Skip to content

Commit b4a88e4

Browse files
llogiqTeXitoi
authored andcommitted
Autovectorize Spectralnorm
This uses the same trick as n_body to better autovectorize. I get a 27% speedup on this machine.
1 parent 751b1f2 commit b4a88e4

File tree

1 file changed

+59
-26
lines changed

1 file changed

+59
-26
lines changed

src/spectralnorm.rs

Lines changed: 59 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,43 @@
99

1010
extern crate rayon;
1111
use rayon::prelude::*;
12+
use std::ops::*;
1213

14+
#[derive(Clone, Copy)]
15+
struct F64x2(f64, f64);
16+
17+
impl F64x2 {
18+
pub fn splat(x: f64) -> F64x2 { F64x2(x, x) }
19+
pub fn new(a: f64, b: f64) -> F64x2 { F64x2(a, b) }
20+
pub fn write_to_slice_unaligned(self, slice: &mut [f64]) {
21+
slice[0] = self.0;
22+
slice[1] = self.1;
23+
}
24+
pub fn sum(self) -> f64 {
25+
let mut s = [0f64; 2];
26+
self.write_to_slice_unaligned(&mut s);
27+
s[0] + s[1]
28+
}
29+
}
30+
31+
impl Add for F64x2 {
32+
type Output = Self;
33+
fn add(self, rhs: Self) -> Self {
34+
F64x2(self.0 + rhs.0, self.1 + rhs.1)
35+
}
36+
}
37+
impl Mul for F64x2 {
38+
type Output = Self;
39+
fn mul(self, rhs: Self) -> Self {
40+
F64x2(self.0 * rhs.0, self.1 * rhs.1)
41+
}
42+
}
43+
impl Div for F64x2 {
44+
type Output = Self;
45+
fn div(self, rhs: Self) -> Self {
46+
F64x2(self.0 / rhs.0, self.1 / rhs.1)
47+
}
48+
}
1349

1450
fn main() {
1551
let n = std::env::args().nth(1)
@@ -22,9 +58,9 @@ fn main() {
2258
fn spectralnorm(n: usize) -> f64 {
2359
// Group all vectors in pairs of two for SIMD convenience.
2460
assert!(n % 2 == 0, "only even lengths are accepted");
25-
let mut u = vec![[1.0, 1.0]; n / 2];
26-
let mut v = vec![[0.0, 0.0]; n / 2];
27-
let mut tmp = vec![[0.0, 0.0]; n / 2];
61+
let mut u = vec![F64x2::splat(1.0); n / 2];
62+
let mut v = vec![F64x2::splat(0.0); n / 2];
63+
let mut tmp = vec![F64x2::splat(0.0); n / 2];
2864

2965
for _ in 0..10 {
3066
mult_at_av(&u, &mut v, &mut tmp);
@@ -34,13 +70,13 @@ fn spectralnorm(n: usize) -> f64 {
3470
(dot(&u, &v) / dot(&v, &v)).sqrt()
3571
}
3672

37-
fn mult_at_av(v: &[[f64; 2]], out: &mut [[f64; 2]], tmp: &mut [[f64; 2]]) {
73+
fn mult_at_av(v: &[F64x2], out: &mut [F64x2], tmp: &mut [F64x2]) {
3874
mult(v, tmp, a);
3975
mult(tmp, out, |i, j| a(j, i));
4076
}
4177

42-
fn mult<F>(v: &[[f64; 2]], out: &mut [[f64; 2]], a: F)
43-
where F: Fn([usize; 2], [usize; 2]) -> [f64; 2] + Sync {
78+
fn mult<F>(v: &[F64x2], out: &mut [F64x2], a: F)
79+
where F: Fn([usize; 2], [usize; 2]) -> F64x2 + Sync {
4480
// Parallelize along the output vector, with each pair of slots as a parallelism unit.
4581
out.par_iter_mut().enumerate().for_each(|(i, slot)| {
4682
// We're computing everything in chunks of two so the indces of slot[0] and slot[1] are 2*i
@@ -50,44 +86,41 @@ fn mult<F>(v: &[[f64; 2]], out: &mut [[f64; 2]], a: F)
5086

5187
// Each slot in the pair gets its own sum, which is further computed in two f64 lanes (which
5288
// are summed at the end.
53-
let (mut sum0, mut sum1) = ([0.0; 2], [0.0; 2]);
89+
let (mut sum0, mut sum1) = (F64x2::splat(0.0), F64x2::splat(0.0));
5490
for (j, x) in v.iter().enumerate() {
5591
let j = [2 * j, 2 * j + 1];
56-
div_and_add(x, &a(i0, j), &a(i1, j), &mut sum0, &mut sum1);
92+
div_and_add(*x, a(i0, j), a(i1, j), &mut sum0, &mut sum1);
5793
}
5894

5995
// Sum the two lanes for each slot.
60-
slot[0] = sum0[0] + sum0[1];
61-
slot[1] = sum1[0] + sum1[1];
96+
*slot = F64x2::new(sum0.sum(), sum1.sum());
6297
});
6398
}
6499

65-
fn a(i: [usize; 2], j: [usize; 2]) -> [f64; 2] {
66-
[(((i[0] + j[0]) * (i[0] + j[0] + 1) / 2 + i[0] + 1) as f64),
67-
(((i[1] + j[1]) * (i[1] + j[1] + 1) / 2 + i[1] + 1) as f64)]
100+
fn a(i: [usize; 2], j: [usize; 2]) -> F64x2 {
101+
F64x2::new(((i[0] + j[0]) * (i[0] + j[0] + 1) / 2 + i[0] + 1) as f64,
102+
((i[1] + j[1]) * (i[1] + j[1] + 1) / 2 + i[1] + 1) as f64)
68103
}
69104

70-
fn dot(v: &[[f64; 2]], u: &[[f64; 2]]) -> f64 {
105+
fn dot(v: &[F64x2], u: &[F64x2]) -> f64 {
71106
// Vectorised form of dot product: (1) compute dot across two lanes.
72107
let r = u.iter()
73108
.zip(v)
74-
.map(|(x, y)| [x[0] * y[0], x[1] * y[1]])
75-
.fold([0.0f64; 2], |s, x| [s[0] + x[0], s[1] + x[1]]);
109+
.map(|(&x, &y)| x * y)
110+
.fold(F64x2::splat(0.0), |s, x| s + x);
76111

77112
// (2) sum the two lanes.
78-
r[0] + r[1]
113+
r.sum()
79114
}
80115

81116
// Hint that this function should not be inlined. Keep the parallelised code tight, and vectorize
82117
// better.
83118
#[inline(never)]
84-
fn div_and_add(x: &[f64; 2],
85-
a0: &[f64; 2],
86-
a1: &[f64; 2],
87-
s0: &mut [f64; 2],
88-
s1: &mut [f64; 2]) {
89-
s0[0] += x[0] / a0[0];
90-
s0[1] += x[1] / a0[1];
91-
s1[0] += x[0] / a1[0];
92-
s1[1] += x[1] / a1[1];
119+
fn div_and_add(x: F64x2,
120+
a0: F64x2,
121+
a1: F64x2,
122+
s0: &mut F64x2,
123+
s1: &mut F64x2) {
124+
*s0 = *s0 + x / a0;
125+
*s1 = *s1 + x / a1;
93126
}

0 commit comments

Comments
 (0)