Skip to content

Commit 590a9e4

Browse files
committed
Use FMA and rsqrt in more places
1 parent 1bedc42 commit 590a9e4

File tree

4 files changed

+34
-20
lines changed

4 files changed

+34
-20
lines changed

examples/aobench/src/geometry/vec.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@ impl V3D {
3636
#[inline(always)]
3737
#[must_use]
3838
pub fn normalized(self) -> Self {
39-
self * (1. / self.dot(self).sqrt())
39+
let len2 = self.dot(self);
40+
let invlen = len2.sqrt().recip();
41+
invlen * self
4042
}
4143
#[inline(always)]
4244
#[must_use]

examples/aobench/src/geometry/vecxN.rs

+26-14
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@ impl V3DxN {
2727
#[inline(always)]
2828
#[must_use]
2929
pub fn normalized(self) -> Self {
30-
(1. / self.dot(self).sqrte()) * self
30+
let len2 = self.dot(self);
31+
let invlen = len2.rsqrte();
32+
invlen * self
3133
}
3234

3335
pub fn get(&self, idx: usize) -> V3D {
@@ -148,17 +150,15 @@ impl Dot<V3DxN> for V3DxN {
148150
type Output = f32xN;
149151
#[inline(always)]
150152
fn dot(self, o: Self) -> Self::Output {
151-
self.x * o.x + self.y * o.y + self.z * o.z
153+
self.x.mul_adde(o.x, self.y.mul_adde(o.y, self.z * o.z))
152154
}
153155
}
154156

155157
impl Dot<V3D> for V3DxN {
156158
type Output = f32xN;
157159
#[inline(always)]
158160
fn dot(self, o: V3D) -> Self::Output {
159-
self.x * f32xN::splat(o.x)
160-
+ self.y * f32xN::splat(o.y)
161-
+ self.z * f32xN::splat(o.z)
161+
self.x.mul_adde(f32xN::splat(o.x), self.y.mul_adde(f32xN::splat(o.y), self.z * o.z))
162162
}
163163
}
164164

@@ -204,15 +204,27 @@ impl Mul<V3DxN> for M3x3 {
204204
#[inline(always)]
205205
fn mul(self, o: V3DxN) -> Self::Output {
206206
V3DxN {
207-
x: o.x * f32xN::splat(self[0].x)
208-
+ o.y * f32xN::splat(self[1].x)
209-
+ o.z * f32xN::splat(self[2].x),
210-
y: o.x * f32xN::splat(self[0].y)
211-
+ o.y * f32xN::splat(self[1].y)
212-
+ o.z * f32xN::splat(self[2].y),
213-
z: o.x * f32xN::splat(self[0].z)
214-
+ o.y * f32xN::splat(self[1].z)
215-
+ o.z * f32xN::splat(self[2].z),
207+
x: o.x.mul_adde(
208+
f32xN::splat(self[0].x),
209+
o.y.mul_adde(
210+
f32xN::splat(self[1].x),
211+
o.z * f32xN::splat(self[2].x),
212+
),
213+
),
214+
y: o.x.mul_adde(
215+
f32xN::splat(self[0].y),
216+
o.y.mul_adde(
217+
f32xN::splat(self[1].y),
218+
o.z * f32xN::splat(self[2].y),
219+
),
220+
),
221+
z: o.x.mul_adde(
222+
f32xN::splat(self[0].z),
223+
o.y.mul_adde(
224+
f32xN::splat(self[1].z),
225+
o.z * f32xN::splat(self[2].z),
226+
),
227+
),
216228
}
217229
}
218230
}

examples/aobench/src/intersection/ray_sphere.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ impl Intersect<Sphere> for RayxN {
4040

4141
let b = rs.dot(ray.dir);
4242
let c = rs.dot(rs) - sphere.radius * sphere.radius;
43-
let d = b * b - c;
43+
let d = b.mul_adde(b, -c);
4444

4545
let old_isect = isect;
4646

examples/aobench/src/vector.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ fn ao_impl<S: Scene>(scene: &mut S, nsubsamples: usize, img: &mut ::Image) {
2323
(x as f32, y as f32, h as f32, w as f32);
2424

2525
let dir = V3D {
26-
x: (x + du - (w / 2.)) / (w / 2.) * w / h,
27-
y: -(y + dv - (h / 2.)) / (h / 2.),
26+
x: (x + du - (w * 0.5)) / (w * 0.5) * w / h,
27+
y: -(y + dv - (h * 0.5)) / (h * 0.5),
2828
z: -1.,
2929
};
3030
let dir = dir.normalized();
@@ -72,7 +72,7 @@ cfg_if! {
7272
ao_impl(scene, nsubsamples, img);
7373
}
7474

75-
#[target_feature(enable = "avx2")]
75+
#[target_feature(enable = "avx2,fma")]
7676
unsafe fn ao_avx2<S: Scene>(scene: &mut S, nsubsamples: usize,
7777
img: &mut ::Image) {
7878
ao_impl(scene, nsubsamples, img);
@@ -81,7 +81,7 @@ cfg_if! {
8181
pub fn ao<S: Scene>(scene: &mut S, nsubsamples: usize,
8282
img: &mut ::Image) {
8383
unsafe {
84-
if is_x86_feature_detected!("avx2") {
84+
if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") {
8585
ao_avx2(scene, nsubsamples, img);
8686
} else if is_x86_feature_detected!("avx") {
8787
ao_avx(scene, nsubsamples, img);

0 commit comments

Comments
 (0)