Skip to content

Commit efdec7b

Browse files
committed
Use FMA explicitly in stencil example
1 parent f2624f9 commit efdec7b

File tree

1 file changed

+25
-12
lines changed

1 file changed

+25
-12
lines changed

examples/stencil/src/simd.rs

+25-12
Original file line numberDiff line numberDiff line change
@@ -27,26 +27,38 @@ pub(crate) fn step_x8(
2727
)
2828
};
2929
}
30+
3031
let cur_0 = a_cur!(0, 0, 0);
3132
let mut div: f32x8 = *coef.get_unchecked(0) * cur_0;
3233

3334
for i in 1..4 {
34-
let i: i32 = i;
35-
div += *coef.get_unchecked(i as usize)
36-
* (a_cur!(i, 0, 0)
35+
let coef = f32x8::splat(*coef.get_unchecked(i));
36+
37+
let sum = {
38+
let i = i as i32;
39+
(a_cur!(i, 0, 0)
3740
+ a_cur!(-i, 0, 0)
3841
+ a_cur!(0, i, 0)
3942
+ a_cur!(0, -i, 0)
4043
+ a_cur!(0, 0, i)
41-
+ a_cur!(0, 0, -i));
44+
+ a_cur!(0, 0, -i))
45+
};
46+
47+
div = coef.mul_adde(sum, div);
4248
}
4349

44-
let r =
45-
2. * cur_0 - f32x8::from_slice_unaligned_unchecked(
46-
&a_out.get_unchecked(out_idx as usize..),
47-
) + f32x8::from_slice_unaligned_unchecked(
48-
&vsq.get_unchecked(index as usize..),
49-
) * div;
50+
let vsq = f32x8::from_slice_unaligned_unchecked(
51+
vsq.get_unchecked(index as usize..),
52+
);
53+
54+
let sum = cur_0.mul_adde(
55+
f32x8::splat(2.),
56+
-f32x8::from_slice_unaligned_unchecked(
57+
a_out.get_unchecked(out_idx as usize..),
58+
),
59+
);
60+
61+
let r = vsq.mul_adde(div, sum);
5062
r.write_to_slice_unaligned_unchecked(
5163
&mut a_out.get_unchecked_mut(out_idx as usize..),
5264
);
@@ -94,7 +106,7 @@ fn x8_impl(
94106
}
95107

96108
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
97-
#[target_feature(enable = "avx2")]
109+
#[target_feature(enable = "avx2,fma")]
98110
unsafe fn x8_impl_avx2(
99111
t0: i32, t1: i32, x0: i32, x1: i32, y0: i32, y1: i32, z0: i32, z1: i32,
100112
n_x: i32, n_y: i32, n_z: i32, coef: &[f32; 4], vsq: &[f32],
@@ -158,7 +170,8 @@ pub fn x8(
158170
) {
159171
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
160172
unsafe {
161-
if is_x86_feature_detected!("avx2") {
173+
if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma")
174+
{
162175
#[rustfmt::skip]
163176
x8_impl_avx2(t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,
164177
coef, vsq, a_even, a_odd)

0 commit comments

Comments
 (0)