@@ -27,26 +27,38 @@ pub(crate) fn step_x8(
27
27
)
28
28
} ;
29
29
}
30
+
30
31
let cur_0 = a_cur ! ( 0 , 0 , 0 ) ;
31
32
let mut div: f32x8 = * coef. get_unchecked ( 0 ) * cur_0;
32
33
33
34
for i in 1 ..4 {
34
- let i: i32 = i;
35
- div += * coef. get_unchecked ( i as usize )
36
- * ( a_cur ! ( i, 0 , 0 )
35
+ let coef = f32x8:: splat ( * coef. get_unchecked ( i) ) ;
36
+
37
+ let sum = {
38
+ let i = i as i32 ;
39
+ ( a_cur ! ( i, 0 , 0 )
37
40
+ a_cur ! ( -i, 0 , 0 )
38
41
+ a_cur ! ( 0 , i, 0 )
39
42
+ a_cur ! ( 0 , -i, 0 )
40
43
+ a_cur ! ( 0 , 0 , i)
41
- + a_cur ! ( 0 , 0 , -i) ) ;
44
+ + a_cur ! ( 0 , 0 , -i) )
45
+ } ;
46
+
47
+ div = coef. mul_adde ( sum, div) ;
42
48
}
43
49
44
- let r =
45
- 2. * cur_0 - f32x8:: from_slice_unaligned_unchecked (
46
- & a_out. get_unchecked ( out_idx as usize ..) ,
47
- ) + f32x8:: from_slice_unaligned_unchecked (
48
- & vsq. get_unchecked ( index as usize ..) ,
49
- ) * div;
50
+ let vsq = f32x8:: from_slice_unaligned_unchecked (
51
+ vsq. get_unchecked ( index as usize ..) ,
52
+ ) ;
53
+
54
+ let sum = cur_0. mul_adde (
55
+ f32x8:: splat ( 2. ) ,
56
+ -f32x8:: from_slice_unaligned_unchecked (
57
+ a_out. get_unchecked ( out_idx as usize ..) ,
58
+ ) ,
59
+ ) ;
60
+
61
+ let r = vsq. mul_adde ( div, sum) ;
50
62
r. write_to_slice_unaligned_unchecked (
51
63
& mut a_out. get_unchecked_mut ( out_idx as usize ..) ,
52
64
) ;
@@ -94,7 +106,7 @@ fn x8_impl(
94
106
}
95
107
96
108
#[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
97
- #[ target_feature( enable = "avx2" ) ]
109
+ #[ target_feature( enable = "avx2,fma " ) ]
98
110
unsafe fn x8_impl_avx2 (
99
111
t0 : i32 , t1 : i32 , x0 : i32 , x1 : i32 , y0 : i32 , y1 : i32 , z0 : i32 , z1 : i32 ,
100
112
n_x : i32 , n_y : i32 , n_z : i32 , coef : & [ f32 ; 4 ] , vsq : & [ f32 ] ,
@@ -158,7 +170,8 @@ pub fn x8(
158
170
) {
159
171
#[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
160
172
unsafe {
161
- if is_x86_feature_detected ! ( "avx2" ) {
173
+ if is_x86_feature_detected ! ( "avx2" ) && is_x86_feature_detected ! ( "fma" )
174
+ {
162
175
#[ rustfmt:: skip]
163
176
x8_impl_avx2 ( t0, t1, x0, x1, y0, y1, z0, z1, n_x, n_y, n_z,
164
177
coef, vsq, a_even, a_odd)
0 commit comments