sse fmadd

pdogr · pdogr · commit d28a3edf19ae · 2023-03-23T15:38:12.000Z
diff --git a/experimental/segmenter/src/lib.rs b/experimental/segmenter/src/lib.rs
@@ -78,10 +78,8 @@
         clippy::panic,
         clippy::exhaustive_structs,
         clippy::exhaustive_enums,
-        missing_debug_implementations,
     )
 )]
-#![warn(missing_docs)]
 
 extern crate alloc;
 
@@ -104,9 +102,9 @@ pub mod provider;
 pub mod symbols;
 
 #[cfg(feature = "lstm")]
-mod lstm;
+pub mod lstm;
 #[cfg(feature = "lstm")]
-mod lstm_bies;
+pub mod lstm_bies;
 #[cfg(feature = "lstm")]
 mod lstm_error;
 #[cfg(feature = "lstm")]
diff --git a/experimental/segmenter/src/line.rs b/experimental/segmenter/src/line.rs
@@ -236,7 +236,6 @@ impl LineSegmenter {
         Self::try_new_lstm_with_options_unstable(provider, Default::default())
     }
 
-    #[cfg(feature = "lstm")]
     icu_provider::gen_any_buffer_constructors!(
         locale: skip,
         options: skip,
diff --git a/experimental/segmenter/src/math_helper.rs b/experimental/segmenter/src/math_helper.rs
@@ -299,13 +299,106 @@ pub fn sigmoid(x: f32) -> f32 {
     1.0 / (1.0 + (-x).exp())
 }
 
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "sse")]
+#[target_feature(enable = "fma")]
+unsafe fn dot_sse_fma(xs: &[f32], ys: &[f32]) -> f32 {
+    use core::arch::x86_64::_mm_add_ps;
+    use core::arch::x86_64::_mm_add_ss;
+    use core::arch::x86_64::_mm_cvtss_f32;
+    use core::arch::x86_64::_mm_fmadd_ps;
+    use core::arch::x86_64::_mm_loadu_ps;
+    use core::arch::x86_64::_mm_movehdup_ps;
+    use core::arch::x86_64::_mm_movehl_ps;
+    use core::arch::x86_64::_mm_setzero_ps;
+
+    debug_assert_eq!(xs.len(), ys.len());
+
+    let xc = xs.chunks_exact(2 * 4);
+    let yc = ys.chunks_exact(2 * 4);
+
+    let sum_all = xc
+        .remainder()
+        .iter()
+        .zip(yc.remainder().iter())
+        .map(|(x, y)| x * y)
+        .sum::<f32>();
+    let mut sum = _mm_setzero_ps();
+
+    for (x, y) in xc.zip(yc) {
+        let xptr = x.as_ptr();
+        let yptr = y.as_ptr();
+
+        let xv = _mm_loadu_ps(xptr);
+        let yv = _mm_loadu_ps(yptr);
+        sum = _mm_fmadd_ps(xv, yv, sum);
+
+        let xv = _mm_loadu_ps(xptr.add(4));
+        let yv = _mm_loadu_ps(yptr.add(4));
+        sum = _mm_fmadd_ps(xv, yv, sum);
+    }
+
+    // Using hacks in
+    // https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction
+    let mut shuf = _mm_movehdup_ps(sum);
+    let mut sums = _mm_add_ps(sum, shuf);
+    shuf = _mm_movehl_ps(shuf, sums);
+    sums = _mm_add_ss(sums, shuf);
+    sum_all + _mm_cvtss_f32(sums)
+}
+
+unsafe fn dot_avx_fma(xs: &[f32], ys: &[f32]) -> f32 {
+    use core::arch::x86_64::{
+        _mm256_castps256_ps128, _mm256_extractf128_ps, _mm256_fmadd_ps, _mm256_loadu_ps,
+        _mm256_setzero_ps, _mm_add_ps, _mm_add_ss, _mm_cvtss_f32, _mm_movehdup_ps, _mm_movehl_ps,
+    };
+    debug_assert_eq!(xs.len(), ys.len());
+
+    let xc = xs.chunks_exact(8);
+    let yc = ys.chunks_exact(8);
+
+    let sum_all = xc
+        .remainder()
+        .iter()
+        .zip(yc.remainder().iter())
+        .map(|(x, y)| x * y)
+        .sum::<f32>();
+    let mut sum = _mm256_setzero_ps();
+
+    for (x, y) in xc.zip(yc) {
+        let xptr = x.as_ptr();
+        let yptr = y.as_ptr();
+
+        let xv = _mm256_loadu_ps(xptr);
+        let yv = _mm256_loadu_ps(yptr);
+        sum = _mm256_fmadd_ps(xv, yv, sum);
+    }
+
+    // Using hacks in
+    // https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction
+    let mut lo = _mm256_castps256_ps128(sum);
+    let hi = _mm256_extractf128_ps(sum, 1);
+    lo = _mm_add_ps(lo, hi);
+
+    let mut shuf = _mm_movehdup_ps(lo);
+    let mut sums = _mm_add_ps(lo, shuf);
+    shuf = _mm_movehl_ps(shuf, sums);
+    sums = _mm_add_ss(sums, shuf);
+    sum_all + _mm_cvtss_f32(sums)
+}
+
 /// Compute the dot product.
 ///
 /// `xs` and `ys` must be the same length
 ///
 /// (From ndarray 0.15.6)
 fn unrolled_dot(xs: &[f32], ys: &[f32]) -> f32 {
     debug_assert_eq!(xs.len(), ys.len());
+    if std::is_x86_feature_detected!("avx") {
+        unsafe {
+            return dot_avx_fma(xs, ys);
+        }
+    }
     // eightfold unrolled so that floating point can be vectorized
     // (even with strict floating point accuracy semantics)
     let mut p = (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);

Original file line number	Diff line number	Diff line change
`@@ -236,7 +236,6 @@ impl LineSegmenter {`
`236`	`236`	`Self::try_new_lstm_with_options_unstable(provider, Default::default())`
`237`	`237`	`}`
`238`	`238`
`239`		`- #[cfg(feature = "lstm")]`
`240`	`239`	`icu_provider::gen_any_buffer_constructors!(`
`241`	`240`	`locale: skip,`
`242`	`241`	`options: skip,`