polyval: implement Karatsuba multiplication for arm64 (#181)

ericlagergren · web-flow · commit 973fe29a2666 · 2023-06-23T12:51:16.000-06:00
Improves performance by ~200 MB/s on a 2020 M1.

Signed-off-by: Eric Lagergren &lt;eric@ericlagergren.com&gt;
diff --git a/polyval/src/backend/pmull.rs b/polyval/src/backend/pmull.rs
@@ -67,10 +67,6 @@ impl Reset for Polyval {
 }
 
 impl Polyval {
-    /// Mask value used when performing reduction.
-    /// This corresponds to POLYVAL's polynomial with the highest bit unset.
-    const MASK: u128 = 1 << 127 | 1 << 126 | 1 << 121 | 1;
-
     /// Get POLYVAL output.
     pub(crate) fn finalize(self) -> Tag {
         unsafe { mem::transmute(self.y) }
@@ -81,42 +77,109 @@ impl Polyval {
     #[inline]
     #[target_feature(enable = "neon")]
     unsafe fn mul(&mut self, x: &Block) {
-        let h = self.h;
         let y = veorq_u8(self.y, vld1q_u8(x.as_ptr()));
-
-        // polynomial multiply
-        let z = vdupq_n_u8(0);
-        let r0 = pmull::<0, 0>(h, y);
-        let r1 = pmull::<1, 1>(h, y);
-        let t0 = pmull::<0, 1>(h, y);
-        let t1 = pmull::<1, 0>(h, y);
-        let t0 = veorq_u8(t0, t1);
-        let t1 = vextq_u8(z, t0, 8);
-        let r0 = veorq_u8(r0, t1);
-        let t1 = vextq_u8(t0, z, 8);
-        let r1 = veorq_u8(r1, t1);
-
-        // polynomial reduction
-        let p = mem::transmute(Self::MASK);
-        let t0 = pmull::<0, 1>(r0, p);
-        let t1 = vextq_u8(t0, t0, 8);
-        let r0 = veorq_u8(r0, t1);
-        let t1 = pmull::<1, 1>(r0, p);
-        let r0 = veorq_u8(r0, t1);
-
-        self.y = veorq_u8(r0, r1);
+        let (h, m, l) = karatsuba1(self.h, y);
+        let (h, l) = karatsuba2(h, m, l);
+        self.y = mont_reduce(h, l);
     }
 }
 
-/// Wrapper for the ARM64 `PMULL` instruction.
-#[inline(always)]
-unsafe fn pmull<const A_LANE: i32, const B_LANE: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+/// Karatsuba decomposition for `x*y`.
+#[inline]
+#[target_feature(enable = "neon")]
+unsafe fn karatsuba1(x: uint8x16_t, y: uint8x16_t) -> (uint8x16_t, uint8x16_t, uint8x16_t) {
+    // First Karatsuba step: decompose x and y.
+    //
+    // (x1*y0 + x0*y1) = (x1+x0) * (y1+x0) + (x1*y1) + (x0*y0)
+    //        M                                 H         L
+    //
+    // m = x.hi^x.lo * y.hi^y.lo
+    let m = pmull(
+        veorq_u8(x, vextq_u8(x, x, 8)), // x.hi^x.lo
+        veorq_u8(y, vextq_u8(y, y, 8)), // y.hi^y.lo
+    );
+    let h = pmull2(x, y); // h = x.hi * y.hi
+    let l = pmull(x, y); // l = x.lo * y.lo
+    (h, m, l)
+}
+
+/// Karatsuba combine.
+#[inline]
+#[target_feature(enable = "neon")]
+unsafe fn karatsuba2(h: uint8x16_t, m: uint8x16_t, l: uint8x16_t) -> (uint8x16_t, uint8x16_t) {
+    // Second Karatsuba step: combine into a 2n-bit product.
+    //
+    // m0 ^= l0 ^ h0 // = m0^(l0^h0)
+    // m1 ^= l1 ^ h1 // = m1^(l1^h1)
+    // l1 ^= m0      // = l1^(m0^l0^h0)
+    // h0 ^= l0 ^ m1 // = h0^(l0^m1^l1^h1)
+    // h1 ^= l1      // = h1^(l1^m0^l0^h0)
+    let t = {
+        //   {m0, m1} ^ {l1, h0}
+        // = {m0^l1, m1^h0}
+        let t0 = veorq_u8(m, vextq_u8(l, h, 8));
+
+        //   {h0, h1} ^ {l0, l1}
+        // = {h0^l0, h1^l1}
+        let t1 = veorq_u8(h, l);
+
+        //   {m0^l1, m1^h0} ^ {h0^l0, h1^l1}
+        // = {m0^l1^h0^l0, m1^h0^h1^l1}
+        veorq_u8(t0, t1)
+    };
+
+    // {m0^l1^h0^l0, l0}
+    let x01 = vextq_u8(
+        vextq_u8(l, l, 8), // {l1, l0}
+        t,
+        8,
+    );
+
+    // {h1, m1^h0^h1^l1}
+    let x23 = vextq_u8(
+        t,
+        vextq_u8(h, h, 8), // {h1, h0}
+        8,
+    );
+
+    (x23, x01)
+}
+
+#[inline]
+#[target_feature(enable = "neon")]
+unsafe fn mont_reduce(x23: uint8x16_t, x01: uint8x16_t) -> uint8x16_t {
+    // Perform the Montgomery reduction over the 256-bit X.
+    //    [A1:A0] = X0 • poly
+    //    [B1:B0] = [X0 ⊕ A1 : X1 ⊕ A0]
+    //    [C1:C0] = B0 • poly
+    //    [D1:D0] = [B0 ⊕ C1 : B1 ⊕ C0]
+    // Output: [D1 ⊕ X3 : D0 ⊕ X2]
+    let poly = vreinterpretq_u8_p128(1 << 127 | 1 << 126 | 1 << 121 | 1 << 63 | 1 << 62 | 1 << 57);
+    let a = pmull(x01, poly);
+    let b = veorq_u8(x01, vextq_u8(a, a, 8));
+    let c = pmull2(b, poly);
+    veorq_u8(x23, veorq_u8(c, b))
+}
+
+/// Multiplies the low bits in `a` and `b`.
+#[inline]
+#[target_feature(enable = "neon")]
+unsafe fn pmull(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
     mem::transmute(vmull_p64(
-        vgetq_lane_u64(vreinterpretq_u64_u8(a), A_LANE),
-        vgetq_lane_u64(vreinterpretq_u64_u8(b), B_LANE),
+        vgetq_lane_u64(vreinterpretq_u64_u8(a), 0),
+        vgetq_lane_u64(vreinterpretq_u64_u8(b), 0),
     ))
 }
 
+/// Multiplies the high bits in `a` and `b`.
+#[inline]
+#[target_feature(enable = "neon")]
+unsafe fn pmull2(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    mem::transmute(vmull_p64(
+        vgetq_lane_u64(vreinterpretq_u64_u8(a), 1),
+        vgetq_lane_u64(vreinterpretq_u64_u8(b), 1),
+    ))
+}
 // TODO(tarcieri): zeroize support
 // #[cfg(feature = "zeroize")]
 // impl Drop for Polyval {