Merge pull request #159 from Marwes/refactor_idct

HeroicKatora · web-flow · commit 2439a90856b6 · 2020-06-28T00:09:51.000+02:00
refactor: Extract and clarify the 8x8 idct implementation
diff --git a/src/idct.rs b/src/idct.rs
@@ -2,7 +2,10 @@
 // One example is tests/crashtest/images/imagetestsuite/b0b8914cc5f7a6eff409f16d8cc236c5.jpg
 // That's why wrapping operators are needed.
 use crate::parser::Dimensions;
-use std::num::Wrapping;
+use std::{
+    convert::TryFrom,
+    num::Wrapping,
+};
 
 pub(crate) fn choose_idct_size(full_size: Dimensions, requested_size: Dimensions) -> usize {
     fn scaled(len: u16, scale: usize) -> u16 { ((len as u32 * scale as u32 - 1) / 8 + 1) as u16 }
@@ -28,7 +31,7 @@ fn test_choose_idct_size() {
     assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 685, height: 999}), 2);
     assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 1000, height: 1000}), 2);
     assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 1400, height: 1400}), 4);
-    
+
     assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 5472, height: 3648}), 8);
     assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 16384, height: 16384}), 8);
     assert_eq!(choose_idct_size(Dimensions{width: 1, height: 1}, Dimensions{width: 65535, height: 65535}), 8);
@@ -45,79 +48,74 @@ pub(crate) fn dequantize_and_idct_block(scale: usize, coefficients: &[i16], quan
     }
 }
 
-// This is based on stb_image's 'stbi__idct_block'.
-fn dequantize_and_idct_block_8x8(coefficients: &[i16], quantization_table: &[u16; 64], output_linestride: usize, output: &mut [u8]) {
+pub fn dequantize_and_idct_block_8x8(
+    coefficients: &[i16],
+    quantization_table: &[u16; 64],
+    output_linestride: usize,
+    output: &mut [u8]
+) {
     debug_assert_eq!(coefficients.len(), 64);
+    let output = output
+        .chunks_mut(output_linestride);
+    dequantize_and_idct_block_8x8_inner(coefficients, quantization_table, output)
+}
+
+// This is based on stb_image's 'stbi__idct_block'.
+fn dequantize_and_idct_block_8x8_inner<'a, I>(
+    coefficients: &[i16],
+    quantization_table: &[u16; 64],
+    output: I,
+) where
+    I: IntoIterator<Item = &'a mut [u8]>,
+    I::IntoIter: ExactSizeIterator<Item = &'a mut [u8]>,
+{
+    let output = output.into_iter();
+    debug_assert!(
+        output.len() >= 8,
+        "Output iterator has the wrong length: {}",
+        output.len()
+    );
 
-    let mut temp = [Wrapping(0i32); 64];
+    let mut temp = [Wrapping(0); 64];
 
     // columns
-    for i in 0 .. 8 {
-        // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
-        if coefficients[i + 8] == 0 && coefficients[i + 16] == 0 && coefficients[i + 24] == 0 &&
-                coefficients[i + 32] == 0 && coefficients[i + 40] == 0 && coefficients[i + 48] == 0 &&
-                coefficients[i + 56] == 0 {
-            let dcterm = Wrapping(coefficients[i] as i32 * quantization_table[i] as i32) << 2;
-            temp[i]      = dcterm;
-            temp[i + 8]  = dcterm;
+    for i in 0..8 {
+        if coefficients[i + 8] == 0
+            && coefficients[i + 16] == 0
+            && coefficients[i + 24] == 0
+            && coefficients[i + 32] == 0
+            && coefficients[i + 40] == 0
+            && coefficients[i + 48] == 0
+            && coefficients[i + 56] == 0
+        {
+            let dcterm = dequantize(coefficients[i], quantization_table[i]) << 2;
+            temp[i] = dcterm;
+            temp[i + 8] = dcterm;
             temp[i + 16] = dcterm;
             temp[i + 24] = dcterm;
             temp[i + 32] = dcterm;
             temp[i + 40] = dcterm;
             temp[i + 48] = dcterm;
             temp[i + 56] = dcterm;
-        }
-        else {
-            let s0 = Wrapping(coefficients[i] as i32 * quantization_table[i] as i32);
-            let s1 = Wrapping(coefficients[i + 8] as i32 * quantization_table[i + 8] as i32);
-            let s2 = Wrapping(coefficients[i + 16] as i32 * quantization_table[i + 16] as i32);
-            let s3 = Wrapping(coefficients[i + 24] as i32 * quantization_table[i + 24] as i32);
-            let s4 = Wrapping(coefficients[i + 32] as i32 * quantization_table[i + 32] as i32);
-            let s5 = Wrapping(coefficients[i + 40] as i32 * quantization_table[i + 40] as i32);
-            let s6 = Wrapping(coefficients[i + 48] as i32 * quantization_table[i + 48] as i32);
-            let s7 = Wrapping(coefficients[i + 56] as i32 * quantization_table[i + 56] as i32);
-
-            let p2 = s2;
-            let p3 = s6;
-            let p1 = (p2 + p3) * stbi_f2f(0.5411961);
-            let t2 = p1 + p3 * stbi_f2f(-1.847759065);
-            let t3 = p1 + p2 * stbi_f2f(0.765366865);
-            let p2 = s0;
-            let p3 = s4;
-            let t0 = stbi_fsh(p2 + p3);
-            let t1 = stbi_fsh(p2 - p3);
-            let x0 = t0 + t3;
-            let x3 = t0 - t3;
-            let x1 = t1 + t2;
-            let x2 = t1 - t2;
-            let t0 = s7;
-            let t1 = s5;
-            let t2 = s3;
-            let t3 = s1;
-            let p3 = t0 + t2;
-            let p4 = t1 + t3;
-            let p1 = t0 + t3;
-            let p2 = t1 + t2;
-            let p5 = (p3 + p4) * stbi_f2f(1.175875602);
-            let t0 = t0 * stbi_f2f(0.298631336);
-            let t1 = t1 * stbi_f2f(2.053119869);
-            let t2 = t2 * stbi_f2f(3.072711026);
-            let t3 = t3 * stbi_f2f(1.501321110);
-            let p1 = p5 + (p1 * stbi_f2f(-0.899976223));
-            let p2 = p5 + (p2 * stbi_f2f(-2.562915447));
-            let p3 = p3 * stbi_f2f(-1.961570560);
-            let p4 = p4 * stbi_f2f(-0.390180644);
-            let t3 = t3 + p1 + p4;
-            let t2 = t2 + p2 + p3;
-            let t1 = t1 + p2 + p4;
-            let t0 = t0 + p1 + p3;
-
-            // constants scaled things up by 1<<12; let's bring them back
-            // down, but keep 2 extra bits of precision
-            let x0 = x0 + Wrapping(512);
-            let x1 = x1 + Wrapping(512);
-            let x2 = x2 + Wrapping(512);
-            let x3 = x3 + Wrapping(512);
+        } else {
+            let s0 = dequantize(coefficients[i], quantization_table[i]);
+            let s1 = dequantize(coefficients[i + 8], quantization_table[i + 8]);
+            let s2 = dequantize(coefficients[i + 16], quantization_table[i + 16]);
+            let s3 = dequantize(coefficients[i + 24], quantization_table[i + 24]);
+            let s4 = dequantize(coefficients[i + 32], quantization_table[i + 32]);
+            let s5 = dequantize(coefficients[i + 40], quantization_table[i + 40]);
+            let s6 = dequantize(coefficients[i + 48], quantization_table[i + 48]);
+            let s7 = dequantize(coefficients[i + 56], quantization_table[i + 56]);
+
+            let Kernel {
+                xs: [x0, x1, x2, x3],
+                ts: [t0, t1, t2, t3],
+            } = kernel(
+                [s0, s1, s2, s3, s4, s5, s6, s7],
+                // constants scaled things up by 1<<12; let's bring them back
+                // down, but keep 2 extra bits of precision
+                512,
+            );
 
             temp[i] = (x0 + t3) >> 10;
             temp[i + 56] = (x0 - t3) >> 10;
@@ -130,72 +128,128 @@ fn dequantize_and_idct_block_8x8(coefficients: &[i16], quantization_table: &[u16
         }
     }
 
-    for i in 0 .. 8 {
-        // no fast case since the first 1D IDCT spread components out
-        let s0 = temp[i * 8];
-        let s1 = temp[i * 8 + 1];
-        let s2 = temp[i * 8 + 2];
-        let s3 = temp[i * 8 + 3];
-        let s4 = temp[i * 8 + 4];
-        let s5 = temp[i * 8 + 5];
-        let s6 = temp[i * 8 + 6];
-        let s7 = temp[i * 8 + 7];
-
-        let p2 = s2;
-        let p3 = s6;
-        let p1 = (p2 + p3) * stbi_f2f(0.5411961);
-        let t2 = p1 + p3 * stbi_f2f(-1.847759065);
-        let t3 = p1 + p2 * stbi_f2f(0.765366865);
-        let p2 = s0;
-        let p3 = s4;
-        let t0 = stbi_fsh(p2 + p3);
-        let t1 = stbi_fsh(p2 - p3);
-        let x0 = t0 + t3;
-        let x3 = t0 - t3;
-        let x1 = t1 + t2;
-        let x2 = t1 - t2;
-        let t0 = s7;
-        let t1 = s5;
-        let t2 = s3;
-        let t3 = s1;
-        let p3 = t0 + t2;
-        let p4 = t1 + t3;
-        let p1 = t0 + t3;
-        let p2 = t1 + t2;
-        let p5 = (p3 + p4) * stbi_f2f(1.175875602);
-        let t0 = t0 * stbi_f2f(0.298631336);
-        let t1 = t1 * stbi_f2f(2.053119869);
-        let t2 = t2 * stbi_f2f(3.072711026);
-        let t3 = t3 * stbi_f2f(1.501321110);
-        let p1 = p5 + p1 * stbi_f2f(-0.899976223);
-        let p2 = p5 + p2 * stbi_f2f(-2.562915447);
-        let p3 = p3 * stbi_f2f(-1.961570560);
-        let p4 = p4 * stbi_f2f(-0.390180644);
-        let t3 = t3 + p1 + p4;
-        let t2 = t2 + p2 + p3;
-        let t1 = t1 + p2 + p4;
-        let t0 = t0 + p1 + p3;
+    for (chunk, output_chunk) in temp.chunks_exact(8).zip(output) {
+        let chunk = <&[_; 8]>::try_from(chunk).unwrap();
 
         // constants scaled things up by 1<<12, plus we had 1<<2 from first
         // loop, plus horizontal and vertical each scale by sqrt(8) so together
         // we've got an extra 1<<3, so 1<<17 total we need to remove.
         // so we want to round that, which means adding 0.5 * 1<<17,
         // aka 65536. Also, we'll end up with -128 to 127 that we want
         // to encode as 0..255 by adding 128, so we'll add that before the shift
-        let x0 = x0 + Wrapping(65536 + (128 << 17));
-        let x1 = x1 + Wrapping(65536 + (128 << 17));
-        let x2 = x2 + Wrapping(65536 + (128 << 17));
-        let x3 = x3 + Wrapping(65536 + (128 << 17));
-
-        output[i * output_linestride] = stbi_clamp((x0 + t3) >> 17);
-        output[i * output_linestride + 7] = stbi_clamp((x0 - t3) >> 17);
-        output[i * output_linestride + 1] = stbi_clamp((x1 + t2) >> 17);
-        output[i * output_linestride + 6] = stbi_clamp((x1 - t2) >> 17);
-        output[i * output_linestride + 2] = stbi_clamp((x2 + t1) >> 17);
-        output[i * output_linestride + 5] = stbi_clamp((x2 - t1) >> 17);
-        output[i * output_linestride + 3] = stbi_clamp((x3 + t0) >> 17);
-        output[i * output_linestride + 4] = stbi_clamp((x3 - t0) >> 17);
+        const X_SCALE: i32 = 65536 + (128 << 17);
+
+        // TODO When the minimum rust version supports it
+        // let [s0, rest @ ..] = chunk;
+        let (s0, rest) = chunk.split_first().unwrap();
+        if *rest == [Wrapping(0); 7] {
+            let dcterm = stbi_clamp((stbi_fsh(*s0) + Wrapping(X_SCALE)) >> 17);
+            output_chunk[0] = dcterm;
+            output_chunk[1] = dcterm;
+            output_chunk[2] = dcterm;
+            output_chunk[3] = dcterm;
+            output_chunk[4] = dcterm;
+            output_chunk[5] = dcterm;
+            output_chunk[6] = dcterm;
+            output_chunk[7] = dcterm;
+        } else {
+            let Kernel {
+                xs: [x0, x1, x2, x3],
+                ts: [t0, t1, t2, t3],
+            } = kernel(*chunk, X_SCALE);
+
+            output_chunk[0] = stbi_clamp((x0 + t3) >> 17);
+            output_chunk[7] = stbi_clamp((x0 - t3) >> 17);
+            output_chunk[1] = stbi_clamp((x1 + t2) >> 17);
+            output_chunk[6] = stbi_clamp((x1 - t2) >> 17);
+            output_chunk[2] = stbi_clamp((x2 + t1) >> 17);
+            output_chunk[5] = stbi_clamp((x2 - t1) >> 17);
+            output_chunk[3] = stbi_clamp((x3 + t0) >> 17);
+            output_chunk[4] = stbi_clamp((x3 - t0) >> 17);
+        }
+    }
+}
+
+struct Kernel {
+    xs: [Wrapping<i32>; 4],
+    ts: [Wrapping<i32>; 4],
+}
+
+#[inline]
+fn kernel_x([s0, s2, s4, s6]: [Wrapping<i32>; 4], x_scale: i32) -> [Wrapping<i32>; 4] {
+    // Even `chunk` indicies
+    let (t2, t3);
+    {
+        let p2 = s2;
+        let p3 = s6;
+
+        let p1 = (p2 + p3) * stbi_f2f(0.5411961);
+        t2 = p1 + p3 * stbi_f2f(-1.847759065);
+        t3 = p1 + p2 * stbi_f2f(0.765366865);
+    }
+
+    let (t0, t1);
+    {
+        let p2 = s0;
+        let p3 = s4;
+
+        t0 = stbi_fsh(p2 + p3);
+        t1 = stbi_fsh(p2 - p3);
     }
+
+    let x0 = t0 + t3;
+    let x3 = t0 - t3;
+    let x1 = t1 + t2;
+    let x2 = t1 - t2;
+
+    let x_scale = Wrapping(x_scale);
+
+    [x0 + x_scale, x1 + x_scale, x2 + x_scale, x3 + x_scale]
+}
+
+#[inline]
+fn kernel_t([s1, s3, s5, s7]: [Wrapping<i32>; 4]) -> [Wrapping<i32>; 4] {
+    // Odd `chunk` indicies
+    let mut t0 = s7;
+    let mut t1 = s5;
+    let mut t2 = s3;
+    let mut t3 = s1;
+
+    let p3 = t0 + t2;
+    let p4 = t1 + t3;
+    let p1 = t0 + t3;
+    let p2 = t1 + t2;
+    let p5 = (p3 + p4) * stbi_f2f(1.175875602);
+
+    t0 *= stbi_f2f(0.298631336);
+    t1 *= stbi_f2f(2.053119869);
+    t2 *= stbi_f2f(3.072711026);
+    t3 *= stbi_f2f(1.501321110);
+
+    let p1 = p5 + p1 * stbi_f2f(-0.899976223);
+    let p2 = p5 + p2 * stbi_f2f(-2.562915447);
+    let p3 = p3 * stbi_f2f(-1.961570560);
+    let p4 = p4 * stbi_f2f(-0.390180644);
+
+    t3 += p1 + p4;
+    t2 += p2 + p3;
+    t1 += p2 + p4;
+    t0 += p1 + p3;
+
+    [t0, t1, t2, t3]
+}
+
+#[inline]
+fn kernel([s0, s1, s2, s3, s4, s5, s6, s7]: [Wrapping<i32>; 8], x_scale: i32) -> Kernel {
+    Kernel {
+        xs: kernel_x([s0, s2, s4, s6], x_scale),
+        ts: kernel_t([s1, s3, s5, s7]),
+    }
+}
+
+#[inline(always)]
+fn dequantize(c: i16, q: u16) -> Wrapping<i32> {
+    Wrapping(i32::from(c) * i32::from(q))
 }
 
 // 4x4 and 2x2 IDCT based on Rakesh Dugad and Narendra Ahuja: "A Fast Scheme for Image Size Change in the Compressed Domain" (2001).