Skip to content

Commit 2439a90

Browse files
author
HeroicKatora
authored
Merge pull request #159 from Marwes/refactor_idct
refactor: Extract and clarify the 8x8 idct implementation
2 parents 073e21e + bc4a9b9 commit 2439a90

File tree

1 file changed

+177
-123
lines changed

1 file changed

+177
-123
lines changed

src/idct.rs

Lines changed: 177 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
// One example is tests/crashtest/images/imagetestsuite/b0b8914cc5f7a6eff409f16d8cc236c5.jpg
33
// That's why wrapping operators are needed.
44
use crate::parser::Dimensions;
5-
use std::num::Wrapping;
5+
use std::{
6+
convert::TryFrom,
7+
num::Wrapping,
8+
};
69

710
pub(crate) fn choose_idct_size(full_size: Dimensions, requested_size: Dimensions) -> usize {
811
fn scaled(len: u16, scale: usize) -> u16 { ((len as u32 * scale as u32 - 1) / 8 + 1) as u16 }
@@ -28,7 +31,7 @@ fn test_choose_idct_size() {
2831
assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 685, height: 999}), 2);
2932
assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 1000, height: 1000}), 2);
3033
assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 1400, height: 1400}), 4);
31-
34+
3235
assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 5472, height: 3648}), 8);
3336
assert_eq!(choose_idct_size(Dimensions{width: 5472, height: 3648}, Dimensions{width: 16384, height: 16384}), 8);
3437
assert_eq!(choose_idct_size(Dimensions{width: 1, height: 1}, Dimensions{width: 65535, height: 65535}), 8);
@@ -45,79 +48,74 @@ pub(crate) fn dequantize_and_idct_block(scale: usize, coefficients: &[i16], quan
4548
}
4649
}
4750

48-
// This is based on stb_image's 'stbi__idct_block'.
49-
fn dequantize_and_idct_block_8x8(coefficients: &[i16], quantization_table: &[u16; 64], output_linestride: usize, output: &mut [u8]) {
51+
pub fn dequantize_and_idct_block_8x8(
52+
coefficients: &[i16],
53+
quantization_table: &[u16; 64],
54+
output_linestride: usize,
55+
output: &mut [u8]
56+
) {
5057
debug_assert_eq!(coefficients.len(), 64);
58+
let output = output
59+
.chunks_mut(output_linestride);
60+
dequantize_and_idct_block_8x8_inner(coefficients, quantization_table, output)
61+
}
62+
63+
// This is based on stb_image's 'stbi__idct_block'.
64+
fn dequantize_and_idct_block_8x8_inner<'a, I>(
65+
coefficients: &[i16],
66+
quantization_table: &[u16; 64],
67+
output: I,
68+
) where
69+
I: IntoIterator<Item = &'a mut [u8]>,
70+
I::IntoIter: ExactSizeIterator<Item = &'a mut [u8]>,
71+
{
72+
let output = output.into_iter();
73+
debug_assert!(
74+
output.len() >= 8,
75+
"Output iterator has the wrong length: {}",
76+
output.len()
77+
);
5178

52-
let mut temp = [Wrapping(0i32); 64];
79+
let mut temp = [Wrapping(0); 64];
5380

5481
// columns
55-
for i in 0 .. 8 {
56-
// if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
57-
if coefficients[i + 8] == 0 && coefficients[i + 16] == 0 && coefficients[i + 24] == 0 &&
58-
coefficients[i + 32] == 0 && coefficients[i + 40] == 0 && coefficients[i + 48] == 0 &&
59-
coefficients[i + 56] == 0 {
60-
let dcterm = Wrapping(coefficients[i] as i32 * quantization_table[i] as i32) << 2;
61-
temp[i] = dcterm;
62-
temp[i + 8] = dcterm;
82+
for i in 0..8 {
83+
if coefficients[i + 8] == 0
84+
&& coefficients[i + 16] == 0
85+
&& coefficients[i + 24] == 0
86+
&& coefficients[i + 32] == 0
87+
&& coefficients[i + 40] == 0
88+
&& coefficients[i + 48] == 0
89+
&& coefficients[i + 56] == 0
90+
{
91+
let dcterm = dequantize(coefficients[i], quantization_table[i]) << 2;
92+
temp[i] = dcterm;
93+
temp[i + 8] = dcterm;
6394
temp[i + 16] = dcterm;
6495
temp[i + 24] = dcterm;
6596
temp[i + 32] = dcterm;
6697
temp[i + 40] = dcterm;
6798
temp[i + 48] = dcterm;
6899
temp[i + 56] = dcterm;
69-
}
70-
else {
71-
let s0 = Wrapping(coefficients[i] as i32 * quantization_table[i] as i32);
72-
let s1 = Wrapping(coefficients[i + 8] as i32 * quantization_table[i + 8] as i32);
73-
let s2 = Wrapping(coefficients[i + 16] as i32 * quantization_table[i + 16] as i32);
74-
let s3 = Wrapping(coefficients[i + 24] as i32 * quantization_table[i + 24] as i32);
75-
let s4 = Wrapping(coefficients[i + 32] as i32 * quantization_table[i + 32] as i32);
76-
let s5 = Wrapping(coefficients[i + 40] as i32 * quantization_table[i + 40] as i32);
77-
let s6 = Wrapping(coefficients[i + 48] as i32 * quantization_table[i + 48] as i32);
78-
let s7 = Wrapping(coefficients[i + 56] as i32 * quantization_table[i + 56] as i32);
79-
80-
let p2 = s2;
81-
let p3 = s6;
82-
let p1 = (p2 + p3) * stbi_f2f(0.5411961);
83-
let t2 = p1 + p3 * stbi_f2f(-1.847759065);
84-
let t3 = p1 + p2 * stbi_f2f(0.765366865);
85-
let p2 = s0;
86-
let p3 = s4;
87-
let t0 = stbi_fsh(p2 + p3);
88-
let t1 = stbi_fsh(p2 - p3);
89-
let x0 = t0 + t3;
90-
let x3 = t0 - t3;
91-
let x1 = t1 + t2;
92-
let x2 = t1 - t2;
93-
let t0 = s7;
94-
let t1 = s5;
95-
let t2 = s3;
96-
let t3 = s1;
97-
let p3 = t0 + t2;
98-
let p4 = t1 + t3;
99-
let p1 = t0 + t3;
100-
let p2 = t1 + t2;
101-
let p5 = (p3 + p4) * stbi_f2f(1.175875602);
102-
let t0 = t0 * stbi_f2f(0.298631336);
103-
let t1 = t1 * stbi_f2f(2.053119869);
104-
let t2 = t2 * stbi_f2f(3.072711026);
105-
let t3 = t3 * stbi_f2f(1.501321110);
106-
let p1 = p5 + (p1 * stbi_f2f(-0.899976223));
107-
let p2 = p5 + (p2 * stbi_f2f(-2.562915447));
108-
let p3 = p3 * stbi_f2f(-1.961570560);
109-
let p4 = p4 * stbi_f2f(-0.390180644);
110-
let t3 = t3 + p1 + p4;
111-
let t2 = t2 + p2 + p3;
112-
let t1 = t1 + p2 + p4;
113-
let t0 = t0 + p1 + p3;
114-
115-
// constants scaled things up by 1<<12; let's bring them back
116-
// down, but keep 2 extra bits of precision
117-
let x0 = x0 + Wrapping(512);
118-
let x1 = x1 + Wrapping(512);
119-
let x2 = x2 + Wrapping(512);
120-
let x3 = x3 + Wrapping(512);
100+
} else {
101+
let s0 = dequantize(coefficients[i], quantization_table[i]);
102+
let s1 = dequantize(coefficients[i + 8], quantization_table[i + 8]);
103+
let s2 = dequantize(coefficients[i + 16], quantization_table[i + 16]);
104+
let s3 = dequantize(coefficients[i + 24], quantization_table[i + 24]);
105+
let s4 = dequantize(coefficients[i + 32], quantization_table[i + 32]);
106+
let s5 = dequantize(coefficients[i + 40], quantization_table[i + 40]);
107+
let s6 = dequantize(coefficients[i + 48], quantization_table[i + 48]);
108+
let s7 = dequantize(coefficients[i + 56], quantization_table[i + 56]);
109+
110+
let Kernel {
111+
xs: [x0, x1, x2, x3],
112+
ts: [t0, t1, t2, t3],
113+
} = kernel(
114+
[s0, s1, s2, s3, s4, s5, s6, s7],
115+
// constants scaled things up by 1<<12; let's bring them back
116+
// down, but keep 2 extra bits of precision
117+
512,
118+
);
121119

122120
temp[i] = (x0 + t3) >> 10;
123121
temp[i + 56] = (x0 - t3) >> 10;
@@ -130,72 +128,128 @@ fn dequantize_and_idct_block_8x8(coefficients: &[i16], quantization_table: &[u16
130128
}
131129
}
132130

133-
for i in 0 .. 8 {
134-
// no fast case since the first 1D IDCT spread components out
135-
let s0 = temp[i * 8];
136-
let s1 = temp[i * 8 + 1];
137-
let s2 = temp[i * 8 + 2];
138-
let s3 = temp[i * 8 + 3];
139-
let s4 = temp[i * 8 + 4];
140-
let s5 = temp[i * 8 + 5];
141-
let s6 = temp[i * 8 + 6];
142-
let s7 = temp[i * 8 + 7];
143-
144-
let p2 = s2;
145-
let p3 = s6;
146-
let p1 = (p2 + p3) * stbi_f2f(0.5411961);
147-
let t2 = p1 + p3 * stbi_f2f(-1.847759065);
148-
let t3 = p1 + p2 * stbi_f2f(0.765366865);
149-
let p2 = s0;
150-
let p3 = s4;
151-
let t0 = stbi_fsh(p2 + p3);
152-
let t1 = stbi_fsh(p2 - p3);
153-
let x0 = t0 + t3;
154-
let x3 = t0 - t3;
155-
let x1 = t1 + t2;
156-
let x2 = t1 - t2;
157-
let t0 = s7;
158-
let t1 = s5;
159-
let t2 = s3;
160-
let t3 = s1;
161-
let p3 = t0 + t2;
162-
let p4 = t1 + t3;
163-
let p1 = t0 + t3;
164-
let p2 = t1 + t2;
165-
let p5 = (p3 + p4) * stbi_f2f(1.175875602);
166-
let t0 = t0 * stbi_f2f(0.298631336);
167-
let t1 = t1 * stbi_f2f(2.053119869);
168-
let t2 = t2 * stbi_f2f(3.072711026);
169-
let t3 = t3 * stbi_f2f(1.501321110);
170-
let p1 = p5 + p1 * stbi_f2f(-0.899976223);
171-
let p2 = p5 + p2 * stbi_f2f(-2.562915447);
172-
let p3 = p3 * stbi_f2f(-1.961570560);
173-
let p4 = p4 * stbi_f2f(-0.390180644);
174-
let t3 = t3 + p1 + p4;
175-
let t2 = t2 + p2 + p3;
176-
let t1 = t1 + p2 + p4;
177-
let t0 = t0 + p1 + p3;
131+
for (chunk, output_chunk) in temp.chunks_exact(8).zip(output) {
132+
let chunk = <&[_; 8]>::try_from(chunk).unwrap();
178133

179134
// constants scaled things up by 1<<12, plus we had 1<<2 from first
180135
// loop, plus horizontal and vertical each scale by sqrt(8) so together
181136
// we've got an extra 1<<3, so 1<<17 total we need to remove.
182137
// so we want to round that, which means adding 0.5 * 1<<17,
183138
// aka 65536. Also, we'll end up with -128 to 127 that we want
184139
// to encode as 0..255 by adding 128, so we'll add that before the shift
185-
let x0 = x0 + Wrapping(65536 + (128 << 17));
186-
let x1 = x1 + Wrapping(65536 + (128 << 17));
187-
let x2 = x2 + Wrapping(65536 + (128 << 17));
188-
let x3 = x3 + Wrapping(65536 + (128 << 17));
189-
190-
output[i * output_linestride] = stbi_clamp((x0 + t3) >> 17);
191-
output[i * output_linestride + 7] = stbi_clamp((x0 - t3) >> 17);
192-
output[i * output_linestride + 1] = stbi_clamp((x1 + t2) >> 17);
193-
output[i * output_linestride + 6] = stbi_clamp((x1 - t2) >> 17);
194-
output[i * output_linestride + 2] = stbi_clamp((x2 + t1) >> 17);
195-
output[i * output_linestride + 5] = stbi_clamp((x2 - t1) >> 17);
196-
output[i * output_linestride + 3] = stbi_clamp((x3 + t0) >> 17);
197-
output[i * output_linestride + 4] = stbi_clamp((x3 - t0) >> 17);
140+
const X_SCALE: i32 = 65536 + (128 << 17);
141+
142+
// TODO When the minimum rust version supports it
143+
// let [s0, rest @ ..] = chunk;
144+
let (s0, rest) = chunk.split_first().unwrap();
145+
if *rest == [Wrapping(0); 7] {
146+
let dcterm = stbi_clamp((stbi_fsh(*s0) + Wrapping(X_SCALE)) >> 17);
147+
output_chunk[0] = dcterm;
148+
output_chunk[1] = dcterm;
149+
output_chunk[2] = dcterm;
150+
output_chunk[3] = dcterm;
151+
output_chunk[4] = dcterm;
152+
output_chunk[5] = dcterm;
153+
output_chunk[6] = dcterm;
154+
output_chunk[7] = dcterm;
155+
} else {
156+
let Kernel {
157+
xs: [x0, x1, x2, x3],
158+
ts: [t0, t1, t2, t3],
159+
} = kernel(*chunk, X_SCALE);
160+
161+
output_chunk[0] = stbi_clamp((x0 + t3) >> 17);
162+
output_chunk[7] = stbi_clamp((x0 - t3) >> 17);
163+
output_chunk[1] = stbi_clamp((x1 + t2) >> 17);
164+
output_chunk[6] = stbi_clamp((x1 - t2) >> 17);
165+
output_chunk[2] = stbi_clamp((x2 + t1) >> 17);
166+
output_chunk[5] = stbi_clamp((x2 - t1) >> 17);
167+
output_chunk[3] = stbi_clamp((x3 + t0) >> 17);
168+
output_chunk[4] = stbi_clamp((x3 - t0) >> 17);
169+
}
170+
}
171+
}
172+
173+
struct Kernel {
174+
xs: [Wrapping<i32>; 4],
175+
ts: [Wrapping<i32>; 4],
176+
}
177+
178+
#[inline]
179+
fn kernel_x([s0, s2, s4, s6]: [Wrapping<i32>; 4], x_scale: i32) -> [Wrapping<i32>; 4] {
180+
// Even `chunk` indicies
181+
let (t2, t3);
182+
{
183+
let p2 = s2;
184+
let p3 = s6;
185+
186+
let p1 = (p2 + p3) * stbi_f2f(0.5411961);
187+
t2 = p1 + p3 * stbi_f2f(-1.847759065);
188+
t3 = p1 + p2 * stbi_f2f(0.765366865);
189+
}
190+
191+
let (t0, t1);
192+
{
193+
let p2 = s0;
194+
let p3 = s4;
195+
196+
t0 = stbi_fsh(p2 + p3);
197+
t1 = stbi_fsh(p2 - p3);
198198
}
199+
200+
let x0 = t0 + t3;
201+
let x3 = t0 - t3;
202+
let x1 = t1 + t2;
203+
let x2 = t1 - t2;
204+
205+
let x_scale = Wrapping(x_scale);
206+
207+
[x0 + x_scale, x1 + x_scale, x2 + x_scale, x3 + x_scale]
208+
}
209+
210+
#[inline]
211+
fn kernel_t([s1, s3, s5, s7]: [Wrapping<i32>; 4]) -> [Wrapping<i32>; 4] {
212+
// Odd `chunk` indicies
213+
let mut t0 = s7;
214+
let mut t1 = s5;
215+
let mut t2 = s3;
216+
let mut t3 = s1;
217+
218+
let p3 = t0 + t2;
219+
let p4 = t1 + t3;
220+
let p1 = t0 + t3;
221+
let p2 = t1 + t2;
222+
let p5 = (p3 + p4) * stbi_f2f(1.175875602);
223+
224+
t0 *= stbi_f2f(0.298631336);
225+
t1 *= stbi_f2f(2.053119869);
226+
t2 *= stbi_f2f(3.072711026);
227+
t3 *= stbi_f2f(1.501321110);
228+
229+
let p1 = p5 + p1 * stbi_f2f(-0.899976223);
230+
let p2 = p5 + p2 * stbi_f2f(-2.562915447);
231+
let p3 = p3 * stbi_f2f(-1.961570560);
232+
let p4 = p4 * stbi_f2f(-0.390180644);
233+
234+
t3 += p1 + p4;
235+
t2 += p2 + p3;
236+
t1 += p2 + p4;
237+
t0 += p1 + p3;
238+
239+
[t0, t1, t2, t3]
240+
}
241+
242+
#[inline]
243+
fn kernel([s0, s1, s2, s3, s4, s5, s6, s7]: [Wrapping<i32>; 8], x_scale: i32) -> Kernel {
244+
Kernel {
245+
xs: kernel_x([s0, s2, s4, s6], x_scale),
246+
ts: kernel_t([s1, s3, s5, s7]),
247+
}
248+
}
249+
250+
#[inline(always)]
251+
fn dequantize(c: i16, q: u16) -> Wrapping<i32> {
252+
Wrapping(i32::from(c) * i32::from(q))
199253
}
200254

201255
// 4x4 and 2x2 IDCT based on Rakesh Dugad and Narendra Ahuja: "A Fast Scheme for Image Size Change in the Compressed Domain" (2001).

0 commit comments

Comments
 (0)