Skip to content

Commit

Permalink
Merge pull request #16 from awxkee/dev
Browse files Browse the repository at this point in the history
AVX StackBlur column pass
  • Loading branch information
awxkee authored Feb 5, 2025
2 parents 9530dcb + 458bb73 commit 7d19b67
Show file tree
Hide file tree
Showing 27 changed files with 1,385 additions and 256 deletions.
19 changes: 15 additions & 4 deletions .github/workflows/build_push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,7 @@ jobs:

fuzz_stack_blur:
name: Fuzzing Stack Blur
strategy:
matrix:
os: [ ubuntu-latest, macos-latest ]
runs-on: ${{ matrix.os }}
runs-on: macos-latest
steps:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@nightly
Expand All @@ -102,6 +99,20 @@ jobs:
- run: cargo fuzz run stack_blur_u16 -- -max_total_time=45
- run: cargo fuzz run stack_blur_f32 -- -max_total_time=45

fuzz_stack_blur_x86:
name: Fuzzing Stack Blur
strategy:
matrix:
feature: [ sse, avx ]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@nightly
- run: cargo install cargo-fuzz
- run: cargo fuzz run stack_blur --features ${{ matrix.feature }} -- -max_total_time=45
- run: cargo fuzz run stack_blur_u16 --features ${{ matrix.feature }} -- -max_total_time=45
- run: cargo fuzz run stack_blur_f32 --features ${{ matrix.feature }} -- -max_total_time=45

fuzz_fast_gaussian_blur:
name: Fuzzing Fast Gaussian Blur
strategy:
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ path = "src/main.rs"
colorutils-rs = "0.7.0"
half = "2.4.1"
image = "0.25.5"
libblur = {path = "src/lib", features = ["image", "fft", "sse"], default-features = false}
libblur = {path = "src/lib", features = ["image", "fft", "sse", "avx"], default-features = false}
rayon = "1.10.0"

[dev-dependencies]
Expand Down
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,14 +100,14 @@ Example comparison time for blurring image 3000x4000 RGB 8-bit in multithreaded

| | time(NEON) | time(SSE) |
|---------|:----------:|:---------:|
| libblur | 7.71ms | 7.27ms |
| libblur | 5.77ms | 5.65ms |
| OpenCV | 8.43ms | 10.36ms |

Example comparison time for blurring image 2828x4242 RGBA 8-bit in multithreaded mode with 77 radius.

| | time(NEON) | time(SSE) |
|---------|:----------:|:---------:|
| libblur | 7.41ms | 8.20ms |
| libblur | 5.58ms | 5.75ms |
| OpenCV | 8.00ms | 8.55ms |

### Fast gaussian
Expand Down Expand Up @@ -228,14 +228,14 @@ Example comparison time for blurring image 3000x4000 RGB 8-bit in multithreaded

| | time(NEON) | time(SSE) |
|---------|:----------:|:---------:|
| libblur | 10.13ms | 12.57ms |
| libblur | 10.13ms | 11.57ms |
| OpenCV | 16.81ms | 47.62ms |

Example comparison time for blurring image 2828x4242 RGBA 8-bit in multithreaded mode with 77 radius.

| | Time(NEON) | Time(SSE) |
|---------|:----------:|:---------:|
| libblur | 10.68ms | 10.18ms |
| libblur | 10.68ms | 9.41ms |
| OpenCV | 16.33ms | 31.29ms |

### Fast bilateral blur
Expand Down
6 changes: 5 additions & 1 deletion benches/gauss/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use libblur::{
filter_1d_rgb_exact, get_gaussian_kernel_1d, get_sigma_size, EdgeMode, FastBlurChannels,
GaussianPreciseLevel, ImageSize, Scalar, ThreadingPolicy,
};
use opencv::core::{find_file, split, Mat, Size, Vector, BORDER_DEFAULT};
use opencv::core::{find_file, split, AlgorithmHint, Mat, Size, Vector, BORDER_DEFAULT};
use opencv::imgcodecs::{imread, IMREAD_COLOR};

pub(crate) fn split_channels_3<T: Copy>(
Expand Down Expand Up @@ -95,6 +95,7 @@ pub fn criterion_benchmark(c: &mut Criterion) {
0.,
0.,
BORDER_DEFAULT,
AlgorithmHint::ALGO_HINT_ACCURATE,
)
.unwrap();
})
Expand Down Expand Up @@ -146,6 +147,7 @@ pub fn criterion_benchmark(c: &mut Criterion) {
(77f64 * 2f64 + 1f64) / 6f64,
(77f64 * 2f64 + 1f64) / 6f64,
BORDER_DEFAULT,
AlgorithmHint::ALGO_HINT_ACCURATE,
)
.unwrap();
})
Expand Down Expand Up @@ -266,6 +268,7 @@ pub fn criterion_benchmark(c: &mut Criterion) {
(77f64 * 2f64 + 1f64) / 6f64,
(77f64 * 2f64 + 1f64) / 6f64,
BORDER_DEFAULT,
AlgorithmHint::ALGO_HINT_ACCURATE,
)
.unwrap();
})
Expand Down Expand Up @@ -348,6 +351,7 @@ pub fn criterion_benchmark(c: &mut Criterion) {
(77f64 * 2f64 + 1f64) / 6f64,
(77f64 * 2f64 + 1f64) / 6f64,
BORDER_DEFAULT,
AlgorithmHint::ALGO_HINT_ACCURATE,
)
.unwrap();
})
Expand Down
2 changes: 1 addition & 1 deletion fuzz/stack_blur_u16/stack_blur_u16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

#![no_main]

use libblur::{stack_blur, stack_blur_u16, FastBlurChannels, ThreadingPolicy};
use libblur::{stack_blur_u16, FastBlurChannels, ThreadingPolicy};
use libfuzzer_sys::fuzz_target;

fuzz_target!(|data: (u8, u8, u8)| {
Expand Down
37 changes: 19 additions & 18 deletions src/lib/filter1d/avx/filter_rgb_row_symm_approx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,12 @@ unsafe fn filter_rgb_row_avx_symm_u8_i32_approx_impl(
let y = filter_region.start;
let local_src = src;

let mut _cx = 0usize;
let mut cx = 0usize;

while _cx + 32 < width {
while cx + 32 < width {
let coeff = _mm256_set1_epi16(scanned_kernel.get_unchecked(half_len).weight as i16);

let shifted_src = local_src.get_unchecked((_cx * N)..);
let shifted_src = local_src.get_unchecked((cx * N)..);

let source =
_mm256_load_deinterleave_rgb(shifted_src.get_unchecked((half_len * N)..).as_ptr());
Expand All @@ -118,7 +118,7 @@ unsafe fn filter_rgb_row_avx_symm_u8_i32_approx_impl(
k2 = _mm256_mul_add_symm_epi8_by_epi16_x4(k2, v_source0.2, v_source1.2, coeff);
}

let dst_offset = y * dst_stride + _cx * N;
let dst_offset = y * dst_stride + cx * N;
let dst_ptr0 = (dst.slice.as_ptr() as *mut u8).add(dst_offset);
_mm256_store_interleave_rgb(
dst_ptr0,
Expand All @@ -128,13 +128,13 @@ unsafe fn filter_rgb_row_avx_symm_u8_i32_approx_impl(
_mm256_pack_epi32_x4_epi8(k2),
),
);
_cx += 32;
cx += 32;
}

while _cx + 16 < width {
while cx + 16 < width {
let coeff = _mm_set1_epi16(scanned_kernel.get_unchecked(half_len).weight as i16);

let shifted_src = local_src.get_unchecked((_cx * N)..);
let shifted_src = local_src.get_unchecked((cx * N)..);

let source =
_mm_load_deinterleave_rgb(shifted_src.get_unchecked((half_len * N)..).as_ptr());
Expand All @@ -154,7 +154,7 @@ unsafe fn filter_rgb_row_avx_symm_u8_i32_approx_impl(
k2 = _mm_mul_add_symm_epi8_by_epi16_x4(k2, v_source0.2, v_source1.2, coeff);
}

let dst_offset = y * dst_stride + _cx * N;
let dst_offset = y * dst_stride + cx * N;
let dst_ptr0 = (dst.slice.as_ptr() as *mut u8).add(dst_offset);
_mm_store_interleave_rgb(
dst_ptr0,
Expand All @@ -164,16 +164,17 @@ unsafe fn filter_rgb_row_avx_symm_u8_i32_approx_impl(
_mm_pack_epi32_x2_epi8(k2),
),
);
_cx += 16;
cx += 16;
}

while _cx + 8 < width {
while cx + 8 < width {
let coeff = _mm_set1_epi16(scanned_kernel.get_unchecked(half_len).weight as i16);

let shifted_src = local_src.get_unchecked((_cx * N)..);
let shifted_src = local_src.get_unchecked((cx * N)..);

let source =
_mm_load_deinterleave_rgb_half(shifted_src.get_unchecked((half_len * N)..).as_ptr());

let mut k0 = _mm_mul_epi8_by_epi16_x2(source.0, coeff);
let mut k1 = _mm_mul_epi8_by_epi16_x2(source.1, coeff);
let mut k2 = _mm_mul_epi8_by_epi16_x2(source.2, coeff);
Expand All @@ -191,7 +192,7 @@ unsafe fn filter_rgb_row_avx_symm_u8_i32_approx_impl(
k2 = _mm_mul_add_symm_epi8_by_epi16_x2(k2, v_source0.2, v_source1.2, coeff);
}

let dst_offset = y * dst_stride + _cx * N;
let dst_offset = y * dst_stride + cx * N;
let dst_ptr0 = (dst.slice.as_ptr() as *mut u8).add(dst_offset);
_mm_store_interleave_rgb_half(
dst_ptr0,
Expand All @@ -201,13 +202,13 @@ unsafe fn filter_rgb_row_avx_symm_u8_i32_approx_impl(
_mm_pack_epi32_epi8(k2),
),
);
_cx += 8;
cx += 8;
}

while _cx + 4 < width {
while cx + 4 < width {
let coeff = *scanned_kernel.get_unchecked(half_len);

let shifted_src = local_src.get_unchecked((_cx * N)..);
let shifted_src = local_src.get_unchecked((cx * N)..);

let mut k0 = ColorGroup::<N, i32>::from_slice(shifted_src, half_len * N).mul(coeff.weight);
let mut k1 =
Expand Down Expand Up @@ -247,16 +248,16 @@ unsafe fn filter_rgb_row_avx_symm_u8_i32_approx_impl(
.add(k3);
}

let dst_offset = y * dst_stride + _cx * N;
let dst_offset = y * dst_stride + cx * N;

k0.to_approx_store(dst, dst_offset);
k1.to_approx_store(dst, dst_offset + N);
k2.to_approx_store(dst, dst_offset + N * 2);
k3.to_approx_store(dst, dst_offset + N * 3);
_cx += 4;
cx += 4;
}

for x in _cx..width {
for x in cx..width {
let coeff = *scanned_kernel.get_unchecked(half_len);
let shifted_src = local_src.get_unchecked((x * N)..);
let mut k0 = ColorGroup::<N, i32>::from_slice(shifted_src, half_len * N).mul(coeff.weight);
Expand Down
13 changes: 9 additions & 4 deletions src/lib/filter1d/avx/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,11 @@ pub(crate) unsafe fn _mm256_mul_add_epi8_by_epi16_x4(
let lo_16 = _mm256_slli_epi16::<6>(j0);
let hi_16 = _mm256_slli_epi16::<6>(j1);

let vj0 = _mm256_mulhrs_epi16(lo_16, weight);
let vj1 = _mm256_mulhrs_epi16(hi_16, weight);
(
_mm256_add_epi16(accumulator.0, _mm256_mulhrs_epi16(lo_16, weight)),
_mm256_add_epi16(accumulator.1, _mm256_mulhrs_epi16(hi_16, weight)),
_mm256_add_epi16(accumulator.0, vj0),
_mm256_add_epi16(accumulator.1, vj1),
)
}

Expand All @@ -107,9 +109,12 @@ pub(crate) unsafe fn _mm256_mul_add_symm_epi8_by_epi16_x4(
let lo_16 = _mm256_slli_epi16::<6>(_mm256_add_epi16(a0, a1));
let hi_16 = _mm256_slli_epi16::<6>(_mm256_add_epi16(a2, a3));

let vj0 = _mm256_mulhrs_epi16(lo_16, weight);
let vj1 = _mm256_mulhrs_epi16(hi_16, weight);

(
_mm256_add_epi16(accumulator.0, _mm256_mulhrs_epi16(lo_16, weight)),
_mm256_add_epi16(accumulator.1, _mm256_mulhrs_epi16(hi_16, weight)),
_mm256_add_epi16(accumulator.0, vj0),
_mm256_add_epi16(accumulator.1, vj1),
)
}

Expand Down
4 changes: 3 additions & 1 deletion src/lib/filter1d/filter_1d_column_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
* // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
use crate::filter1d::arena::Arena;
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "avx"))]
use crate::filter1d::avx::{
filter_column_avx_f32_f32, filter_column_avx_symm_u8_f32, filter_column_avx_u8_f32,
};
Expand Down Expand Up @@ -88,6 +88,7 @@ impl Filter1DColumnHandler<u8, f32> for u8 {
fn get_column_handler(
is_symmetric_kernel: bool,
) -> fn(Arena, &[&[u8]], &UnsafeSlice<u8>, ImageSize, FilterRegion, &[ScanPoint1d<f32>]) {
#[cfg(feature = "avx")]
if std::arch::is_x86_feature_detected!("avx2") {
if is_symmetric_kernel {
return filter_column_avx_symm_u8_f32;
Expand Down Expand Up @@ -178,6 +179,7 @@ impl Filter1DColumnHandler<f32, f32> for f32 {
fn get_column_handler(
is_symmetric_kernel: bool,
) -> fn(Arena, &[&[f32]], &UnsafeSlice<f32>, ImageSize, FilterRegion, &[ScanPoint1d<f32>]) {
#[cfg(feature = "avx")]
if std::arch::is_x86_feature_detected!("avx2") {
return filter_column_avx_f32_f32;
}
Expand Down
3 changes: 2 additions & 1 deletion src/lib/filter1d/filter_1d_column_handler_approx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
use crate::cpu_features::is_aarch_rdm_supported;
use crate::filter1d::arena::Arena;
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "avx"))]
use crate::filter1d::avx::{filter_column_avx_symm_u8_i32_app, filter_column_avx_u8_i32_app};
use crate::filter1d::filter_column_approx::filter_column_approx;
use crate::filter1d::filter_column_approx_symmetric::filter_column_symmetric_approx;
Expand Down Expand Up @@ -117,6 +117,7 @@ impl Filter1DColumnHandlerApprox<u8, i32> for u8 {
fn get_column_handler(
is_kernel_symmetric: bool,
) -> fn(Arena, &[&[u8]], &UnsafeSlice<u8>, ImageSize, FilterRegion, &[ScanPoint1d<i32>]) {
#[cfg(feature = "avx")]
if std::arch::is_x86_feature_detected!("avx2") {
if is_kernel_symmetric {
return filter_column_avx_symm_u8_i32_app;
Expand Down
4 changes: 3 additions & 1 deletion src/lib/filter1d/filter_1d_rgb_row_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
* // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
use crate::filter1d::arena::Arena;
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "avx"))]
use crate::filter1d::avx::{
filter_rgb_row_avx_f32_f32, filter_rgb_row_avx_symm_u8_f32, filter_rgb_row_avx_u8_f32,
};
Expand Down Expand Up @@ -116,6 +116,7 @@ impl Filter1DRgbRowHandler<u8, f32> for u8 {
fn get_rgb_row_handler(
is_symmetric_kernel: bool,
) -> fn(Arena, &[u8], &UnsafeSlice<u8>, ImageSize, FilterRegion, &[ScanPoint1d<f32>]) {
#[cfg(feature = "avx")]
if std::arch::is_x86_feature_detected!("avx2") {
if is_symmetric_kernel {
return filter_rgb_row_avx_symm_u8_f32;
Expand Down Expand Up @@ -166,6 +167,7 @@ impl Filter1DRgbRowHandler<f32, f32> for f32 {
fn get_rgb_row_handler(
is_symmetric_kernel: bool,
) -> fn(Arena, &[f32], &UnsafeSlice<f32>, ImageSize, FilterRegion, &[ScanPoint1d<f32>]) {
#[cfg(feature = "avx")]
if std::arch::is_x86_feature_detected!("avx2") {
return filter_rgb_row_avx_f32_f32;
}
Expand Down
3 changes: 2 additions & 1 deletion src/lib/filter1d/filter_1d_rgb_row_handler_approx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
use crate::cpu_features::is_aarch_rdm_supported;
use crate::filter1d::arena::Arena;
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "avx"))]
use crate::filter1d::avx::{filter_rgb_row_avx_symm_u8_i32_approx, filter_rgb_row_avx_u8_i32_app};
use crate::filter1d::filter_row_cg_approx::filter_color_group_row_approx;
use crate::filter1d::filter_row_cg_approx_symmetric::filter_color_group_row_symmetric_approx;
Expand Down Expand Up @@ -114,6 +114,7 @@ impl Filter1DRgbRowHandlerApprox<u8, i32> for u8 {
fn get_rgb_row_handler_apr(
is_symmetrical_kernel: bool,
) -> fn(Arena, &[u8], &UnsafeSlice<u8>, ImageSize, FilterRegion, &[ScanPoint1d<i32>]) {
#[cfg(feature = "avx")]
if std::arch::is_x86_feature_detected!("avx2") {
if is_symmetrical_kernel {
return filter_rgb_row_avx_symm_u8_i32_approx;
Expand Down
4 changes: 3 additions & 1 deletion src/lib/filter1d/filter_1d_rgba_row_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
* // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
use crate::filter1d::arena::Arena;
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "avx"))]
use crate::filter1d::avx::{
filter_rgba_row_avx_f32_f32, filter_rgba_row_avx_symm_u8_f32, filter_rgba_row_avx_u8_f32,
};
Expand Down Expand Up @@ -114,6 +114,7 @@ impl Filter1DRgbaRowHandler<u8, f32> for u8 {
fn get_rgba_row_handler(
is_kernel_symmetric: bool,
) -> fn(Arena, &[u8], &UnsafeSlice<u8>, ImageSize, FilterRegion, &[ScanPoint1d<f32>]) {
#[cfg(feature = "avx")]
if std::arch::is_x86_feature_detected!("avx2") {
if is_kernel_symmetric {
return filter_rgba_row_avx_symm_u8_f32;
Expand Down Expand Up @@ -164,6 +165,7 @@ impl Filter1DRgbaRowHandler<f32, f32> for f32 {
fn get_rgba_row_handler(
is_kernel_symmetric: bool,
) -> fn(Arena, &[f32], &UnsafeSlice<f32>, ImageSize, FilterRegion, &[ScanPoint1d<f32>]) {
#[cfg(feature = "avx")]
if std::arch::is_x86_feature_detected!("avx2") {
return filter_rgba_row_avx_f32_f32;
}
Expand Down
3 changes: 2 additions & 1 deletion src/lib/filter1d/filter_1d_rgba_row_handler_approx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
use crate::cpu_features::is_aarch_rdm_supported;
use crate::filter1d::arena::Arena;
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "avx"))]
use crate::filter1d::avx::{filter_rgba_row_avx_symm_u8_i32_app, filter_rgba_row_avx_u8_i32_app};
use crate::filter1d::filter_row_cg_approx::filter_color_group_row_approx;
use crate::filter1d::filter_row_cg_approx_symmetric::filter_color_group_row_symmetric_approx;
Expand Down Expand Up @@ -114,6 +114,7 @@ impl Filter1DRgbaRowHandlerApprox<u8, i32> for u8 {
fn get_rgba_row_handler(
is_kernel_symmetric: bool,
) -> fn(Arena, &[u8], &UnsafeSlice<u8>, ImageSize, FilterRegion, &[ScanPoint1d<i32>]) {
#[cfg(feature = "avx")]
if std::arch::is_x86_feature_detected!("avx2") {
if is_kernel_symmetric {
return filter_rgba_row_avx_symm_u8_i32_app;
Expand Down
Loading

0 comments on commit 7d19b67

Please sign in to comment.