Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Box blur +10% #20

Merged
merged 4 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions .github/workflows/build_push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,9 @@ jobs:
- run: cargo install cargo-fuzz
- run: cargo fuzz run box --features ${{ matrix.feature }} -- -max_total_time=45

fuzz_gauss:
fuzz_gauss_arm:
name: Fuzzing Gauss
strategy:
matrix:
os: [ ubuntu-latest, macos-latest ]
runs-on: ${{ matrix.os }}
runs-on: macos-latest
steps:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@nightly
Expand All @@ -88,6 +85,21 @@ jobs:
- run: cargo fuzz run gauss_u16 -- -max_total_time=45
- run: cargo fuzz run gauss_f32 -- -max_total_time=45

fuzz_gauss_x86:
name: Fuzzing Gauss x86
strategy:
matrix:
feature: [ sse, avx ]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@nightly
- run: cargo install cargo-fuzz
- run: cargo fuzz run gauss --features ${{ matrix.feature }} -- -max_total_time=45
- run: cargo fuzz run gauss --features ${{ matrix.feature }} -- -max_total_time=45
- run: cargo fuzz run gauss_u16 --features ${{ matrix.feature }} -- -max_total_time=45
- run: cargo fuzz run gauss_f32 --features ${{ matrix.feature }} -- -max_total_time=45

fuzz_stack_blur:
name: Fuzzing Stack Blur
runs-on: macos-latest
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ path = "src/main.rs"
colorutils-rs = "0.7.0"
half = "2.4.1"
image = "0.25.5"
libblur = {path = "src/lib", features = ["image", "fft", "sse", "avx"], default-features = false}
libblur = {path = "src/lib", features = ["image", "fft", "sse"], default-features = false}
rayon = "1.10.0"

[dev-dependencies]
Expand Down
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -228,15 +228,15 @@ Example comparison time for blurring image 3000x4000 RGB 8-bit in multithreaded

| | time(NEON) | time(SSE) |
|---------|:----------:|:---------:|
| libblur | 10.13ms | 11.57ms |
| OpenCV | 16.81ms | 47.62ms |
| libblur | 9.01ms | 11.24ms |
| OpenCV | 15.73ms | 43.59ms |

Example comparison time for blurring image 2828x4242 RGBA 8-bit in multithreaded mode with 77 radius.

| | Time(NEON) | Time(SSE) |
|---------|:----------:|:---------:|
| libblur | 10.68ms | 9.41ms |
| OpenCV | 16.33ms | 31.29ms |
| libblur | 8.28ms | 9.38ms |
| OpenCV | 15.77ms | 31.29ms |

### Fast bilateral blur

Expand Down
215 changes: 215 additions & 0 deletions src/lib/box/box_blur_avx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,221 @@ unsafe fn box_blur_vertical_pass_avx2_impl<T, const CHANNELS: usize>(
let v_edge_count = unsafe { _mm256_set1_epi32(edge_count as i32) };
let v_weight = unsafe { _mm256_set1_ps(1f32 / (radius * 2) as f32) };

while cx + 64 < end_x {
let px = cx as usize;

let mut store_0: __m256i;
let mut store_1: __m256i;
let mut store_2: __m256i;
let mut store_3: __m256i;
let mut store_4: __m256i;
let mut store_5: __m256i;
let mut store_6: __m256i;
let mut store_7: __m256i;

unsafe {
let s_ptr = src.as_ptr().add(px);
let edge0 = _mm256_loadu_si256(s_ptr as *const _);
let edge1 = _mm256_loadu_si256(s_ptr.add(32) as *const _);
let lo0 = _mm256_unpacklo_epi8(edge0, _mm256_setzero_si256());
let hi0 = _mm256_unpackhi_epi8(edge0, _mm256_setzero_si256());

let lo1 = _mm256_unpacklo_epi8(edge1, _mm256_setzero_si256());
let hi1 = _mm256_unpackhi_epi8(edge1, _mm256_setzero_si256());

let i16_l0 = _mm256_unpacklo_epi16(lo0, _mm256_setzero_si256());
let i16_h0 = _mm256_unpackhi_epi16(lo0, _mm256_setzero_si256());
let i16_l1 = _mm256_unpacklo_epi16(hi0, _mm256_setzero_si256());
let i16_h1 = _mm256_unpackhi_epi16(hi0, _mm256_setzero_si256());

let i16_l01 = _mm256_unpacklo_epi16(lo1, _mm256_setzero_si256());
let i16_h01 = _mm256_unpackhi_epi16(lo1, _mm256_setzero_si256());
let i16_l11 = _mm256_unpacklo_epi16(hi1, _mm256_setzero_si256());
let i16_h11 = _mm256_unpackhi_epi16(hi1, _mm256_setzero_si256());

store_0 = _mm256_madd_epi16(i16_l0, v_edge_count);
store_1 = _mm256_madd_epi16(i16_h0, v_edge_count);

store_2 = _mm256_madd_epi16(i16_l1, v_edge_count);
store_3 = _mm256_madd_epi16(i16_h1, v_edge_count);

store_4 = _mm256_madd_epi16(i16_l01, v_edge_count);
store_5 = _mm256_madd_epi16(i16_h01, v_edge_count);

store_6 = _mm256_madd_epi16(i16_l11, v_edge_count);
store_7 = _mm256_madd_epi16(i16_h11, v_edge_count);
}

unsafe {
for y in 1..std::cmp::min(half_kernel, height) {
let y_src_shift = y as usize * src_stride as usize;
let s_ptr = src.as_ptr().add(y_src_shift + px);

let edge0 = _mm256_loadu_si256(s_ptr as *const _);
let edge1 = _mm256_loadu_si256(s_ptr.add(32) as *const _);
let lo0 = _mm256_unpacklo_epi8(edge0, _mm256_setzero_si256());
let hi0 = _mm256_unpackhi_epi8(edge0, _mm256_setzero_si256());

let lo1 = _mm256_unpacklo_epi8(edge1, _mm256_setzero_si256());
let hi1 = _mm256_unpackhi_epi8(edge1, _mm256_setzero_si256());

let i16_l0 = _mm256_unpacklo_epi16(lo0, _mm256_setzero_si256());
let i16_h0 = _mm256_unpackhi_epi16(lo0, _mm256_setzero_si256());
let i16_l1 = _mm256_unpacklo_epi16(hi0, _mm256_setzero_si256());
let i16_h1 = _mm256_unpackhi_epi16(hi0, _mm256_setzero_si256());

let i16_l01 = _mm256_unpacklo_epi16(lo1, _mm256_setzero_si256());
let i16_h01 = _mm256_unpackhi_epi16(lo1, _mm256_setzero_si256());
let i16_l11 = _mm256_unpacklo_epi16(hi1, _mm256_setzero_si256());
let i16_h11 = _mm256_unpackhi_epi16(hi1, _mm256_setzero_si256());

store_0 = _mm256_add_epi32(store_0, i16_l0);
store_1 = _mm256_add_epi32(store_1, i16_h0);

store_2 = _mm256_add_epi32(store_2, i16_l1);
store_3 = _mm256_add_epi32(store_3, i16_h1);

store_4 = _mm256_add_epi32(store_4, i16_l01);
store_5 = _mm256_add_epi32(store_5, i16_h01);

store_6 = _mm256_add_epi32(store_6, i16_l11);
store_7 = _mm256_add_epi32(store_7, i16_h11);
}
}

unsafe {
for y in 0..height {
// preload edge pixels
let next =
std::cmp::min(y + half_kernel, height - 1) as usize * src_stride as usize;
let previous =
std::cmp::max(y as i64 - half_kernel as i64, 0) as usize * src_stride as usize;
let y_dst_shift = dst_stride as usize * y as usize;

// subtract previous
{
let s_ptr = src.as_ptr().add(previous + px);
let edge0 = _mm256_loadu_si256(s_ptr as *const _);
let edge1 = _mm256_loadu_si256(s_ptr.add(32) as *const _);
let lo0 = _mm256_unpacklo_epi8(edge0, _mm256_setzero_si256());
let hi0 = _mm256_unpackhi_epi8(edge0, _mm256_setzero_si256());

let lo1 = _mm256_unpacklo_epi8(edge1, _mm256_setzero_si256());
let hi1 = _mm256_unpackhi_epi8(edge1, _mm256_setzero_si256());

let i16_l0 = _mm256_unpacklo_epi16(lo0, _mm256_setzero_si256());
let i16_h0 = _mm256_unpackhi_epi16(lo0, _mm256_setzero_si256());
let i16_l1 = _mm256_unpacklo_epi16(hi0, _mm256_setzero_si256());
let i16_h1 = _mm256_unpackhi_epi16(hi0, _mm256_setzero_si256());

let i16_l01 = _mm256_unpacklo_epi16(lo1, _mm256_setzero_si256());
let i16_h01 = _mm256_unpackhi_epi16(lo1, _mm256_setzero_si256());
let i16_l11 = _mm256_unpacklo_epi16(hi1, _mm256_setzero_si256());
let i16_h11 = _mm256_unpackhi_epi16(hi1, _mm256_setzero_si256());

store_0 = _mm256_sub_epi32(store_0, i16_l0);
store_1 = _mm256_sub_epi32(store_1, i16_h0);

store_2 = _mm256_sub_epi32(store_2, i16_l1);
store_3 = _mm256_sub_epi32(store_3, i16_h1);

store_4 = _mm256_sub_epi32(store_4, i16_l01);
store_5 = _mm256_sub_epi32(store_5, i16_h01);

store_6 = _mm256_sub_epi32(store_6, i16_l11);
store_7 = _mm256_sub_epi32(store_7, i16_h11);
}

// add next
{
let s_ptr = src.as_ptr().add(next + px);
let edge0 = _mm256_loadu_si256(s_ptr as *const _);
let edge1 = _mm256_loadu_si256(s_ptr.add(32) as *const _);
let lo0 = _mm256_unpacklo_epi8(edge0, _mm256_setzero_si256());
let hi0 = _mm256_unpackhi_epi8(edge0, _mm256_setzero_si256());

let lo1 = _mm256_unpacklo_epi8(edge1, _mm256_setzero_si256());
let hi1 = _mm256_unpackhi_epi8(edge1, _mm256_setzero_si256());

let i16_l0 = _mm256_unpacklo_epi16(lo0, _mm256_setzero_si256());
let i16_h0 = _mm256_unpackhi_epi16(lo0, _mm256_setzero_si256());
let i16_l1 = _mm256_unpacklo_epi16(hi0, _mm256_setzero_si256());
let i16_h1 = _mm256_unpackhi_epi16(hi0, _mm256_setzero_si256());

let i16_l01 = _mm256_unpacklo_epi16(lo1, _mm256_setzero_si256());
let i16_h01 = _mm256_unpackhi_epi16(lo1, _mm256_setzero_si256());
let i16_l11 = _mm256_unpacklo_epi16(hi1, _mm256_setzero_si256());
let i16_h11 = _mm256_unpackhi_epi16(hi1, _mm256_setzero_si256());

store_0 = _mm256_add_epi32(store_0, i16_l0);
store_1 = _mm256_add_epi32(store_1, i16_h0);

store_2 = _mm256_add_epi32(store_2, i16_l1);
store_3 = _mm256_add_epi32(store_3, i16_h1);

store_4 = _mm256_add_epi32(store_4, i16_l01);
store_5 = _mm256_add_epi32(store_5, i16_h01);

store_6 = _mm256_add_epi32(store_6, i16_l11);
store_7 = _mm256_add_epi32(store_7, i16_h11);
}

let px = cx as usize;

let store0_ps = _mm256_cvtepi32_ps(store_0);
let store1_ps = _mm256_cvtepi32_ps(store_1);
let store2_ps = _mm256_cvtepi32_ps(store_2);
let store3_ps = _mm256_cvtepi32_ps(store_3);
let store4_ps = _mm256_cvtepi32_ps(store_4);
let store5_ps = _mm256_cvtepi32_ps(store_5);
let store6_ps = _mm256_cvtepi32_ps(store_6);
let store7_ps = _mm256_cvtepi32_ps(store_7);

let scale_store_0_ps = _mm256_mul_ps(store0_ps, v_weight);
let scale_store_1_ps = _mm256_mul_ps(store1_ps, v_weight);
let scale_store_2_ps = _mm256_mul_ps(store2_ps, v_weight);
let scale_store_3_ps = _mm256_mul_ps(store3_ps, v_weight);
let scale_store_4_ps = _mm256_mul_ps(store4_ps, v_weight);
let scale_store_5_ps = _mm256_mul_ps(store5_ps, v_weight);
let scale_store_6_ps = _mm256_mul_ps(store6_ps, v_weight);
let scale_store_7_ps = _mm256_mul_ps(store7_ps, v_weight);

let scale_store_0_rnd = _mm256_round_ps::<0x00>(scale_store_0_ps);
let scale_store_1_rnd = _mm256_round_ps::<0x00>(scale_store_1_ps);
let scale_store_2_rnd = _mm256_round_ps::<0x00>(scale_store_2_ps);
let scale_store_3_rnd = _mm256_round_ps::<0x00>(scale_store_3_ps);
let scale_store_4_rnd = _mm256_round_ps::<0x00>(scale_store_4_ps);
let scale_store_5_rnd = _mm256_round_ps::<0x00>(scale_store_5_ps);
let scale_store_6_rnd = _mm256_round_ps::<0x00>(scale_store_6_ps);
let scale_store_7_rnd = _mm256_round_ps::<0x00>(scale_store_7_ps);

let scale_store_0 = _mm256_cvtps_epi32(scale_store_0_rnd);
let scale_store_1 = _mm256_cvtps_epi32(scale_store_1_rnd);
let scale_store_2 = _mm256_cvtps_epi32(scale_store_2_rnd);
let scale_store_3 = _mm256_cvtps_epi32(scale_store_3_rnd);
let scale_store_4 = _mm256_cvtps_epi32(scale_store_4_rnd);
let scale_store_5 = _mm256_cvtps_epi32(scale_store_5_rnd);
let scale_store_6 = _mm256_cvtps_epi32(scale_store_6_rnd);
let scale_store_7 = _mm256_cvtps_epi32(scale_store_7_rnd);

let offset = y_dst_shift + px;
let ptr = unsafe_dst.slice.get_unchecked(offset).get();

let set0 = _mm256_packus_epi32(scale_store_0, scale_store_1);
let set1 = _mm256_packus_epi32(scale_store_2, scale_store_3);
let set2 = _mm256_packus_epi32(scale_store_4, scale_store_5);
let set3 = _mm256_packus_epi32(scale_store_6, scale_store_7);

let full_set0 = _mm256_packus_epi16(set0, set1);
let full_set1 = _mm256_packus_epi16(set2, set3);
_mm256_storeu_si256(ptr as *mut _, full_set0);
_mm256_storeu_si256(ptr.add(32) as *mut _, full_set1);
}
}

cx += 64;
}

while cx + 32 < end_x {
let px = cx as usize;

Expand Down
Loading