Skip to content

Commit

Permalink
RDM feature darwin additional check
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Nov 18, 2024
1 parent daf1847 commit 4a0d6c9
Show file tree
Hide file tree
Showing 10 changed files with 127 additions and 93 deletions.
11 changes: 11 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ rayon = "1.10.0"

[dev-dependencies]
criterion = "0.5.1"
opencv = { version = "0.93.1", default-features = false, features = ["imgcodecs", "imgproc"] }
opencv = { version = "0.93.1", default-features = false, features = ["imgcodecs", "imgproc", "clang-runtime"] }

[[bench]]
name = "gauss"
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ Example comparison time for blurring image 3000x4000 single plane 8-bit in multi
| | time(NEON) | Time(SSE/AVX) |
|-------------------|:----------:|:-------------:|
| libblur(Exact) | 24.19ms | 28.73ms |
| libblur(Integral) | 13.70ms | 18.97ms |
| libblur(Integral) | 11.49ms | 18.97ms |
| OpenCV | 74.73ms | 64.20ms |

### Stack blur
Expand Down
138 changes: 69 additions & 69 deletions benches/gauss/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,80 +43,80 @@ pub fn criterion_benchmark(c: &mut Criterion) {
let stride = dimensions.0 as usize * components;
let src_bytes = img.as_bytes();

// c.bench_function("RGBA gauss blur kernel clip exact: 13", |b| {
// b.iter(|| {
// let mut dst_bytes: Vec<u8> = vec![0u8; dimensions.1 as usize * stride];
// libblur::gaussian_blur(
// &src_bytes,
// &mut dst_bytes,
// dimensions.0,
// dimensions.1,
// 13,
// 0.,
// FastBlurChannels::Channels4,
// EdgeMode::Clamp,
// ThreadingPolicy::Adaptive,
// GaussianPreciseLevel::EXACT,
// );
// })
// });
//
// c.bench_function("RGBA gauss blur clamp approx: 13", |b| {
// b.iter(|| {
// let mut dst_bytes: Vec<u8> = vec![0u8; dimensions.1 as usize * stride];
// libblur::gaussian_blur(
// &src_bytes,
// &mut dst_bytes,
// dimensions.0,
// dimensions.1,
// 13,
// 0.,
// FastBlurChannels::Channels4,
// EdgeMode::Clamp,
// ThreadingPolicy::Adaptive,
// GaussianPreciseLevel::INTEGRAL,
// );
// })
// });
//
c.bench_function("RGBA gauss blur kernel clip exact: 13", |b| {
b.iter(|| {
let mut dst_bytes: Vec<u8> = vec![0u8; dimensions.1 as usize * stride];
libblur::gaussian_blur(
&src_bytes,
&mut dst_bytes,
dimensions.0,
dimensions.1,
13,
0.,
FastBlurChannels::Channels4,
EdgeMode::Clamp,
ThreadingPolicy::Adaptive,
GaussianPreciseLevel::EXACT,
);
})
});

c.bench_function("RGBA gauss blur clamp approx: 13", |b| {
b.iter(|| {
let mut dst_bytes: Vec<u8> = vec![0u8; dimensions.1 as usize * stride];
libblur::gaussian_blur(
&src_bytes,
&mut dst_bytes,
dimensions.0,
dimensions.1,
13,
0.,
FastBlurChannels::Channels4,
EdgeMode::Clamp,
ThreadingPolicy::Adaptive,
GaussianPreciseLevel::INTEGRAL,
);
})
});

let src = imread(
&find_file(&"assets/test_image_4.png", false, false).unwrap(),
IMREAD_COLOR,
)
.unwrap();
//
// c.bench_function("OpenCV RGBA Gaussian: 13", |b| {
// b.iter(|| {
// let mut dst = Mat::default();
// opencv::imgproc::gaussian_blur(
// &src,
// &mut dst,
// Size::new(13, 13),
// 0.,
// 0.,
// BORDER_DEFAULT,
// )
// .unwrap();
// })
// });
//
// c.bench_function("RGBA gauss blur edge clamp: rad 151", |b| {
// b.iter(|| {
// let mut dst_bytes: Vec<u8> = vec![0u8; dimensions.1 as usize * stride];
// libblur::gaussian_blur(
// &src_bytes,
// &mut dst_bytes,
// dimensions.0,
// dimensions.1,
// 77 * 2 + 1,
// (77f32 * 2f32 + 1f32) / 6f32,
// FastBlurChannels::Channels4,
// EdgeMode::Clamp,
// ThreadingPolicy::Adaptive,
// GaussianPreciseLevel::EXACT,
// );
// })
// });

c.bench_function("OpenCV RGBA Gaussian: 13", |b| {
b.iter(|| {
let mut dst = Mat::default();
opencv::imgproc::gaussian_blur(
&src,
&mut dst,
Size::new(13, 13),
0.,
0.,
BORDER_DEFAULT,
)
.unwrap();
})
});

c.bench_function("RGBA gauss blur edge clamp: rad 151", |b| {
b.iter(|| {
let mut dst_bytes: Vec<u8> = vec![0u8; dimensions.1 as usize * stride];
libblur::gaussian_blur(
&src_bytes,
&mut dst_bytes,
dimensions.0,
dimensions.1,
77 * 2 + 1,
(77f32 * 2f32 + 1f32) / 6f32,
FastBlurChannels::Channels4,
EdgeMode::Clamp,
ThreadingPolicy::Adaptive,
GaussianPreciseLevel::EXACT,
);
})
});

c.bench_function("RGBA gauss blur edge clamp approx: rad 151", |b| {
b.iter(|| {
Expand Down
45 changes: 30 additions & 15 deletions src/lib/cpu_features.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
all(target_arch = "aarch64", target_feature = "neon"),
any(target_arch = "x86", target_arch = "x86_64")
))]
#[cfg(any(target_os = "macos", target_os = "ios"))]
#[cfg(any(target_os = "macos", target_os = "ios", target_os = "tvos"))]
#[allow(dead_code)]
fn apple_has_cpu_feature(feature_name: &str) -> bool {
use libc::{c_int, sysctlbyname};
Expand Down Expand Up @@ -61,11 +61,11 @@ fn apple_has_cpu_feature(feature_name: &str) -> bool {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[allow(dead_code)]
pub fn is_x86_avx512dq_supported() -> bool {
#[cfg(any(target_os = "macos", target_os = "ios"))]
#[cfg(any(target_os = "macos", target_os = "ios", target_os = "tvos"))]
{
apple_has_cpu_feature("hw.optional.avx512dq")
}
#[cfg(not(any(target_os = "macos", target_os = "ios")))]
#[cfg(not(any(target_os = "macos", target_os = "ios", target_os = "tvos")))]
{
std::arch::is_x86_feature_detected!("avx512dq")
}
Expand All @@ -74,11 +74,11 @@ pub fn is_x86_avx512dq_supported() -> bool {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[allow(dead_code)]
pub fn is_x86_avx512vl_supported() -> bool {
#[cfg(any(target_os = "macos", target_os = "ios"))]
#[cfg(any(target_os = "macos", target_os = "ios", target_os = "tvos"))]
{
apple_has_cpu_feature("hw.optional.avx512vl")
}
#[cfg(not(any(target_os = "macos", target_os = "ios")))]
#[cfg(not(any(target_os = "macos", target_os = "ios", target_os = "tvos")))]
{
std::arch::is_x86_feature_detected!("avx512vl")
}
Expand All @@ -88,7 +88,7 @@ pub fn is_x86_avx512vl_supported() -> bool {
all(target_arch = "aarch64", target_feature = "neon"),
any(target_arch = "x86", target_arch = "x86_64")
)))]
#[cfg(not(any(target_os = "macos", target_os = "ios")))]
#[cfg(not(any(target_os = "macos", target_os = "ios", target_os = "tvos")))]
#[allow(dead_code)]
fn apple_has_cpu_feature(_feature_name: &str) -> bool {
false
Expand All @@ -98,12 +98,12 @@ fn apple_has_cpu_feature(_feature_name: &str) -> bool {
/// on *Apple* platform [libc](https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics) be used
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
#[allow(dead_code)]
pub fn is_aarch_f16_supported() -> bool {
#[cfg(any(target_os = "macos", target_os = "ios"))]
pub(crate) fn is_aarch_f16_supported() -> bool {
#[cfg(any(target_os = "macos", target_os = "ios", target_os = "tvos"))]
{
apple_has_cpu_feature("hw.optional.arm.FEAT_FP16")
}
#[cfg(not(any(target_os = "macos", target_os = "ios")))]
#[cfg(not(any(target_os = "macos", target_os = "ios", target_os = "tvos")))]
{
std::arch::is_aarch64_feature_detected!("fp16")
}
Expand All @@ -114,12 +114,12 @@ pub fn is_aarch_f16_supported() -> bool {
/// otherwise consider it is always available
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
#[allow(dead_code)]
pub fn is_aarch_f16c_supported() -> bool {
#[cfg(any(target_os = "macos", target_os = "ios"))]
pub(crate) fn is_aarch_f16c_supported() -> bool {
#[cfg(any(target_os = "macos", target_os = "ios", target_os = "tvos"))]
{
apple_has_cpu_feature("hw.optional.AdvSIMD_HPFPCvt")
}
#[cfg(not(any(target_os = "macos", target_os = "ios")))]
#[cfg(not(any(target_os = "macos", target_os = "ios", target_os = "tvos")))]
{
true
}
Expand All @@ -130,13 +130,28 @@ pub fn is_aarch_f16c_supported() -> bool {
/// otherwise consider it is always available
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
#[allow(dead_code)]
pub fn is_aarch_fhm_supported() -> bool {
#[cfg(any(target_os = "macos", target_os = "ios"))]
pub(crate) fn is_aarch_fhm_supported() -> bool {
#[cfg(any(target_os = "macos", target_os = "ios", target_os = "tvos"))]
{
apple_has_cpu_feature("hw.optional.arm.FEAT_FHM")
}
#[cfg(not(any(target_os = "macos", target_os = "ios")))]
#[cfg(not(any(target_os = "macos", target_os = "ios", target_os = "tvos")))]
{
std::arch::is_aarch64_feature_detected!("fhm")
}
}

/// Test aarch64 cpu with *rdm* check,
/// on *Apple* platform [libc](https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics) be used
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
#[allow(dead_code)]
pub(crate) fn is_aarch_rdm_supported() -> bool {
#[cfg(any(target_os = "macos", target_os = "ios", target_os = "tvos"))]
{
apple_has_cpu_feature("hw.optional.arm.FEAT_RDM")
}
#[cfg(not(any(target_os = "macos", target_os = "ios", target_os = "tvos")))]
{
std::arch::is_aarch64_feature_detected!("rdm")
}
}
6 changes: 4 additions & 2 deletions src/lib/filter1d/filter_1d_column_handler_approx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
* // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
use crate::cpu_features::is_aarch_rdm_supported;
use crate::filter1d::arena::Arena;
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
use crate::filter1d::avx::{filter_column_avx_symm_u8_i32_app, filter_column_avx_u8_i32_app};
Expand Down Expand Up @@ -99,12 +101,12 @@ impl Filter1DColumnHandlerApprox<u8, i32> for u8 {
is_kernel_symmetric: bool,
) -> fn(Arena, &[&[u8]], &UnsafeSlice<u8>, ImageSize, FilterRegion, &[ScanPoint1d<i32>]) {
if is_kernel_symmetric {
if std::arch::is_aarch64_feature_detected!("rdm") {
if is_aarch_rdm_supported() {
return filter_column_symm_neon_u8_i32_rdm;
}
filter_column_symm_neon_u8_i32_app
} else {
if std::arch::is_aarch64_feature_detected!("rdm") {
if is_aarch_rdm_supported() {
return filter_column_neon_u8_i32_i16_qrdm_app;
}
filter_column_neon_u8_i32_app
Expand Down
4 changes: 3 additions & 1 deletion src/lib/filter1d/filter_1d_rgb_row_handler_approx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
* // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
use crate::cpu_features::is_aarch_rdm_supported;
use crate::filter1d::arena::Arena;
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
use crate::filter1d::avx::{filter_rgb_row_avx_symm_u8_i32_approx, filter_rgb_row_avx_u8_i32_app};
Expand Down Expand Up @@ -99,7 +101,7 @@ impl Filter1DRgbRowHandlerApprox<u8, i32> for u8 {
is_symmetrical_kernel: bool,
) -> fn(Arena, &[u8], &UnsafeSlice<u8>, ImageSize, FilterRegion, &[ScanPoint1d<i32>]) {
if is_symmetrical_kernel {
if std::arch::is_aarch64_feature_detected!("rdm") {
if is_aarch_rdm_supported() {
return filter_rgb_row_symm_neon_u8_i32_rdm;
}
filter_rgb_row_symm_neon_u8_i32
Expand Down
4 changes: 3 additions & 1 deletion src/lib/filter1d/filter_1d_rgba_row_handler_approx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
* // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
use crate::cpu_features::is_aarch_rdm_supported;
use crate::filter1d::arena::Arena;
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
use crate::filter1d::avx::{filter_rgba_row_avx_symm_u8_i32_app, filter_rgba_row_avx_u8_i32_app};
Expand Down Expand Up @@ -99,7 +101,7 @@ impl Filter1DRgbaRowHandlerApprox<u8, i32> for u8 {
is_kernel_symmetric: bool,
) -> fn(Arena, &[u8], &UnsafeSlice<u8>, ImageSize, FilterRegion, &[ScanPoint1d<i32>]) {
if is_kernel_symmetric {
if std::arch::is_aarch64_feature_detected!("rdm") {
if is_aarch_rdm_supported() {
return filter_rgba_row_symm_neon_u8_i32_rdm;
}
filter_rgba_row_symm_neon_u8_i32
Expand Down
4 changes: 3 additions & 1 deletion src/lib/filter1d/filter_1d_row_handler_approx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
* // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
use crate::cpu_features::is_aarch_rdm_supported;
use crate::filter1d::arena::Arena;
use crate::filter1d::filter_row_cg_approx::filter_color_group_row_approx;
use crate::filter1d::filter_row_cg_approx_symmetric::filter_color_group_row_symmetric_approx;
Expand Down Expand Up @@ -94,7 +96,7 @@ impl Filter1DRowHandlerApprox<u8, i32> for u8 {
fn get_row_handler_apr(
_: bool,
) -> fn(Arena, &[u8], &UnsafeSlice<u8>, ImageSize, FilterRegion, &[ScanPoint1d<i32>]) {
if std::arch::is_aarch64_feature_detected!("rdm") {
if is_aarch_rdm_supported() {
return filter_row_neon_u8_i32_rdm;
}
filter_row_neon_u8_i32_app
Expand Down
4 changes: 2 additions & 2 deletions src/lib/filter1d/neon/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@ mod filter_rgba_row_symm_f32;
mod filter_rgba_symm_row;
mod filter_row;
mod filter_row_approx;
mod filter_row_approx_rdm;
mod filter_row_f32;
pub mod utils;
mod filter_row_approx_rdm;

pub use filter_column::filter_column_neon_u8_f32;
pub use filter_column_approx::filter_column_neon_u8_i32_app;
Expand Down Expand Up @@ -86,5 +86,5 @@ pub use filter_rgba_row_symm_f32::filter_rgba_row_neon_symm_f32_f32;
pub use filter_rgba_symm_row::filter_rgba_row_symm_neon_u8_f32;
pub use filter_row::filter_row_neon_u8_f32;
pub use filter_row_approx::filter_row_neon_u8_i32_app;
pub(crate) use filter_row_approx_rdm::filter_row_neon_u8_i32_rdm;
pub use filter_row_f32::filter_row_neon_f32_f32;
pub(crate) use filter_row_approx_rdm::filter_row_neon_u8_i32_rdm;

0 comments on commit 4a0d6c9

Please sign in to comment.