Skip to content

Commit e794d16

Browse files
committed
Fix support for arm/aarch64 SIMD
Use sse2neon.h for NEON support.
1 parent 677a401 commit e794d16

File tree

2 files changed

+38
-1
lines changed

2 files changed

+38
-1
lines changed

CMakeLists.txt

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ if(CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "GNU")
7272
target_compile_options(${LIBRARY_NAME} PRIVATE -O3 -funroll-loops)
7373
endif()
7474

75-
# Only apply SIMD flags if we are on a capable architecture (x86/x86_64).
75+
# Only apply SIMD flags if we are on a capable architecture (x86/x86_64/arm/arm64).
7676
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|i.86")
7777
message(STATUS "x86/x86_64 architecture detected. Configuring SIMD instruction sets.")
7878

@@ -102,6 +102,27 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|i.86")
102102
elseif(CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "GNU")
103103
set_source_files_properties("${CMAKE_CURRENT_SOURCE_DIR}/src/flash3kyuu_deband_impl_avx512.cpp" PROPERTIES COMPILE_OPTIONS "-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl;-mavx512cd;-mfma")
104104
endif()
105+
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm.*|armv7.*|armv8.*|aarch64|ARM|ARM64")
106+
message(STATUS "arm/arm64 architecture detected. Configuring SIMD instruction sets.")
107+
108+
target_sources(${LIBRARY_NAME} PRIVATE
109+
"${CMAKE_CURRENT_SOURCE_DIR}/src/flash3kyuu_deband_impl_sse4.cpp"
110+
"${CMAKE_CURRENT_SOURCE_DIR}/src/flash3kyuu_deband_sse_base.h"
111+
)
112+
113+
# Workaround missing x86 intrinsics header for Vector Class Library
114+
set(GENERATED_INCLUDE_DIR "${CMAKE_BINARY_DIR}/include")
115+
set(X86INTRIN_H_STUB "x86intrin.h")
116+
file(MAKE_DIRECTORY ${GENERATED_INCLUDE_DIR})
117+
file(TOUCH "${GENERATED_INCLUDE_DIR}/${X86INTRIN_H_STUB}")
118+
119+
target_include_directories(${LIBRARY_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/VCL2" "${GENERATED_INCLUDE_DIR}")
120+
121+
if(CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "MSVC")
122+
set_source_files_properties("${CMAKE_CURRENT_SOURCE_DIR}/src/flash3kyuu_deband_impl_sse4.cpp" PROPERTIES COMPILE_OPTIONS "/DINSTRSET=6")
123+
elseif(CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "GNU")
124+
set_source_files_properties("${CMAKE_CURRENT_SOURCE_DIR}/src/flash3kyuu_deband_impl_sse4.cpp" PROPERTIES COMPILE_OPTIONS "-DINSTRSET=6;-Wno-narrowing")
125+
endif()
105126
else()
106127
message(STATUS "Non-x86 architecture detected (${CMAKE_SYSTEM_PROCESSOR}). Skipping SIMD-specific source files.")
107128
endif()

src/impl_dispatch.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ const process_plane_impl_t* process_plane_impl_high_precision_no_dithering[] = {
1212
process_plane_impl_sse4_high_no_dithering,
1313
process_plane_impl_avx2_high_no_dithering,
1414
process_plane_impl_avx512_high_no_dithering,
15+
#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM)
16+
process_plane_impl_sse4_high_no_dithering,
17+
nullptr,
18+
nullptr,
1519
#else
1620
nullptr,
1721
nullptr,
@@ -27,6 +31,10 @@ const process_plane_impl_t* process_plane_impl_high_precision_ordered_dithering[
2731
process_plane_impl_sse4_high_ordered_dithering,
2832
process_plane_impl_avx2_high_ordered_dithering,
2933
process_plane_impl_avx512_high_ordered_dithering,
34+
#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM)
35+
process_plane_impl_sse4_high_ordered_dithering,
36+
nullptr,
37+
nullptr,
3038
#else
3139
nullptr,
3240
nullptr,
@@ -42,6 +50,10 @@ const process_plane_impl_t* process_plane_impl_high_precision_floyd_steinberg_di
4250
process_plane_impl_sse4_high_floyd_steinberg_dithering,
4351
process_plane_impl_avx2_high_floyd_steinberg_dithering,
4452
process_plane_impl_avx512_high_floyd_steinberg_dithering,
53+
#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM)
54+
process_plane_impl_sse4_high_floyd_steinberg_dithering,
55+
nullptr,
56+
nullptr,
4557
#else
4658
nullptr,
4759
nullptr,
@@ -57,6 +69,10 @@ const process_plane_impl_t* process_plane_impl_16bit_interleaved[] = {
5769
process_plane_impl_sse4_16bit_interleaved,
5870
process_plane_impl_avx2_16bit_interleaved,
5971
process_plane_impl_avx512_16bit_interleaved,
72+
#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM)
73+
process_plane_impl_sse4_16bit_interleaved,
74+
nullptr,
75+
nullptr,
6076
#else
6177
nullptr,
6278
nullptr,

0 commit comments

Comments
 (0)