Skip to content

Commit a48b4bc

Browse files
anthony-linarosherholz-intel
authored andcommitted
[sse2neon+compile] Re-apply sse2neon changes, and add support for Windows ARM64
1 parent 606518e commit a48b4bc

File tree

4 files changed

+31
-14
lines changed

4 files changed

+31
-14
lines changed

CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ SET(OPENPGL_ARM OFF)
3535
IF (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND (CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64"))
3636
MESSAGE(STATUS "Building for Apple silicon")
3737
SET(OPENPGL_ARM ON)
38-
ELSEIF(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
38+
ELSEIF(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64")
3939
MESSAGE(STATUS "Building for AArch64")
4040
SET(OPENPGL_ARM ON)
4141
ENDIF()

openpgl/CMakeLists.txt

+10-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ message(STATUS "Compiler: ${CMAKE_CXX_COMPILER_ID}")
7878
message(STATUS "Arch: ${CMAKE_SYSTEM_PROCESSOR}")
7979

8080
if(WIN32)
81-
if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
81+
# Here we chack for MSVC, or Clang pretending to be MSVC via Clang-CL
82+
if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC"))
8283
set(OPENPGL_RELEASE_OPTIONS /Ox /Oi)
8384
set(OPENPGL_COMMON_OPTIONS /fp:precise)
8485
#set(OPENPGL_RELEASE_OPTIONS ${OPENPGL_RELEASE_OPTIONS} -ftree-vectorize -mfpmath=sse -funsafe-math-optimizations -fno-rounding-math -fno-signaling-nans -fno-math-errno -fomit-frame-pointer )
@@ -95,6 +96,14 @@ if(WIN32)
9596
if(OPENPGL_ISA_AVX512)
9697
set_source_files_properties(api/deviceCPU16.cpp PROPERTIES COMPILE_FLAGS "/D__SSE__ /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__SSE4_2__ /arch:AVX /arch:AVX2 /arch:AVX512")
9798
endif()
99+
if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
100+
if(OPENPGL_ISA_NEON)
101+
set_source_files_properties(api/deviceCPU4.cpp PROPERTIES COMPILE_FLAGS "/D__SSE4_2__ /D__SSE4_1__")
102+
endif()
103+
if(OPENPGL_ISA_NEON2X)
104+
set_source_files_properties(api/deviceCPU8.cpp PROPERTIES COMPILE_FLAGS "/D__AVX2__ /D__AVX__ /D__SSE4_2__ /D__SSE4_1__ /D__BMI__ /D__BMI2__ /D__LZCNT__")
105+
endif()
106+
endif()
98107
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM" OR CMAKE_CXX_COMPILER_ID STREQUAL "dpcpp")
99108
set(OPENPGL_RELEASE_OPTIONS -O3)
100109
set(OPENPGL_COMMON_OPTIONS -Wall)

third-party/embreeSrc/common/simd/arm/sse2neon.h

+18-10
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,14 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
336336
* argument "a" of mm_shuffle_ps that will be places in fp1 of result.
337337
* fp0 is the same for fp0 of result.
338338
*/
339+
#if defined(__aarch64__)
340+
#define _MN_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+\
341+
2), (((fp2)*4)+3), (((fp1)*4)+0), (((fp1)*4)+1), (((fp1)*4)+2), (((fp1)*4)+3), (((fp0)*4)+0), (((fp0)*4)+1), (((fp0)*4)+2), (((fp0)*4)+3) } )
342+
#define _MF_SHUFFLE(fp3,fp2,fp1,fp0) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+\
343+
2), (((fp2)*4)+3), (((fp1)*4)+16+0), (((fp1)*4)+16+1), (((fp1)*4)+16+2), (((fp1)*4)+16+3), (((fp0)*4)+16+0), (((fp0)*4)+16+1), (((fp0)*4)+16+2), (((fp0)*\
344+
4)+16+3) } )
345+
#endif
346+
339347
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
340348
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
341349

@@ -2822,7 +2830,7 @@ FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
28222830
FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
28232831
{
28242832
#if __has_builtin(__builtin_nontemporal_store)
2825-
__builtin_nontemporal_store(a, (float32x4_t *) p);
2833+
__builtin_nontemporal_store(reinterpret_cast<float32x4_t>(a), (float32x4_t *) p);
28262834
#else
28272835
vst1q_f32(p, vreinterpretq_f32_m128(a));
28282836
#endif
@@ -5660,7 +5668,7 @@ FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
56605668
FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
56615669
{
56625670
#if __has_builtin(__builtin_nontemporal_store)
5663-
__builtin_nontemporal_store(a, (__m128d *) p);
5671+
__builtin_nontemporal_store(reinterpret_cast<float32x4_t>(a), (float32x4_t *) p);
56645672
#elif defined(__aarch64__) || defined(_M_ARM64)
56655673
vst1q_f64(p, vreinterpretq_f64_m128d(a));
56665674
#else
@@ -6809,14 +6817,14 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
68096817
_sse2neon_define2( \
68106818
__m128i, a, b, \
68116819
const uint16_t _mask[8] = \
6812-
_sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \
6813-
((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \
6814-
((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \
6815-
((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \
6816-
((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \
6817-
((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \
6818-
((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \
6819-
((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0); \
6820+
_sse2neon_init(((imm) & (1 << 0)) ? (uint16_t)0xffff : (uint16_t)0x0000, \
6821+
((imm) & (1 << 1)) ? (uint16_t)0xffff : (uint16_t)0x0000, \
6822+
((imm) & (1 << 2)) ? (uint16_t)0xffff : (uint16_t)0x0000, \
6823+
((imm) & (1 << 3)) ? (uint16_t)0xffff : (uint16_t)0x0000, \
6824+
((imm) & (1 << 4)) ? (uint16_t)0xffff : (uint16_t)0x0000, \
6825+
((imm) & (1 << 5)) ? (uint16_t)0xffff : (uint16_t)0x0000, \
6826+
((imm) & (1 << 6)) ? (uint16_t)0xffff : (uint16_t)0x0000, \
6827+
((imm) & (1 << 7)) ? (uint16_t)0xffff : (uint16_t)0x0000); \
68206828
uint16x8_t _mask_vec = vld1q_u16(_mask); \
68216829
uint16x8_t __a = vreinterpretq_u16_m128i(_a); \
68226830
uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \

third-party/embreeSrc/common/sys/intrinsics.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ namespace embree
9191

9292
#if defined(__X86_64__) || defined (__aarch64__)
9393
__forceinline size_t bsf(size_t v) {
94-
#if defined(__AVX2__)
94+
#if defined(__AVX2__) && !defined(__aarch64__)
9595
return _tzcnt_u64(v);
9696
#else
9797
unsigned long r = 0; _BitScanForward64(&r,v); return r;
@@ -140,7 +140,7 @@ namespace embree
140140

141141
#if defined(__X86_64__) || defined (__aarch64__)
142142
__forceinline size_t bsr(size_t v) {
143-
#if defined(__AVX2__)
143+
#if defined(__AVX2__) && !defined(__aarch64__)
144144
return 63 -_lzcnt_u64(v);
145145
#else
146146
unsigned long r = 0; _BitScanReverse64(&r, v); return r;

0 commit comments

Comments
 (0)