@@ -336,6 +336,14 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
336
336
* argument "a" of mm_shuffle_ps that will be places in fp1 of result.
337
337
* fp0 is the same for fp0 of result.
338
338
*/
339
+ #if defined(__aarch64__ )
340
+ #define _MN_SHUFFLE (fp3 ,fp2 ,fp1 ,fp0 ) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+\
341
+ 2), (((fp2)*4)+3), (((fp1)*4)+0), (((fp1)*4)+1), (((fp1)*4)+2), (((fp1)*4)+3), (((fp0)*4)+0), (((fp0)*4)+1), (((fp0)*4)+2), (((fp0)*4)+3) } )
342
+ #define _MF_SHUFFLE (fp3 ,fp2 ,fp1 ,fp0 ) ( (uint8x16_t){ (((fp3)*4)+0), (((fp3)*4)+1), (((fp3)*4)+2), (((fp3)*4)+3), (((fp2)*4)+0), (((fp2)*4)+1), (((fp2)*4)+\
343
+ 2), (((fp2)*4)+3), (((fp1)*4)+16+0), (((fp1)*4)+16+1), (((fp1)*4)+16+2), (((fp1)*4)+16+3), (((fp0)*4)+16+0), (((fp0)*4)+16+1), (((fp0)*4)+16+2), (((fp0)*\
344
+ 4)+16+3) } )
345
+ #endif
346
+
339
347
#define _MM_SHUFFLE (fp3 , fp2 , fp1 , fp0 ) \
340
348
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
341
349
@@ -2822,7 +2830,7 @@ FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
2822
2830
FORCE_INLINE void _mm_stream_ps (float * p , __m128 a )
2823
2831
{
2824
2832
#if __has_builtin (__builtin_nontemporal_store )
2825
- __builtin_nontemporal_store (a , (float32x4_t * ) p );
2833
+ __builtin_nontemporal_store (reinterpret_cast < float32x4_t > ( a ) , (float32x4_t * ) p );
2826
2834
#else
2827
2835
vst1q_f32 (p , vreinterpretq_f32_m128 (a ));
2828
2836
#endif
@@ -5660,7 +5668,7 @@ FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
5660
5668
FORCE_INLINE void _mm_stream_pd (double * p , __m128d a )
5661
5669
{
5662
5670
#if __has_builtin (__builtin_nontemporal_store )
5663
- __builtin_nontemporal_store (a , (__m128d * ) p );
5671
+ __builtin_nontemporal_store (reinterpret_cast < float32x4_t > ( a ) , (float32x4_t * ) p );
5664
5672
#elif defined(__aarch64__ ) || defined(_M_ARM64 )
5665
5673
vst1q_f64 (p , vreinterpretq_f64_m128d (a ));
5666
5674
#else
@@ -6809,14 +6817,14 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
6809
6817
_sse2neon_define2( \
6810
6818
__m128i, a, b, \
6811
6819
const uint16_t _mask[8] = \
6812
- _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0 , \
6813
- ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0 , \
6814
- ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0 , \
6815
- ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0 , \
6816
- ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0 , \
6817
- ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0 , \
6818
- ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0 , \
6819
- ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0 ); \
6820
+ _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t)0xffff : (uint16_t)0x0000 , \
6821
+ ((imm) & (1 << 1)) ? (uint16_t)0xffff : (uint16_t)0x0000 , \
6822
+ ((imm) & (1 << 2)) ? (uint16_t)0xffff : (uint16_t)0x0000 , \
6823
+ ((imm) & (1 << 3)) ? (uint16_t)0xffff : (uint16_t)0x0000 , \
6824
+ ((imm) & (1 << 4)) ? (uint16_t)0xffff : (uint16_t)0x0000 , \
6825
+ ((imm) & (1 << 5)) ? (uint16_t)0xffff : (uint16_t)0x0000 , \
6826
+ ((imm) & (1 << 6)) ? (uint16_t)0xffff : (uint16_t)0x0000 , \
6827
+ ((imm) & (1 << 7)) ? (uint16_t)0xffff : (uint16_t)0x0000 ); \
6820
6828
uint16x8_t _mask_vec = vld1q_u16(_mask); \
6821
6829
uint16x8_t __a = vreinterpretq_u16_m128i(_a); \
6822
6830
uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \
0 commit comments