@@ -143,48 +143,48 @@ constexpr auto avx2_compressstore_lut64_perm
143
143
constexpr auto avx2_compressstore_lut64_left
144
144
= avx2_compressstore_lut64_gen.second;
145
145
146
- X86_SIMD_SORT_INLINE
146
+ X86_SIMD_SORT_FORCE_INLINE
147
147
__m256i convert_int_to_avx2_mask (int32_t m)
148
148
{
149
149
return _mm256_loadu_si256 (
150
150
(const __m256i *)avx2_mask_helper_lut32[m].data ());
151
151
}
152
152
153
- X86_SIMD_SORT_INLINE
153
+ X86_SIMD_SORT_FORCE_INLINE
154
154
int32_t convert_avx2_mask_to_int (__m256i m)
155
155
{
156
156
return _mm256_movemask_ps (_mm256_castsi256_ps (m));
157
157
}
158
158
159
- X86_SIMD_SORT_INLINE
159
+ X86_SIMD_SORT_FORCE_INLINE
160
160
__m256i convert_int_to_avx2_mask_64bit (int32_t m)
161
161
{
162
162
return _mm256_loadu_si256 (
163
163
(const __m256i *)avx2_mask_helper_lut64[m].data ());
164
164
}
165
165
166
- X86_SIMD_SORT_INLINE
166
+ X86_SIMD_SORT_FORCE_INLINE
167
167
int32_t convert_avx2_mask_to_int_64bit (__m256i m)
168
168
{
169
169
return _mm256_movemask_pd (_mm256_castsi256_pd (m));
170
170
}
171
171
172
- X86_SIMD_SORT_INLINE
172
+ X86_SIMD_SORT_FORCE_INLINE
173
173
__m128i convert_int_to_avx2_mask_half (int32_t m)
174
174
{
175
175
return _mm_loadu_si128 (
176
176
(const __m128i *)avx2_mask_helper_lut32_half[m].data ());
177
177
}
178
178
179
- X86_SIMD_SORT_INLINE
179
+ X86_SIMD_SORT_FORCE_INLINE
180
180
int32_t convert_avx2_mask_to_int_half (__m128i m)
181
181
{
182
182
return _mm_movemask_ps (_mm_castsi128_ps (m));
183
183
}
184
184
185
185
// Emulators for intrinsics missing from AVX2 compared to AVX512
186
186
template <typename T>
187
- T avx2_emu_reduce_max32 (typename avx2_vector<T>::reg_t x)
187
+ X86_SIMD_SORT_FORCE_INLINE T avx2_emu_reduce_max32 (typename avx2_vector<T>::reg_t x)
188
188
{
189
189
using vtype = avx2_vector<T>;
190
190
using reg_t = typename vtype::reg_t ;
@@ -199,7 +199,7 @@ T avx2_emu_reduce_max32(typename avx2_vector<T>::reg_t x)
199
199
}
200
200
201
201
template <typename T>
202
- T avx2_emu_reduce_max32_half (typename avx2_half_vector<T>::reg_t x)
202
+ X86_SIMD_SORT_FORCE_INLINE T avx2_emu_reduce_max32_half (typename avx2_half_vector<T>::reg_t x)
203
203
{
204
204
using vtype = avx2_half_vector<T>;
205
205
using reg_t = typename vtype::reg_t ;
@@ -212,7 +212,7 @@ T avx2_emu_reduce_max32_half(typename avx2_half_vector<T>::reg_t x)
212
212
}
213
213
214
214
template <typename T>
215
- T avx2_emu_reduce_min32 (typename avx2_vector<T>::reg_t x)
215
+ X86_SIMD_SORT_FORCE_INLINE T avx2_emu_reduce_min32 (typename avx2_vector<T>::reg_t x)
216
216
{
217
217
using vtype = avx2_vector<T>;
218
218
using reg_t = typename vtype::reg_t ;
@@ -227,7 +227,7 @@ T avx2_emu_reduce_min32(typename avx2_vector<T>::reg_t x)
227
227
}
228
228
229
229
template <typename T>
230
- T avx2_emu_reduce_min32_half (typename avx2_half_vector<T>::reg_t x)
230
+ X86_SIMD_SORT_FORCE_INLINE T avx2_emu_reduce_min32_half (typename avx2_half_vector<T>::reg_t x)
231
231
{
232
232
using vtype = avx2_half_vector<T>;
233
233
using reg_t = typename vtype::reg_t ;
@@ -240,7 +240,7 @@ T avx2_emu_reduce_min32_half(typename avx2_half_vector<T>::reg_t x)
240
240
}
241
241
242
242
template <typename T>
243
- T avx2_emu_reduce_max64 (typename avx2_vector<T>::reg_t x)
243
+ X86_SIMD_SORT_FORCE_INLINE T avx2_emu_reduce_max64 (typename avx2_vector<T>::reg_t x)
244
244
{
245
245
using vtype = avx2_vector<T>;
246
246
typename vtype::reg_t inter1 = vtype::max (
@@ -251,7 +251,7 @@ T avx2_emu_reduce_max64(typename avx2_vector<T>::reg_t x)
251
251
}
252
252
253
253
template <typename T>
254
- T avx2_emu_reduce_min64 (typename avx2_vector<T>::reg_t x)
254
+ X86_SIMD_SORT_FORCE_INLINE T avx2_emu_reduce_min64 (typename avx2_vector<T>::reg_t x)
255
255
{
256
256
using vtype = avx2_vector<T>;
257
257
typename vtype::reg_t inter1 = vtype::min (
@@ -262,7 +262,7 @@ T avx2_emu_reduce_min64(typename avx2_vector<T>::reg_t x)
262
262
}
263
263
264
264
template <typename T>
265
- void avx2_emu_mask_compressstoreu32 (void *base_addr,
265
+ X86_SIMD_SORT_FORCE_INLINE void avx2_emu_mask_compressstoreu32 (void *base_addr,
266
266
typename avx2_vector<T>::opmask_t k,
267
267
typename avx2_vector<T>::reg_t reg)
268
268
{
@@ -282,7 +282,7 @@ void avx2_emu_mask_compressstoreu32(void *base_addr,
282
282
}
283
283
284
284
template <typename T>
285
- void avx2_emu_mask_compressstoreu32_half (
285
+ X86_SIMD_SORT_FORCE_INLINE void avx2_emu_mask_compressstoreu32_half (
286
286
void *base_addr,
287
287
typename avx2_half_vector<T>::opmask_t k,
288
288
typename avx2_half_vector<T>::reg_t reg)
@@ -305,7 +305,7 @@ void avx2_emu_mask_compressstoreu32_half(
305
305
}
306
306
307
307
template <typename T>
308
- void avx2_emu_mask_compressstoreu64 (void *base_addr,
308
+ X86_SIMD_SORT_FORCE_INLINE void avx2_emu_mask_compressstoreu64 (void *base_addr,
309
309
typename avx2_vector<T>::opmask_t k,
310
310
typename avx2_vector<T>::reg_t reg)
311
311
{
@@ -326,7 +326,7 @@ void avx2_emu_mask_compressstoreu64(void *base_addr,
326
326
}
327
327
328
328
template <typename T>
329
- int avx2_double_compressstore32 (void *left_addr,
329
+ X86_SIMD_SORT_FORCE_INLINE int avx2_double_compressstore32 (void *left_addr,
330
330
void *right_addr,
331
331
typename avx2_vector<T>::opmask_t k,
332
332
typename avx2_vector<T>::reg_t reg)
@@ -349,7 +349,7 @@ int avx2_double_compressstore32(void *left_addr,
349
349
}
350
350
351
351
template <typename T>
352
- int avx2_double_compressstore32_half (void *left_addr,
352
+ X86_SIMD_SORT_FORCE_INLINE int avx2_double_compressstore32_half (void *left_addr,
353
353
void *right_addr,
354
354
typename avx2_half_vector<T>::opmask_t k,
355
355
typename avx2_half_vector<T>::reg_t reg)
@@ -373,7 +373,7 @@ int avx2_double_compressstore32_half(void *left_addr,
373
373
}
374
374
375
375
template <typename T>
376
- int32_t avx2_double_compressstore64 (void *left_addr,
376
+ X86_SIMD_SORT_FORCE_INLINE int32_t avx2_double_compressstore64 (void *left_addr,
377
377
void *right_addr,
378
378
typename avx2_vector<T>::opmask_t k,
379
379
typename avx2_vector<T>::reg_t reg)
@@ -397,7 +397,7 @@ int32_t avx2_double_compressstore64(void *left_addr,
397
397
}
398
398
399
399
template <typename T>
400
- typename avx2_vector<T>::reg_t avx2_emu_max (typename avx2_vector<T>::reg_t x,
400
+ X86_SIMD_SORT_FORCE_INLINE typename avx2_vector<T>::reg_t avx2_emu_max (typename avx2_vector<T>::reg_t x,
401
401
typename avx2_vector<T>::reg_t y)
402
402
{
403
403
using vtype = avx2_vector<T>;
@@ -408,7 +408,7 @@ typename avx2_vector<T>::reg_t avx2_emu_max(typename avx2_vector<T>::reg_t x,
408
408
}
409
409
410
410
template <typename T>
411
- typename avx2_vector<T>::reg_t avx2_emu_min (typename avx2_vector<T>::reg_t x,
411
+ X86_SIMD_SORT_FORCE_INLINE typename avx2_vector<T>::reg_t avx2_emu_min (typename avx2_vector<T>::reg_t x,
412
412
typename avx2_vector<T>::reg_t y)
413
413
{
414
414
using vtype = avx2_vector<T>;
0 commit comments