Skip to content

Commit f580fdd

Browse files
committed
explicitly request force inlining for small functions using simd arguments, to prevent registers spill to the stack in Windows ABI mode
Signed-off-by: Fabio Cannizzo <[email protected]>
1 parent 24d18d9 commit f580fdd

17 files changed

+762
-760
lines changed

src/avx2-32bit-half.hpp

Lines changed: 98 additions & 98 deletions
Large diffs are not rendered by default.

src/avx2-32bit-qsort.hpp

Lines changed: 98 additions & 98 deletions
Large diffs are not rendered by default.

src/avx2-64bit-qsort.hpp

Lines changed: 105 additions & 105 deletions
Large diffs are not rendered by default.

src/avx2-emu-funcs.hpp

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -143,48 +143,48 @@ constexpr auto avx2_compressstore_lut64_perm
143143
constexpr auto avx2_compressstore_lut64_left
144144
= avx2_compressstore_lut64_gen.second;
145145

146-
X86_SIMD_SORT_INLINE
146+
X86_SIMD_SORT_FORCE_INLINE
147147
__m256i convert_int_to_avx2_mask(int32_t m)
148148
{
149149
return _mm256_loadu_si256(
150150
(const __m256i *)avx2_mask_helper_lut32[m].data());
151151
}
152152

153-
X86_SIMD_SORT_INLINE
153+
X86_SIMD_SORT_FORCE_INLINE
154154
int32_t convert_avx2_mask_to_int(__m256i m)
155155
{
156156
return _mm256_movemask_ps(_mm256_castsi256_ps(m));
157157
}
158158

159-
X86_SIMD_SORT_INLINE
159+
X86_SIMD_SORT_FORCE_INLINE
160160
__m256i convert_int_to_avx2_mask_64bit(int32_t m)
161161
{
162162
return _mm256_loadu_si256(
163163
(const __m256i *)avx2_mask_helper_lut64[m].data());
164164
}
165165

166-
X86_SIMD_SORT_INLINE
166+
X86_SIMD_SORT_FORCE_INLINE
167167
int32_t convert_avx2_mask_to_int_64bit(__m256i m)
168168
{
169169
return _mm256_movemask_pd(_mm256_castsi256_pd(m));
170170
}
171171

172-
X86_SIMD_SORT_INLINE
172+
X86_SIMD_SORT_FORCE_INLINE
173173
__m128i convert_int_to_avx2_mask_half(int32_t m)
174174
{
175175
return _mm_loadu_si128(
176176
(const __m128i *)avx2_mask_helper_lut32_half[m].data());
177177
}
178178

179-
X86_SIMD_SORT_INLINE
179+
X86_SIMD_SORT_FORCE_INLINE
180180
int32_t convert_avx2_mask_to_int_half(__m128i m)
181181
{
182182
return _mm_movemask_ps(_mm_castsi128_ps(m));
183183
}
184184

185185
// Emulators for intrinsics missing from AVX2 compared to AVX512
186186
template <typename T>
187-
T avx2_emu_reduce_max32(typename avx2_vector<T>::reg_t x)
187+
X86_SIMD_SORT_FORCE_INLINE T avx2_emu_reduce_max32(typename avx2_vector<T>::reg_t x)
188188
{
189189
using vtype = avx2_vector<T>;
190190
using reg_t = typename vtype::reg_t;
@@ -199,7 +199,7 @@ T avx2_emu_reduce_max32(typename avx2_vector<T>::reg_t x)
199199
}
200200

201201
template <typename T>
202-
T avx2_emu_reduce_max32_half(typename avx2_half_vector<T>::reg_t x)
202+
X86_SIMD_SORT_FORCE_INLINE T avx2_emu_reduce_max32_half(typename avx2_half_vector<T>::reg_t x)
203203
{
204204
using vtype = avx2_half_vector<T>;
205205
using reg_t = typename vtype::reg_t;
@@ -212,7 +212,7 @@ T avx2_emu_reduce_max32_half(typename avx2_half_vector<T>::reg_t x)
212212
}
213213

214214
template <typename T>
215-
T avx2_emu_reduce_min32(typename avx2_vector<T>::reg_t x)
215+
X86_SIMD_SORT_FORCE_INLINE T avx2_emu_reduce_min32(typename avx2_vector<T>::reg_t x)
216216
{
217217
using vtype = avx2_vector<T>;
218218
using reg_t = typename vtype::reg_t;
@@ -227,7 +227,7 @@ T avx2_emu_reduce_min32(typename avx2_vector<T>::reg_t x)
227227
}
228228

229229
template <typename T>
230-
T avx2_emu_reduce_min32_half(typename avx2_half_vector<T>::reg_t x)
230+
X86_SIMD_SORT_FORCE_INLINE T avx2_emu_reduce_min32_half(typename avx2_half_vector<T>::reg_t x)
231231
{
232232
using vtype = avx2_half_vector<T>;
233233
using reg_t = typename vtype::reg_t;
@@ -240,7 +240,7 @@ T avx2_emu_reduce_min32_half(typename avx2_half_vector<T>::reg_t x)
240240
}
241241

242242
template <typename T>
243-
T avx2_emu_reduce_max64(typename avx2_vector<T>::reg_t x)
243+
X86_SIMD_SORT_FORCE_INLINE T avx2_emu_reduce_max64(typename avx2_vector<T>::reg_t x)
244244
{
245245
using vtype = avx2_vector<T>;
246246
typename vtype::reg_t inter1 = vtype::max(
@@ -251,7 +251,7 @@ T avx2_emu_reduce_max64(typename avx2_vector<T>::reg_t x)
251251
}
252252

253253
template <typename T>
254-
T avx2_emu_reduce_min64(typename avx2_vector<T>::reg_t x)
254+
X86_SIMD_SORT_FORCE_INLINE T avx2_emu_reduce_min64(typename avx2_vector<T>::reg_t x)
255255
{
256256
using vtype = avx2_vector<T>;
257257
typename vtype::reg_t inter1 = vtype::min(
@@ -262,7 +262,7 @@ T avx2_emu_reduce_min64(typename avx2_vector<T>::reg_t x)
262262
}
263263

264264
template <typename T>
265-
void avx2_emu_mask_compressstoreu32(void *base_addr,
265+
X86_SIMD_SORT_FORCE_INLINE void avx2_emu_mask_compressstoreu32(void *base_addr,
266266
typename avx2_vector<T>::opmask_t k,
267267
typename avx2_vector<T>::reg_t reg)
268268
{
@@ -282,7 +282,7 @@ void avx2_emu_mask_compressstoreu32(void *base_addr,
282282
}
283283

284284
template <typename T>
285-
void avx2_emu_mask_compressstoreu32_half(
285+
X86_SIMD_SORT_FORCE_INLINE void avx2_emu_mask_compressstoreu32_half(
286286
void *base_addr,
287287
typename avx2_half_vector<T>::opmask_t k,
288288
typename avx2_half_vector<T>::reg_t reg)
@@ -305,7 +305,7 @@ void avx2_emu_mask_compressstoreu32_half(
305305
}
306306

307307
template <typename T>
308-
void avx2_emu_mask_compressstoreu64(void *base_addr,
308+
X86_SIMD_SORT_FORCE_INLINE void avx2_emu_mask_compressstoreu64(void *base_addr,
309309
typename avx2_vector<T>::opmask_t k,
310310
typename avx2_vector<T>::reg_t reg)
311311
{
@@ -326,7 +326,7 @@ void avx2_emu_mask_compressstoreu64(void *base_addr,
326326
}
327327

328328
template <typename T>
329-
int avx2_double_compressstore32(void *left_addr,
329+
X86_SIMD_SORT_FORCE_INLINE int avx2_double_compressstore32(void *left_addr,
330330
void *right_addr,
331331
typename avx2_vector<T>::opmask_t k,
332332
typename avx2_vector<T>::reg_t reg)
@@ -349,7 +349,7 @@ int avx2_double_compressstore32(void *left_addr,
349349
}
350350

351351
template <typename T>
352-
int avx2_double_compressstore32_half(void *left_addr,
352+
X86_SIMD_SORT_FORCE_INLINE int avx2_double_compressstore32_half(void *left_addr,
353353
void *right_addr,
354354
typename avx2_half_vector<T>::opmask_t k,
355355
typename avx2_half_vector<T>::reg_t reg)
@@ -373,7 +373,7 @@ int avx2_double_compressstore32_half(void *left_addr,
373373
}
374374

375375
template <typename T>
376-
int32_t avx2_double_compressstore64(void *left_addr,
376+
X86_SIMD_SORT_FORCE_INLINE int32_t avx2_double_compressstore64(void *left_addr,
377377
void *right_addr,
378378
typename avx2_vector<T>::opmask_t k,
379379
typename avx2_vector<T>::reg_t reg)
@@ -397,7 +397,7 @@ int32_t avx2_double_compressstore64(void *left_addr,
397397
}
398398

399399
template <typename T>
400-
typename avx2_vector<T>::reg_t avx2_emu_max(typename avx2_vector<T>::reg_t x,
400+
X86_SIMD_SORT_FORCE_INLINE typename avx2_vector<T>::reg_t avx2_emu_max(typename avx2_vector<T>::reg_t x,
401401
typename avx2_vector<T>::reg_t y)
402402
{
403403
using vtype = avx2_vector<T>;
@@ -408,7 +408,7 @@ typename avx2_vector<T>::reg_t avx2_emu_max(typename avx2_vector<T>::reg_t x,
408408
}
409409

410410
template <typename T>
411-
typename avx2_vector<T>::reg_t avx2_emu_min(typename avx2_vector<T>::reg_t x,
411+
X86_SIMD_SORT_FORCE_INLINE typename avx2_vector<T>::reg_t avx2_emu_min(typename avx2_vector<T>::reg_t x,
412412
typename avx2_vector<T>::reg_t y)
413413
{
414414
using vtype = avx2_vector<T>;

src/avx512-16bit-common.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
struct avx512_16bit_swizzle_ops {
1111
template <typename vtype, int scale>
12-
X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg)
12+
static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg)
1313
{
1414
__m512i v = vtype::cast_to(reg);
1515

@@ -41,7 +41,7 @@ struct avx512_16bit_swizzle_ops {
4141
}
4242

4343
template <typename vtype, int scale>
44-
X86_SIMD_SORT_INLINE typename vtype::reg_t
44+
static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t
4545
reverse_n(typename vtype::reg_t reg)
4646
{
4747
__m512i v = vtype::cast_to(reg);
@@ -82,7 +82,7 @@ struct avx512_16bit_swizzle_ops {
8282
}
8383

8484
template <typename vtype, int scale>
85-
X86_SIMD_SORT_INLINE typename vtype::reg_t
85+
static X86_SIMD_SORT_FORCE_INLINE typename vtype::reg_t
8686
merge_n(typename vtype::reg_t reg, typename vtype::reg_t other)
8787
{
8888
__m512i v1 = vtype::cast_to(reg);

0 commit comments

Comments
 (0)