Merge pull request #4 from r-devulap/cygwin-bug

Raghuveer Devulapalli · web-flow · commit 7d7591cf5927 · 2023-01-31T12:48:16.000-08:00
Force inline on cygwin only
diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp
@@ -374,7 +374,7 @@ struct zmm_vector<uint16_t> {
  * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
  */
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_FINLINE zmm_t sort_zmm_16bit(zmm_t zmm)
+X86_SIMD_SORT_INLINE zmm_t sort_zmm_16bit(zmm_t zmm)
 {
     // Level 1
     zmm = cmp_merge<vtype>(
@@ -434,7 +434,7 @@ X86_SIMD_SORT_FINLINE zmm_t sort_zmm_16bit(zmm_t zmm)
 
 // Assumes zmm is bitonic and performs a recursive half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm)
+X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm)
 {
     // 1) half_cleaner[32]: compare 1-17, 2-18, 3-19 etc ..
     zmm = cmp_merge<vtype>(
@@ -460,7 +460,7 @@ X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm)
 
 // Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2)
+X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2)
 {
     // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
     zmm2 = vtype::permutexvar(vtype::get_network(4), zmm2);
@@ -474,7 +474,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2)
 // Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
 // half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm)
+X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm)
 {
     zmm_t zmm2r = vtype::permutexvar(vtype::get_network(4), zmm[2]);
     zmm_t zmm3r = vtype::permutexvar(vtype::get_network(4), zmm[3]);
@@ -495,7 +495,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm)
 }
 
 template <typename vtype, typename type_t>
-X86_SIMD_SORT_FINLINE void sort_32_16bit(type_t *arr, int32_t N)
+X86_SIMD_SORT_INLINE void sort_32_16bit(type_t *arr, int32_t N)
 {
     typename vtype::opmask_t load_mask = ((0x1ull << N) - 0x1ull) & 0xFFFFFFFF;
     typename vtype::zmm_t zmm
@@ -504,7 +504,7 @@ X86_SIMD_SORT_FINLINE void sort_32_16bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-X86_SIMD_SORT_FINLINE void sort_64_16bit(type_t *arr, int32_t N)
+X86_SIMD_SORT_INLINE void sort_64_16bit(type_t *arr, int32_t N)
 {
     if (N <= 32) {
         sort_32_16bit<vtype>(arr, N);
@@ -523,7 +523,7 @@ X86_SIMD_SORT_FINLINE void sort_64_16bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-X86_SIMD_SORT_FINLINE void sort_128_16bit(type_t *arr, int32_t N)
+X86_SIMD_SORT_INLINE void sort_128_16bit(type_t *arr, int32_t N)
 {
     if (N <= 64) {
         sort_64_16bit<vtype>(arr, N);
@@ -556,9 +556,9 @@ X86_SIMD_SORT_FINLINE void sort_128_16bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-X86_SIMD_SORT_FINLINE type_t get_pivot_16bit(type_t *arr,
-                                             const int64_t left,
-                                             const int64_t right)
+X86_SIMD_SORT_INLINE type_t get_pivot_16bit(type_t *arr,
+                                            const int64_t left,
+                                            const int64_t right)
 {
     // median of 32
     int64_t size = (right - left) / 32;
@@ -657,8 +657,8 @@ qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
         qsort_16bit_<vtype>(arr, pivot_index, right, max_iters - 1);
 }
 
-X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(uint16_t *arr,
-                                                   int64_t arrsize)
+X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(uint16_t *arr,
+                                                  int64_t arrsize)
 {
     int64_t nan_count = 0;
     __mmask16 loadmask = 0xFFFF;
@@ -676,7 +676,7 @@ X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(uint16_t *arr,
     return nan_count;
 }
 
-X86_SIMD_SORT_FINLINE void
+X86_SIMD_SORT_INLINE void
 replace_inf_with_nan(uint16_t *arr, int64_t arrsize, int64_t nan_count)
 {
     for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {
diff --git a/src/avx512-32bit-qsort.hpp b/src/avx512-32bit-qsort.hpp
@@ -336,7 +336,7 @@ struct zmm_vector<float> {
  * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
  */
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_FINLINE zmm_t sort_zmm_32bit(zmm_t zmm)
+X86_SIMD_SORT_INLINE zmm_t sort_zmm_32bit(zmm_t zmm)
 {
     zmm = cmp_merge<vtype>(
             zmm,
@@ -383,7 +383,7 @@ X86_SIMD_SORT_FINLINE zmm_t sort_zmm_32bit(zmm_t zmm)
 
 // Assumes zmm is bitonic and performs a recursive half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm)
+X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm)
 {
     // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
     zmm = cmp_merge<vtype>(
@@ -410,7 +410,7 @@ X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm)
 
 // Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2)
+X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2)
 {
     // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
     *zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), *zmm2);
@@ -424,7 +424,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2)
 // Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
 // half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm)
+X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm)
 {
     zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[2]);
     zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[3]);
@@ -445,7 +445,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm)
 }
 
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_FINLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm)
+X86_SIMD_SORT_INLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm)
 {
     zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[4]);
     zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[5]);
@@ -482,7 +482,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm)
 }
 
 template <typename vtype, typename type_t>
-X86_SIMD_SORT_FINLINE void sort_16_32bit(type_t *arr, int32_t N)
+X86_SIMD_SORT_INLINE void sort_16_32bit(type_t *arr, int32_t N)
 {
     typename vtype::opmask_t load_mask = (0x0001 << N) - 0x0001;
     typename vtype::zmm_t zmm
@@ -491,7 +491,7 @@ X86_SIMD_SORT_FINLINE void sort_16_32bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-X86_SIMD_SORT_FINLINE void sort_32_32bit(type_t *arr, int32_t N)
+X86_SIMD_SORT_INLINE void sort_32_32bit(type_t *arr, int32_t N)
 {
     if (N <= 16) {
         sort_16_32bit<vtype>(arr, N);
@@ -509,7 +509,7 @@ X86_SIMD_SORT_FINLINE void sort_32_32bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-X86_SIMD_SORT_FINLINE void sort_64_32bit(type_t *arr, int32_t N)
+X86_SIMD_SORT_INLINE void sort_64_32bit(type_t *arr, int32_t N)
 {
     if (N <= 32) {
         sort_32_32bit<vtype>(arr, N);
@@ -540,7 +540,7 @@ X86_SIMD_SORT_FINLINE void sort_64_32bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-X86_SIMD_SORT_FINLINE void sort_128_32bit(type_t *arr, int32_t N)
+X86_SIMD_SORT_INLINE void sort_128_32bit(type_t *arr, int32_t N)
 {
     if (N <= 64) {
         sort_64_32bit<vtype>(arr, N);
@@ -592,9 +592,9 @@ X86_SIMD_SORT_FINLINE void sort_128_32bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-X86_SIMD_SORT_FINLINE type_t get_pivot_32bit(type_t *arr,
-                                             const int64_t left,
-                                             const int64_t right)
+X86_SIMD_SORT_INLINE type_t get_pivot_32bit(type_t *arr,
+                                            const int64_t left,
+                                            const int64_t right)
 {
     // median of 16
     int64_t size = (right - left) / 16;
@@ -656,7 +656,7 @@ qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
         qsort_32bit_<vtype>(arr, pivot_index, right, max_iters - 1);
 }
 
-X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize)
+X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize)
 {
     int64_t nan_count = 0;
     __mmask16 loadmask = 0xFFFF;
@@ -672,7 +672,7 @@ X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize)
     return nan_count;
 }
 
-X86_SIMD_SORT_FINLINE void
+X86_SIMD_SORT_INLINE void
 replace_inf_with_nan(float *arr, int64_t arrsize, int64_t nan_count)
 {
     for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {
diff --git a/src/avx512-64bit-qsort.hpp b/src/avx512-64bit-qsort.hpp
@@ -330,7 +330,7 @@ struct zmm_vector<double> {
  * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
  */
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_FINLINE zmm_t sort_zmm_64bit(zmm_t zmm)
+X86_SIMD_SORT_INLINE zmm_t sort_zmm_64bit(zmm_t zmm)
 {
     const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
     zmm = cmp_merge<vtype>(
@@ -353,7 +353,7 @@ X86_SIMD_SORT_FINLINE zmm_t sort_zmm_64bit(zmm_t zmm)
 
 // Assumes zmm is bitonic and performs a recursive half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm)
+X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm)
 {
 
     // 1) half_cleaner[8]: compare 0-4, 1-5, 2-6, 3-7
@@ -374,7 +374,7 @@ X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm)
 
 // Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2)
+X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2)
 {
     const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
     // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
@@ -389,7 +389,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2)
 // Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
 // half cleaner
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm)
+X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm)
 {
     const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
     // 1) First step of a merging network
@@ -411,7 +411,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm)
 }
 
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_FINLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm)
+X86_SIMD_SORT_INLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm)
 {
     const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
     zmm_t zmm4r = vtype::permutexvar(rev_index, zmm[4]);
@@ -445,7 +445,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm)
 }
 
 template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-X86_SIMD_SORT_FINLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm)
+X86_SIMD_SORT_INLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm)
 {
     const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
     zmm_t zmm8r = vtype::permutexvar(rev_index, zmm[8]);
@@ -519,7 +519,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm)
 }
 
 template <typename vtype, typename type_t>
-X86_SIMD_SORT_FINLINE void sort_8_64bit(type_t *arr, int32_t N)
+X86_SIMD_SORT_INLINE void sort_8_64bit(type_t *arr, int32_t N)
 {
     typename vtype::opmask_t load_mask = (0x01 << N) - 0x01;
     typename vtype::zmm_t zmm
@@ -528,7 +528,7 @@ X86_SIMD_SORT_FINLINE void sort_8_64bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-X86_SIMD_SORT_FINLINE void sort_16_64bit(type_t *arr, int32_t N)
+X86_SIMD_SORT_INLINE void sort_16_64bit(type_t *arr, int32_t N)
 {
     if (N <= 8) {
         sort_8_64bit<vtype>(arr, N);
@@ -546,7 +546,7 @@ X86_SIMD_SORT_FINLINE void sort_16_64bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-X86_SIMD_SORT_FINLINE void sort_32_64bit(type_t *arr, int32_t N)
+X86_SIMD_SORT_INLINE void sort_32_64bit(type_t *arr, int32_t N)
 {
     if (N <= 16) {
         sort_16_64bit<vtype>(arr, N);
@@ -577,7 +577,7 @@ X86_SIMD_SORT_FINLINE void sort_32_64bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-X86_SIMD_SORT_FINLINE void sort_64_64bit(type_t *arr, int32_t N)
+X86_SIMD_SORT_INLINE void sort_64_64bit(type_t *arr, int32_t N)
 {
     if (N <= 32) {
         sort_32_64bit<vtype>(arr, N);
@@ -628,7 +628,7 @@ X86_SIMD_SORT_FINLINE void sort_64_64bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-X86_SIMD_SORT_FINLINE void sort_128_64bit(type_t *arr, int32_t N)
+X86_SIMD_SORT_INLINE void sort_128_64bit(type_t *arr, int32_t N)
 {
     if (N <= 64) {
         sort_64_64bit<vtype>(arr, N);
@@ -718,9 +718,9 @@ X86_SIMD_SORT_FINLINE void sort_128_64bit(type_t *arr, int32_t N)
 }
 
 template <typename vtype, typename type_t>
-X86_SIMD_SORT_FINLINE type_t get_pivot_64bit(type_t *arr,
-                                             const int64_t left,
-                                             const int64_t right)
+X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr,
+                                            const int64_t left,
+                                            const int64_t right)
 {
     // median of 8
     int64_t size = (right - left) / 8;
@@ -769,7 +769,7 @@ qsort_64bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
         qsort_64bit_<vtype>(arr, pivot_index, right, max_iters - 1);
 }
 
-X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(double *arr, int64_t arrsize)
+X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(double *arr, int64_t arrsize)
 {
     int64_t nan_count = 0;
     __mmask8 loadmask = 0xFF;
@@ -785,7 +785,7 @@ X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(double *arr, int64_t arrsize)
     return nan_count;
 }
 
-X86_SIMD_SORT_FINLINE void
+X86_SIMD_SORT_INLINE void
 replace_inf_with_nan(double *arr, int64_t arrsize, int64_t nan_count)
 {
     for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {
diff --git a/src/avx512-common-qsort.h b/src/avx512-common-qsort.h
@@ -64,10 +64,20 @@
 #define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d
 
 #ifdef _MSC_VER
+#define X86_SIMD_SORT_INLINE static inline
 #define X86_SIMD_SORT_FINLINE static __forceinline
+#elif defined(__CYGWIN__)
+/*
+ * Force inline in cygwin to work around a compiler bug. See
+ * https://github.com/numpy/numpy/pull/22315#issuecomment-1267757584
+ */
+#define X86_SIMD_SORT_INLINE static __attribute__((always_inline))
+#define X86_SIMD_SORT_FINLINE static __attribute__((always_inline))
 #elif defined(__GNUC__)
-#define X86_SIMD_SORT_FINLINE static inline //__attribute__((always_inline))
+#define X86_SIMD_SORT_INLINE static inline
+#define X86_SIMD_SORT_FINLINE static __attribute__((always_inline))
 #else
+#define X86_SIMD_SORT_INLINE static
 #define X86_SIMD_SORT_FINLINE static
 #endif
 

Original file line number	Diff line number	Diff line change
`@@ -374,7 +374,7 @@ struct zmm_vector<uint16_t> {`
`374`	`374`	`* https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg`
`375`	`375`	`*/`
`376`	`376`	`template <typename vtype, typename zmm_t = typename vtype::zmm_t>`
`377`		`-X86_SIMD_SORT_FINLINE zmm_t sort_zmm_16bit(zmm_t zmm)`
	`377`	`+X86_SIMD_SORT_INLINE zmm_t sort_zmm_16bit(zmm_t zmm)`
`378`	`378`	`{`
`379`	`379`	`// Level 1`
`380`	`380`	`zmm = cmp_merge<vtype>(`
`@@ -434,7 +434,7 @@ X86_SIMD_SORT_FINLINE zmm_t sort_zmm_16bit(zmm_t zmm)`
`434`	`434`
`435`	`435`	`// Assumes zmm is bitonic and performs a recursive half cleaner`
`436`	`436`	`template <typename vtype, typename zmm_t = typename vtype::zmm_t>`
`437`		`-X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm)`
	`437`	`+X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm)`
`438`	`438`	`{`
`439`	`439`	`// 1) half_cleaner[32]: compare 1-17, 2-18, 3-19 etc ..`
`440`	`440`	`zmm = cmp_merge<vtype>(`
`@@ -460,7 +460,7 @@ X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm)`
`460`	`460`
`461`	`461`	`// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner`
`462`	`462`	`template <typename vtype, typename zmm_t = typename vtype::zmm_t>`
`463`		`-X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2)`
	`463`	`+X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2)`
`464`	`464`	`{`
`465`	`465`	`// 1) First step of a merging network: coex of zmm1 and zmm2 reversed`
`466`	`466`	`zmm2 = vtype::permutexvar(vtype::get_network(4), zmm2);`
`@@ -474,7 +474,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2)`
`474`	`474`	`// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive`
`475`	`475`	`// half cleaner`
`476`	`476`	`template <typename vtype, typename zmm_t = typename vtype::zmm_t>`
`477`		`-X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm)`
	`477`	`+X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm)`
`478`	`478`	`{`
`479`	`479`	`zmm_t zmm2r = vtype::permutexvar(vtype::get_network(4), zmm[2]);`
`480`	`480`	`zmm_t zmm3r = vtype::permutexvar(vtype::get_network(4), zmm[3]);`
`@@ -495,7 +495,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm)`
`495`	`495`	`}`
`496`	`496`
`497`	`497`	`template <typename vtype, typename type_t>`
`498`		`-X86_SIMD_SORT_FINLINE void sort_32_16bit(type_t *arr, int32_t N)`
	`498`	`+X86_SIMD_SORT_INLINE void sort_32_16bit(type_t *arr, int32_t N)`
`499`	`499`	`{`
`500`	`500`	`typename vtype::opmask_t load_mask = ((0x1ull << N) - 0x1ull) & 0xFFFFFFFF;`
`501`	`501`	`typename vtype::zmm_t zmm`
`@@ -504,7 +504,7 @@ X86_SIMD_SORT_FINLINE void sort_32_16bit(type_t *arr, int32_t N)`
`504`	`504`	`}`
`505`	`505`
`506`	`506`	`template <typename vtype, typename type_t>`
`507`		`-X86_SIMD_SORT_FINLINE void sort_64_16bit(type_t *arr, int32_t N)`
	`507`	`+X86_SIMD_SORT_INLINE void sort_64_16bit(type_t *arr, int32_t N)`
`508`	`508`	`{`
`509`	`509`	`if (N <= 32) {`
`510`	`510`	`sort_32_16bit<vtype>(arr, N);`
`@@ -523,7 +523,7 @@ X86_SIMD_SORT_FINLINE void sort_64_16bit(type_t *arr, int32_t N)`
`523`	`523`	`}`
`524`	`524`
`525`	`525`	`template <typename vtype, typename type_t>`
`526`		`-X86_SIMD_SORT_FINLINE void sort_128_16bit(type_t *arr, int32_t N)`
	`526`	`+X86_SIMD_SORT_INLINE void sort_128_16bit(type_t *arr, int32_t N)`
`527`	`527`	`{`
`528`	`528`	`if (N <= 64) {`
`529`	`529`	`sort_64_16bit<vtype>(arr, N);`
`@@ -556,9 +556,9 @@ X86_SIMD_SORT_FINLINE void sort_128_16bit(type_t *arr, int32_t N)`
`556`	`556`	`}`
`557`	`557`
`558`	`558`	`template <typename vtype, typename type_t>`
`559`		`-X86_SIMD_SORT_FINLINE type_t get_pivot_16bit(type_t *arr,`
`560`		`- const int64_t left,`
`561`		`- const int64_t right)`
	`559`	`+X86_SIMD_SORT_INLINE type_t get_pivot_16bit(type_t *arr,`
	`560`	`+ const int64_t left,`
	`561`	`+ const int64_t right)`
`562`	`562`	`{`
`563`	`563`	`// median of 32`
`564`	`564`	`int64_t size = (right - left) / 32;`
`@@ -657,8 +657,8 @@ qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)`
`657`	`657`	`qsort_16bit_<vtype>(arr, pivot_index, right, max_iters - 1);`
`658`	`658`	`}`
`659`	`659`
`660`		`-X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(uint16_t *arr,`
`661`		`- int64_t arrsize)`
	`660`	`+X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(uint16_t *arr,`
	`661`	`+ int64_t arrsize)`
`662`	`662`	`{`
`663`	`663`	`int64_t nan_count = 0;`
`664`	`664`	`__mmask16 loadmask = 0xFFFF;`
`@@ -676,7 +676,7 @@ X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(uint16_t *arr,`
`676`	`676`	`return nan_count;`
`677`	`677`	`}`
`678`	`678`
`679`		`-X86_SIMD_SORT_FINLINE void`
	`679`	`+X86_SIMD_SORT_INLINE void`
`680`	`680`	`replace_inf_with_nan(uint16_t *arr, int64_t arrsize, int64_t nan_count)`
`681`	`681`	`{`
`682`	`682`	`for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {`
Original file line number	Diff line number	Diff line change
`@@ -336,7 +336,7 @@ struct zmm_vector<float> {`
`336`	`336`	`* https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg`
`337`	`337`	`*/`
`338`	`338`	`template <typename vtype, typename zmm_t = typename vtype::zmm_t>`
`339`		`-X86_SIMD_SORT_FINLINE zmm_t sort_zmm_32bit(zmm_t zmm)`
	`339`	`+X86_SIMD_SORT_INLINE zmm_t sort_zmm_32bit(zmm_t zmm)`
`340`	`340`	`{`
`341`	`341`	`zmm = cmp_merge<vtype>(`
`342`	`342`	`zmm,`
`@@ -383,7 +383,7 @@ X86_SIMD_SORT_FINLINE zmm_t sort_zmm_32bit(zmm_t zmm)`
`383`	`383`
`384`	`384`	`// Assumes zmm is bitonic and performs a recursive half cleaner`
`385`	`385`	`template <typename vtype, typename zmm_t = typename vtype::zmm_t>`
`386`		`-X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm)`
	`386`	`+X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm)`
`387`	`387`	`{`
`388`	`388`	`// 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..`
`389`	`389`	`zmm = cmp_merge<vtype>(`
`@@ -410,7 +410,7 @@ X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm)`
`410`	`410`
`411`	`411`	`// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner`
`412`	`412`	`template <typename vtype, typename zmm_t = typename vtype::zmm_t>`
`413`		`-X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_32bit(zmm_t zmm1, zmm_t zmm2)`
	`413`	`+X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_32bit(zmm_t zmm1, zmm_t zmm2)`
`414`	`414`	`{`
`415`	`415`	`// 1) First step of a merging network: coex of zmm1 and zmm2 reversed`
`416`	`416`	`zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm2);`
`@@ -424,7 +424,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_32bit(zmm_t zmm1, zmm_t zmm2)`
`424`	`424`	`// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive`
`425`	`425`	`// half cleaner`
`426`	`426`	`template <typename vtype, typename zmm_t = typename vtype::zmm_t>`
`427`		`-X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm)`
	`427`	`+X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm)`
`428`	`428`	`{`
`429`	`429`	`zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[2]);`
`430`	`430`	`zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[3]);`
`@@ -445,7 +445,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm)`
`445`	`445`	`}`
`446`	`446`
`447`	`447`	`template <typename vtype, typename zmm_t = typename vtype::zmm_t>`
`448`		`-X86_SIMD_SORT_FINLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm)`
	`448`	`+X86_SIMD_SORT_INLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm)`
`449`	`449`	`{`
`450`	`450`	`zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[4]);`
`451`	`451`	`zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[5]);`
`@@ -482,7 +482,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm)`
`482`	`482`	`}`
`483`	`483`
`484`	`484`	`template <typename vtype, typename type_t>`
`485`		`-X86_SIMD_SORT_FINLINE void sort_16_32bit(type_t *arr, int32_t N)`
	`485`	`+X86_SIMD_SORT_INLINE void sort_16_32bit(type_t *arr, int32_t N)`
`486`	`486`	`{`
`487`	`487`	`typename vtype::opmask_t load_mask = (0x0001 << N) - 0x0001;`
`488`	`488`	`typename vtype::zmm_t zmm`
`@@ -491,7 +491,7 @@ X86_SIMD_SORT_FINLINE void sort_16_32bit(type_t *arr, int32_t N)`
`491`	`491`	`}`
`492`	`492`
`493`	`493`	`template <typename vtype, typename type_t>`
`494`		`-X86_SIMD_SORT_FINLINE void sort_32_32bit(type_t *arr, int32_t N)`
	`494`	`+X86_SIMD_SORT_INLINE void sort_32_32bit(type_t *arr, int32_t N)`
`495`	`495`	`{`
`496`	`496`	`if (N <= 16) {`
`497`	`497`	`sort_16_32bit<vtype>(arr, N);`
`@@ -509,7 +509,7 @@ X86_SIMD_SORT_FINLINE void sort_32_32bit(type_t *arr, int32_t N)`
`509`	`509`	`}`
`510`	`510`
`511`	`511`	`template <typename vtype, typename type_t>`
`512`		`-X86_SIMD_SORT_FINLINE void sort_64_32bit(type_t *arr, int32_t N)`
	`512`	`+X86_SIMD_SORT_INLINE void sort_64_32bit(type_t *arr, int32_t N)`
`513`	`513`	`{`
`514`	`514`	`if (N <= 32) {`
`515`	`515`	`sort_32_32bit<vtype>(arr, N);`
`@@ -540,7 +540,7 @@ X86_SIMD_SORT_FINLINE void sort_64_32bit(type_t *arr, int32_t N)`
`540`	`540`	`}`
`541`	`541`
`542`	`542`	`template <typename vtype, typename type_t>`
`543`		`-X86_SIMD_SORT_FINLINE void sort_128_32bit(type_t *arr, int32_t N)`
	`543`	`+X86_SIMD_SORT_INLINE void sort_128_32bit(type_t *arr, int32_t N)`
`544`	`544`	`{`
`545`	`545`	`if (N <= 64) {`
`546`	`546`	`sort_64_32bit<vtype>(arr, N);`
`@@ -592,9 +592,9 @@ X86_SIMD_SORT_FINLINE void sort_128_32bit(type_t *arr, int32_t N)`
`592`	`592`	`}`
`593`	`593`
`594`	`594`	`template <typename vtype, typename type_t>`
`595`		`-X86_SIMD_SORT_FINLINE type_t get_pivot_32bit(type_t *arr,`
`596`		`- const int64_t left,`
`597`		`- const int64_t right)`
	`595`	`+X86_SIMD_SORT_INLINE type_t get_pivot_32bit(type_t *arr,`
	`596`	`+ const int64_t left,`
	`597`	`+ const int64_t right)`
`598`	`598`	`{`
`599`	`599`	`// median of 16`
`600`	`600`	`int64_t size = (right - left) / 16;`
`@@ -656,7 +656,7 @@ qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)`
`656`	`656`	`qsort_32bit_<vtype>(arr, pivot_index, right, max_iters - 1);`
`657`	`657`	`}`
`658`	`658`
`659`		`-X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize)`
	`659`	`+X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize)`
`660`	`660`	`{`
`661`	`661`	`int64_t nan_count = 0;`
`662`	`662`	`__mmask16 loadmask = 0xFFFF;`
`@@ -672,7 +672,7 @@ X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize)`
`672`	`672`	`return nan_count;`
`673`	`673`	`}`
`674`	`674`
`675`		`-X86_SIMD_SORT_FINLINE void`
	`675`	`+X86_SIMD_SORT_INLINE void`
`676`	`676`	`replace_inf_with_nan(float *arr, int64_t arrsize, int64_t nan_count)`
`677`	`677`	`{`
`678`	`678`	`for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {`
Original file line number	Diff line number	Diff line change
`@@ -330,7 +330,7 @@ struct zmm_vector<double> {`
`330`	`330`	`* https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg`
`331`	`331`	`*/`
`332`	`332`	`template <typename vtype, typename zmm_t = typename vtype::zmm_t>`
`333`		`-X86_SIMD_SORT_FINLINE zmm_t sort_zmm_64bit(zmm_t zmm)`
	`333`	`+X86_SIMD_SORT_INLINE zmm_t sort_zmm_64bit(zmm_t zmm)`
`334`	`334`	`{`
`335`	`335`	`const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);`
`336`	`336`	`zmm = cmp_merge<vtype>(`
`@@ -353,7 +353,7 @@ X86_SIMD_SORT_FINLINE zmm_t sort_zmm_64bit(zmm_t zmm)`
`353`	`353`
`354`	`354`	`// Assumes zmm is bitonic and performs a recursive half cleaner`
`355`	`355`	`template <typename vtype, typename zmm_t = typename vtype::zmm_t>`
`356`		`-X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm)`
	`356`	`+X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm)`
`357`	`357`	`{`
`358`	`358`
`359`	`359`	`// 1) half_cleaner[8]: compare 0-4, 1-5, 2-6, 3-7`
`@@ -374,7 +374,7 @@ X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm)`
`374`	`374`
`375`	`375`	`// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner`
`376`	`376`	`template <typename vtype, typename zmm_t = typename vtype::zmm_t>`
`377`		`-X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2)`
	`377`	`+X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2)`
`378`	`378`	`{`
`379`	`379`	`const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);`
`380`	`380`	`// 1) First step of a merging network: coex of zmm1 and zmm2 reversed`
`@@ -389,7 +389,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2)`
`389`	`389`	`// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive`
`390`	`390`	`// half cleaner`
`391`	`391`	`template <typename vtype, typename zmm_t = typename vtype::zmm_t>`
`392`		`-X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm)`
	`392`	`+X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm)`
`393`	`393`	`{`
`394`	`394`	`const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);`
`395`	`395`	`// 1) First step of a merging network`
`@@ -411,7 +411,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm)`
`411`	`411`	`}`
`412`	`412`
`413`	`413`	`template <typename vtype, typename zmm_t = typename vtype::zmm_t>`
`414`		`-X86_SIMD_SORT_FINLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm)`
	`414`	`+X86_SIMD_SORT_INLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm)`
`415`	`415`	`{`
`416`	`416`	`const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);`
`417`	`417`	`zmm_t zmm4r = vtype::permutexvar(rev_index, zmm[4]);`
`@@ -445,7 +445,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm)`
`445`	`445`	`}`
`446`	`446`
`447`	`447`	`template <typename vtype, typename zmm_t = typename vtype::zmm_t>`
`448`		`-X86_SIMD_SORT_FINLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm)`
	`448`	`+X86_SIMD_SORT_INLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm)`
`449`	`449`	`{`
`450`	`450`	`const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);`
`451`	`451`	`zmm_t zmm8r = vtype::permutexvar(rev_index, zmm[8]);`
`@@ -519,7 +519,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm)`
`519`	`519`	`}`
`520`	`520`
`521`	`521`	`template <typename vtype, typename type_t>`
`522`		`-X86_SIMD_SORT_FINLINE void sort_8_64bit(type_t *arr, int32_t N)`
	`522`	`+X86_SIMD_SORT_INLINE void sort_8_64bit(type_t *arr, int32_t N)`
`523`	`523`	`{`
`524`	`524`	`typename vtype::opmask_t load_mask = (0x01 << N) - 0x01;`
`525`	`525`	`typename vtype::zmm_t zmm`
`@@ -528,7 +528,7 @@ X86_SIMD_SORT_FINLINE void sort_8_64bit(type_t *arr, int32_t N)`
`528`	`528`	`}`
`529`	`529`
`530`	`530`	`template <typename vtype, typename type_t>`
`531`		`-X86_SIMD_SORT_FINLINE void sort_16_64bit(type_t *arr, int32_t N)`
	`531`	`+X86_SIMD_SORT_INLINE void sort_16_64bit(type_t *arr, int32_t N)`
`532`	`532`	`{`
`533`	`533`	`if (N <= 8) {`
`534`	`534`	`sort_8_64bit<vtype>(arr, N);`
`@@ -546,7 +546,7 @@ X86_SIMD_SORT_FINLINE void sort_16_64bit(type_t *arr, int32_t N)`
`546`	`546`	`}`
`547`	`547`
`548`	`548`	`template <typename vtype, typename type_t>`
`549`		`-X86_SIMD_SORT_FINLINE void sort_32_64bit(type_t *arr, int32_t N)`
	`549`	`+X86_SIMD_SORT_INLINE void sort_32_64bit(type_t *arr, int32_t N)`
`550`	`550`	`{`
`551`	`551`	`if (N <= 16) {`
`552`	`552`	`sort_16_64bit<vtype>(arr, N);`
`@@ -577,7 +577,7 @@ X86_SIMD_SORT_FINLINE void sort_32_64bit(type_t *arr, int32_t N)`
`577`	`577`	`}`
`578`	`578`
`579`	`579`	`template <typename vtype, typename type_t>`
`580`		`-X86_SIMD_SORT_FINLINE void sort_64_64bit(type_t *arr, int32_t N)`
	`580`	`+X86_SIMD_SORT_INLINE void sort_64_64bit(type_t *arr, int32_t N)`
`581`	`581`	`{`
`582`	`582`	`if (N <= 32) {`
`583`	`583`	`sort_32_64bit<vtype>(arr, N);`
`@@ -628,7 +628,7 @@ X86_SIMD_SORT_FINLINE void sort_64_64bit(type_t *arr, int32_t N)`
`628`	`628`	`}`
`629`	`629`
`630`	`630`	`template <typename vtype, typename type_t>`
`631`		`-X86_SIMD_SORT_FINLINE void sort_128_64bit(type_t *arr, int32_t N)`
	`631`	`+X86_SIMD_SORT_INLINE void sort_128_64bit(type_t *arr, int32_t N)`
`632`	`632`	`{`
`633`	`633`	`if (N <= 64) {`
`634`	`634`	`sort_64_64bit<vtype>(arr, N);`
`@@ -718,9 +718,9 @@ X86_SIMD_SORT_FINLINE void sort_128_64bit(type_t *arr, int32_t N)`
`718`	`718`	`}`
`719`	`719`
`720`	`720`	`template <typename vtype, typename type_t>`
`721`		`-X86_SIMD_SORT_FINLINE type_t get_pivot_64bit(type_t *arr,`
`722`		`- const int64_t left,`
`723`		`- const int64_t right)`
	`721`	`+X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr,`
	`722`	`+ const int64_t left,`
	`723`	`+ const int64_t right)`
`724`	`724`	`{`
`725`	`725`	`// median of 8`
`726`	`726`	`int64_t size = (right - left) / 8;`
`@@ -769,7 +769,7 @@ qsort_64bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)`
`769`	`769`	`qsort_64bit_<vtype>(arr, pivot_index, right, max_iters - 1);`
`770`	`770`	`}`
`771`	`771`
`772`		`-X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(double *arr, int64_t arrsize)`
	`772`	`+X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(double *arr, int64_t arrsize)`
`773`	`773`	`{`
`774`	`774`	`int64_t nan_count = 0;`
`775`	`775`	`__mmask8 loadmask = 0xFF;`
`@@ -785,7 +785,7 @@ X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(double *arr, int64_t arrsize)`
`785`	`785`	`return nan_count;`
`786`	`786`	`}`
`787`	`787`
`788`		`-X86_SIMD_SORT_FINLINE void`
	`788`	`+X86_SIMD_SORT_INLINE void`
`789`	`789`	`replace_inf_with_nan(double *arr, int64_t arrsize, int64_t nan_count)`
`790`	`790`	`{`
`791`	`791`	`for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {`