New pivot selection to improve performance in many special cases

sterrettm2 · Raghuveer Devulapalli · commit 0041c05d070b · 2024-02-21T13:37:36.000-08:00
diff --git a/src/avx2-32bit-qsort.hpp b/src/avx2-32bit-qsort.hpp
@@ -86,6 +86,11 @@ struct avx2_vector<int32_t> {
     {
         return _mm256_set1_epi32(type_max());
     } // TODO: this should broadcast bits as is?
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        auto allOnes = seti(-1, -1, -1, -1, -1, -1, -1, -1);
+        return _mm256_xor_si256(x, allOnes);
+    }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
         auto mask = ((0x1ull << num_to_read) - 0x1ull);
@@ -204,6 +209,9 @@ struct avx2_vector<int32_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return _mm256_movemask_ps(_mm256_castsi256_ps(k)) == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
@@ -242,6 +250,11 @@ struct avx2_vector<uint32_t> {
     {
         return _mm256_set1_epi32(type_max());
     }
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        auto allOnes = seti(-1, -1, -1, -1, -1, -1, -1, -1);
+        return _mm256_xor_si256(x, allOnes);
+    }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
         auto mask = ((0x1ull << num_to_read) - 0x1ull);
@@ -349,6 +362,9 @@ struct avx2_vector<uint32_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return _mm256_movemask_ps(_mm256_castsi256_ps(k)) == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
@@ -387,7 +403,11 @@ struct avx2_vector<float> {
     {
         return _mm256_set1_ps(type_max());
     }
-
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        auto allOnes = seti(-1, -1, -1, -1, -1, -1, -1, -1);
+        return _mm256_xor_si256(x, allOnes);
+    }
     static ymmi_t
     seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8)
     {
@@ -514,6 +534,9 @@ struct avx2_vector<float> {
     {
         return _mm256_castps_si256(v);
     }
+    static bool all_false(opmask_t k){
+        return _mm256_movemask_ps(_mm256_castsi256_ps(k)) == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
diff --git a/src/avx2-64bit-qsort.hpp b/src/avx2-64bit-qsort.hpp
@@ -68,12 +68,17 @@ struct avx2_vector<int64_t> {
     {
         return _mm256_set1_epi64x(type_max());
     } // TODO: this should broadcast bits as is?
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF);
+        return _mm256_xor_si256(x, allTrue);
+    }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
         auto mask = ((0x1ull << num_to_read) - 0x1ull);
         return convert_int_to_avx2_mask_64bit(mask);
     }
-    static ymmi_t seti(int v1, int v2, int v3, int v4)
+    static ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4)
     {
         return _mm256_set_epi64x(v1, v2, v3, v4);
     }
@@ -209,6 +214,9 @@ struct avx2_vector<int64_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return _mm256_movemask_pd(_mm256_castsi256_pd(k)) == 0;
+    }
 };
 template <>
 struct avx2_vector<uint64_t> {
@@ -239,12 +247,17 @@ struct avx2_vector<uint64_t> {
     {
         return _mm256_set1_epi64x(type_max());
     }
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF);
+        return _mm256_xor_si256(x, allTrue);
+    }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
         auto mask = ((0x1ull << num_to_read) - 0x1ull);
         return convert_int_to_avx2_mask_64bit(mask);
     }
-    static ymmi_t seti(int v1, int v2, int v3, int v4)
+    static ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4)
     {
         return _mm256_set_epi64x(v1, v2, v3, v4);
     }
@@ -378,6 +391,9 @@ struct avx2_vector<uint64_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return _mm256_movemask_pd(_mm256_castsi256_pd(k)) == 0;
+    }
 };
 
 /*
@@ -421,6 +437,11 @@ struct avx2_vector<double> {
     {
         return _mm256_set1_pd(type_max());
     }
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF);
+        return _mm256_xor_si256(x, allTrue);
+    }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
         auto mask = ((0x1ull << num_to_read) - 0x1ull);
@@ -440,7 +461,7 @@ struct avx2_vector<double> {
             static_assert(type == (0x01 | 0x80), "should not reach here");
         }
     }
-    static ymmi_t seti(int v1, int v2, int v3, int v4)
+    static ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4)
     {
         return _mm256_set_epi64x(v1, v2, v3, v4);
     }
@@ -571,6 +592,9 @@ struct avx2_vector<double> {
     {
         return _mm256_castpd_si256(v);
     }
+    static bool all_false(opmask_t k){
+        return _mm256_movemask_pd(_mm256_castsi256_pd(k)) == 0;
+    }
 };
 
 struct avx2_64bit_swizzle_ops {
diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp
@@ -81,6 +81,10 @@ struct zmm_vector<float16> {
                           exp_eq, mant_x, mant_y, _MM_CMPINT_NLT);
         return _kxor_mask32(mask_ge, neg);
     }
+    static opmask_t eq(reg_t x, reg_t y)
+    {
+        return _mm512_cmpeq_epu16_mask(x, y);
+    }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
         return ((0x1ull << num_to_read) - 0x1ull);
@@ -186,6 +190,9 @@ struct zmm_vector<float16> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
@@ -238,6 +245,10 @@ struct zmm_vector<int16_t> {
     {
         return _mm512_cmp_epi16_mask(x, y, _MM_CMPINT_NLT);
     }
+    static opmask_t eq(reg_t x, reg_t y)
+    {
+        return _mm512_cmpeq_epi16_mask(x, y);
+    }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
         return ((0x1ull << num_to_read) - 0x1ull);
@@ -323,6 +334,9 @@ struct zmm_vector<int16_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
@@ -374,6 +388,10 @@ struct zmm_vector<uint16_t> {
     {
         return _mm512_cmp_epu16_mask(x, y, _MM_CMPINT_NLT);
     }
+    static opmask_t eq(reg_t x, reg_t y)
+    {
+        return _mm512_cmpeq_epu16_mask(x, y);
+    }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
         return ((0x1ull << num_to_read) - 0x1ull);
@@ -457,6 +475,9 @@ struct zmm_vector<uint16_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
diff --git a/src/avx512-32bit-qsort.hpp b/src/avx512-32bit-qsort.hpp
@@ -198,6 +198,9 @@ struct zmm_vector<int32_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
@@ -377,6 +380,9 @@ struct zmm_vector<uint32_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
@@ -570,6 +576,9 @@ struct zmm_vector<float> {
     {
         return _mm512_castps_si512(v);
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h
@@ -732,6 +732,9 @@ struct zmm_vector<int64_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
@@ -903,6 +906,9 @@ struct zmm_vector<uint64_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
@@ -1093,6 +1099,9 @@ struct zmm_vector<double> {
     {
         return _mm512_castpd_si512(v);
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
diff --git a/src/avx512fp16-16bit-qsort.hpp b/src/avx512fp16-16bit-qsort.hpp
@@ -150,6 +150,9 @@ struct zmm_vector<_Float16> {
     {
         return _mm512_castph_si512(v);
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h
@@ -498,14 +498,24 @@ qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters)
                 arr + left, (int32_t)(right + 1 - left));
         return;
     }
-
-    type_t pivot = get_pivot_blocks<vtype, type_t>(arr, left, right);
+    
+    auto pivot_result = get_pivot_smart<vtype, type_t>(arr, left, right);
+    type_t pivot = pivot_result.pivot;
+    
+    if (pivot_result.alreadySorted){
+        return;
+    }
+    
     type_t smallest = vtype::type_max();
     type_t biggest = vtype::type_min();
 
     arrsize_t pivot_index
             = partition_avx512_unrolled<vtype, vtype::partition_unroll_factor>(
                     arr, left, right + 1, pivot, &smallest, &biggest);
+    
+    if (pivot_result.only2Values){
+        return;
+    }
 
     if (pivot != smallest)
         qsort_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
diff --git a/src/xss-network-qsort.hpp b/src/xss-network-qsort.hpp
@@ -4,6 +4,9 @@
 #include "xss-optimal-networks.hpp"
 #include "xss-common-qsort.h"
 
+template <typename vtype, typename mm_t>
+X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b);
+
 template <typename vtype, int numVecs, typename reg_t = typename vtype::reg_t>
 X86_SIMD_SORT_FINLINE void bitonic_sort_n_vec(reg_t *regs)
 {
@@ -140,6 +143,17 @@ X86_SIMD_SORT_FINLINE void merge_n_vec(reg_t *regs)
     }
 }
 
+template <typename vtype, int numVecs, typename reg_t = typename vtype::reg_t>
+X86_SIMD_SORT_FINLINE void sort_vectors(reg_t * vecs){
+    /* Run the initial sorting network to sort the columns of the [numVecs x
+     * num_lanes] matrix
+     */
+    bitonic_sort_n_vec<vtype, numVecs>(vecs);
+
+    // Merge the vectors using bitonic merging networks
+    merge_n_vec<vtype, numVecs>(vecs);
+}
+
 template <typename vtype, int numVecs, typename reg_t = typename vtype::reg_t>
 X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int N)
 {
@@ -174,14 +188,8 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int N)
         vecs[i] = vtype::mask_loadu(
                 vtype::zmm_max(), ioMasks[j], arr + i * vtype::numlanes);
     }
-
-    /* Run the initial sorting network to sort the columns of the [numVecs x
-     * num_lanes] matrix
-     */
-    bitonic_sort_n_vec<vtype, numVecs>(vecs);
-
-    // Merge the vectors using bitonic merging networks
-    merge_n_vec<vtype, numVecs>(vecs);
+    
+    sort_vectors<vtype, numVecs>(vecs);
 
     // Unmasked part of the store
     X86_SIMD_SORT_UNROLL_LOOP(64)
diff --git a/src/xss-optimal-networks.hpp b/src/xss-optimal-networks.hpp
@@ -1,6 +1,9 @@
 // All of these sources files are generated from the optimal networks described in
 // https://bertdobbelaere.github.io/sorting_networks.html
 
+template <typename vtype, typename mm_t>
+X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b);
+
 template <typename vtype, typename reg_t = typename vtype::reg_t>
 X86_SIMD_SORT_FINLINE void optimal_sort_4(reg_t *vecs)
 {
diff --git a/src/xss-pivot-selection.hpp b/src/xss-pivot-selection.hpp

Original file line number	Diff line number	Diff line change
`@@ -86,6 +86,11 @@ struct avx2_vector<int32_t> {`
`86`	`86`	`{`
`87`	`87`	`return _mm256_set1_epi32(type_max());`
`88`	`88`	`} // TODO: this should broadcast bits as is?`
	`89`	`+ static opmask_t knot_opmask(opmask_t x)`
	`90`	`+ {`
	`91`	`+ auto allOnes = seti(-1, -1, -1, -1, -1, -1, -1, -1);`
	`92`	`+ return _mm256_xor_si256(x, allOnes);`
	`93`	`+ }`
`89`	`94`	`static opmask_t get_partial_loadmask(uint64_t num_to_read)`
`90`	`95`	`{`
`91`	`96`	`auto mask = ((0x1ull << num_to_read) - 0x1ull);`
`@@ -204,6 +209,9 @@ struct avx2_vector<int32_t> {`
`204`	`209`	`{`
`205`	`210`	`return v;`
`206`	`211`	`}`
	`212`	`+ static bool all_false(opmask_t k){`
	`213`	`+ return _mm256_movemask_ps(_mm256_castsi256_ps(k)) == 0;`
	`214`	`+ }`
`207`	`215`	`static int double_compressstore(type_t *left_addr,`
`208`	`216`	`type_t *right_addr,`
`209`	`217`	`opmask_t k,`
`@@ -242,6 +250,11 @@ struct avx2_vector<uint32_t> {`
`242`	`250`	`{`
`243`	`251`	`return _mm256_set1_epi32(type_max());`
`244`	`252`	`}`
	`253`	`+ static opmask_t knot_opmask(opmask_t x)`
	`254`	`+ {`
	`255`	`+ auto allOnes = seti(-1, -1, -1, -1, -1, -1, -1, -1);`
	`256`	`+ return _mm256_xor_si256(x, allOnes);`
	`257`	`+ }`
`245`	`258`	`static opmask_t get_partial_loadmask(uint64_t num_to_read)`
`246`	`259`	`{`
`247`	`260`	`auto mask = ((0x1ull << num_to_read) - 0x1ull);`
`@@ -349,6 +362,9 @@ struct avx2_vector<uint32_t> {`
`349`	`362`	`{`
`350`	`363`	`return v;`
`351`	`364`	`}`
	`365`	`+ static bool all_false(opmask_t k){`
	`366`	`+ return _mm256_movemask_ps(_mm256_castsi256_ps(k)) == 0;`
	`367`	`+ }`
`352`	`368`	`static int double_compressstore(type_t *left_addr,`
`353`	`369`	`type_t *right_addr,`
`354`	`370`	`opmask_t k,`
`@@ -387,7 +403,11 @@ struct avx2_vector<float> {`
`387`	`403`	`{`
`388`	`404`	`return _mm256_set1_ps(type_max());`
`389`	`405`	`}`
`390`		`-`
	`406`	`+ static opmask_t knot_opmask(opmask_t x)`
	`407`	`+ {`
	`408`	`+ auto allOnes = seti(-1, -1, -1, -1, -1, -1, -1, -1);`
	`409`	`+ return _mm256_xor_si256(x, allOnes);`
	`410`	`+ }`
`391`	`411`	`static ymmi_t`
`392`	`412`	`seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8)`
`393`	`413`	`{`
`@@ -514,6 +534,9 @@ struct avx2_vector<float> {`
`514`	`534`	`{`
`515`	`535`	`return _mm256_castps_si256(v);`
`516`	`536`	`}`
	`537`	`+ static bool all_false(opmask_t k){`
	`538`	`+ return _mm256_movemask_ps(_mm256_castsi256_ps(k)) == 0;`
	`539`	`+ }`
`517`	`540`	`static int double_compressstore(type_t *left_addr,`
`518`	`541`	`type_t *right_addr,`
`519`	`542`	`opmask_t k,`
Original file line number	Diff line number	Diff line change
`@@ -68,12 +68,17 @@ struct avx2_vector<int64_t> {`
`68`	`68`	`{`
`69`	`69`	`return _mm256_set1_epi64x(type_max());`
`70`	`70`	`} // TODO: this should broadcast bits as is?`
	`71`	`+ static opmask_t knot_opmask(opmask_t x)`
	`72`	`+ {`
	`73`	`+ auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF);`
	`74`	`+ return _mm256_xor_si256(x, allTrue);`
	`75`	`+ }`
`71`	`76`	`static opmask_t get_partial_loadmask(uint64_t num_to_read)`
`72`	`77`	`{`
`73`	`78`	`auto mask = ((0x1ull << num_to_read) - 0x1ull);`
`74`	`79`	`return convert_int_to_avx2_mask_64bit(mask);`
`75`	`80`	`}`
`76`		`- static ymmi_t seti(int v1, int v2, int v3, int v4)`
	`81`	`+ static ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4)`
`77`	`82`	`{`
`78`	`83`	`return _mm256_set_epi64x(v1, v2, v3, v4);`
`79`	`84`	`}`
`@@ -209,6 +214,9 @@ struct avx2_vector<int64_t> {`
`209`	`214`	`{`
`210`	`215`	`return v;`
`211`	`216`	`}`
	`217`	`+ static bool all_false(opmask_t k){`
	`218`	`+ return _mm256_movemask_pd(_mm256_castsi256_pd(k)) == 0;`
	`219`	`+ }`
`212`	`220`	`};`
`213`	`221`	`template <>`
`214`	`222`	`struct avx2_vector<uint64_t> {`
`@@ -239,12 +247,17 @@ struct avx2_vector<uint64_t> {`
`239`	`247`	`{`
`240`	`248`	`return _mm256_set1_epi64x(type_max());`
`241`	`249`	`}`
	`250`	`+ static opmask_t knot_opmask(opmask_t x)`
	`251`	`+ {`
	`252`	`+ auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF);`
	`253`	`+ return _mm256_xor_si256(x, allTrue);`
	`254`	`+ }`
`242`	`255`	`static opmask_t get_partial_loadmask(uint64_t num_to_read)`
`243`	`256`	`{`
`244`	`257`	`auto mask = ((0x1ull << num_to_read) - 0x1ull);`
`245`	`258`	`return convert_int_to_avx2_mask_64bit(mask);`
`246`	`259`	`}`
`247`		`- static ymmi_t seti(int v1, int v2, int v3, int v4)`
	`260`	`+ static ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4)`
`248`	`261`	`{`
`249`	`262`	`return _mm256_set_epi64x(v1, v2, v3, v4);`
`250`	`263`	`}`
`@@ -378,6 +391,9 @@ struct avx2_vector<uint64_t> {`
`378`	`391`	`{`
`379`	`392`	`return v;`
`380`	`393`	`}`
	`394`	`+ static bool all_false(opmask_t k){`
	`395`	`+ return _mm256_movemask_pd(_mm256_castsi256_pd(k)) == 0;`
	`396`	`+ }`
`381`	`397`	`};`
`382`	`398`
`383`	`399`	`/*`
`@@ -421,6 +437,11 @@ struct avx2_vector<double> {`
`421`	`437`	`{`
`422`	`438`	`return _mm256_set1_pd(type_max());`
`423`	`439`	`}`
	`440`	`+ static opmask_t knot_opmask(opmask_t x)`
	`441`	`+ {`
	`442`	`+ auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF);`
	`443`	`+ return _mm256_xor_si256(x, allTrue);`
	`444`	`+ }`
`424`	`445`	`static opmask_t get_partial_loadmask(uint64_t num_to_read)`
`425`	`446`	`{`
`426`	`447`	`auto mask = ((0x1ull << num_to_read) - 0x1ull);`
`@@ -440,7 +461,7 @@ struct avx2_vector<double> {`
`440`	`461`	`static_assert(type == (0x01 \| 0x80), "should not reach here");`
`441`	`462`	`}`
`442`	`463`	`}`
`443`		`- static ymmi_t seti(int v1, int v2, int v3, int v4)`
	`464`	`+ static ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4)`
`444`	`465`	`{`
`445`	`466`	`return _mm256_set_epi64x(v1, v2, v3, v4);`
`446`	`467`	`}`
`@@ -571,6 +592,9 @@ struct avx2_vector<double> {`
`571`	`592`	`{`
`572`	`593`	`return _mm256_castpd_si256(v);`
`573`	`594`	`}`
	`595`	`+ static bool all_false(opmask_t k){`
	`596`	`+ return _mm256_movemask_pd(_mm256_castsi256_pd(k)) == 0;`
	`597`	`+ }`
`574`	`598`	`};`
`575`	`599`
`576`	`600`	`struct avx2_64bit_swizzle_ops {`
Original file line number	Diff line number	Diff line change
`@@ -81,6 +81,10 @@ struct zmm_vector<float16> {`
`81`	`81`	`exp_eq, mant_x, mant_y, _MM_CMPINT_NLT);`
`82`	`82`	`return _kxor_mask32(mask_ge, neg);`
`83`	`83`	`}`
	`84`	`+ static opmask_t eq(reg_t x, reg_t y)`
	`85`	`+ {`
	`86`	`+ return _mm512_cmpeq_epu16_mask(x, y);`
	`87`	`+ }`
`84`	`88`	`static opmask_t get_partial_loadmask(uint64_t num_to_read)`
`85`	`89`	`{`
`86`	`90`	`return ((0x1ull << num_to_read) - 0x1ull);`
`@@ -186,6 +190,9 @@ struct zmm_vector<float16> {`
`186`	`190`	`{`
`187`	`191`	`return v;`
`188`	`192`	`}`
	`193`	`+ static bool all_false(opmask_t k){`
	`194`	`+ return k == 0;`
	`195`	`+ }`
`189`	`196`	`static int double_compressstore(type_t *left_addr,`
`190`	`197`	`type_t *right_addr,`
`191`	`198`	`opmask_t k,`
`@@ -238,6 +245,10 @@ struct zmm_vector<int16_t> {`
`238`	`245`	`{`
`239`	`246`	`return _mm512_cmp_epi16_mask(x, y, _MM_CMPINT_NLT);`
`240`	`247`	`}`
	`248`	`+ static opmask_t eq(reg_t x, reg_t y)`
	`249`	`+ {`
	`250`	`+ return _mm512_cmpeq_epi16_mask(x, y);`
	`251`	`+ }`
`241`	`252`	`static opmask_t get_partial_loadmask(uint64_t num_to_read)`
`242`	`253`	`{`
`243`	`254`	`return ((0x1ull << num_to_read) - 0x1ull);`
`@@ -323,6 +334,9 @@ struct zmm_vector<int16_t> {`
`323`	`334`	`{`
`324`	`335`	`return v;`
`325`	`336`	`}`
	`337`	`+ static bool all_false(opmask_t k){`
	`338`	`+ return k == 0;`
	`339`	`+ }`
`326`	`340`	`static int double_compressstore(type_t *left_addr,`
`327`	`341`	`type_t *right_addr,`
`328`	`342`	`opmask_t k,`
`@@ -374,6 +388,10 @@ struct zmm_vector<uint16_t> {`
`374`	`388`	`{`
`375`	`389`	`return _mm512_cmp_epu16_mask(x, y, _MM_CMPINT_NLT);`
`376`	`390`	`}`
	`391`	`+ static opmask_t eq(reg_t x, reg_t y)`
	`392`	`+ {`
	`393`	`+ return _mm512_cmpeq_epu16_mask(x, y);`
	`394`	`+ }`
`377`	`395`	`static opmask_t get_partial_loadmask(uint64_t num_to_read)`
`378`	`396`	`{`
`379`	`397`	`return ((0x1ull << num_to_read) - 0x1ull);`
`@@ -457,6 +475,9 @@ struct zmm_vector<uint16_t> {`
`457`	`475`	`{`
`458`	`476`	`return v;`
`459`	`477`	`}`
	`478`	`+ static bool all_false(opmask_t k){`
	`479`	`+ return k == 0;`
	`480`	`+ }`
`460`	`481`	`static int double_compressstore(type_t *left_addr,`
`461`	`482`	`type_t *right_addr,`
`462`	`483`	`opmask_t k,`
Original file line number	Diff line number	Diff line change
`@@ -198,6 +198,9 @@ struct zmm_vector<int32_t> {`
`198`	`198`	`{`
`199`	`199`	`return v;`
`200`	`200`	`}`
	`201`	`+ static bool all_false(opmask_t k){`
	`202`	`+ return k == 0;`
	`203`	`+ }`
`201`	`204`	`static int double_compressstore(type_t *left_addr,`
`202`	`205`	`type_t *right_addr,`
`203`	`206`	`opmask_t k,`
`@@ -377,6 +380,9 @@ struct zmm_vector<uint32_t> {`
`377`	`380`	`{`
`378`	`381`	`return v;`
`379`	`382`	`}`
	`383`	`+ static bool all_false(opmask_t k){`
	`384`	`+ return k == 0;`
	`385`	`+ }`
`380`	`386`	`static int double_compressstore(type_t *left_addr,`
`381`	`387`	`type_t *right_addr,`
`382`	`388`	`opmask_t k,`
`@@ -570,6 +576,9 @@ struct zmm_vector<float> {`
`570`	`576`	`{`
`571`	`577`	`return _mm512_castps_si512(v);`
`572`	`578`	`}`
	`579`	`+ static bool all_false(opmask_t k){`
	`580`	`+ return k == 0;`
	`581`	`+ }`
`573`	`582`	`static int double_compressstore(type_t *left_addr,`
`574`	`583`	`type_t *right_addr,`
`575`	`584`	`opmask_t k,`
Original file line number	Diff line number	Diff line change
`@@ -732,6 +732,9 @@ struct zmm_vector<int64_t> {`
`732`	`732`	`{`
`733`	`733`	`return v;`
`734`	`734`	`}`
	`735`	`+ static bool all_false(opmask_t k){`
	`736`	`+ return k == 0;`
	`737`	`+ }`
`735`	`738`	`static int double_compressstore(type_t *left_addr,`
`736`	`739`	`type_t *right_addr,`
`737`	`740`	`opmask_t k,`
`@@ -903,6 +906,9 @@ struct zmm_vector<uint64_t> {`
`903`	`906`	`{`
`904`	`907`	`return v;`
`905`	`908`	`}`
	`909`	`+ static bool all_false(opmask_t k){`
	`910`	`+ return k == 0;`
	`911`	`+ }`
`906`	`912`	`static int double_compressstore(type_t *left_addr,`
`907`	`913`	`type_t *right_addr,`
`908`	`914`	`opmask_t k,`
`@@ -1093,6 +1099,9 @@ struct zmm_vector<double> {`
`1093`	`1099`	`{`
`1094`	`1100`	`return _mm512_castpd_si512(v);`
`1095`	`1101`	`}`
	`1102`	`+ static bool all_false(opmask_t k){`
	`1103`	`+ return k == 0;`
	`1104`	`+ }`
`1096`	`1105`	`static int double_compressstore(type_t *left_addr,`
`1097`	`1106`	`type_t *right_addr,`
`1098`	`1107`	`opmask_t k,`
Original file line number	Diff line number	Diff line change
`@@ -150,6 +150,9 @@ struct zmm_vector<_Float16> {`
`150`	`150`	`{`
`151`	`151`	`return _mm512_castph_si512(v);`
`152`	`152`	`}`
	`153`	`+ static bool all_false(opmask_t k){`
	`154`	`+ return k == 0;`
	`155`	`+ }`
`153`	`156`	`static int double_compressstore(type_t *left_addr,`
`154`	`157`	`type_t *right_addr,`
`155`	`158`	`opmask_t k,`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,9 @@`
`1`	`1`	`// All of these sources files are generated from the optimal networks described in`
`2`	`2`	`// https://bertdobbelaere.github.io/sorting_networks.html`
`3`	`3`
	`4`	`+template <typename vtype, typename mm_t>`
	`5`	`+X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b);`
	`6`	`+`
`4`	`7`	`template <typename vtype, typename reg_t = typename vtype::reg_t>`
`5`	`8`	`X86_SIMD_SORT_FINLINE void optimal_sort_4(reg_t *vecs)`
`6`	`9`	`{`