Skip to content

Commit 860494d

Browse files
authored
avx512bw (#1014)
1 parent ecc7a73 commit 860494d

File tree

2 files changed

+223
-9
lines changed

2 files changed

+223
-9
lines changed

crates/core_arch/avx512bw.md

+9-9
Original file line numberDiff line numberDiff line change
@@ -296,9 +296,9 @@
296296
* [x] [`_mm512_cvtepi16_epi8`]
297297
* [x] [`_mm512_mask_cvtepi16_epi8`]
298298
* [x] [`_mm512_maskz_cvtepi16_epi8`]
299-
* [_] [`_mm512_mask_cvtepi16_storeu_epi8`]
300-
* [_] [`_mm_mask_cvtepi16_storeu_epi8`]
301-
* [_] [`_mm256_mask_cvtepi16_storeu_epi8`]
299+
* [x] [`_mm512_mask_cvtepi16_storeu_epi8`]
300+
* [x] [`_mm_mask_cvtepi16_storeu_epi8`]
301+
* [x] [`_mm256_mask_cvtepi16_storeu_epi8`]
302302
* [x] [`_mm_cvtepi16_epi8`]
303303
* [x] [`_mm_mask_cvtepi16_epi8`]
304304
* [x] [`_mm_maskz_cvtepi16_epi8`]
@@ -315,15 +315,15 @@
315315
* [x] [`_mm512_cvtsepi16_epi8`]
316316
* [x] [`_mm512_mask_cvtsepi16_epi8`]
317317
* [x] [`_mm512_maskz_cvtsepi16_epi8`]
318-
* [_] [`_mm512_mask_cvtsepi16_storeu_epi8`]
319318
* [x] [`_mm_cvtsepi16_epi8`]
320319
* [x] [`_mm_mask_cvtsepi16_epi8`]
321320
* [x] [`_mm_maskz_cvtsepi16_epi8`]
322321
* [x] [`_mm256_cvtsepi16_epi8`]
323322
* [x] [`_mm256_mask_cvtsepi16_epi8`]
324323
* [x] [`_mm256_maskz_cvtsepi16_epi8`]
325-
* [_] [`_mm_mask_cvtsepi16_storeu_epi8`]
326-
* [_] [`_mm256_mask_cvtsepi16_storeu_epi8`]
324+
* [x] [`_mm512_mask_cvtsepi16_storeu_epi8`]
325+
* [x] [`_mm_mask_cvtsepi16_storeu_epi8`]
326+
* [x] [`_mm256_mask_cvtsepi16_storeu_epi8`]
327327
* [x] [`_mm512_cvtepu8_epi16`]
328328
* [x] [`_mm512_mask_cvtepu8_epi16`]
329329
* [x] [`_mm512_maskz_cvtepu8_epi16`]
@@ -338,15 +338,15 @@
338338
* [x] [`_mm512_cvtusepi16_epi8`]
339339
* [x] [`_mm512_mask_cvtusepi16_epi8`]
340340
* [x] [`_mm512_maskz_cvtusepi16_epi8`]
341-
* [_] [`_mm512_mask_cvtusepi16_storeu_epi8`]
342341
* [x] [`_mm_cvtusepi16_epi8`]
343342
* [x] [`_mm_mask_cvtusepi16_epi8`]
344343
* [x] [`_mm_maskz_cvtusepi16_epi8`]
345344
* [x] [`_mm256_cvtusepi16_epi8`]
346345
* [x] [`_mm256_mask_cvtusepi16_epi8`]
347346
* [x] [`_mm256_maskz_cvtusepi16_epi8`]
348-
* [_] [`_mm_mask_cvtusepi16_storeu_epi8`]
349-
* [_] [`_mm256_mask_cvtusepi16_storeu_epi8`]
347+
* [x] [`_mm512_mask_cvtusepi16_storeu_epi8`]
348+
* [x] [`_mm_mask_cvtusepi16_storeu_epi8`]
349+
* [x] [`_mm256_mask_cvtusepi16_storeu_epi8`]
350350
* [x] [`_mm512_dbsad_epu8`]
351351
* [x] [`_mm512_mask_dbsad_epu8`]
352352
* [x] [`_mm512_maskz_dbsad_epu8`]

crates/core_arch/src/x86/avx512bw.rs

+214
Original file line numberDiff line numberDiff line change
@@ -9388,6 +9388,96 @@ pub unsafe fn _mm_maskz_alignr_epi8(k: __mmask16, a: __m128i, b: __m128i, imm8:
93889388
transmute(simd_select_bitmask(k, r.as_i8x16(), zero))
93899389
}
93909390

9391+
/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
9392+
///
9393+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi16_storeu_epi8&expand=1812)
9394+
#[inline]
9395+
#[target_feature(enable = "avx512bw")]
9396+
#[cfg_attr(test, assert_instr(vpmovswb))]
9397+
pub unsafe fn _mm512_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32, a: __m512i) {
9398+
vpmovswbmem(mem_addr as *mut i8, a.as_i16x32(), k);
9399+
}
9400+
9401+
/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
9402+
///
9403+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi16_storeu_epi8&expand=1811)
9404+
#[inline]
9405+
#[target_feature(enable = "avx512bw,avx512vl")]
9406+
#[cfg_attr(test, assert_instr(vpmovswb))]
9407+
pub unsafe fn _mm256_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m256i) {
9408+
vpmovswbmem256(mem_addr as *mut i8, a.as_i16x16(), k);
9409+
}
9410+
9411+
/// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
9412+
///
9413+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi16_storeu_epi8&expand=1810)
9414+
#[inline]
9415+
#[target_feature(enable = "avx512bw,avx512vl")]
9416+
#[cfg_attr(test, assert_instr(vpmovswb))]
9417+
pub unsafe fn _mm_mask_cvtsepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
9418+
vpmovswbmem128(mem_addr as *mut i8, a.as_i16x8(), k);
9419+
}
9420+
9421+
/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
9422+
///
9423+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi16_storeu_epi8&expand=1412)
9424+
#[inline]
9425+
#[target_feature(enable = "avx512bw")]
9426+
#[cfg_attr(test, assert_instr(vpmovwb))]
9427+
pub unsafe fn _mm512_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32, a: __m512i) {
9428+
vpmovwbmem(mem_addr as *mut i8, a.as_i16x32(), k);
9429+
}
9430+
9431+
/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
9432+
///
9433+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_storeu_epi8&expand=1411)
9434+
#[inline]
9435+
#[target_feature(enable = "avx512bw,avx512vl")]
9436+
#[cfg_attr(test, assert_instr(vpmovwb))]
9437+
pub unsafe fn _mm256_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m256i) {
9438+
vpmovwbmem256(mem_addr as *mut i8, a.as_i16x16(), k);
9439+
}
9440+
9441+
/// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
9442+
///
9443+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_storeu_epi8&expand=1410)
9444+
#[inline]
9445+
#[target_feature(enable = "avx512bw,avx512vl")]
9446+
#[cfg_attr(test, assert_instr(vpmovwb))]
9447+
pub unsafe fn _mm_mask_cvtepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
9448+
vpmovwbmem128(mem_addr as *mut i8, a.as_i16x8(), k);
9449+
}
9450+
9451+
/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
9452+
///
9453+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi16_storeu_epi8&expand=2047)
9454+
#[inline]
9455+
#[target_feature(enable = "avx512bw")]
9456+
#[cfg_attr(test, assert_instr(vpmovuswb))]
9457+
pub unsafe fn _mm512_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask32, a: __m512i) {
9458+
vpmovuswbmem(mem_addr as *mut i8, a.as_i16x32(), k);
9459+
}
9460+
9461+
/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
9462+
///
9463+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi16_storeu_epi8&expand=2046)
9464+
#[inline]
9465+
#[target_feature(enable = "avx512bw,avx512vl")]
9466+
#[cfg_attr(test, assert_instr(vpmovuswb))]
9467+
pub unsafe fn _mm256_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m256i) {
9468+
vpmovuswbmem256(mem_addr as *mut i8, a.as_i16x16(), k);
9469+
}
9470+
9471+
/// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
9472+
///
9473+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi16_storeu_epi8&expand=2045)
9474+
#[inline]
9475+
#[target_feature(enable = "avx512bw,avx512vl")]
9476+
#[cfg_attr(test, assert_instr(vpmovuswb))]
9477+
pub unsafe fn _mm_mask_cvtusepi16_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
9478+
vpmovuswbmem128(mem_addr as *mut i8, a.as_i16x8(), k);
9479+
}
9480+
93919481
#[allow(improper_ctypes)]
93929482
extern "C" {
93939483
#[link_name = "llvm.x86.avx512.mask.paddus.w.512"]
@@ -9594,6 +9684,27 @@ extern "C" {
95949684
fn vpmovuswb256(a: u16x16, src: u8x16, mask: u16) -> u8x16;
95959685
#[link_name = "llvm.x86.avx512.mask.pmovus.wb.128"]
95969686
fn vpmovuswb128(a: u16x8, src: u8x16, mask: u8) -> u8x16;
9687+
9688+
#[link_name = "llvm.x86.avx512.mask.pmovs.wb.mem.512"]
9689+
fn vpmovswbmem(mem_addr: *mut i8, a: i16x32, mask: u32);
9690+
#[link_name = "llvm.x86.avx512.mask.pmovs.wb.mem.256"]
9691+
fn vpmovswbmem256(mem_addr: *mut i8, a: i16x16, mask: u16);
9692+
#[link_name = "llvm.x86.avx512.mask.pmovs.wb.mem.128"]
9693+
fn vpmovswbmem128(mem_addr: *mut i8, a: i16x8, mask: u8);
9694+
9695+
#[link_name = "llvm.x86.avx512.mask.pmov.wb.mem.512"]
9696+
fn vpmovwbmem(mem_addr: *mut i8, a: i16x32, mask: u32);
9697+
#[link_name = "llvm.x86.avx512.mask.pmov.wb.mem.256"]
9698+
fn vpmovwbmem256(mem_addr: *mut i8, a: i16x16, mask: u16);
9699+
#[link_name = "llvm.x86.avx512.mask.pmov.wb.mem.128"]
9700+
fn vpmovwbmem128(mem_addr: *mut i8, a: i16x8, mask: u8);
9701+
9702+
#[link_name = "llvm.x86.avx512.mask.pmovus.wb.mem.512"]
9703+
fn vpmovuswbmem(mem_addr: *mut i8, a: i16x32, mask: u32);
9704+
#[link_name = "llvm.x86.avx512.mask.pmovus.wb.mem.256"]
9705+
fn vpmovuswbmem256(mem_addr: *mut i8, a: i16x16, mask: u16);
9706+
#[link_name = "llvm.x86.avx512.mask.pmovus.wb.mem.128"]
9707+
fn vpmovuswbmem128(mem_addr: *mut i8, a: i16x8, mask: u8);
95979708
}
95989709

95999710
#[cfg(test)]
@@ -17905,4 +18016,107 @@ mod tests {
1790518016
let e = _mm_set_epi8(0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1);
1790618017
assert_eq_m128i(r, e);
1790718018
}
18019+
18020+
#[simd_test(enable = "avx512bw")]
18021+
unsafe fn test_mm512_mask_cvtsepi16_storeu_epi8() {
18022+
let a = _mm512_set1_epi16(i16::MAX);
18023+
let mut r = _mm256_undefined_si256();
18024+
_mm512_mask_cvtsepi16_storeu_epi8(
18025+
&mut r as *mut _ as *mut i8,
18026+
0b11111111_11111111_11111111_11111111,
18027+
a,
18028+
);
18029+
let e = _mm256_set1_epi8(i8::MAX);
18030+
assert_eq_m256i(r, e);
18031+
}
18032+
18033+
#[simd_test(enable = "avx512bw,avx512vl")]
18034+
unsafe fn test_mm256_mask_cvtsepi16_storeu_epi8() {
18035+
let a = _mm256_set1_epi16(i16::MAX);
18036+
let mut r = _mm_undefined_si128();
18037+
_mm256_mask_cvtsepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
18038+
let e = _mm_set1_epi8(i8::MAX);
18039+
assert_eq_m128i(r, e);
18040+
}
18041+
18042+
#[simd_test(enable = "avx512bw,avx512vl")]
18043+
unsafe fn test_mm_mask_cvtsepi16_storeu_epi8() {
18044+
let a = _mm_set1_epi16(i16::MAX);
18045+
let mut r = _mm_set1_epi8(0);
18046+
_mm_mask_cvtsepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
18047+
#[rustfmt::skip]
18048+
let e = _mm_set_epi8(
18049+
0, 0, 0, 0, 0, 0, 0, 0,
18050+
i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX, i8::MAX,
18051+
);
18052+
assert_eq_m128i(r, e);
18053+
}
18054+
18055+
#[simd_test(enable = "avx512bw")]
18056+
unsafe fn test_mm512_mask_cvtepi16_storeu_epi8() {
18057+
let a = _mm512_set1_epi16(8);
18058+
let mut r = _mm256_undefined_si256();
18059+
_mm512_mask_cvtepi16_storeu_epi8(
18060+
&mut r as *mut _ as *mut i8,
18061+
0b11111111_11111111_11111111_11111111,
18062+
a,
18063+
);
18064+
let e = _mm256_set1_epi8(8);
18065+
assert_eq_m256i(r, e);
18066+
}
18067+
18068+
#[simd_test(enable = "avx512bw,avx512vl")]
18069+
unsafe fn test_mm256_mask_cvtepi16_storeu_epi8() {
18070+
let a = _mm256_set1_epi16(8);
18071+
let mut r = _mm_undefined_si128();
18072+
_mm256_mask_cvtepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
18073+
let e = _mm_set1_epi8(8);
18074+
assert_eq_m128i(r, e);
18075+
}
18076+
18077+
#[simd_test(enable = "avx512bw,avx512vl")]
18078+
unsafe fn test_mm_mask_cvtepi16_storeu_epi8() {
18079+
let a = _mm_set1_epi16(8);
18080+
let mut r = _mm_set1_epi8(0);
18081+
_mm_mask_cvtepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
18082+
let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8);
18083+
assert_eq_m128i(r, e);
18084+
}
18085+
18086+
#[simd_test(enable = "avx512bw")]
18087+
unsafe fn test_mm512_mask_cvtusepi16_storeu_epi8() {
18088+
let a = _mm512_set1_epi16(i16::MAX);
18089+
let mut r = _mm256_undefined_si256();
18090+
_mm512_mask_cvtusepi16_storeu_epi8(
18091+
&mut r as *mut _ as *mut i8,
18092+
0b11111111_11111111_11111111_11111111,
18093+
a,
18094+
);
18095+
let e = _mm256_set1_epi8(u8::MAX as i8);
18096+
assert_eq_m256i(r, e);
18097+
}
18098+
18099+
#[simd_test(enable = "avx512bw,avx512vl")]
18100+
unsafe fn test_mm256_mask_cvtusepi16_storeu_epi8() {
18101+
let a = _mm256_set1_epi16(i16::MAX);
18102+
let mut r = _mm_undefined_si128();
18103+
_mm256_mask_cvtusepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
18104+
let e = _mm_set1_epi8(u8::MAX as i8);
18105+
assert_eq_m128i(r, e);
18106+
}
18107+
18108+
#[simd_test(enable = "avx512bw,avx512vl")]
18109+
unsafe fn test_mm_mask_cvtusepi16_storeu_epi8() {
18110+
let a = _mm_set1_epi16(i16::MAX);
18111+
let mut r = _mm_set1_epi8(0);
18112+
_mm_mask_cvtusepi16_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
18113+
#[rustfmt::skip]
18114+
let e = _mm_set_epi8(
18115+
0, 0, 0, 0,
18116+
0, 0, 0, 0,
18117+
u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
18118+
u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
18119+
);
18120+
assert_eq_m128i(r, e);
18121+
}
1790818122
}

0 commit comments

Comments
 (0)