Skip to content

Commit 947d960

Browse files
Rollup merge of #121953 - jhorstmann:assembly-tests-for-masked-simd-instructions, r=workingjubilee
Add tests for the generated assembly of mask related simd instructions. The tests show that the code generation currently uses the least significant bits of <iX x N> vector masks when converting to <i1 xN>. This leads to an additional left shift operation in the assembly for x86, since mask operations on x86 operate based on the most significant bit. The exception is simd_bitmask, which already uses the most-significant bit. This additional instruction would be removed by the changes in #104693, which makes all mask operations consistently use the most significant bits. By using the "C" calling convention the tests should be stable regarding changes in register allocation, but it is possible that future llvm updates will require updating some of the checks.
2 parents 7a45c72 + e91f937 commit 947d960

7 files changed

+597
-0
lines changed

tests/assembly/simd-bitmask.rs

+149
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
//@ revisions: x86 x86-avx2 x86-avx512 aarch64
2+
//@ [x86] compile-flags: --target=x86_64-unknown-linux-gnu -C llvm-args=-x86-asm-syntax=intel
3+
//@ [x86] needs-llvm-components: x86
4+
//@ [x86-avx2] compile-flags: --target=x86_64-unknown-linux-gnu -C llvm-args=-x86-asm-syntax=intel
5+
//@ [x86-avx2] compile-flags: -C target-feature=+avx2
6+
//@ [x86-avx2] needs-llvm-components: x86
7+
//@ [x86-avx512] compile-flags: --target=x86_64-unknown-linux-gnu -C llvm-args=-x86-asm-syntax=intel
8+
//@ [x86-avx512] compile-flags: -C target-feature=+avx512f,+avx512vl,+avx512bw,+avx512dq
9+
//@ [x86-avx512] needs-llvm-components: x86
10+
//@ [aarch64] compile-flags: --target=aarch64-unknown-linux-gnu
11+
//@ [aarch64] needs-llvm-components: aarch64
12+
//@ [aarch64] min-llvm-version: 18.0
13+
//@ assembly-output: emit-asm
14+
//@ compile-flags: --crate-type=lib -O
15+
16+
#![feature(no_core, lang_items, repr_simd, intrinsics)]
17+
#![no_core]
18+
#![allow(non_camel_case_types)]
19+
20+
// Because we don't have core yet.
21+
#[lang = "sized"]
22+
pub trait Sized {}
23+
24+
#[lang = "copy"]
25+
trait Copy {}
26+
27+
#[repr(simd)]
28+
pub struct m8x16([i8; 16]);
29+
30+
#[repr(simd)]
31+
pub struct m8x64([i8; 64]);
32+
33+
#[repr(simd)]
34+
pub struct m32x4([i32; 4]);
35+
36+
#[repr(simd)]
37+
pub struct m64x2([i64; 2]);
38+
39+
#[repr(simd)]
40+
pub struct m64x4([i64; 4]);
41+
42+
extern "rust-intrinsic" {
43+
fn simd_bitmask<V, B>(mask: V) -> B;
44+
}
45+
46+
// CHECK-LABEL: bitmask_m8x16
47+
#[no_mangle]
48+
pub unsafe extern "C" fn bitmask_m8x16(mask: m8x16) -> u16 {
49+
// The simd_bitmask intrinsic already uses the most significant bit, so no shift is necessary.
50+
// Note that x86 has no byte shift, llvm uses a word shift to move the least significant bit
51+
// of each byte into the right position.
52+
//
53+
// x86-NOT: psllw
54+
// x86: movmskb eax, xmm0
55+
//
56+
// x86-avx2-NOT: vpsllw
57+
// x86-avx2: vpmovmskb eax, xmm0
58+
//
59+
// x86-avx512-NOT: vpsllw xmm0
60+
// x86-avx512: vpmovmskb eax, xmm0
61+
//
62+
// aarch64: adrp
63+
// aarch64-NEXT: cmlt
64+
// aarch64-NEXT: ldr
65+
// aarch64-NEXT: and
66+
// aarch64-NEXT: ext
67+
// aarch64-NEXT: zip1
68+
// aarch64-NEXT: addv
69+
// aarch64-NEXT: fmov
70+
simd_bitmask(mask)
71+
}
72+
73+
// CHECK-LABEL: bitmask_m8x64
74+
#[no_mangle]
75+
pub unsafe extern "C" fn bitmask_m8x64(mask: m8x64) -> u64 {
76+
// The simd_bitmask intrinsic already uses the most significant bit, so no shift is necessary.
77+
// Note that x86 has no byte shift, llvm uses a word shift to move the least significant bit
78+
// of each byte into the right position.
79+
//
80+
// The parameter is a 512 bit vector which in the C abi is only valid for avx512 targets.
81+
//
82+
// x86-avx512-NOT: vpsllw
83+
// x86-avx512: vpmovb2m k0, zmm0
84+
// x86-avx512: kmovq rax, k0
85+
simd_bitmask(mask)
86+
}
87+
88+
// CHECK-LABEL: bitmask_m32x4
89+
#[no_mangle]
90+
pub unsafe extern "C" fn bitmask_m32x4(mask: m32x4) -> u8 {
91+
// The simd_bitmask intrinsic already uses the most significant bit, so no shift is necessary.
92+
//
93+
// x86-NOT: psllq
94+
// x86: movmskps eax, xmm0
95+
//
96+
// x86-avx2-NOT: vpsllq
97+
// x86-avx2: vmovmskps eax, xmm0
98+
//
99+
// x86-avx512-NOT: vpsllq
100+
// x86-avx512: vmovmskps eax, xmm0
101+
//
102+
// aarch64: adrp
103+
// aarch64-NEXT: cmlt
104+
// aarch64-NEXT: ldr
105+
// aarch64-NEXT: and
106+
// aarch64-NEXT: addv
107+
// aarch64-NEXT: fmov
108+
// aarch64-NEXT: and
109+
simd_bitmask(mask)
110+
}
111+
112+
// CHECK-LABEL: bitmask_m64x2
113+
#[no_mangle]
114+
pub unsafe extern "C" fn bitmask_m64x2(mask: m64x2) -> u8 {
115+
// The simd_bitmask intrinsic already uses the most significant bit, so no shift is necessary.
116+
//
117+
// x86-NOT: psllq
118+
// x86: movmskpd eax, xmm0
119+
//
120+
// x86-avx2-NOT: vpsllq
121+
// x86-avx2: vmovmskpd eax, xmm0
122+
//
123+
// x86-avx512-NOT: vpsllq
124+
// x86-avx512: vmovmskpd eax, xmm0
125+
//
126+
// aarch64: adrp
127+
// aarch64-NEXT: cmlt
128+
// aarch64-NEXT: ldr
129+
// aarch64-NEXT: and
130+
// aarch64-NEXT: addp
131+
// aarch64-NEXT: fmov
132+
// aarch64-NEXT: and
133+
simd_bitmask(mask)
134+
}
135+
136+
// CHECK-LABEL: bitmask_m64x4
137+
#[no_mangle]
138+
pub unsafe extern "C" fn bitmask_m64x4(mask: m64x4) -> u8 {
139+
// The simd_bitmask intrinsic already uses the most significant bit, so no shift is necessary.
140+
//
141+
// The parameter is a 256 bit vector which in the C abi is only valid for avx/avx512 targets.
142+
//
143+
// x86-avx2-NOT: vpsllq
144+
// x86-avx2: vmovmskpd eax, ymm0
145+
//
146+
// x86-avx512-NOT: vpsllq
147+
// x86-avx512: vmovmskpd eax, ymm0
148+
simd_bitmask(mask)
149+
}
+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
//@ revisions: x86-avx512
2+
//@ [x86-avx512] compile-flags: --target=x86_64-unknown-linux-gnu -C llvm-args=-x86-asm-syntax=intel
3+
//@ [x86-avx512] compile-flags: -C target-feature=+avx512f,+avx512vl,+avx512bw,+avx512dq
4+
//@ [x86-avx512] needs-llvm-components: x86
5+
//@ [x86-avx512] min-llvm-version: 18.0
6+
//@ assembly-output: emit-asm
7+
//@ compile-flags: --crate-type=lib -O
8+
9+
#![feature(no_core, lang_items, repr_simd, intrinsics)]
10+
#![no_core]
11+
#![allow(non_camel_case_types)]
12+
13+
// Because we don't have core yet.
14+
#[lang = "sized"]
15+
pub trait Sized {}
16+
17+
#[lang = "copy"]
18+
trait Copy {}
19+
20+
#[repr(simd)]
21+
pub struct f64x4([f64; 4]);
22+
23+
#[repr(simd)]
24+
pub struct m64x4([i64; 4]);
25+
26+
#[repr(simd)]
27+
pub struct pf64x4([*const f64; 4]);
28+
29+
extern "rust-intrinsic" {
30+
fn simd_gather<V, M, P>(values: V, mask: M, pointer: P) -> V;
31+
}
32+
33+
// CHECK-LABEL: gather_f64x4
34+
#[no_mangle]
35+
pub unsafe extern "C" fn gather_f64x4(mask: m64x4, ptrs: pf64x4) -> f64x4 {
36+
// FIXME: This should also get checked to generate a gather instruction for avx2.
37+
// Currently llvm scalarizes this code, see https://github.com/llvm/llvm-project/issues/59789
38+
//
39+
// x86-avx512: vpsllq ymm0, ymm0, 63
40+
// x86-avx512-NEXT: vpmovq2m k1, ymm0
41+
// x86-avx512-NEXT: vpxor xmm0, xmm0, xmm0
42+
// x86-avx512-NEXT: vgatherqpd ymm0 {k1}, ymmword ptr [1*ymm1]
43+
simd_gather(f64x4([0_f64, 0_f64, 0_f64, 0_f64]), ptrs, mask)
44+
}
+88
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
//@ revisions: x86-avx2 x86-avx512
2+
//@ [x86-avx2] compile-flags: --target=x86_64-unknown-linux-gnu -C llvm-args=-x86-asm-syntax=intel
3+
//@ [x86-avx2] compile-flags: -C target-feature=+avx2
4+
//@ [x86-avx2] needs-llvm-components: x86
5+
//@ [x86-avx512] compile-flags: --target=x86_64-unknown-linux-gnu -C llvm-args=-x86-asm-syntax=intel
6+
//@ [x86-avx512] compile-flags: -C target-feature=+avx512f,+avx512vl,+avx512bw,+avx512dq
7+
//@ [x86-avx512] needs-llvm-components: x86
8+
//@ assembly-output: emit-asm
9+
//@ compile-flags: --crate-type=lib -O
10+
11+
#![feature(no_core, lang_items, repr_simd, intrinsics)]
12+
#![no_core]
13+
#![allow(non_camel_case_types)]
14+
15+
// Because we don't have core yet.
16+
#[lang = "sized"]
17+
pub trait Sized {}
18+
19+
#[lang = "copy"]
20+
trait Copy {}
21+
22+
#[repr(simd)]
23+
pub struct i8x16([i8; 16]);
24+
25+
#[repr(simd)]
26+
pub struct m8x16([i8; 16]);
27+
28+
#[repr(simd)]
29+
pub struct f32x8([f32; 8]);
30+
31+
#[repr(simd)]
32+
pub struct m32x8([i32; 8]);
33+
34+
#[repr(simd)]
35+
pub struct f64x4([f64; 4]);
36+
37+
#[repr(simd)]
38+
pub struct m64x4([i64; 4]);
39+
40+
extern "rust-intrinsic" {
41+
fn simd_masked_load<M, P, T>(mask: M, pointer: P, values: T) -> T;
42+
}
43+
44+
// CHECK-LABEL: load_i8x16
45+
#[no_mangle]
46+
pub unsafe extern "C" fn load_i8x16(mask: m8x16, pointer: *const i8) -> i8x16 {
47+
// Since avx2 supports no masked loads for bytes, the code tests each individual bit
48+
// and jumps to code that inserts individual bytes.
49+
// x86-avx2: vpsllw xmm0, xmm0, 7
50+
// x86-avx2-NEXT: vpmovmskb eax, xmm0
51+
// x86-avx2-NEXT: vpxor xmm0, xmm0
52+
// x86-avx2-NEXT: test al, 1
53+
// x86-avx2-NEXT: jne
54+
// x86-avx2-NEXT: test al, 2
55+
// x86-avx2-NEXT: jne
56+
// x86-avx2-DAG: movzx [[REG:[a-z]+]], byte ptr [rdi]
57+
// x86-avx2-NEXT: vmovd xmm0, [[REG]]
58+
// x86-avx2-DAG: vpinsrb xmm0, xmm0, byte ptr [rdi + 1], 1
59+
//
60+
// x86-avx512: vpsllw xmm0, xmm0, 7
61+
// x86-avx512-NEXT: vpmovb2m k1, xmm0
62+
// x86-avx512-NEXT: vmovdqu8 xmm0 {k1} {z}, xmmword ptr [rdi]
63+
simd_masked_load(mask, pointer, i8x16([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))
64+
}
65+
66+
// CHECK-LABEL: load_f32x8
67+
#[no_mangle]
68+
pub unsafe extern "C" fn load_f32x8(mask: m32x8, pointer: *const f32) -> f32x8 {
69+
// x86-avx2: vpslld ymm0, ymm0, 31
70+
// x86-avx2-NEXT: vmaskmovps ymm0, ymm0, ymmword ptr [rdi]
71+
//
72+
// x86-avx512: vpslld ymm0, ymm0, 31
73+
// x86-avx512-NEXT: vpmovd2m k1, ymm0
74+
// x86-avx512-NEXT: vmovups ymm0 {k1} {z}, ymmword ptr [rdi]
75+
simd_masked_load(mask, pointer, f32x8([0_f32, 0_f32, 0_f32, 0_f32, 0_f32, 0_f32, 0_f32, 0_f32]))
76+
}
77+
78+
// CHECK-LABEL: load_f64x4
79+
#[no_mangle]
80+
pub unsafe extern "C" fn load_f64x4(mask: m64x4, pointer: *const f64) -> f64x4 {
81+
// x86-avx2: vpsllq ymm0, ymm0, 63
82+
// x86-avx2-NEXT: vmaskmovpd ymm0, ymm0, ymmword ptr [rdi]
83+
//
84+
// x86-avx512: vpsllq ymm0, ymm0, 63
85+
// x86-avx512-NEXT: vpmovq2m k1, ymm0
86+
// x86-avx512-NEXT: vmovupd ymm0 {k1} {z}, ymmword ptr [rdi]
87+
simd_masked_load(mask, pointer, f64x4([0_f64, 0_f64, 0_f64, 0_f64]))
88+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
// verify that simd mask reductions do not introduce additional bit shift operations
2+
//@ revisions: x86 aarch64
3+
//@ [x86] compile-flags: --target=x86_64-unknown-linux-gnu -C llvm-args=-x86-asm-syntax=intel
4+
//@ [x86] needs-llvm-components: x86
5+
//@ [aarch64] compile-flags: --target=aarch64-unknown-linux-gnu
6+
//@ [aarch64] needs-llvm-components: aarch64
7+
//@ [aarch64] min-llvm-version: 18.0
8+
//@ assembly-output: emit-asm
9+
//@ compile-flags: --crate-type=lib -O
10+
11+
#![feature(no_core, lang_items, repr_simd, intrinsics)]
12+
#![no_core]
13+
#![allow(non_camel_case_types)]
14+
15+
// Because we don't have core yet.
16+
#[lang = "sized"]
17+
pub trait Sized {}
18+
19+
#[lang = "copy"]
20+
trait Copy {}
21+
22+
#[repr(simd)]
23+
pub struct mask8x16([i8; 16]);
24+
25+
extern "rust-intrinsic" {
26+
fn simd_reduce_all<T>(x: T) -> bool;
27+
fn simd_reduce_any<T>(x: T) -> bool;
28+
}
29+
30+
// CHECK-LABEL: mask_reduce_all:
31+
#[no_mangle]
32+
pub unsafe extern "C" fn mask_reduce_all(m: mask8x16) -> bool {
33+
// x86: psllw xmm0, 7
34+
// x86-NEXT: pmovmskb eax, xmm0
35+
// x86-NEXT: {{cmp ax, -1|xor eax, 65535}}
36+
// x86-NEXT: sete al
37+
//
38+
// aarch64: shl v0.16b, v0.16b, #7
39+
// aarch64-NEXT: cmlt v0.16b, v0.16b, #0
40+
// aarch64-NEXT: uminv b0, v0.16b
41+
// aarch64-NEXT: fmov [[REG:[a-z0-9]+]], s0
42+
// aarch64-NEXT: and w0, [[REG]], #0x1
43+
simd_reduce_all(m)
44+
}
45+
46+
// CHECK-LABEL: mask_reduce_any:
47+
#[no_mangle]
48+
pub unsafe extern "C" fn mask_reduce_any(m: mask8x16) -> bool {
49+
// x86: psllw xmm0, 7
50+
// x86-NEXT: pmovmskb
51+
// x86-NEXT: test eax, eax
52+
// x86-NEXT: setne al
53+
//
54+
// aarch64: shl v0.16b, v0.16b, #7
55+
// aarch64-NEXT: cmlt v0.16b, v0.16b, #0
56+
// aarch64-NEXT: umaxv b0, v0.16b
57+
// aarch64-NEXT: fmov [[REG:[a-z0-9]+]], s0
58+
// aarch64-NEXT: and w0, [[REG]], #0x1
59+
simd_reduce_any(m)
60+
}

0 commit comments

Comments
 (0)