Skip to content

Commit 134792b

Browse files
committed
Verify Intel intrinsics against upstream definitions
This commit adds a new crate for testing that the intrinsics listed in this crate do indeed match the upstream definition of each intrinsic. A pre-downloaded XML description of all Intel intrinsics is checked in which is then parsed in the `stdsimd-verify` crate to verify that everything we write down is matched against the upstream definitions. Currently the checks are pretty loose to get this compiling but a few intrinsics were fixed as a result of this. For example: * `_mm256_extract_epi8` - AVX2 intrinsic erroneously listed under AVX * `_mm256_extract_epi16` - AVX2 intrinsic erroneously listed under AVX * `_mm256_extract_epi32` - AVX2 intrinsic erroneously listed under AVX * `_mm256_extract_epi64` - AVX2 intrinsic erroneously listed under AVX * `_mm_tzcnt_32` - erroneously had `u32` in the name * `_mm_tzcnt_64` - erroneously had `u64` in the name * `_mm_cvtsi64_si128` - erroneously available on 32-bit platforms * `_mm_cvtsi64x_si128` - erroneously available on 32-bit platforms * `_mm_cvtsi128_si64` - erroneously available on 32-bit platforms * `_mm_cvtsi128_si64x` - erroneously available on 32-bit platforms * `_mm_extract_epi64` - erroneously available on 32-bit platforms * `_mm_insert_epi64` - erroneously available on 32-bit platforms * `_mm256_extract_epi16` - erroneously returned i32 instead of i16 * `_mm256_extract_epi8` - erroneously returned i32 instead of i8 * `_mm_shuffle_ps` - the mask argument was erroneously i32 instead of u32 * `_popcnt32` - the signededness of the argument and return were flipped * `_popcnt64` - the signededness of the argument was flipped and the argument was too large bit-wise * `_mm_tzcnt_32` - the return value's sign was flipped * `_mm_tzcnt_64` - the return value's sign was flipped * A good number of intrinsics used `imm8: i8` or `imm8: u8` instead of `imm8: i32` which Intel was using. (we were also internally inconsistent) * A number of intrinsics working with `__m64` were instead working with i64/u64, so they're now corrected to operate with the vector types instead. Currently the verifications performed are: * Each name in Rust is defined in the XML document * The arguments/return values all agree. * The CPUID features listed in the XML document are all enabled in Rust as well. The type matching right now is pretty loose and has a lot of questionable changes. Future commits will touch these up to be more strict and require closer adherence with Intel's own types. Otherwise types like `i32x8` (or any integers with 256 bits) all match up to `__m256i` right now, althoguh this may want to change in the future. Finally we're also not testing the instruction listed in the XML right now. There's a huge number of discrepancies between the instruction listed in the XML and the instruction listed in `assert_instr`, and those'll need to be taken care of in a future commit. Closes #240
1 parent 2c4d880 commit 134792b

28 files changed

+135852
-349
lines changed

.travis.yml

+4
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ matrix:
2424
- env: DOCUMENTATION
2525
install: true
2626
script: ci/dox.sh
27+
- script: cargo test --manifest-path stdsimd-verify/Cargo.toml
28+
install: true
2729
- env: RUSTFMT=On TARGET=x86_64-unknown-linux-gnu NO_ADD=1
2830
script: |
2931
cargo install rustfmt-nightly --force
@@ -40,6 +42,8 @@ install:
4042

4143
script:
4244
- cargo generate-lockfile
45+
# FIXME (travis-ci/travis-ci#8920) shouldn't be necessary...
46+
- python -c "import fcntl; fcntl.fcntl(1, fcntl.F_SETFL, 0)"
4347
- ci/run-docker.sh $TARGET $FEATURES
4448

4549
notifications:

Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ categories = ["hardware-support"]
1212
license = "MIT/Apache-2.0"
1313

1414
[workspace]
15+
members = ["stdsimd-verify"]
1516

1617
[badges]
1718
travis-ci = { repository = "BurntSushi/stdsimd" }

ci/run.sh

+3-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@ echo "FEATURES=${FEATURES}"
2222
echo "OBJDUMP=${OBJDUMP}"
2323

2424
cargo_test() {
25-
cmd="cargo test --all --target=$TARGET --features $FEATURES --verbose $1 -- --nocapture $2"
25+
cmd="cargo test --target=$TARGET --features $FEATURES $1"
26+
cmd="$cmd -p coresimd -p stdsimd"
27+
cmd="$cmd -- $2"
2628
$cmd
2729
}
2830

coresimd/src/x86/i586/abm.rs

+8-8
Original file line numberDiff line numberDiff line change
@@ -44,16 +44,16 @@ pub unsafe fn _lzcnt_u64(x: u64) -> u64 {
4444
#[inline(always)]
4545
#[target_feature = "+popcnt"]
4646
#[cfg_attr(test, assert_instr(popcnt))]
47-
pub unsafe fn _popcnt32(x: u32) -> u32 {
48-
x.count_ones()
47+
pub unsafe fn _popcnt32(x: i32) -> i32 {
48+
x.count_ones() as i32
4949
}
5050

5151
/// Counts the bits that are set.
5252
#[inline(always)]
5353
#[target_feature = "+popcnt"]
5454
#[cfg_attr(test, assert_instr(popcnt))]
55-
pub unsafe fn _popcnt64(x: u64) -> u64 {
56-
x.count_ones() as u64
55+
pub unsafe fn _popcnt64(x: i64) -> i32 {
56+
x.count_ones() as i32
5757
}
5858

5959
#[cfg(test)]
@@ -64,21 +64,21 @@ mod tests {
6464

6565
#[simd_test = "lzcnt"]
6666
unsafe fn _lzcnt_u32() {
67-
assert_eq!(abm::_lzcnt_u32(0b0101_1010u32), 25u32);
67+
assert_eq!(abm::_lzcnt_u32(0b0101_1010), 25);
6868
}
6969

7070
#[simd_test = "lzcnt"]
7171
unsafe fn _lzcnt_u64() {
72-
assert_eq!(abm::_lzcnt_u64(0b0101_1010u64), 57u64);
72+
assert_eq!(abm::_lzcnt_u64(0b0101_1010), 57);
7373
}
7474

7575
#[simd_test = "popcnt"]
7676
unsafe fn _popcnt32() {
77-
assert_eq!(abm::_popcnt32(0b0101_1010u32), 4);
77+
assert_eq!(abm::_popcnt32(0b0101_1010), 4);
7878
}
7979

8080
#[simd_test = "popcnt"]
8181
unsafe fn _popcnt64() {
82-
assert_eq!(abm::_popcnt64(0b0101_1010u64), 4);
82+
assert_eq!(abm::_popcnt64(0b0101_1010), 4);
8383
}
8484
}

coresimd/src/x86/i586/avx.rs

+41-124
Original file line numberDiff line numberDiff line change
@@ -607,77 +607,77 @@ pub unsafe fn _mm256_xor_ps(a: f32x8, b: f32x8) -> f32x8 {
607607
}
608608

609609
/// Equal (ordered, non-signaling)
610-
pub const _CMP_EQ_OQ: u8 = 0x00;
610+
pub const _CMP_EQ_OQ: i32 = 0x00;
611611
/// Less-than (ordered, signaling)
612-
pub const _CMP_LT_OS: u8 = 0x01;
612+
pub const _CMP_LT_OS: i32 = 0x01;
613613
/// Less-than-or-equal (ordered, signaling)
614-
pub const _CMP_LE_OS: u8 = 0x02;
614+
pub const _CMP_LE_OS: i32 = 0x02;
615615
/// Unordered (non-signaling)
616-
pub const _CMP_UNORD_Q: u8 = 0x03;
616+
pub const _CMP_UNORD_Q: i32 = 0x03;
617617
/// Not-equal (unordered, non-signaling)
618-
pub const _CMP_NEQ_UQ: u8 = 0x04;
618+
pub const _CMP_NEQ_UQ: i32 = 0x04;
619619
/// Not-less-than (unordered, signaling)
620-
pub const _CMP_NLT_US: u8 = 0x05;
620+
pub const _CMP_NLT_US: i32 = 0x05;
621621
/// Not-less-than-or-equal (unordered, signaling)
622-
pub const _CMP_NLE_US: u8 = 0x06;
622+
pub const _CMP_NLE_US: i32 = 0x06;
623623
/// Ordered (non-signaling)
624-
pub const _CMP_ORD_Q: u8 = 0x07;
624+
pub const _CMP_ORD_Q: i32 = 0x07;
625625
/// Equal (unordered, non-signaling)
626-
pub const _CMP_EQ_UQ: u8 = 0x08;
626+
pub const _CMP_EQ_UQ: i32 = 0x08;
627627
/// Not-greater-than-or-equal (unordered, signaling)
628-
pub const _CMP_NGE_US: u8 = 0x09;
628+
pub const _CMP_NGE_US: i32 = 0x09;
629629
/// Not-greater-than (unordered, signaling)
630-
pub const _CMP_NGT_US: u8 = 0x0a;
630+
pub const _CMP_NGT_US: i32 = 0x0a;
631631
/// False (ordered, non-signaling)
632-
pub const _CMP_FALSE_OQ: u8 = 0x0b;
632+
pub const _CMP_FALSE_OQ: i32 = 0x0b;
633633
/// Not-equal (ordered, non-signaling)
634-
pub const _CMP_NEQ_OQ: u8 = 0x0c;
634+
pub const _CMP_NEQ_OQ: i32 = 0x0c;
635635
/// Greater-than-or-equal (ordered, signaling)
636-
pub const _CMP_GE_OS: u8 = 0x0d;
636+
pub const _CMP_GE_OS: i32 = 0x0d;
637637
/// Greater-than (ordered, signaling)
638-
pub const _CMP_GT_OS: u8 = 0x0e;
638+
pub const _CMP_GT_OS: i32 = 0x0e;
639639
/// True (unordered, non-signaling)
640-
pub const _CMP_TRUE_UQ: u8 = 0x0f;
640+
pub const _CMP_TRUE_UQ: i32 = 0x0f;
641641
/// Equal (ordered, signaling)
642-
pub const _CMP_EQ_OS: u8 = 0x10;
642+
pub const _CMP_EQ_OS: i32 = 0x10;
643643
/// Less-than (ordered, non-signaling)
644-
pub const _CMP_LT_OQ: u8 = 0x11;
644+
pub const _CMP_LT_OQ: i32 = 0x11;
645645
/// Less-than-or-equal (ordered, non-signaling)
646-
pub const _CMP_LE_OQ: u8 = 0x12;
646+
pub const _CMP_LE_OQ: i32 = 0x12;
647647
/// Unordered (signaling)
648-
pub const _CMP_UNORD_S: u8 = 0x13;
648+
pub const _CMP_UNORD_S: i32 = 0x13;
649649
/// Not-equal (unordered, signaling)
650-
pub const _CMP_NEQ_US: u8 = 0x14;
650+
pub const _CMP_NEQ_US: i32 = 0x14;
651651
/// Not-less-than (unordered, non-signaling)
652-
pub const _CMP_NLT_UQ: u8 = 0x15;
652+
pub const _CMP_NLT_UQ: i32 = 0x15;
653653
/// Not-less-than-or-equal (unordered, non-signaling)
654-
pub const _CMP_NLE_UQ: u8 = 0x16;
654+
pub const _CMP_NLE_UQ: i32 = 0x16;
655655
/// Ordered (signaling)
656-
pub const _CMP_ORD_S: u8 = 0x17;
656+
pub const _CMP_ORD_S: i32 = 0x17;
657657
/// Equal (unordered, signaling)
658-
pub const _CMP_EQ_US: u8 = 0x18;
658+
pub const _CMP_EQ_US: i32 = 0x18;
659659
/// Not-greater-than-or-equal (unordered, non-signaling)
660-
pub const _CMP_NGE_UQ: u8 = 0x19;
660+
pub const _CMP_NGE_UQ: i32 = 0x19;
661661
/// Not-greater-than (unordered, non-signaling)
662-
pub const _CMP_NGT_UQ: u8 = 0x1a;
662+
pub const _CMP_NGT_UQ: i32 = 0x1a;
663663
/// False (ordered, signaling)
664-
pub const _CMP_FALSE_OS: u8 = 0x1b;
664+
pub const _CMP_FALSE_OS: i32 = 0x1b;
665665
/// Not-equal (ordered, signaling)
666-
pub const _CMP_NEQ_OS: u8 = 0x1c;
666+
pub const _CMP_NEQ_OS: i32 = 0x1c;
667667
/// Greater-than-or-equal (ordered, non-signaling)
668-
pub const _CMP_GE_OQ: u8 = 0x1d;
668+
pub const _CMP_GE_OQ: i32 = 0x1d;
669669
/// Greater-than (ordered, non-signaling)
670-
pub const _CMP_GT_OQ: u8 = 0x1e;
670+
pub const _CMP_GT_OQ: i32 = 0x1e;
671671
/// True (unordered, signaling)
672-
pub const _CMP_TRUE_US: u8 = 0x1f;
672+
pub const _CMP_TRUE_US: i32 = 0x1f;
673673

674674
/// Compare packed double-precision (64-bit) floating-point
675675
/// elements in `a` and `b` based on the comparison operand
676676
/// specified by `imm8`.
677677
#[inline(always)]
678678
#[target_feature = "+avx,+sse2"]
679679
#[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
680-
pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
680+
pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: i32) -> f64x2 {
681681
macro_rules! call {
682682
($imm8:expr) => { vcmppd(a, b, $imm8) }
683683
}
@@ -690,7 +690,7 @@ pub unsafe fn _mm_cmp_pd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
690690
#[inline(always)]
691691
#[target_feature = "+avx"]
692692
#[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
693-
pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: u8) -> f64x4 {
693+
pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
694694
macro_rules! call {
695695
($imm8:expr) => { vcmppd256(a, b, $imm8) }
696696
}
@@ -703,7 +703,7 @@ pub unsafe fn _mm256_cmp_pd(a: f64x4, b: f64x4, imm8: u8) -> f64x4 {
703703
#[inline(always)]
704704
#[target_feature = "+avx,+sse"]
705705
#[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
706-
pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
706+
pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: i32) -> f32x4 {
707707
macro_rules! call {
708708
($imm8:expr) => { vcmpps(a, b, $imm8) }
709709
}
@@ -716,7 +716,7 @@ pub unsafe fn _mm_cmp_ps(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
716716
#[inline(always)]
717717
#[target_feature = "+avx"]
718718
#[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
719-
pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 {
719+
pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: i32) -> f32x8 {
720720
macro_rules! call {
721721
($imm8:expr) => { vcmpps256(a, b, $imm8) }
722722
}
@@ -731,7 +731,7 @@ pub unsafe fn _mm256_cmp_ps(a: f32x8, b: f32x8, imm8: u8) -> f32x8 {
731731
#[inline(always)]
732732
#[target_feature = "+avx,+sse2"]
733733
#[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd
734-
pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
734+
pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: i32) -> f64x2 {
735735
macro_rules! call {
736736
($imm8:expr) => { vcmpsd(a, b, $imm8) }
737737
}
@@ -746,7 +746,7 @@ pub unsafe fn _mm_cmp_sd(a: f64x2, b: f64x2, imm8: u8) -> f64x2 {
746746
#[inline(always)]
747747
#[target_feature = "+avx,+sse"]
748748
#[cfg_attr(test, assert_instr(vcmpeqss, imm8 = 0))] // TODO Validate vcmpss
749-
pub unsafe fn _mm_cmp_ss(a: f32x4, b: f32x4, imm8: u8) -> f32x4 {
749+
pub unsafe fn _mm_cmp_ss(a: f32x4, b: f32x4, imm8: i32) -> f32x4 {
750750
macro_rules! call {
751751
($imm8:expr) => { vcmpss(a, b, $imm8) }
752752
}
@@ -862,48 +862,6 @@ pub unsafe fn _mm256_extractf128_si256(a: __m256i, imm8: i32) -> __m128i {
862862
__m128i::from(dst)
863863
}
864864

865-
/// Extract an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit
866-
/// integer containing the zero-extended integer data.
867-
///
868-
/// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
869-
#[inline(always)]
870-
#[target_feature = "+avx"]
871-
// This intrinsic has no corresponding instruction.
872-
pub unsafe fn _mm256_extract_epi8(a: i8x32, imm8: i32) -> i32 {
873-
let imm8 = (imm8 & 31) as u32;
874-
(a.extract_unchecked(imm8) as i32) & 0xFF
875-
}
876-
877-
/// Extract a 16-bit integer from `a`, selected with `imm8`. Returns a 32-bit
878-
/// integer containing the zero-extended integer data.
879-
///
880-
/// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
881-
#[inline(always)]
882-
#[target_feature = "+avx"]
883-
// This intrinsic has no corresponding instruction.
884-
pub unsafe fn _mm256_extract_epi16(a: i16x16, imm8: i32) -> i32 {
885-
let imm8 = (imm8 & 15) as u32;
886-
(a.extract_unchecked(imm8) as i32) & 0xFFFF
887-
}
888-
889-
/// Extract a 32-bit integer from `a`, selected with `imm8`.
890-
#[inline(always)]
891-
#[target_feature = "+avx"]
892-
// This intrinsic has no corresponding instruction.
893-
pub unsafe fn _mm256_extract_epi32(a: i32x8, imm8: i32) -> i32 {
894-
let imm8 = (imm8 & 7) as u32;
895-
a.extract_unchecked(imm8)
896-
}
897-
898-
/// Extract a 64-bit integer from `a`, selected with `imm8`.
899-
#[inline(always)]
900-
#[target_feature = "+avx"]
901-
// This intrinsic has no corresponding instruction.
902-
pub unsafe fn _mm256_extract_epi64(a: i64x4, imm8: i32) -> i64 {
903-
let imm8 = (imm8 & 3) as u32;
904-
a.extract_unchecked(imm8)
905-
}
906-
907865
/// Zero the contents of all XMM or YMM registers.
908866
#[inline(always)]
909867
#[target_feature = "+avx"]
@@ -1138,7 +1096,7 @@ pub unsafe fn _mm_permute_pd(a: f64x2, imm8: i32) -> f64x2 {
11381096
#[inline(always)]
11391097
#[target_feature = "+avx"]
11401098
#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x5))]
1141-
pub unsafe fn _mm256_permute2f128_ps(a: f32x8, b: f32x8, imm8: i8) -> f32x8 {
1099+
pub unsafe fn _mm256_permute2f128_ps(a: f32x8, b: f32x8, imm8: i32) -> f32x8 {
11421100
macro_rules! call {
11431101
($imm8:expr) => { vperm2f128ps256(a, b, $imm8) }
11441102
}
@@ -1150,7 +1108,7 @@ pub unsafe fn _mm256_permute2f128_ps(a: f32x8, b: f32x8, imm8: i8) -> f32x8 {
11501108
#[inline(always)]
11511109
#[target_feature = "+avx"]
11521110
#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
1153-
pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 {
1111+
pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
11541112
macro_rules! call {
11551113
($imm8:expr) => { vperm2f128pd256(a, b, $imm8) }
11561114
}
@@ -1163,7 +1121,7 @@ pub unsafe fn _mm256_permute2f128_pd(a: f64x4, b: f64x4, imm8: i8) -> f64x4 {
11631121
#[target_feature = "+avx"]
11641122
#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
11651123
pub unsafe fn _mm256_permute2f128_si256(
1166-
a: i32x8, b: i32x8, imm8: i8
1124+
a: i32x8, b: i32x8, imm8: i32
11671125
) -> i32x8 {
11681126
macro_rules! call {
11691127
($imm8:expr) => { vperm2f128si256(a, b, $imm8) }
@@ -3146,47 +3104,6 @@ mod tests {
31463104
assert_eq!(r, __m128i::from(e));
31473105
}
31483106

3149-
#[simd_test = "avx"]
3150-
unsafe fn _mm256_extract_epi8() {
3151-
#[cfg_attr(rustfmt, rustfmt_skip)]
3152-
let a = i8x32::new(
3153-
-1, 1, 2, 3, 4, 5, 6, 7,
3154-
8, 9, 10, 11, 12, 13, 14, 15,
3155-
16, 17, 18, 19, 20, 21, 22, 23,
3156-
24, 25, 26, 27, 28, 29, 30, 31
3157-
);
3158-
let r1 = avx::_mm256_extract_epi8(a, 0);
3159-
let r2 = avx::_mm256_extract_epi8(a, 35);
3160-
assert_eq!(r1, 0xFF);
3161-
assert_eq!(r2, 3);
3162-
}
3163-
3164-
#[simd_test = "avx"]
3165-
unsafe fn _mm256_extract_epi16() {
3166-
let a =
3167-
i16x16::new(-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3168-
let r1 = avx::_mm256_extract_epi16(a, 0);
3169-
let r2 = avx::_mm256_extract_epi16(a, 19);
3170-
assert_eq!(r1, 0xFFFF);
3171-
assert_eq!(r2, 3);
3172-
}
3173-
3174-
#[simd_test = "avx"]
3175-
unsafe fn _mm256_extract_epi32() {
3176-
let a = i32x8::new(-1, 1, 2, 3, 4, 5, 6, 7);
3177-
let r1 = avx::_mm256_extract_epi32(a, 0);
3178-
let r2 = avx::_mm256_extract_epi32(a, 11);
3179-
assert_eq!(r1, -1);
3180-
assert_eq!(r2, 3);
3181-
}
3182-
3183-
#[simd_test = "avx"]
3184-
unsafe fn _mm256_extract_epi64() {
3185-
let a = i64x4::new(0, 1, 2, 3);
3186-
let r = avx::_mm256_extract_epi64(a, 3);
3187-
assert_eq!(r, 3);
3188-
}
3189-
31903107
#[simd_test = "avx"]
31913108
unsafe fn _mm256_zeroall() {
31923109
avx::_mm256_zeroall();

0 commit comments

Comments
 (0)