Skip to content

Commit da7ca5f

Browse files
gnzlbgalexcrichton
authored andcommitted
[sse4a] implement non-immediate-mode intrinsics (#249)
1 parent d2cb7ed commit da7ca5f

File tree

4 files changed

+165
-5
lines changed

4 files changed

+165
-5
lines changed

coresimd/src/runtime/x86.rs

+4-5
Original file line numberDiff line numberDiff line change
@@ -302,8 +302,7 @@ pub fn detect_features() -> usize {
302302
// Contains information about bmi,bmi2, and avx2 support.
303303
let (extended_features_ebx, extended_features_ecx) = if max_basic_leaf >= 7
304304
{
305-
let CpuidResult { ebx, ecx, .. } =
306-
unsafe { __cpuid(0x0000_0007_u32) };
305+
let CpuidResult { ebx, ecx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
307306
(ebx, ecx)
308307
} else {
309308
(0, 0) // CPUID does not support "Extended Features"
@@ -320,8 +319,7 @@ pub fn detect_features() -> usize {
320319
// EAX = 0x8000_0001, ECX=0: Queries "Extended Processor Info and Feature
321320
// Bits"
322321
let extended_proc_info_ecx = if extended_max_basic_leaf >= 1 {
323-
let CpuidResult { ecx, .. } =
324-
unsafe { __cpuid(0x8000_0001_u32) };
322+
let CpuidResult { ecx, .. } = unsafe { __cpuid(0x8000_0001_u32) };
325323
ecx
326324
} else {
327325
0
@@ -457,6 +455,7 @@ mod tests {
457455
println!("ssse3: {:?}", cfg_feature_enabled!("ssse3"));
458456
println!("sse4.1: {:?}", cfg_feature_enabled!("sse4.1"));
459457
println!("sse4.2: {:?}", cfg_feature_enabled!("sse4.2"));
458+
println!("sse4a: {:?}", cfg_feature_enabled!("sse4a"));
460459
println!("avx: {:?}", cfg_feature_enabled!("avx"));
461460
println!("avx2: {:?}", cfg_feature_enabled!("avx2"));
462461
println!("avx512f {:?}", cfg_feature_enabled!("avx512f"));
@@ -495,6 +494,7 @@ mod tests {
495494
assert_eq!(cfg_feature_enabled!("ssse3"), information.ssse3());
496495
assert_eq!(cfg_feature_enabled!("sse4.1"), information.sse4_1());
497496
assert_eq!(cfg_feature_enabled!("sse4.2"), information.sse4_2());
497+
assert_eq!(cfg_feature_enabled!("sse4a"), information.sse4a());
498498
assert_eq!(cfg_feature_enabled!("avx"), information.avx());
499499
assert_eq!(cfg_feature_enabled!("avx2"), information.avx2());
500500
assert_eq!(cfg_feature_enabled!("avx512f"), information.avx512f());
@@ -520,7 +520,6 @@ mod tests {
520520
assert_eq!(cfg_feature_enabled!("bmi"), information.bmi1());
521521
assert_eq!(cfg_feature_enabled!("bmi2"), information.bmi2());
522522
assert_eq!(cfg_feature_enabled!("popcnt"), information.popcnt());
523-
assert_eq!(cfg_feature_enabled!("sse4a"), information.sse4a());
524523
assert_eq!(cfg_feature_enabled!("abm"), information.lzcnt());
525524
assert_eq!(cfg_feature_enabled!("tbm"), information.tbm());
526525
assert_eq!(cfg_feature_enabled!("lzcnt"), information.lzcnt());

coresimd/src/x86/i586/tbm.rs

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#[cfg(test)]
1414
use stdsimd_test::assert_instr;
1515

16+
// FIXME(blocked on #248)
1617
// TODO: LLVM-CODEGEN ERROR: LLVM ERROR: Cannot select:
1718
// intrinsic %llvm.x86.tbm.bextri.u32
1819
/*

coresimd/src/x86/i686/mod.rs

+5
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,8 @@ pub use self::sse41::*;
1717

1818
mod sse42;
1919
pub use self::sse42::*;
20+
21+
#[cfg(not(feature = "intel_sde"))]
22+
mod sse4a;
23+
#[cfg(not(feature = "intel_sde"))]
24+
pub use self::sse4a::*;

coresimd/src/x86/i686/sse4a.rs

+155
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
//! `i686`'s Streaming SIMD Extensions 4a (SSE4a)
2+
3+
use core::mem;
4+
use v128::*;
5+
6+
#[cfg(test)]
7+
use stdsimd_test::assert_instr;
8+
9+
#[allow(improper_ctypes)]
10+
extern "C" {
11+
#[link_name = "llvm.x86.sse4a.extrq"]
12+
fn extrq(x: i64x2, y: i8x16) -> i64x2;
13+
#[link_name = "llvm.x86.sse4a.insertq"]
14+
fn insertq(x: i64x2, y: i64x2) -> i64x2;
15+
#[link_name = "llvm.x86.sse4a.movnt.sd"]
16+
fn movntsd(x: *mut f64, y: f64x2);
17+
#[link_name = "llvm.x86.sse4a.movnt.ss"]
18+
fn movntss(x: *mut f32, y: f32x4);
19+
}
20+
21+
// FIXME(blocked on #248): _mm_extracti_si64(x, len, idx) // EXTRQ
22+
// FIXME(blocked on #248): _mm_inserti_si64(x, y, len, idx) // INSERTQ
23+
24+
/// Extracts the bit range specified by `y` from the lower 64 bits of `x`.
25+
///
26+
/// The [13:8] bits of `y` specify the index of the bit-range to extract. The
27+
/// [5:0] bits of `y` specify the length of the bit-range to extract. All other
28+
/// bits are ignored.
29+
///
30+
/// If the length is zero, it is interpreted as `64`. If the length and index
31+
/// are zero, the lower 64 bits of `x` are extracted.
32+
///
33+
/// If `length == 0 && index > 0` or `lenght + index > 64` the result is
34+
/// undefined.
35+
#[inline(always)]
36+
#[target_feature = "+sse4a"]
37+
#[cfg_attr(test, assert_instr(extrq))]
38+
pub unsafe fn _mm_extract_si64(x: i64x2, y: i64x2) -> i64x2 {
39+
extrq(x, mem::transmute(y))
40+
}
41+
42+
/// Inserts the `[length:0]` bits of `y` into `x` at `index`.
43+
///
44+
/// The bits of `y`:
45+
///
46+
/// - `[69:64]` specify the `length`,
47+
/// - `[77:72]` specify the index.
48+
///
49+
/// If the `length` is zero it is interpreted as `64`. If `index + length > 64`
50+
/// or `index > 0 && length == 0` the result is undefined.
51+
#[inline(always)]
52+
#[target_feature = "+sse4a"]
53+
#[cfg_attr(test, assert_instr(insertq))]
54+
pub unsafe fn _mm_insert_si64(x: i64x2, y: i64x2) -> i64x2 {
55+
insertq(x, mem::transmute(y))
56+
}
57+
58+
/// Non-temporal store of `a.1` into `p`.
59+
#[inline(always)]
60+
#[target_feature = "+sse4a"]
61+
#[cfg_attr(test, assert_instr(movntsd))]
62+
pub unsafe fn _mm_stream_sd(p: *mut f64, a: f64x2) {
63+
movntsd(p, a);
64+
}
65+
66+
/// Non-temporal store of `a.3` into `p`.
67+
#[inline(always)]
68+
#[target_feature = "+sse4a"]
69+
#[cfg_attr(test, assert_instr(movntss))]
70+
pub unsafe fn _mm_stream_ss(p: *mut f32, a: f32x4) {
71+
movntss(p, a);
72+
}
73+
74+
#[cfg(test)]
75+
mod tests {
76+
use stdsimd_test::simd_test;
77+
use x86::i686::sse4a;
78+
use v128::*;
79+
80+
#[simd_test = "sse4a"]
81+
unsafe fn _mm_extract_si64() {
82+
let b = 0b0110_0000_0000_i64;
83+
// ^^^^ bit range extracted
84+
let x = i64x2::new(b, 0);
85+
let v = 0b001000___00___000100_i64;
86+
// ^idx: 2^3 = 8 ^length = 2^2 = 4
87+
let y = i64x2::new(v, 0);
88+
let e = i64x2::new(0b0110_i64, 0);
89+
let r = sse4a::_mm_extract_si64(x, y);
90+
assert_eq!(r, e);
91+
}
92+
93+
#[simd_test = "sse4a"]
94+
unsafe fn _mm_insert_si64() {
95+
let i = 0b0110_i64;
96+
// ^^^^ bit range inserted
97+
let z = 0b1010_1010_1010i64;
98+
// ^^^^ bit range replaced
99+
let e = 0b0110_1010_1010i64;
100+
// ^^^^ replaced 1010 with 0110
101+
let x = i64x2::new(z, 0);
102+
let expected = i64x2::new(e, 0);
103+
let v = 0b001000___00___000100_i64;
104+
// ^idx: 2^3 = 8 ^length = 2^2 = 4
105+
let y = i64x2::new(i, v);
106+
let r = sse4a::_mm_insert_si64(x, y);
107+
assert_eq!(r, expected);
108+
}
109+
110+
#[repr(align(16))]
111+
struct MemoryF64 {
112+
data: [f64; 2],
113+
}
114+
115+
#[simd_test = "sse4a"]
116+
unsafe fn _mm_stream_sd() {
117+
let mut mem = MemoryF64 {
118+
data: [1.0_f64, 2.0],
119+
};
120+
{
121+
let vals = &mut mem.data;
122+
let d = vals.as_mut_ptr();
123+
124+
let x = f64x2::new(3.0, 4.0);
125+
126+
sse4a::_mm_stream_sd(d, x);
127+
}
128+
assert_eq!(mem.data[0], 4.0);
129+
assert_eq!(mem.data[1], 2.0);
130+
}
131+
132+
#[repr(align(16))]
133+
struct MemoryF32 {
134+
data: [f32; 4],
135+
}
136+
137+
#[simd_test = "sse4a"]
138+
unsafe fn _mm_stream_ss() {
139+
let mut mem = MemoryF32 {
140+
data: [1.0_f32, 2.0, 3.0, 4.0],
141+
};
142+
{
143+
let vals = &mut mem.data;
144+
let d = vals.as_mut_ptr();
145+
146+
let x = f32x4::new(5.0, 6.0, 7.0, 8.0);
147+
148+
sse4a::_mm_stream_ss(d, x);
149+
}
150+
assert_eq!(mem.data[0], 8.0);
151+
assert_eq!(mem.data[1], 2.0);
152+
assert_eq!(mem.data[2], 3.0);
153+
assert_eq!(mem.data[3], 4.0);
154+
}
155+
}

0 commit comments

Comments
 (0)