|
| 1 | +//! `i686`'s Streaming SIMD Extensions 4a (SSE4a) |
| 2 | +
|
| 3 | +use core::mem; |
| 4 | +use v128::*; |
| 5 | + |
| 6 | +#[cfg(test)] |
| 7 | +use stdsimd_test::assert_instr; |
| 8 | + |
| 9 | +#[allow(improper_ctypes)] |
| 10 | +extern "C" { |
| 11 | + #[link_name = "llvm.x86.sse4a.extrq"] |
| 12 | + fn extrq(x: i64x2, y: i8x16) -> i64x2; |
| 13 | + #[link_name = "llvm.x86.sse4a.insertq"] |
| 14 | + fn insertq(x: i64x2, y: i64x2) -> i64x2; |
| 15 | + #[link_name = "llvm.x86.sse4a.movnt.sd"] |
| 16 | + fn movntsd(x: *mut f64, y: f64x2); |
| 17 | + #[link_name = "llvm.x86.sse4a.movnt.ss"] |
| 18 | + fn movntss(x: *mut f32, y: f32x4); |
| 19 | +} |
| 20 | + |
| 21 | +// FIXME(blocked on #248): _mm_extracti_si64(x, len, idx) // EXTRQ |
| 22 | +// FIXME(blocked on #248): _mm_inserti_si64(x, y, len, idx) // INSERTQ |
| 23 | + |
| 24 | +/// Extracts the bit range specified by `y` from the lower 64 bits of `x`. |
| 25 | +/// |
| 26 | +/// The [13:8] bits of `y` specify the index of the bit-range to extract. The |
| 27 | +/// [5:0] bits of `y` specify the length of the bit-range to extract. All other |
| 28 | +/// bits are ignored. |
| 29 | +/// |
| 30 | +/// If the length is zero, it is interpreted as `64`. If the length and index |
| 31 | +/// are zero, the lower 64 bits of `x` are extracted. |
| 32 | +/// |
| 33 | +/// If `length == 0 && index > 0` or `lenght + index > 64` the result is |
| 34 | +/// undefined. |
| 35 | +#[inline(always)] |
| 36 | +#[target_feature = "+sse4a"] |
| 37 | +#[cfg_attr(test, assert_instr(extrq))] |
| 38 | +pub unsafe fn _mm_extract_si64(x: i64x2, y: i64x2) -> i64x2 { |
| 39 | + extrq(x, mem::transmute(y)) |
| 40 | +} |
| 41 | + |
| 42 | +/// Inserts the `[length:0]` bits of `y` into `x` at `index`. |
| 43 | +/// |
| 44 | +/// The bits of `y`: |
| 45 | +/// |
| 46 | +/// - `[69:64]` specify the `length`, |
| 47 | +/// - `[77:72]` specify the index. |
| 48 | +/// |
| 49 | +/// If the `length` is zero it is interpreted as `64`. If `index + length > 64` |
| 50 | +/// or `index > 0 && length == 0` the result is undefined. |
| 51 | +#[inline(always)] |
| 52 | +#[target_feature = "+sse4a"] |
| 53 | +#[cfg_attr(test, assert_instr(insertq))] |
| 54 | +pub unsafe fn _mm_insert_si64(x: i64x2, y: i64x2) -> i64x2 { |
| 55 | + insertq(x, mem::transmute(y)) |
| 56 | +} |
| 57 | + |
| 58 | +/// Non-temporal store of `a.1` into `p`. |
| 59 | +#[inline(always)] |
| 60 | +#[target_feature = "+sse4a"] |
| 61 | +#[cfg_attr(test, assert_instr(movntsd))] |
| 62 | +pub unsafe fn _mm_stream_sd(p: *mut f64, a: f64x2) { |
| 63 | + movntsd(p, a); |
| 64 | +} |
| 65 | + |
| 66 | +/// Non-temporal store of `a.3` into `p`. |
| 67 | +#[inline(always)] |
| 68 | +#[target_feature = "+sse4a"] |
| 69 | +#[cfg_attr(test, assert_instr(movntss))] |
| 70 | +pub unsafe fn _mm_stream_ss(p: *mut f32, a: f32x4) { |
| 71 | + movntss(p, a); |
| 72 | +} |
| 73 | + |
| 74 | +#[cfg(test)] |
| 75 | +mod tests { |
| 76 | + use stdsimd_test::simd_test; |
| 77 | + use x86::i686::sse4a; |
| 78 | + use v128::*; |
| 79 | + |
| 80 | + #[simd_test = "sse4a"] |
| 81 | + unsafe fn _mm_extract_si64() { |
| 82 | + let b = 0b0110_0000_0000_i64; |
| 83 | + // ^^^^ bit range extracted |
| 84 | + let x = i64x2::new(b, 0); |
| 85 | + let v = 0b001000___00___000100_i64; |
| 86 | + // ^idx: 2^3 = 8 ^length = 2^2 = 4 |
| 87 | + let y = i64x2::new(v, 0); |
| 88 | + let e = i64x2::new(0b0110_i64, 0); |
| 89 | + let r = sse4a::_mm_extract_si64(x, y); |
| 90 | + assert_eq!(r, e); |
| 91 | + } |
| 92 | + |
| 93 | + #[simd_test = "sse4a"] |
| 94 | + unsafe fn _mm_insert_si64() { |
| 95 | + let i = 0b0110_i64; |
| 96 | + // ^^^^ bit range inserted |
| 97 | + let z = 0b1010_1010_1010i64; |
| 98 | + // ^^^^ bit range replaced |
| 99 | + let e = 0b0110_1010_1010i64; |
| 100 | + // ^^^^ replaced 1010 with 0110 |
| 101 | + let x = i64x2::new(z, 0); |
| 102 | + let expected = i64x2::new(e, 0); |
| 103 | + let v = 0b001000___00___000100_i64; |
| 104 | + // ^idx: 2^3 = 8 ^length = 2^2 = 4 |
| 105 | + let y = i64x2::new(i, v); |
| 106 | + let r = sse4a::_mm_insert_si64(x, y); |
| 107 | + assert_eq!(r, expected); |
| 108 | + } |
| 109 | + |
| 110 | + #[repr(align(16))] |
| 111 | + struct MemoryF64 { |
| 112 | + data: [f64; 2], |
| 113 | + } |
| 114 | + |
| 115 | + #[simd_test = "sse4a"] |
| 116 | + unsafe fn _mm_stream_sd() { |
| 117 | + let mut mem = MemoryF64 { |
| 118 | + data: [1.0_f64, 2.0], |
| 119 | + }; |
| 120 | + { |
| 121 | + let vals = &mut mem.data; |
| 122 | + let d = vals.as_mut_ptr(); |
| 123 | + |
| 124 | + let x = f64x2::new(3.0, 4.0); |
| 125 | + |
| 126 | + sse4a::_mm_stream_sd(d, x); |
| 127 | + } |
| 128 | + assert_eq!(mem.data[0], 4.0); |
| 129 | + assert_eq!(mem.data[1], 2.0); |
| 130 | + } |
| 131 | + |
| 132 | + #[repr(align(16))] |
| 133 | + struct MemoryF32 { |
| 134 | + data: [f32; 4], |
| 135 | + } |
| 136 | + |
| 137 | + #[simd_test = "sse4a"] |
| 138 | + unsafe fn _mm_stream_ss() { |
| 139 | + let mut mem = MemoryF32 { |
| 140 | + data: [1.0_f32, 2.0, 3.0, 4.0], |
| 141 | + }; |
| 142 | + { |
| 143 | + let vals = &mut mem.data; |
| 144 | + let d = vals.as_mut_ptr(); |
| 145 | + |
| 146 | + let x = f32x4::new(5.0, 6.0, 7.0, 8.0); |
| 147 | + |
| 148 | + sse4a::_mm_stream_ss(d, x); |
| 149 | + } |
| 150 | + assert_eq!(mem.data[0], 8.0); |
| 151 | + assert_eq!(mem.data[1], 2.0); |
| 152 | + assert_eq!(mem.data[2], 3.0); |
| 153 | + assert_eq!(mem.data[3], 4.0); |
| 154 | + } |
| 155 | +} |
0 commit comments