Skip to content

Commit 128decd

Browse files
authored
pulley: Initial scaffold of SIMD support (#9820)
* pulley: Initial scaffold of SIMD support This commit fills out some of the initial infrastructure necessary for supporting the SIMD proposal to WebAssembly in the Pulley interpreter, namely 128-bit simd. The `VRegVal` union has been filled out with various types, endianness questions are settled, and initial implementations of a suite of opcodes are added to get a basic set of tests working throughout the backend. cc #9783 * Avoid dealing with big-endian vectors * Change wasm `global`s to store `v128` in little-endian format. * Change pulley stack loads/stores to work with vectors in little-endian format.
1 parent 9fd2b3a commit 128decd

File tree

16 files changed

+518
-77
lines changed

16 files changed

+518
-77
lines changed

cranelift/codegen/meta/src/pulley.rs

+44-16
Original file line numberDiff line numberDiff line change
@@ -27,19 +27,47 @@ const OPS: &[Inst<'_>] = pulley_interpreter::for_each_op!(define);
2727
const EXTENDED_OPS: &[Inst<'_>] = pulley_interpreter::for_each_extended_op!(define);
2828

2929
enum Operand<'a> {
30-
Normal { name: &'a str, ty: &'a str },
31-
Writable { name: &'a str, ty: &'a str },
32-
TrapCode { name: &'a str, ty: &'a str },
33-
Binop { reg: &'a str },
30+
Normal {
31+
name: &'a str,
32+
ty: &'a str,
33+
},
34+
Writable {
35+
name: &'a str,
36+
ty: &'a str,
37+
},
38+
TrapCode {
39+
name: &'a str,
40+
ty: &'a str,
41+
},
42+
Binop {
43+
dst: &'a str,
44+
src1: &'a str,
45+
src2: &'a str,
46+
},
3447
}
3548

3649
impl Inst<'_> {
3750
fn operands(&self) -> impl Iterator<Item = Operand<'_>> {
3851
self.fields
3952
.iter()
4053
.map(|(name, ty)| match (*name, *ty) {
41-
("operands", "BinaryOperands < XReg >") => Operand::Binop { reg: "XReg" },
42-
("operands", "BinaryOperands < FReg >") => Operand::Binop { reg: "FReg" },
54+
("operands", binop) => {
55+
// Parse "BinaryOperands < A >"` as A/A/A
56+
// Parse "BinaryOperands < A, B >"` as A/B/A
57+
// Parse "BinaryOperands < A, B, C >"` as A/B/C
58+
let mut parts = binop
59+
.strip_prefix("BinaryOperands <")
60+
.unwrap()
61+
.strip_suffix(">")
62+
.unwrap()
63+
.trim()
64+
.split(',')
65+
.map(|x| x.trim());
66+
let dst = parts.next().unwrap();
67+
let src1 = parts.next().unwrap_or(dst);
68+
let src2 = parts.next().unwrap_or(dst);
69+
Operand::Binop { dst, src1, src2 }
70+
}
4371
("dst", ty) => Operand::Writable { name, ty },
4472
(name, ty) => Operand::Normal { name, ty },
4573
})
@@ -109,7 +137,7 @@ pub fn generate_rust(filename: &str, out_dir: &Path) -> Result<(), Error> {
109137
pat.push_str(",");
110138
format_string.push_str(&format!(" // trap={{{name}:?}}"));
111139
}
112-
Operand::Binop { reg: _ } => {
140+
Operand::Binop { .. } => {
113141
pat.push_str("dst, src1, src2,");
114142
format_string.push_str(" {dst}, {src1}, {src2}");
115143
locals.push_str(&format!("let dst = reg_name(*dst.to_reg());\n"));
@@ -161,7 +189,7 @@ pub fn generate_rust(filename: &str, out_dir: &Path) -> Result<(), Error> {
161189
}
162190
}
163191
Operand::TrapCode { .. } => {}
164-
Operand::Binop { reg: _ } => {
192+
Operand::Binop { .. } => {
165193
pat.push_str("dst, src1, src2,");
166194
uses.push("src1");
167195
uses.push("src2");
@@ -221,7 +249,7 @@ pub fn generate_rust(filename: &str, out_dir: &Path) -> Result<(), Error> {
221249
pat.push_str(",");
222250
trap.push_str(&format!("sink.add_trap({name});\n"));
223251
}
224-
Operand::Binop { reg: _ } => {
252+
Operand::Binop { .. } => {
225253
pat.push_str("dst, src1, src2,");
226254
args.push_str(
227255
"pulley_interpreter::regs::BinaryOperands::new(dst, src1, src2),",
@@ -265,10 +293,10 @@ pub fn generate_isle(filename: &str, out_dir: &Path) -> Result<(), Error> {
265293
Operand::Writable { name, ty } => {
266294
isle.push_str(&format!("\n ({name} Writable{ty})"));
267295
}
268-
Operand::Binop { reg } => {
269-
isle.push_str(&format!("\n (dst Writable{reg})"));
270-
isle.push_str(&format!("\n (src1 {reg})"));
271-
isle.push_str(&format!("\n (src2 {reg})"));
296+
Operand::Binop { dst, src1, src2 } => {
297+
isle.push_str(&format!("\n (dst Writable{dst})"));
298+
isle.push_str(&format!("\n (src1 {src1})"));
299+
isle.push_str(&format!("\n (src2 {src2})"));
272300
}
273301
}
274302
}
@@ -303,13 +331,13 @@ pub fn generate_isle(filename: &str, out_dir: &Path) -> Result<(), Error> {
303331
assert!(result.is_none(), "{} has >1 result", inst.snake_name);
304332
result = Some(ty);
305333
}
306-
Operand::Binop { reg } => {
307-
isle.push_str(&format!("{reg} {reg}"));
334+
Operand::Binop { dst, src1, src2 } => {
335+
isle.push_str(&format!("{src1} {src2}"));
308336
rule.push_str("src1 src2");
309337
ops.push("src1");
310338
ops.push("src2");
311339
assert!(result.is_none(), "{} has >1 result", inst.snake_name);
312-
result = Some(reg);
340+
result = Some(dst);
313341
}
314342
}
315343
isle.push_str(" ");

cranelift/codegen/src/isa/pulley_shared/abi.rs

+25-12
Original file line numberDiff line numberDiff line change
@@ -160,11 +160,23 @@ where
160160
}
161161

162162
fn gen_load_stack(mem: StackAMode, into_reg: Writable<Reg>, ty: Type) -> Self::I {
163-
Inst::gen_load(into_reg, mem.into(), ty, MemFlags::trusted()).into()
163+
let mut flags = MemFlags::trusted();
164+
// Stack loads/stores of vectors always use little-endianess to avoid
165+
// implementing a byte-swap of vectors on big-endian platforms.
166+
if ty.is_vector() {
167+
flags.set_endianness(ir::Endianness::Little);
168+
}
169+
Inst::gen_load(into_reg, mem.into(), ty, flags).into()
164170
}
165171

166172
fn gen_store_stack(mem: StackAMode, from_reg: Reg, ty: Type) -> Self::I {
167-
Inst::gen_store(mem.into(), from_reg, ty, MemFlags::trusted()).into()
173+
let mut flags = MemFlags::trusted();
174+
// Stack loads/stores of vectors always use little-endianess to avoid
175+
// implementing a byte-swap of vectors on big-endian platforms.
176+
if ty.is_vector() {
177+
flags.set_endianness(ir::Endianness::Little);
178+
}
179+
Inst::gen_store(mem.into(), from_reg, ty, flags).into()
168180
}
169181

170182
fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Self::I {
@@ -510,17 +522,18 @@ where
510522
_target_vector_bytes: u32,
511523
_isa_flags: &PulleyFlags,
512524
) -> u32 {
525+
// Spill slots are the size of a "word" or a pointer, but Pulley
526+
// registers are 8-byte for integers/floats regardless of pointer size.
527+
// Calculate the number of slots necessary to store 8 bytes.
528+
let slots_for_8bytes = match P::pointer_width() {
529+
PointerWidth::PointerWidth32 => 2,
530+
PointerWidth::PointerWidth64 => 1,
531+
};
513532
match rc {
514-
// Spilling an integer or float register requires spilling 8 bytes,
515-
// and spill slots are defined in terms of "word bytes" or the size
516-
// of a pointer. That means on 32-bit pulley we need to take up two
517-
// spill slots where on 64-bit pulley we need to only take up one
518-
// spill slot for integers.
519-
RegClass::Int | RegClass::Float => match P::pointer_width() {
520-
PointerWidth::PointerWidth32 => 2,
521-
PointerWidth::PointerWidth64 => 1,
522-
},
523-
RegClass::Vector => unreachable!(),
533+
// Int/float registers are 8-bytes
534+
RegClass::Int | RegClass::Float => slots_for_8bytes,
535+
// Vector registers are 16 bytes
536+
RegClass::Vector => 2 * slots_for_8bytes,
524537
}
525538
}
526539

cranelift/codegen/src/isa/pulley_shared/inst.isle

+10
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,16 @@
414414
(rule (pulley_fstore amode src ty flags)
415415
(SideEffectNoResult.Inst (MInst.FStore amode src ty flags)))
416416

417+
(decl pulley_vload (Amode Type MemFlags) VReg)
418+
(rule (pulley_vload amode ty flags)
419+
(let ((dst WritableVReg (temp_writable_vreg))
420+
(_ Unit (emit (MInst.VLoad dst amode ty flags))))
421+
dst))
422+
423+
(decl pulley_vstore (Amode VReg Type MemFlags) SideEffectNoResult)
424+
(rule (pulley_vstore amode src ty flags)
425+
(SideEffectNoResult.Inst (MInst.VStore amode src ty flags)))
426+
417427
(decl gen_br_table (XReg MachLabel BoxVecMachLabel) Unit)
418428
(rule (gen_br_table idx default labels)
419429
(emit (MInst.BrTable idx default labels)))

cranelift/codegen/src/isa/pulley_shared/inst/mod.rs

+2-12
Original file line numberDiff line numberDiff line change
@@ -453,18 +453,8 @@ where
453453
}
454454

455455
fn worst_case_size() -> CodeOffset {
456-
// `BrIfXeq32 { a, b, taken, not_taken }` expands to `br_if_xeq32 a, b, taken; jump not_taken`.
457-
//
458-
// The first instruction is seven bytes long:
459-
// * 1 byte opcode
460-
// * 1 byte `a` register encoding
461-
// * 1 byte `b` register encoding
462-
// * 4 byte `taken` displacement
463-
//
464-
// And the second instruction is five bytes long:
465-
// * 1 byte opcode
466-
// * 4 byte `not_taken` displacement
467-
12
456+
// `Vconst128 { dst, imm }` is 18 bytes (opcode + dst + 16-byte imm)
457+
18
468458
}
469459

470460
fn ref_type_regclass(_settings: &settings::Flags) -> RegClass {

cranelift/codegen/src/isa/pulley_shared/lower.isle

+35
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,11 @@
143143
(rule (lower (has_type $I64 (iadd a b)))
144144
(pulley_xadd64 a b))
145145

146+
(rule (lower (has_type $I8X16 (iadd a b))) (pulley_vaddi8x16 a b))
147+
(rule (lower (has_type $I16X8 (iadd a b))) (pulley_vaddi16x8 a b))
148+
(rule (lower (has_type $I32X4 (iadd a b))) (pulley_vaddi32x4 a b))
149+
(rule (lower (has_type $I64X2 (iadd a b))) (pulley_vaddi64x2 a b))
150+
146151
;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
147152

148153
(rule (lower (has_type $I8 (isub a b)))
@@ -192,6 +197,11 @@
192197
(rule (lower (has_type $I64 (ishl a b)))
193198
(pulley_xshl64 a b))
194199

200+
(rule (lower (has_type $I8X16 (ishl a b))) (pulley_vshli8x16 a b))
201+
(rule (lower (has_type $I16X8 (ishl a b))) (pulley_vshli16x8 a b))
202+
(rule (lower (has_type $I32X4 (ishl a b))) (pulley_vshli32x4 a b))
203+
(rule (lower (has_type $I64X2 (ishl a b))) (pulley_vshli64x2 a b))
204+
195205
;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
196206

197207
(rule (lower (has_type $I32 (ushr a b)))
@@ -200,6 +210,11 @@
200210
(rule (lower (has_type $I64 (ushr a b)))
201211
(pulley_xshr64_u a b))
202212

213+
(rule (lower (has_type $I8X16 (ushr a b))) (pulley_vshri8x16_u a b))
214+
(rule (lower (has_type $I16X8 (ushr a b))) (pulley_vshri16x8_u a b))
215+
(rule (lower (has_type $I32X4 (ushr a b))) (pulley_vshri32x4_u a b))
216+
(rule (lower (has_type $I64X2 (ushr a b))) (pulley_vshri64x2_u a b))
217+
203218
;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
204219

205220
(rule (lower (has_type $I32 (sshr a b)))
@@ -208,6 +223,11 @@
208223
(rule (lower (has_type $I64 (sshr a b)))
209224
(pulley_xshr64_s a b))
210225

226+
(rule (lower (has_type $I8X16 (sshr a b))) (pulley_vshri8x16_s a b))
227+
(rule (lower (has_type $I16X8 (sshr a b))) (pulley_vshri16x8_s a b))
228+
(rule (lower (has_type $I32X4 (sshr a b))) (pulley_vshri32x4_s a b))
229+
(rule (lower (has_type $I64X2 (sshr a b))) (pulley_vshri64x2_s a b))
230+
211231
;;;; Rules for `band` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
212232

213233
(rule 0 (lower (has_type (fits_in_32 _) (band a b)))
@@ -414,6 +434,9 @@
414434
(rule 1 (lower (has_type $I64 (sload32 flags addr offset)))
415435
(pulley_xload (amode addr offset) $I32 flags (ExtKind.Sign64)))
416436

437+
(rule 2 (lower (has_type (ty_vec128 ty) (load flags addr offset)))
438+
(pulley_vload (amode addr offset) ty flags))
439+
417440
;;;; Rules for `store` and friends ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
418441

419442
(rule (lower (store flags src @ (value_type (ty_int ty)) addr offset))
@@ -431,6 +454,9 @@
431454
(rule (lower (istore32 flags src addr offset))
432455
(side_effect (pulley_xstore (amode addr offset) src $I32 flags)))
433456

457+
(rule 2 (lower (store flags src @ (value_type (ty_vec128 ty)) addr offset))
458+
(side_effect (pulley_vstore (amode addr offset) src ty flags)))
459+
434460
;;;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
435461

436462
(rule (lower (stack_addr stack_slot offset))
@@ -522,6 +548,9 @@
522548
(rule (lower (has_type $I64 (bitcast _flags val @ (value_type $F64))))
523549
(pulley_bitcast_int_from_float_64 val))
524550

551+
(rule 1 (lower (has_type (ty_vec128 _) (bitcast _flags val @ (value_type (ty_vec128 _)))))
552+
val)
553+
525554
;;;; Rules for `fcvt_to_{u,s}int` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
526555

527556
(rule (lower (has_type $I32 (fcvt_to_uint val @ (value_type $F32))))
@@ -622,6 +651,8 @@
622651

623652
(rule (lower (has_type $F32 (fadd a b))) (pulley_fadd32 a b))
624653
(rule (lower (has_type $F64 (fadd a b))) (pulley_fadd64 a b))
654+
(rule (lower (has_type $F32X4 (fadd a b))) (pulley_vaddf32x4 a b))
655+
(rule (lower (has_type $F64X2 (fadd a b))) (pulley_vaddf64x2 a b))
625656

626657
;;;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
627658

@@ -687,3 +718,7 @@
687718

688719
(rule (lower (has_type $F32 (fabs a))) (pulley_fabs32 a))
689720
(rule (lower (has_type $F64 (fabs a))) (pulley_fabs64 a))
721+
722+
;;;; Rules for `vconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
723+
724+
(rule (lower (has_type (ty_vec128 _) (vconst (u128_from_constant a)))) (pulley_vconst128 a))

crates/cranelift/src/translate/code_translator.rs

+10
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,12 @@ pub fn translate_operator(
177177
GlobalVariable::Memory { gv, offset, ty } => {
178178
let addr = builder.ins().global_value(environ.pointer_type(), gv);
179179
let mut flags = ir::MemFlags::trusted();
180+
// Store vector globals in little-endian format to avoid
181+
// byte swaps on big-endian platforms since at-rest vectors
182+
// should already be in little-endian format anyway.
183+
if ty.is_vector() {
184+
flags.set_endianness(ir::Endianness::Little);
185+
}
180186
// Put globals in the "table" abstract heap category as well.
181187
flags.set_alias_region(Some(ir::AliasRegion::Table));
182188
builder.ins().load(ty, flags, addr, offset)
@@ -191,6 +197,10 @@ pub fn translate_operator(
191197
GlobalVariable::Memory { gv, offset, ty } => {
192198
let addr = builder.ins().global_value(environ.pointer_type(), gv);
193199
let mut flags = ir::MemFlags::trusted();
200+
// Like `global.get`, store globals in little-endian format.
201+
if ty.is_vector() {
202+
flags.set_endianness(ir::Endianness::Little);
203+
}
194204
// Put globals in the "table" abstract heap category as well.
195205
flags.set_alias_region(Some(ir::AliasRegion::Table));
196206
let mut val = state.pop1();

crates/wasmtime/src/runtime/externals/global.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ impl Global {
114114
ValType::I64 => Val::from(*definition.as_i64()),
115115
ValType::F32 => Val::F32(*definition.as_u32()),
116116
ValType::F64 => Val::F64(*definition.as_u64()),
117-
ValType::V128 => Val::V128((*definition.as_u128()).into()),
117+
ValType::V128 => Val::V128(definition.get_u128().into()),
118118
ValType::Ref(ref_ty) => {
119119
let reference: Ref = match ref_ty.heap_type() {
120120
HeapType::Func | HeapType::ConcreteFunc(_) => {
@@ -187,7 +187,7 @@ impl Global {
187187
Val::I64(i) => *definition.as_i64_mut() = i,
188188
Val::F32(f) => *definition.as_u32_mut() = f,
189189
Val::F64(f) => *definition.as_u64_mut() = f,
190-
Val::V128(i) => *definition.as_u128_mut() = i.into(),
190+
Val::V128(i) => definition.set_u128(i.into()),
191191
Val::FuncRef(f) => {
192192
*definition.as_func_ref_mut() = f.map_or(ptr::null_mut(), |f| {
193193
f.vm_func_ref(&mut store).as_ptr().cast()

crates/wasmtime/src/runtime/trampoline/global.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ pub fn generate_global_export(
3434
Val::I64(x) => *global.as_i64_mut() = x,
3535
Val::F32(x) => *global.as_f32_bits_mut() = x,
3636
Val::F64(x) => *global.as_f64_bits_mut() = x,
37-
Val::V128(x) => *global.as_u128_mut() = x.into(),
37+
Val::V128(x) => global.set_u128(x.into()),
3838
Val::FuncRef(f) => {
3939
*global.as_func_ref_mut() =
4040
f.map_or(ptr::null_mut(), |f| f.vm_func_ref(&mut store).as_ptr());

crates/wasmtime/src/runtime/vm/vmcontext.rs

+14-8
Original file line numberDiff line numberDiff line change
@@ -447,7 +447,7 @@ impl VMGlobalDefinition {
447447
WasmValType::I64 => *global.as_i64_mut() = raw.get_i64(),
448448
WasmValType::F32 => *global.as_f32_bits_mut() = raw.get_f32(),
449449
WasmValType::F64 => *global.as_f64_bits_mut() = raw.get_f64(),
450-
WasmValType::V128 => *global.as_u128_mut() = raw.get_v128(),
450+
WasmValType::V128 => global.set_u128(raw.get_v128()),
451451
WasmValType::Ref(r) => match r.heap_type.top() {
452452
WasmHeapTopType::Extern => {
453453
let r = VMGcRef::from_raw_u32(raw.get_externref());
@@ -478,7 +478,7 @@ impl VMGlobalDefinition {
478478
WasmValType::I64 => ValRaw::i64(*self.as_i64()),
479479
WasmValType::F32 => ValRaw::f32(*self.as_f32_bits()),
480480
WasmValType::F64 => ValRaw::f64(*self.as_f64_bits()),
481-
WasmValType::V128 => ValRaw::v128(*self.as_u128()),
481+
WasmValType::V128 => ValRaw::v128(self.get_u128()),
482482
WasmValType::Ref(r) => match r.heap_type.top() {
483483
WasmHeapTopType::Extern => ValRaw::externref(match self.as_gc_ref() {
484484
Some(r) => store.gc_store_mut()?.clone_gc_ref(r).as_raw_u32(),
@@ -575,14 +575,20 @@ impl VMGlobalDefinition {
575575
&mut *(self.storage.as_mut().as_mut_ptr().cast::<u64>())
576576
}
577577

578-
/// Return a reference to the value as an u128.
579-
pub unsafe fn as_u128(&self) -> &u128 {
580-
&*(self.storage.as_ref().as_ptr().cast::<u128>())
578+
/// Gets the underlying 128-bit vector value.
579+
//
580+
// Note that vectors are stored in little-endian format while other types
581+
// are stored in native-endian format.
582+
pub unsafe fn get_u128(&self) -> u128 {
583+
u128::from_le(*(self.storage.as_ref().as_ptr().cast::<u128>()))
581584
}
582585

583-
/// Return a mutable reference to the value as an u128.
584-
pub unsafe fn as_u128_mut(&mut self) -> &mut u128 {
585-
&mut *(self.storage.as_mut().as_mut_ptr().cast::<u128>())
586+
/// Sets the 128-bit vector values.
587+
//
588+
// Note that vectors are stored in little-endian format while other types
589+
// are stored in native-endian format.
590+
pub unsafe fn set_u128(&mut self, val: u128) {
591+
*self.storage.as_mut().as_mut_ptr().cast::<u128>() = val.to_le();
586592
}
587593

588594
/// Return a reference to the value as u128 bits.

0 commit comments

Comments
 (0)