From b5f1ab7780bb6534566ab2bd9b2f0a658cbb6822 Mon Sep 17 00:00:00 2001
From: Trevor Elliott <telliott@fastly.com>
Date: Tue, 23 Aug 2022 11:22:49 -0700
Subject: [PATCH] x64: Lower stack_addr, udiv, sdiv, urem, srem, umulhi, smulhi
 in ISLE (#4741)

Lower stack_addr, udiv, sdiv, urem, srem, umulhi, and smulhi in ISLE.

For udiv, sdiv, urem, and srem I opted to move the original lowering into an extern constructor, as the interactions with rax and rdx for the div instruction didn't seem meaningful to implement in ISLE. However, I'm happy to revisit this choice and move more of the embedding into ISLE.
---
 cranelift/codegen/src/isa/x64/inst.isle       |  21 ++-
 cranelift/codegen/src/isa/x64/inst/args.rs    |   2 +-
 .../codegen/src/isa/x64/inst/emit_tests.rs    |  17 ++
 cranelift/codegen/src/isa/x64/inst/mod.rs     |  17 --
 cranelift/codegen/src/isa/x64/lower.isle      |  59 +++++++
 cranelift/codegen/src/isa/x64/lower.rs        | 148 +-----------------
 cranelift/codegen/src/isa/x64/lower/isle.rs   | 102 ++++++++++++
 .../filetests/filetests/isa/x64/sdiv.clif     |  67 ++++++++
 .../filetests/filetests/isa/x64/smulhi.clif   |  51 ++++++
 .../filetests/filetests/isa/x64/srem.clif     |  71 +++++++++
 .../filetests/filetests/isa/x64/udiv.clif     |  67 ++++++++
 .../filetests/filetests/isa/x64/umulhi.clif   |  51 ++++++
 .../filetests/filetests/isa/x64/urem.clif     |  71 +++++++++
 13 files changed, 585 insertions(+), 159 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/x64/sdiv.clif
 create mode 100644 cranelift/filetests/filetests/isa/x64/smulhi.clif
 create mode 100644 cranelift/filetests/filetests/isa/x64/srem.clif
 create mode 100644 cranelift/filetests/filetests/isa/x64/udiv.clif
 create mode 100644 cranelift/filetests/filetests/isa/x64/umulhi.clif
 create mode 100644 cranelift/filetests/filetests/isa/x64/urem.clif

diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
index cabbffc34348..405caffac681 100644
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -2738,7 +2738,7 @@
 (rule (mul_hi ty signed src1 src2)
       (let ((dst_lo WritableGpr (temp_writable_gpr))
             (dst_hi WritableGpr (temp_writable_gpr))
-            (size OperandSize (operand_size_of_type_32_64 ty))
+            (size OperandSize (raw_operand_size_of_type ty))
             (_ Unit (emit (MInst.MulHi size
                                        signed
                                        src1
@@ -3587,6 +3587,25 @@
 (rule (bitcast_gpr_to_xmm $I64 src)
       (gpr_to_xmm (SseOpcode.Movq) src (OperandSize.Size64)))
 
+;;;; Stack Addresses ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(decl stack_addr_impl (StackSlot Offset32) Gpr)
+(rule (stack_addr_impl stack_slot offset)
+      (let ((dst WritableGpr (temp_writable_gpr))
+           (_ Unit (emit (abi_stackslot_addr dst stack_slot offset))))
+        dst))
+
+;;;; Division/Remainders ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(decl emit_div_or_rem (DivOrRemKind Type WritableGpr Gpr Gpr) Unit)
+(extern constructor emit_div_or_rem emit_div_or_rem)
+
+(decl div_or_rem (DivOrRemKind Value Value) Gpr)
+(rule (div_or_rem kind a @ (value_type ty) b)
+      (let ((dst WritableGpr (temp_writable_gpr))
+            (_ Unit (emit_div_or_rem kind ty dst a b)))
+        dst))
+
 ;;;; Automatic conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (convert Gpr InstOutput output_gpr)
diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs
index eddfc6cd948b..f364747a917e 100644
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -1575,7 +1575,7 @@ impl fmt::Display for ShiftKind {
 }
 
 /// What kind of division or remainer instruction this is?
-#[derive(Clone)]
+#[derive(Clone, Eq, PartialEq)]
 pub enum DivOrRemKind {
     SignedDiv,
     UnsignedDiv,
diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
index b9a3a94ffbcd..ee01f41a1346 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -49,6 +49,23 @@ impl Inst {
             dst: WritableXmm::from_writable_reg(dst).unwrap(),
         }
     }
+
+    fn mul_hi(size: OperandSize, signed: bool, rhs: RegMem) -> Inst {
+        debug_assert!(size.is_one_of(&[
+            OperandSize::Size16,
+            OperandSize::Size32,
+            OperandSize::Size64
+        ]));
+        rhs.assert_regclass_is(RegClass::Int);
+        Inst::MulHi {
+            size,
+            signed,
+            src1: Gpr::new(regs::rax()).unwrap(),
+            src2: GprMem::new(rhs).unwrap(),
+            dst_lo: WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
+            dst_hi: WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
+        }
+    }
 }
 
 #[test]
diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
index a7f221c0261c..83cca60a32f6 100644
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -208,23 +208,6 @@ impl Inst {
         }
     }
 
-    pub(crate) fn mul_hi(size: OperandSize, signed: bool, rhs: RegMem) -> Inst {
-        debug_assert!(size.is_one_of(&[
-            OperandSize::Size16,
-            OperandSize::Size32,
-            OperandSize::Size64
-        ]));
-        rhs.assert_regclass_is(RegClass::Int);
-        Inst::MulHi {
-            size,
-            signed,
-            src1: Gpr::new(regs::rax()).unwrap(),
-            src2: GprMem::new(rhs).unwrap(),
-            dst_lo: WritableGpr::from_reg(Gpr::new(regs::rax()).unwrap()),
-            dst_hi: WritableGpr::from_reg(Gpr::new(regs::rdx()).unwrap()),
-        }
-    }
-
     pub(crate) fn checked_div_or_rem_seq(
         kind: DivOrRemKind,
         size: OperandSize,
diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index 10cdd7fd861a..98fffdb53ff2 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -3426,3 +3426,62 @@
 
 (rule (lower (has_type (use_sse41) (trunc a @ (value_type $F64X2))))
       (x64_roundpd a (RoundImm.RoundZero)))
+
+;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (stack_addr stack_slot offset))
+      (stack_addr_impl stack_slot offset))
+
+;; Rules for `udiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (udiv a @ (value_type ty) b))
+      (div_or_rem (DivOrRemKind.UnsignedDiv) a b))
+
+;; Rules for `sdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (sdiv a @ (value_type ty) b))
+      (div_or_rem (DivOrRemKind.SignedDiv) a b))
+
+;; Rules for `urem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (urem a @ (value_type ty) b))
+      (div_or_rem (DivOrRemKind.UnsignedRem) a b))
+
+;; Rules for `srem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (srem a @ (value_type ty) b))
+      (div_or_rem (DivOrRemKind.SignedRem) a b))
+
+;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (umulhi a @ (value_type $I16) b))
+      (let ((res ValueRegs (mul_hi $I16 $false a b))
+            (hi Gpr (value_regs_get_gpr res 1)))
+        hi))
+
+(rule (lower (umulhi a @ (value_type $I32) b))
+      (let ((res ValueRegs (mul_hi $I32 $false a b))
+            (hi Gpr (value_regs_get_gpr res 1)))
+        hi))
+
+(rule (lower (umulhi a @ (value_type $I64) b))
+      (let ((res ValueRegs (mul_hi $I64 $false a b))
+            (hi Gpr (value_regs_get_gpr res 1)))
+        hi))
+
+;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (smulhi a @ (value_type $I16) b))
+      (let ((res ValueRegs (mul_hi $I16 $true a b))
+            (hi Gpr (value_regs_get_gpr res 1)))
+        hi))
+
+(rule (lower (smulhi a @ (value_type $I32) b))
+      (let ((res ValueRegs (mul_hi $I32 $true a b))
+            (hi Gpr (value_regs_get_gpr res 1)))
+        hi))
+
+(rule (lower (smulhi a @ (value_type $I64) b))
+      (let ((res ValueRegs (mul_hi $I64 $true a b))
+            (hi Gpr (value_regs_get_gpr res 1)))
+        hi))
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index 3fab7062a491..c074521c48cd 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -14,7 +14,6 @@ use crate::machinst::*;
 use crate::result::CodegenResult;
 use crate::settings::{Flags, TlsModel};
 use smallvec::SmallVec;
-use std::convert::TryFrom;
 use target_lexicon::Triple;
 
 //=============================================================================
@@ -574,150 +573,19 @@ fn lower_insn_to_regs(
         | Opcode::Ceil
         | Opcode::Floor
         | Opcode::Nearest
-        | Opcode::Trunc => {
+        | Opcode::Trunc
+        | Opcode::StackAddr
+        | Opcode::Udiv
+        | Opcode::Urem
+        | Opcode::Sdiv
+        | Opcode::Srem
+        | Opcode::Umulhi
+        | Opcode::Smulhi => {
             implemented_in_isle(ctx);
         }
 
         Opcode::DynamicStackAddr => unimplemented!("DynamicStackAddr"),
 
-        Opcode::StackAddr => {
-            let (stack_slot, offset) = match *ctx.data(insn) {
-                InstructionData::StackLoad {
-                    opcode: Opcode::StackAddr,
-                    stack_slot,
-                    offset,
-                } => (stack_slot, offset),
-                _ => unreachable!(),
-            };
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let offset: i32 = offset.into();
-            let inst =
-                ctx.abi()
-                    .sized_stackslot_addr(stack_slot, u32::try_from(offset).unwrap(), dst);
-            ctx.emit(inst);
-        }
-
-        Opcode::Udiv | Opcode::Urem | Opcode::Sdiv | Opcode::Srem => {
-            let kind = match op {
-                Opcode::Udiv => DivOrRemKind::UnsignedDiv,
-                Opcode::Sdiv => DivOrRemKind::SignedDiv,
-                Opcode::Urem => DivOrRemKind::UnsignedRem,
-                Opcode::Srem => DivOrRemKind::SignedRem,
-                _ => unreachable!(),
-            };
-            let is_div = kind.is_div();
-
-            let input_ty = ctx.input_ty(insn, 0);
-            let size = OperandSize::from_ty(input_ty);
-
-            let dividend = put_input_in_reg(ctx, inputs[0]);
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-
-            ctx.emit(Inst::gen_move(
-                Writable::from_reg(regs::rax()),
-                dividend,
-                input_ty,
-            ));
-
-            // Always do explicit checks for `srem`: otherwise, INT_MIN % -1 is not handled properly.
-            if flags.avoid_div_traps() || op == Opcode::Srem {
-                // A vcode meta-instruction is used to lower the inline checks, since they embed
-                // pc-relative offsets that must not change, thus requiring regalloc to not
-                // interfere by introducing spills and reloads.
-                //
-                // Note it keeps the result in $rax (for divide) or $rdx (for rem), so that
-                // regalloc is aware of the coalescing opportunity between rax/rdx and the
-                // destination register.
-                let divisor = put_input_in_reg(ctx, inputs[1]);
-
-                let divisor_copy = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-                ctx.emit(Inst::gen_move(divisor_copy, divisor, types::I64));
-
-                let tmp = if op == Opcode::Sdiv && size == OperandSize::Size64 {
-                    Some(ctx.alloc_tmp(types::I64).only_reg().unwrap())
-                } else {
-                    None
-                };
-                // TODO use xor
-                ctx.emit(Inst::imm(
-                    OperandSize::Size32,
-                    0,
-                    Writable::from_reg(regs::rdx()),
-                ));
-                ctx.emit(Inst::checked_div_or_rem_seq(kind, size, divisor_copy, tmp));
-            } else {
-                // We don't want more than one trap record for a single instruction,
-                // so let's not allow the "mem" case (load-op merging) here; force
-                // divisor into a register instead.
-                let divisor = RegMem::reg(put_input_in_reg(ctx, inputs[1]));
-
-                // Fill in the high parts:
-                if kind.is_signed() {
-                    // sign-extend the sign-bit of al into ah for size 1, or rax into rdx, for
-                    // signed opcodes.
-                    ctx.emit(Inst::sign_extend_data(size));
-                } else if input_ty == types::I8 {
-                    ctx.emit(Inst::movzx_rm_r(
-                        ExtMode::BL,
-                        RegMem::reg(regs::rax()),
-                        Writable::from_reg(regs::rax()),
-                    ));
-                } else {
-                    // zero for unsigned opcodes.
-                    ctx.emit(Inst::imm(
-                        OperandSize::Size64,
-                        0,
-                        Writable::from_reg(regs::rdx()),
-                    ));
-                }
-
-                // Emit the actual idiv.
-                ctx.emit(Inst::div(size, kind.is_signed(), divisor));
-            }
-
-            // Move the result back into the destination reg.
-            if is_div {
-                // The quotient is in rax.
-                ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty));
-            } else {
-                if size == OperandSize::Size8 {
-                    // The remainder is in AH. Right-shift by 8 bits then move from rax.
-                    ctx.emit(Inst::shift_r(
-                        OperandSize::Size64,
-                        ShiftKind::ShiftRightLogical,
-                        Some(8),
-                        Writable::from_reg(regs::rax()),
-                    ));
-                    ctx.emit(Inst::gen_move(dst, regs::rax(), input_ty));
-                } else {
-                    // The remainder is in rdx.
-                    ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty));
-                }
-            }
-        }
-
-        Opcode::Umulhi | Opcode::Smulhi => {
-            let input_ty = ctx.input_ty(insn, 0);
-
-            let lhs = put_input_in_reg(ctx, inputs[0]);
-            let rhs = input_to_reg_mem(ctx, inputs[1]);
-            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-
-            // Move lhs in %rax.
-            ctx.emit(Inst::gen_move(
-                Writable::from_reg(regs::rax()),
-                lhs,
-                input_ty,
-            ));
-
-            // Emit the actual mul or imul.
-            let signed = op == Opcode::Smulhi;
-            ctx.emit(Inst::mul_hi(OperandSize::from_ty(input_ty), signed, rhs));
-
-            // Read the result from the high part (stored in %rdx).
-            ctx.emit(Inst::gen_move(dst, regs::rdx(), input_ty));
-        }
-
         Opcode::GetPinnedReg => {
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             ctx.emit(Inst::gen_move(dst, regs::pinned_reg(), types::I64));
diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs
index 4d0dfea4a4fd..504fd18bf411 100644
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -848,6 +848,108 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
         self.lower_ctx
             .use_constant(VCodeConstantData::WellKnown(&UMAX_MASK))
     }
+
+    fn emit_div_or_rem(
+        &mut self,
+        kind: &DivOrRemKind,
+        ty: Type,
+        dst: WritableGpr,
+        dividend: Gpr,
+        divisor: Gpr,
+    ) {
+        let is_div = kind.is_div();
+        let size = OperandSize::from_ty(ty);
+
+        self.lower_ctx.emit(MInst::gen_move(
+            Writable::from_reg(regs::rax()),
+            dividend.to_reg(),
+            ty,
+        ));
+
+        // Always do explicit checks for `srem`: otherwise, INT_MIN % -1 is not handled properly.
+        if self.flags.avoid_div_traps() || *kind == DivOrRemKind::SignedRem {
+            // A vcode meta-instruction is used to lower the inline checks, since they embed
+            // pc-relative offsets that must not change, thus requiring regalloc to not
+            // interfere by introducing spills and reloads.
+            //
+            // Note it keeps the result in $rax (for divide) or $rdx (for rem), so that
+            // regalloc is aware of the coalescing opportunity between rax/rdx and the
+            // destination register.
+            let divisor_copy = self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap();
+            self.lower_ctx
+                .emit(MInst::gen_move(divisor_copy, divisor.to_reg(), types::I64));
+
+            let tmp = if *kind == DivOrRemKind::SignedDiv && size == OperandSize::Size64 {
+                Some(self.lower_ctx.alloc_tmp(types::I64).only_reg().unwrap())
+            } else {
+                None
+            };
+            // TODO use xor
+            self.lower_ctx.emit(MInst::imm(
+                OperandSize::Size32,
+                0,
+                Writable::from_reg(regs::rdx()),
+            ));
+            self.lower_ctx.emit(MInst::checked_div_or_rem_seq(
+                kind.clone(),
+                size,
+                divisor_copy,
+                tmp,
+            ));
+        } else {
+            // We don't want more than one trap record for a single instruction,
+            // so let's not allow the "mem" case (load-op merging) here; force
+            // divisor into a register instead.
+            let divisor = RegMem::reg(divisor.to_reg());
+
+            // Fill in the high parts:
+            if kind.is_signed() {
+                // sign-extend the sign-bit of al into ah for size 1, or rax into rdx, for
+                // signed opcodes.
+                self.lower_ctx.emit(MInst::sign_extend_data(size));
+            } else if ty == types::I8 {
+                self.lower_ctx.emit(MInst::movzx_rm_r(
+                    ExtMode::BL,
+                    RegMem::reg(regs::rax()),
+                    Writable::from_reg(regs::rax()),
+                ));
+            } else {
+                // zero for unsigned opcodes.
+                self.lower_ctx.emit(MInst::imm(
+                    OperandSize::Size64,
+                    0,
+                    Writable::from_reg(regs::rdx()),
+                ));
+            }
+
+            // Emit the actual idiv.
+            self.lower_ctx
+                .emit(MInst::div(size, kind.is_signed(), divisor));
+        }
+
+        // Move the result back into the destination reg.
+        if is_div {
+            // The quotient is in rax.
+            self.lower_ctx
+                .emit(MInst::gen_move(dst.to_writable_reg(), regs::rax(), ty));
+        } else {
+            if size == OperandSize::Size8 {
+                // The remainder is in AH. Right-shift by 8 bits then move from rax.
+                self.lower_ctx.emit(MInst::shift_r(
+                    OperandSize::Size64,
+                    ShiftKind::ShiftRightLogical,
+                    Some(8),
+                    Writable::from_reg(regs::rax()),
+                ));
+                self.lower_ctx
+                    .emit(MInst::gen_move(dst.to_writable_reg(), regs::rax(), ty));
+            } else {
+                // The remainder is in rdx.
+                self.lower_ctx
+                    .emit(MInst::gen_move(dst.to_writable_reg(), regs::rdx(), ty));
+            }
+        }
+    }
 }
 
 impl IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
diff --git a/cranelift/filetests/filetests/isa/x64/sdiv.clif b/cranelift/filetests/filetests/isa/x64/sdiv.clif
new file mode 100644
index 000000000000..c0f486c71f1c
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/sdiv.clif
@@ -0,0 +1,67 @@
+test compile precise-output
+target x86_64
+
+function %f1(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = sdiv v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   cbw %al, %dl
+;   idiv    %al, (none), %sil, %al, %dl
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f2(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = sdiv v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   cwd %ax, %dx
+;   idiv    %ax, %dx, %si, %ax, %dx
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f3(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = sdiv v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   cdq %eax, %edx
+;   idiv    %eax, %edx, %esi, %eax, %edx
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f4(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = sdiv v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   cqo %rax, %rdx
+;   idiv    %rax, %rdx, %rsi, %rax, %rdx
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/x64/smulhi.clif b/cranelift/filetests/filetests/isa/x64/smulhi.clif
new file mode 100644
index 000000000000..0958ce301b62
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/smulhi.clif
@@ -0,0 +1,51 @@
+test compile precise-output
+target x86_64
+
+function %f1(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = smulhi v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   imul    %ax, %si, %ax, %dx
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f2(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = smulhi v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   imul    %eax, %esi, %eax, %edx
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f3(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = smulhi v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   imul    %rax, %rsi, %rax, %rdx
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/x64/srem.clif b/cranelift/filetests/filetests/isa/x64/srem.clif
new file mode 100644
index 000000000000..99b137d56665
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/srem.clif
@@ -0,0 +1,71 @@
+test compile precise-output
+target x86_64
+
+function %f1(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = srem v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    $0, %edx
+;   srem_seq %al, %dl, %sil, %al, %dl, tmp=(none)
+;   shrq    $8, %rax, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f2(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = srem v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    $0, %edx
+;   srem_seq %ax, %dx, %si, %ax, %dx, tmp=(none)
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f3(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = srem v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    $0, %edx
+;   srem_seq %eax, %edx, %esi, %eax, %edx, tmp=(none)
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f4(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = srem v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    $0, %edx
+;   srem_seq %rax, %rdx, %rsi, %rax, %rdx, tmp=(none)
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/x64/udiv.clif b/cranelift/filetests/filetests/isa/x64/udiv.clif
new file mode 100644
index 000000000000..a49b5a027ef5
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/udiv.clif
@@ -0,0 +1,67 @@
+test compile precise-output
+target x86_64
+
+function %f1(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = udiv v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movzbl  %al, %eax
+;   div     %al, (none), %sil, %al, %dl
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f2(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = udiv v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    $0, %edx
+;   div     %ax, %dx, %si, %ax, %dx
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f3(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = udiv v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    $0, %edx
+;   div     %eax, %edx, %esi, %eax, %edx
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f4(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = udiv v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    $0, %edx
+;   div     %rax, %rdx, %rsi, %rax, %rdx
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/x64/umulhi.clif b/cranelift/filetests/filetests/isa/x64/umulhi.clif
new file mode 100644
index 000000000000..c5b0b73a26ac
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/umulhi.clif
@@ -0,0 +1,51 @@
+test compile precise-output
+target x86_64
+
+function %f1(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = umulhi v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   mul     %ax, %si, %ax, %dx
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f2(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = umulhi v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   mul     %eax, %esi, %eax, %edx
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f3(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = umulhi v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   mul     %rax, %rsi, %rax, %rdx
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/x64/urem.clif b/cranelift/filetests/filetests/isa/x64/urem.clif
new file mode 100644
index 000000000000..5f4e80251f76
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/urem.clif
@@ -0,0 +1,71 @@
+test compile precise-output
+target x86_64
+
+function %f1(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = urem v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movzbl  %al, %eax
+;   div     %al, (none), %sil, %al, %dl
+;   shrq    $8, %rax, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f2(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = urem v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    $0, %edx
+;   div     %ax, %dx, %si, %ax, %dx
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f3(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = urem v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    $0, %edx
+;   div     %eax, %edx, %esi, %eax, %edx
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f4(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = urem v0, v1
+  return v2
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   movq    %rdi, %rax
+;   movl    $0, %edx
+;   div     %rax, %rdx, %rsi, %rax, %rdx
+;   movq    %rdx, %rax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+