bytecodealliance · theotherjimmy · Feb 4, 2026 · Feb 4, 2026 · Feb 4, 2026 · Feb 4, 2026
@@ -1189,11 +1189,13 @@
     (Add32x4)
     (Add64x2)
     (Add128)
+    (Add128Cout)
     (Sub8x16)
     (Sub16x8)
     (Sub32x4)
     (Sub64x2)
     (Sub128)
+    (Sub128Cout)
     ;; Multiplication
     (Mul8x16)
     (Mul16x8)
@@ -3842,6 +3844,10 @@
 (decl add_reg (Type Reg Reg) Reg)
 (rule (add_reg ty x y) (alu_rrr ty (aluop_add ty) x y))
 
+(decl add_reg_with_flags_paired (Type Reg Reg) ProducesFlags)
+(rule (add_reg_with_flags_paired ty x y)
+      (alu_rrr_with_flags_paired ty (aluop_add ty) x y))
+
 (decl add_reg_sext32 (Type Reg Reg) Reg)
 (rule (add_reg_sext32 ty x y) (alu_rr ty (aluop_add_sext32 ty) x y))
 
@@ -3915,6 +3921,11 @@
 (rule (add_logical_mem_zext32_with_flags_paired ty x y)
       (alu_rx_with_flags_paired ty (aluop_add_logical_zext32 ty) x y))
 
+(decl vecop_add_logical_cout (Type) VecBinaryOp)
+(rule (vecop_add_logical_cout $I128) (VecBinaryOp.Add128Cout))
+
+(decl vec_add_logical_cout (Type Reg Reg) Reg)
+(rule (vec_add_logical_cout ty x y) (vec_rrr ty (vecop_add_logical_cout ty) x y))
 
 ;; Helpers for generating `sub` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -3935,6 +3946,10 @@
 (decl sub_reg (Type Reg Reg) Reg)
 (rule (sub_reg ty x y) (alu_rrr ty (aluop_sub ty) x y))
 
+(decl sub_reg_with_flags_paired (Type Reg Reg) ProducesFlags)
+(rule (sub_reg_with_flags_paired ty x y)
+      (alu_rrr_with_flags_paired ty (aluop_sub ty) x y))
+
 (decl sub_reg_sext32 (Type Reg Reg) Reg)
 (rule (sub_reg_sext32 ty x y) (alu_rr ty (aluop_sub_sext32 ty) x y))
 
@@ -3970,6 +3985,10 @@
 (decl sub_logical_reg (Type Reg Reg) Reg)
 (rule (sub_logical_reg ty x y) (alu_rrr ty (aluop_sub_logical ty) x y))
 
+(decl sub_logical_reg_with_flags_paired (Type Reg Reg) ProducesFlags)
+(rule (sub_logical_reg_with_flags_paired ty x y)
+      (alu_rrr_with_flags_paired ty (aluop_sub_logical ty) x y))
+
 (decl sub_logical_reg_zext32 (Type Reg Reg) Reg)
 (rule (sub_logical_reg_zext32 ty x y) (alu_rr ty (aluop_sub_logical_zext32 ty) x y))
 
@@ -3982,6 +4001,11 @@
 (decl sub_logical_mem_zext32 (Type Reg MemArg) Reg)
 (rule (sub_logical_mem_zext32 ty x y) (alu_rx ty (aluop_sub_logical ty) x y))
 
+(decl vecop_sub_logical_cout (Type) VecBinaryOp)
+(rule (vecop_sub_logical_cout $I128) (VecBinaryOp.Sub128Cout))
+
+(decl vec_sub_logical_cout (Type Reg Reg) Reg)
+(rule (vec_sub_logical_cout ty x y) (vec_rrr ty (vecop_sub_logical_cout ty) x y))
 
 ;; Helpers for generating `mul` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -4002,6 +4026,10 @@
 (decl mul_reg (Type Reg Reg) Reg)
 (rule (mul_reg ty x y) (alu_rrr ty (aluop_mul ty) x y))
 
+(decl mul_reg_with_flags_paired (Type Reg Reg) ProducesFlags)
+(rule (mul_reg_with_flags_paired ty x y)
+     (alu_rrr_with_flags_paired ty (aluop_mul ty) x y))
+
 (decl mul_reg_sext32 (Type Reg Reg) Reg)
 (rule (mul_reg_sext32 ty x y) (alu_rr ty (aluop_mul_sext32 ty) x y))
 

@@ -1515,8 +1515,8 @@ impl Inst {
                     ALUOp::Sub64 => (0xb9e9, true),        // SGRK
                     ALUOp::SubLogical32 => (0xb9fb, true), // SLRK
                     ALUOp::SubLogical64 => (0xb9eb, true), // SLGRK
-                    ALUOp::Mul32 => (0xb9fd, true),        // MSRKC
-                    ALUOp::Mul64 => (0xb9ed, true),        // MSGRKC
+                    ALUOp::Mul32 => (0xb9fd, false),       // MSRKC
+                    ALUOp::Mul64 => (0xb9ed, false),       // MSGRKC
                     ALUOp::And32 => (0xb9f4, true),        // NRK
                     ALUOp::And64 => (0xb9e4, true),        // NGRK
                     ALUOp::Orr32 => (0xb9f6, true),        // ORK
@@ -2719,11 +2719,13 @@ impl Inst {
                     VecBinaryOp::Add32x4 => (0xe7f3, 2),       // VAF
                     VecBinaryOp::Add64x2 => (0xe7f3, 3),       // VAG
                     VecBinaryOp::Add128 => (0xe7f3, 4),        // VAQ
+                    VecBinaryOp::Add128Cout => (0xe7f1, 4),    // VACCQ
                     VecBinaryOp::Sub8x16 => (0xe7f7, 0),       // VSB
                     VecBinaryOp::Sub16x8 => (0xe7f7, 1),       // VSH
                     VecBinaryOp::Sub32x4 => (0xe7f7, 2),       // VSF
                     VecBinaryOp::Sub64x2 => (0xe7f7, 3),       // VSG
                     VecBinaryOp::Sub128 => (0xe7f7, 4),        // VSQ
+                    VecBinaryOp::Sub128Cout => (0xe7f5, 4),    // VSCBI
                     VecBinaryOp::Mul8x16 => (0xe7a2, 0),       // VMLB
                     VecBinaryOp::Mul16x8 => (0xe7a2, 1),       // VMLHW
                     VecBinaryOp::Mul32x4 => (0xe7a2, 2),       // VMLF

@@ -1295,8 +1295,8 @@ impl Inst {
                     ALUOp::Sub64 => ("sgrk", true),
                     ALUOp::SubLogical32 => ("slrk", true),
                     ALUOp::SubLogical64 => ("slgrk", true),
-                    ALUOp::Mul32 => ("msrkc", true),
-                    ALUOp::Mul64 => ("msgrkc", true),
+                    ALUOp::Mul32 => ("msrkc", false),
+                    ALUOp::Mul64 => ("msgrkc", false),
                     ALUOp::And32 => ("nrk", true),
                     ALUOp::And64 => ("ngrk", true),
                     ALUOp::Orr32 => ("ork", true),
@@ -2511,11 +2511,13 @@ impl Inst {
                     VecBinaryOp::Add32x4 => "vaf",
                     VecBinaryOp::Add64x2 => "vag",
                     VecBinaryOp::Add128 => "vaq",
+                    VecBinaryOp::Add128Cout => "vaccq",
                     VecBinaryOp::Sub8x16 => "vsb",
                     VecBinaryOp::Sub16x8 => "vsh",
                     VecBinaryOp::Sub32x4 => "vsf",
                     VecBinaryOp::Sub64x2 => "vsg",
                     VecBinaryOp::Sub128 => "vsq",
+                    VecBinaryOp::Sub128Cout => "vscbiq",
                     VecBinaryOp::Mul8x16 => "vmlb",
                     VecBinaryOp::Mul16x8 => "vmlhw",
                     VecBinaryOp::Mul32x4 => "vmlf",

@@ -4311,12 +4311,143 @@
 
 ;;;; Rules for `uadd_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+;; For 8 & 16 bit widths, there is no instruction set support, so we have to do a
+;; wider add and split manually
+
+(decl type_shift_up (Type) u8)
+(rule (type_shift_up $I8) 24)
+(rule (type_shift_up $I16) 16)
+
+;; For fit-in-16 bit itegers, we shift them into the most significant positions of their
+;; 32-bit registers, use the codition codes for the overflow, and shifting back into
+;; the expected least-significant position to generate the result
+(decl overflow_and_result_from_shifted (Type ALUOp Reg Reg Cond) InstOutput)
+(rule (overflow_and_result_from_shifted (fits_in_16 ty) op x y cond)
+      (let ((x_shifted Reg (lshl_imm $I32 x (type_shift_up ty)))
+            (y_shifted Reg (lshl_imm $I32 y (type_shift_up ty)))
+            (producer ProducesFlags (alu_rrr_with_flags_paired ty op x_shifted y_shifted))
+            (overflow Reg (lower_bool $I8 (bool (produces_flags_ignore producer) cond)))
+            (out Reg (lshr_imm $I32 (produces_flags_get_reg producer) (type_shift_up ty))))
+        (output_pair out overflow)))
+
+(rule 1 (lower (has_type (fits_in_16 ty) (uadd_overflow x y)))
+    (overflow_and_result_from_shifted ty (aluop_add_logical $I32) x y (mask_as_cond 3)))
+
+;; Generate the paired overflow result from the generated condition codes
+(decl overflow_and_result_from_producer (ProducesFlags Cond) InstOutput)
+(rule (overflow_and_result_from_producer producer cond)
+      (output_pair
+        (produces_flags_get_reg producer)
+        (lower_bool $I8 (bool (produces_flags_ignore producer) cond))))
+
+;; For 32 & 64 bit widths, we can convert condition codes to the overflow out byte
 (rule 0 (lower (has_type (ty_32_or_64 ty) (uadd_overflow x y)))
-      (let ((sum Reg (add_reg ty x y))
-            (overflow Reg
-              (lower_bool $I8
-                          (bool (icmpu_reg ty sum x) (intcc_as_cond (IntCC.UnsignedLessThan))))))
-        (output_pair sum overflow)))
+      (overflow_and_result_from_producer (add_logical_reg_with_flags_paired ty x y) (mask_as_cond 3)))
+
+(rule 2 (lower (has_type $I128 (uadd_overflow x y)))
+      (output_pair
+        (vec_add $I128 x y)
+        (vec_extract_lane $I64X2 (vec_add_logical_cout $I128 x y) 1 (zero_reg))))
+
+;;;; Rules for `usub_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Note: s390x stores and computes the borrow bit as a 0 when an overflow is present
+;; so all of the conditons and computed borrows are inverted
+
+(rule 1 (lower (has_type (fits_in_16 ty) (usub_overflow x y)))
+      (overflow_and_result_from_shifted ty (aluop_sub_logical $I32) x y
+        (invert_cond (mask_as_cond 3))))
+
+(rule 0 (lower (has_type (ty_32_or_64 ty) (usub_overflow x y)))
+      (overflow_and_result_from_producer
+        (sub_logical_reg_with_flags_paired ty x y)
+        (invert_cond (mask_as_cond 3))))
+
+(rule 2 (lower (has_type $I128 (usub_overflow x y)))
+      (output_pair
+        (vec_sub $I128 x y)
+        (xor_uimm32shifted $I8
+          (vec_extract_lane $I64X2 (vec_sub_logical_cout $I128 x y) 1 (zero_reg))
+          (uimm32shifted 1 0))))
+
+;;;; Rules for `sadd_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule 1 (lower (has_type (fits_in_16 ty) (sadd_overflow x y)))
+      (overflow_and_result_from_shifted ty (aluop_add $I32) x y (mask_as_cond 1)))
+
+(rule 0 (lower (has_type (ty_32_or_64 ty) (sadd_overflow x y)))
+      (overflow_and_result_from_producer (add_reg_with_flags_paired ty x y) (mask_as_cond 1)))
+
+(rule 2 (lower (has_type $I128 (sadd_overflow x y)))
+      (let ((res Reg (vec_add $I128 x y))
+            (res_hi Reg (vec_extract_lane $I64X2 res 0 (zero_reg)))
+            (x_hi Reg (vec_extract_lane $I64X2 x 0 (zero_reg)))
+            (y_hi Reg (vec_extract_lane $I64X2 y 0 (zero_reg)))
+            (of_in_sign Reg
+               (and_reg $I64
+                 (xor_reg $I64 x_hi res_hi)
+                 (xor_reg $I64 y_hi res_hi))))
+        (output_pair res (lshr_imm $I64 of_in_sign 63))))
+
+;;;; Rules for `ssub_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule 1 (lower (has_type (fits_in_16 ty) (ssub_overflow x y)))
+      (overflow_and_result_from_shifted ty (aluop_sub $I32) x y (mask_as_cond 1)))
+
+;; Use flags generated by the add instruction to handle overflow
+(rule 0 (lower (has_type (ty_32_or_64 ty) (ssub_overflow x y)))
+      (overflow_and_result_from_producer (sub_reg_with_flags_paired ty x y) (mask_as_cond 1)))
+
+(rule 2 (lower (has_type $I128 (ssub_overflow x y)))
+      (let ((res Reg (vec_sub $I128 x y))
+            (res_hi Reg (vec_extract_lane $I64X2 res 0 (zero_reg)))
+            (x_hi Reg (vec_extract_lane $I64X2 x 0 (zero_reg)))
+            (y_hi Reg (vec_extract_lane $I64X2 y 0 (zero_reg)))
+            (of_in_sign Reg
+               (and_reg $I64
+                 (xor_reg $I64 x_hi res_hi)
+                 (xor_reg $I64 x_hi y_hi))))
+        (output_pair res (lshr_imm $I64 of_in_sign 63))))
+
+;;;; Rules for `umul_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule 1 (lower (has_type (fits_in_32 ty) (umul_overflow x y)))
+      (let ((result Reg (mul_reg $I64 (zext64_reg ty y) (zext64_reg ty x)))
+            (of Reg (lower_bool $I8 (invert_bool (bool
+              (icmps_simm16 $I32 (lshr_imm $I64 result (ty_bits ty)) 0)
+              (intcc_as_cond (IntCC.Equal)))))))
+        (output_pair result of)))
+
+(rule 0 (lower (has_type ty @ $I64 (umul_overflow x y)))
+      (let ((mul_out RegPair (umul_wide y x))
+            (result Reg (regpair_lo mul_out))
+            (of Reg (lower_bool $I8 (invert_bool (bool
+              (icmps_simm16 ty (regpair_hi mul_out) 0)
+              (intcc_as_cond (IntCC.Equal)))))))
+        (output_pair result of)))
+
+;;;; Rules for `smul_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; For fit-in-16 bit integers, we shift just the rhs into the most
+;; significant positions of its 32-bit register, use the codition
+;; codes for the overflow, and shifting back into the expected
+;; least-significant position to generate the result.
+(rule 1 (lower (has_type (fits_in_16 ty) (smul_overflow x y)))
+      (let ((y_ext Reg (sext32_reg ty y))
+            (x_shifted Reg (lshl_imm $I32 x (type_shift_up ty)))
+            (producer ProducesFlags
+              (mul_reg_with_flags_paired $I32 x_shifted y_ext))
+            (overflow Reg (lower_bool $I8 (bool
+              (produces_flags_ignore producer)
+              (mask_as_cond 1))))
+            (out Reg (lshr_imm $I32
+              (produces_flags_get_reg producer)
+              (type_shift_up ty))))
+        (output_pair out overflow)))
+
+;; Use flags generated by the add instruction to handle overflow
+(rule 0 (lower (has_type (ty_32_or_64 ty) (smul_overflow x y)))
+      (overflow_and_result_from_producer (mul_reg_with_flags_paired ty x y) (mask_as_cond 1)))
 
 ;;;; Rules for `return` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 

@@ -956,8 +956,8 @@ block0(v0: i128, v1: i128):
 ;   vlgvg %r12, %v3, 1
 ;   lgr %r3, %r5
 ;   mlgr %r2, %r12
-;   msgr %r5, %r10
-;   msgr %r4, %r12
+;   msgrkc %r5, %r5, %r10
+;   msgrkc %r4, %r4, %r12
 ;   agrk %r2, %r5, %r2
 ;   agrk %r2, %r4, %r2
 ;   vlvgp %v25, %r2, %r3
@@ -979,8 +979,8 @@ block0(v0: i128, v1: i128):
 ;   vlgvg %r12, %v3, 1
 ;   lgr %r3, %r5
 ;   mlgr %r2, %r12
-;   msgr %r5, %r10
-;   msgr %r4, %r12
+;   msgrkc %r5, %r5, %r10
+;   msgrkc %r4, %r4, %r12
 ;   agrk %r2, %r5, %r2
 ;   agrk %r2, %r4, %r2
 ;   vlvgp %v25, %r2, %r3
@@ -997,12 +997,12 @@ block0(v0: i64, v1: i64):
 
 ; VCode:
 ; block0:
-;   msgr %r2, %r3
+;   msgrkc %r2, %r2, %r3
 ;   br %r14
 ;
 ; Disassembled:
 ; block0: ; offset 0x0
-;   msgr %r2, %r3
+;   msgrkc %r2, %r2, %r3
 ;   br %r14
 
 function %imul_i64_imm16(i64) -> i64 {
@@ -1098,12 +1098,12 @@ block0(v0: i32, v1: i32):
 
 ; VCode:
 ; block0:
-;   msr %r2, %r3
+;   msrkc %r2, %r2, %r3
 ;   br %r14
 ;
 ; Disassembled:
 ; block0: ; offset 0x0
-;   msr %r2, %r3
+;   msrkc %r2, %r2, %r3
 ;   br %r14
 
 function %imul_i32_imm16(i32) -> i32 {
@@ -1216,12 +1216,12 @@ block0(v0: i16, v1: i16):
 
 ; VCode:
 ; block0:
-;   msr %r2, %r3
+;   msrkc %r2, %r2, %r3
 ;   br %r14
 ;
 ; Disassembled:
 ; block0: ; offset 0x0
-;   msr %r2, %r3
+;   msrkc %r2, %r2, %r3
 ;   br %r14
 
 function %imul_i16_imm(i16) -> i16 {
@@ -1266,12 +1266,12 @@ block0(v0: i8, v1: i8):
 
 ; VCode:
 ; block0:
-;   msr %r2, %r3
+;   msrkc %r2, %r2, %r3
 ;   br %r14
 ;
 ; Disassembled:
 ; block0: ; offset 0x0
-;   msr %r2, %r3
+;   msrkc %r2, %r2, %r3
 ;   br %r14
 
 function %imul_i8_imm(i8) -> i8 {
@@ -1301,13 +1301,13 @@ block0(v0: i8, v1: i64):
 ; VCode:
 ; block0:
 ;   llc %r4, 0(%r3)
-;   msr %r2, %r4
+;   msrkc %r2, %r2, %r4
 ;   br %r14
 ;
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   llc %r4, 0(%r3) ; trap: heap_oob
-;   msr %r2, %r4
+;   msrkc %r2, %r2, %r4
 ;   br %r14
 
 function %umulhi_i64(i64, i64) -> i64 {