fix: dynamic apply for x86_64 on macOS/linux

bitwalker · bitwalker · commit 7a7b02a949ce · 2023-02-27T04:38:51.000-05:00
diff --git a/library/rt/src/function/apply/dynamic/asm/dynamic_apply_linux.s b/library/rt/src/function/apply/dynamic/asm/dynamic_apply_linux.s
@@ -10,41 +10,42 @@ __firefly_dynamic_apply:
     # At this point, the following registers are bound:
     #
     #   rdi <- callee
-    #   rsi <- argv
-    #   rdx <- argc
+    #   rsi <- process
+    #   rdx <- argv
+    #   rcx <- argc
     #
     # Save the parent base pointer for when control returns to this call frame.
-    # CFA directives will inform the unwinder to expect %rbp at the bottom of the
+    # CFA directives will inform the unwinder to expect rbp at the bottom of the
     # stack for this frame, so this should be the last value on the stack in the caller
     push rbp
     .cfi_def_cfa_offset 16
     .cfi_offset rbp, -16
     mov  rbp, rsp
     .cfi_def_cfa_register rbp
 
-    # Save our callee and argv pointers, and argc
-    mov    r10, rdi
-    mov    r11, rsi
-    mov    rax, rdx
+    # Pin callee pointer to r10
+    mov  r10, rdi
+    # Pin the argv pointer to r11
+    mov  r11, rdx
+    # The process pointer needs to be in rdi
+    mov  rdi, rsi
 
     # Determine if spills are needed
     # In the common case in which they are not, we perform a tail call
-    cmp  rdx, 7
+    cmp  rcx, 6
     ja .L_dyn_call_spill
     
 .L_dyn_call_no_spill:
     # We only reach this block if we had no arguments to spill, so
     # we are not certain about which registers we need to assign. We
     # simply check for each register whether this a corresponding argument,
     # and if so, we assign it.
-    #
-    # Sure would be nice if we had the equivalent of LDM from ARM
 
     # Calculate offset in jump table to block which handles the specific
     # number of registers we have arguments for, then jump to that block
-    lea    rcx, [rip + .L_dyn_call_jt]
-    mov    rcx, [rcx + rdx * 8]
-    jmp    rcx
+    lea  rcx, [rip + .L_dyn_call_jt]
+    mov  rax, [rcx + 8*rax]
+    jmp  rax
 
     # All of these basic blocks perform a tail call. As such,
     # the unwinder will skip over this frame should the callee
@@ -54,58 +55,58 @@ __firefly_dynamic_apply:
     jmp r10
 
 .L_dyn_call_regs1:
-    mov rdi, [r11]
+    mov rsi, [r11]
     pop rbp
     jmp r10
 
 .L_dyn_call_regs2:
-    mov rdi, [r11]
-    mov rsi, [r11 + 8]
-    pop rbp
-    jmp r10
+    mov  rsi, [r11]
+    mov  rdx, [r11 + 8]
+    pop  rbp
+    jmp  r10
 
 .L_dyn_call_regs3:
-    mov rdi, [r11]
-    mov rsi, [r11 + 8]
-    mov rdx, [r11 + 16]
-    pop rbp
-    jmp r10
+    mov  rsi, [r11]
+    mov  rdx, [r11 + 8]
+    mov  rcx, [r11 + 16]
+    pop  rbp
+    jmp  r10
 
 .L_dyn_call_regs4:
-    mov rdi, [r11]
-    mov rsi, [r11 + 8]
-    mov rdx, [r11 + 16]
-    mov rcx, [r11 + 24]
-    pop rbp
-    jmp r10
+    mov  rsi, [r11]
+    mov  rdx, [r11 + 8]
+    mov  rcx, [r11 + 16]
+    mov  r8, [r11 + 24]
+    pop  rbp
+    jmp  r10
 
 .L_dyn_call_regs5:
-    mov rdi, [r11]
-    mov rsi, [r11 + 8]
-    mov rdx, [r11 + 16]
-    mov rcx, [r11 + 24]
-    mov r8,  [r11 + 32]
-    pop rbp
-    jmp r10
-
-.L_dyn_call_regs6:
-    mov rdi, [r11]
-    mov rsi, [r11 + 8]
-    mov rdx, [r11 + 16]
-    mov rcx, [r11 + 24]
-    mov r8, [r11 + 32]
-    mov r9, [r11 + 40]
-    pop rbp
-    jmp r10
+    mov  rsi, [r11]
+    mov  rdx, [r11 + 8]
+    mov  rcx, [r11 + 16]
+    mov  r8, [r11 + 24]
+    mov  r9, [r11 + 32]
+    pop  rbp
+    jmp  r10
 
 .L_dyn_call_spill:
     # If we hit this block, we have identified that there are
     # arguments to spill. We perform some setup for the actual
     # spilling, which is a loop built on `rep movsq`
+    #
+    # At this point, the following registers are occupied/hold these values:
+    #
+    #  r10 <- callee
+    #  rdi <- process
+    #  r11 <- argv
+    #  rcx <- argc
+
+    # rcx, rdi, and rsi are used by `rep movsq`, so save them temporarily
+    mov  r8, rcx
+    mov  r9, rdi
 
-    # Calculate spill count for later (rep uses rcx for the iteration count,
+    # Calculate spill count for later (rep uses rcx for the iteration count `i`,
     # which in this case is the number of quadwords to copy)
-    mov  rcx, rdx
     sub  rcx, 6
 
     # Calculate spill space, and ensure it is rounded up to the nearest 16 bytes.
@@ -116,21 +117,22 @@ __firefly_dynamic_apply:
     sub rsp, rax
 
     # load source pointer (last item of argv)
-    lea rsi, [r11 + rdx * 8 + -8]
+    lea rsi, [r11 + r8 * 8 - 8]
     # load destination pointer (top of spill region)
-    lea rdi, [rsp + rcx * 8 + -8]
-    # copy rcx quadwords from rsi to rdi, in reverse
+    lea rdi, [rsp + rcx * 8 - 8]
+    # copy `i` quadwords from source to destination, in reverse
     std
     rep movsq
     cld
 
-    # We've spilled arguments, so we have at least 6 args
-    mov  rdi, [r11]
-    mov  rsi, [r11 + 8]
-    mov  rdx, [r11 + 16]
-    mov  rcx, [r11 + 24]
-    mov  r8,  [r11 + 32]
-    mov  r9,  [r11 + 40]
+    # We've spilled arguments, so we have at least 6 args, move them into their
+    # final destination registers in preparation for the call
+    mov  rdi, r9
+    mov  rsi, [r11]
+    mov  rdx, [r11 + 8]
+    mov  rcx, [r11 + 16]
+    mov  r8,  [r11 + 24]
+    mov  r9,  [r11 + 32]
 
 .L_dyn_call_exec:
     # If we spill arguments to the stack, we can't perform
@@ -167,7 +169,6 @@ __firefly_dynamic_apply:
     .quad .L_dyn_call_regs3
     .quad .L_dyn_call_regs4
     .quad .L_dyn_call_regs5
-    .quad .L_dyn_call_regs6
     .size .L_dyn_call_jt, 48
 
     # The following is the LSDA metadata for exception handling
diff --git a/library/rt/src/function/apply/dynamic/asm/dynamic_apply_macos.s b/library/rt/src/function/apply/dynamic/asm/dynamic_apply_macos.s
@@ -8,8 +8,9 @@ L_dyn_call_begin:
     # At this point, the following registers are bound:
     #
     #   rdi <- callee
-    #   rsi <- argv
-    #   rdx <- argc
+    #   rsi <- process
+    #   rdx <- argv
+    #   rcx <- argc
     #
     # Save the parent base pointer for when control returns to this call frame.
     # CFA directives will inform the unwinder to expect rbp at the bottom of the
@@ -20,14 +21,16 @@ L_dyn_call_begin:
     mov  rbp, rsp
     .cfi_def_cfa_register rbp
 
-    # Save our callee and argv pointers, and argc
+    # Pin callee pointer to r10
     mov  r10, rdi
-    mov  r11, rsi
-    mov  rax, rdx
+    # Pin the argv pointer to r11
+    mov  r11, rdx
+    # The process pointer needs to be in rdi
+    mov  rdi, rsi
 
-    # Determine if spills are needed
+    # Determine if spills are needed (argc + 1 should be <= 8 when not needed)
     # In the common case in which they are not, we perform a tail call
-    cmp  rdx, 7
+    cmp  rcx, 6
     ja L_dyn_call_spill
 
 L_dyn_call_no_spill:
@@ -39,70 +42,70 @@ L_dyn_call_no_spill:
     # Calculate offset in jump table to block which handles the specific
     # number of registers we have arguments for, then jump to that block
     lea  rcx, [rip + L_dyn_call_jt]
-    mov  rax, [rcx + rax * 4]
+    movsxd  rax, dword ptr [rcx + 4*rax]
     add  rax, rcx
-    jmp  [rax]
+    jmp  rax
 
     # All of these basic blocks perform a tail call. As such,
     # the unwinder will skip over this frame should the callee
     # throw an exception
 L_dyn_call_regs0:
     pop  rbp
-    jmp  [r10]
+    jmp  r10
 
 L_dyn_call_regs1:
-    mov  rdi, [r11]
+    mov  rsi, [r11]
     pop  rbp
-    jmp  [r10]
+    jmp  r10
 
 L_dyn_call_regs2:
-    mov  rdi, [r11]
-    mov  rsi, [r11 + 8]
+    mov  rsi, [r11]
+    mov  rdx, [r11 + 8]
     pop  rbp
-    jmp  [r10]
+    jmp  r10
 
 L_dyn_call_regs3:
-    mov  rdi, [r11]
-    mov  rsi, [r11 + 8]
-    mov  rdx, [r11 + 16]
+    mov  rsi, [r11]
+    mov  rdx, [r11 + 8]
+    mov  rcx, [r11 + 16]
     pop  rbp
-    jmp  [r10]
+    jmp  r10
 
 L_dyn_call_regs4:
-    mov  rdi, [r11]
-    mov  rsi, [r11 + 8]
-    mov  rdx, [r11 + 16]
-    mov  rcx, [r11 + 24]
+    mov  rsi, [r11]
+    mov  rdx, [r11 + 8]
+    mov  rcx, [r11 + 16]
+    mov  r8, [r11 + 24]
     pop  rbp
-    jmp  [r10]
+    jmp  r10
 
 L_dyn_call_regs5:
-    mov  rdi, [r11]
-    mov  rsi, [r11 + 8]
-    mov  rdx, [r11 + 16]
-    mov  rcx, [r11 + 24]
-    mov  r8, [r11 + 32]
+    mov  rsi, [r11]
+    mov  rdx, [r11 + 8]
+    mov  rcx, [r11 + 16]
+    mov  r8, [r11 + 24]
+    mov  r9, [r11 + 32]
     pop  rbp
-    jmp  [r10]
-
-L_dyn_call_regs6:
-    mov  rdi, [r11]
-    mov  rsi, [r11 + 8]
-    mov  rdx, [r11 + 16]
-    mov  rcx, [r11 + 24]
-    mov  r8, [r11 + 32]
-    mov  r9, [r11 + 40]
-    pop  rbp
-    jmp  [r10]
+    jmp  r10
 
 L_dyn_call_spill:
     # If we hit this block, we have identified that there are
     # arguments to spill. We perform some setup for the actual
     # spilling, which is a loop built on `rep movsq`
+    #
+    # At this point, the following registers are occupied/hold these values:
+    #
+    #  r10 <- callee
+    #  rdi <- process
+    #  r11 <- argv
+    #  rcx <- argc
+
+    # rcx, rdi, and rsi are used by `rep movsq`, so save them temporarily
+    mov  r8, rcx
+    mov  r9, rdi
 
-    # Calculate spill count for later (rep uses rcx for the iteration count,
+    # Calculate spill count for later (rep uses rcx for the iteration count `i`,
     # which in this case is the number of quadwords to copy)
-    mov  rcx, rdx
     sub  rcx, 6
 
     # Calculate spill space, and ensure it is rounded up to the nearest 16 bytes.
@@ -113,21 +116,22 @@ L_dyn_call_spill:
     sub  rsp, rax
 
     # load source pointer (last item of argv)
-    lea  rsi, [r11 + rdx * 8 - 8]
+    lea  rsi, [r11 + r8 * 8 - 8]
     # load destination pointer (top of spill region)
-    lea rdi, [rsp + rcx * 8 - 8]
-    # copy rcx quadwords from rsi to rdi, in reverse
+    lea  rdi, [rsp + rcx * 8 - 8]
+    # copy `i` quadwords from source to destination, in reverse
     std
     rep  movsq
     cld
 
-    # We've spilled arguments, so we have at least 6 args
-    mov  rdi,   [r11]
-    mov  rsi,  [r11 + 8]
-    mov  rdx, [r11 + 16]
-    mov  rcx, [r11 + 24]
-    mov  r8, [r11 + 32]
-    mov  r9, [r11 + 40]
+    # We've spilled arguments, so we have at least 6 args, move them into their
+    # final destination registers in preparation for the call
+    mov  rdi, r9
+    mov  rsi, [r11]
+    mov  rdx, [r11 + 8]
+    mov  rcx, [r11 + 16]
+    mov  r8,  [r11 + 24]
+    mov  r9,  [r11 + 32]
 
 L_dyn_call_exec:
     # If we spill arguments to the stack, we can't perform
@@ -141,7 +145,7 @@ L_dyn_call_exec:
     # This instruction will push the return address and jump,
     # and we can expect rbp to be the same as we left it upon
     # return.
-    call  [r10]
+    call  r10
 
 L_dyn_call_ret:
     # Non-tail call completed successfully
@@ -156,21 +160,19 @@ L_dyn_call_end:
     # a variable number of register-based arguments
     .p2align 2
     .data_region jt32
-    .set L_dyn_call_jt_entry0, L_dyn_call_exec-L_dyn_call_jt
+    .set L_dyn_call_jt_entry0, L_dyn_call_regs0-L_dyn_call_jt
     .set L_dyn_call_jt_entry1, L_dyn_call_regs1-L_dyn_call_jt
     .set L_dyn_call_jt_entry2, L_dyn_call_regs2-L_dyn_call_jt
     .set L_dyn_call_jt_entry3, L_dyn_call_regs3-L_dyn_call_jt
     .set L_dyn_call_jt_entry4, L_dyn_call_regs4-L_dyn_call_jt
     .set L_dyn_call_jt_entry5, L_dyn_call_regs5-L_dyn_call_jt
-    .set L_dyn_call_jt_entry6, L_dyn_call_regs6-L_dyn_call_jt
 L_dyn_call_jt:
     .long L_dyn_call_jt_entry0
     .long L_dyn_call_jt_entry1
     .long L_dyn_call_jt_entry2
     .long L_dyn_call_jt_entry3
     .long L_dyn_call_jt_entry4
     .long L_dyn_call_jt_entry5
-    .long L_dyn_call_jt_entry6
     .end_data_region
 
     # The following is the LSDA metadata for exception handling