Skip to content
This repository was archived by the owner on Jun 10, 2024. It is now read-only.

Commit 7a7b02a

Browse files
committed
fix: dynamic apply for x86_64 on macOS/linux
1 parent 53bda3c commit 7a7b02a

File tree

2 files changed

+118
-115
lines changed

2 files changed

+118
-115
lines changed

library/rt/src/function/apply/dynamic/asm/dynamic_apply_linux.s

Lines changed: 60 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -10,41 +10,42 @@ __firefly_dynamic_apply:
1010
# At this point, the following registers are bound:
1111
#
1212
# rdi <- callee
13-
# rsi <- argv
14-
# rdx <- argc
13+
# rsi <- process
14+
# rdx <- argv
15+
# rcx <- argc
1516
#
1617
# Save the parent base pointer for when control returns to this call frame.
17-
# CFA directives will inform the unwinder to expect %rbp at the bottom of the
18+
# CFA directives will inform the unwinder to expect rbp at the bottom of the
1819
# stack for this frame, so this should be the last value on the stack in the caller
1920
push rbp
2021
.cfi_def_cfa_offset 16
2122
.cfi_offset rbp, -16
2223
mov rbp, rsp
2324
.cfi_def_cfa_register rbp
2425

25-
# Save our callee and argv pointers, and argc
26-
mov r10, rdi
27-
mov r11, rsi
28-
mov rax, rdx
26+
# Pin callee pointer to r10
27+
mov r10, rdi
28+
# Pin the argv pointer to r11
29+
mov r11, rdx
30+
# The process pointer needs to be in rdi
31+
mov rdi, rsi
2932

3033
# Determine if spills are needed
3134
# In the common case in which they are not, we perform a tail call
32-
cmp rdx, 7
35+
cmp rcx, 6
3336
ja .L_dyn_call_spill
3437
3538
.L_dyn_call_no_spill:
3639
# We only reach this block if we had no arguments to spill, so
3740
# we are not certain about which registers we need to assign. We
3841
# simply check for each register whether this a corresponding argument,
3942
# and if so, we assign it.
40-
#
41-
# Sure would be nice if we had the equivalent of LDM from ARM
4243

4344
# Calculate offset in jump table to block which handles the specific
4445
# number of registers we have arguments for, then jump to that block
45-
lea rcx, [rip + .L_dyn_call_jt]
46-
mov rcx, [rcx + rdx * 8]
47-
jmp rcx
46+
lea rcx, [rip + .L_dyn_call_jt]
47+
mov rax, [rcx + 8*rax]
48+
jmp rax
4849

4950
# All of these basic blocks perform a tail call. As such,
5051
# the unwinder will skip over this frame should the callee
@@ -54,58 +55,58 @@ __firefly_dynamic_apply:
5455
jmp r10
5556

5657
.L_dyn_call_regs1:
57-
mov rdi, [r11]
58+
mov rsi, [r11]
5859
pop rbp
5960
jmp r10
6061

6162
.L_dyn_call_regs2:
62-
mov rdi, [r11]
63-
mov rsi, [r11 + 8]
64-
pop rbp
65-
jmp r10
63+
mov rsi, [r11]
64+
mov rdx, [r11 + 8]
65+
pop rbp
66+
jmp r10
6667

6768
.L_dyn_call_regs3:
68-
mov rdi, [r11]
69-
mov rsi, [r11 + 8]
70-
mov rdx, [r11 + 16]
71-
pop rbp
72-
jmp r10
69+
mov rsi, [r11]
70+
mov rdx, [r11 + 8]
71+
mov rcx, [r11 + 16]
72+
pop rbp
73+
jmp r10
7374

7475
.L_dyn_call_regs4:
75-
mov rdi, [r11]
76-
mov rsi, [r11 + 8]
77-
mov rdx, [r11 + 16]
78-
mov rcx, [r11 + 24]
79-
pop rbp
80-
jmp r10
76+
mov rsi, [r11]
77+
mov rdx, [r11 + 8]
78+
mov rcx, [r11 + 16]
79+
mov r8, [r11 + 24]
80+
pop rbp
81+
jmp r10
8182

8283
.L_dyn_call_regs5:
83-
mov rdi, [r11]
84-
mov rsi, [r11 + 8]
85-
mov rdx, [r11 + 16]
86-
mov rcx, [r11 + 24]
87-
mov r8, [r11 + 32]
88-
pop rbp
89-
jmp r10
90-
91-
.L_dyn_call_regs6:
92-
mov rdi, [r11]
93-
mov rsi, [r11 + 8]
94-
mov rdx, [r11 + 16]
95-
mov rcx, [r11 + 24]
96-
mov r8, [r11 + 32]
97-
mov r9, [r11 + 40]
98-
pop rbp
99-
jmp r10
84+
mov rsi, [r11]
85+
mov rdx, [r11 + 8]
86+
mov rcx, [r11 + 16]
87+
mov r8, [r11 + 24]
88+
mov r9, [r11 + 32]
89+
pop rbp
90+
jmp r10
10091

10192
.L_dyn_call_spill:
10293
# If we hit this block, we have identified that there are
10394
# arguments to spill. We perform some setup for the actual
10495
# spilling, which is a loop built on `rep movsq`
96+
#
97+
# At this point, the following registers are occupied/hold these values:
98+
#
99+
# r10 <- callee
100+
# rdi <- process
101+
# r11 <- argv
102+
# rcx <- argc
103+
104+
# rcx, rdi, and rsi are used by `rep movsq`, so save them temporarily
105+
mov r8, rcx
106+
mov r9, rdi
105107

106-
# Calculate spill count for later (rep uses rcx for the iteration count,
108+
# Calculate spill count for later (rep uses rcx for the iteration count `i`,
107109
# which in this case is the number of quadwords to copy)
108-
mov rcx, rdx
109110
sub rcx, 6
110111

111112
# Calculate spill space, and ensure it is rounded up to the nearest 16 bytes.
@@ -116,21 +117,22 @@ __firefly_dynamic_apply:
116117
sub rsp, rax
117118

118119
# load source pointer (last item of argv)
119-
lea rsi, [r11 + rdx * 8 + -8]
120+
lea rsi, [r11 + r8 * 8 - 8]
120121
# load destination pointer (top of spill region)
121-
lea rdi, [rsp + rcx * 8 + -8]
122-
# copy rcx quadwords from rsi to rdi, in reverse
122+
lea rdi, [rsp + rcx * 8 - 8]
123+
# copy `i` quadwords from source to destination, in reverse
123124
std
124125
rep movsq
125126
cld
126127

127-
# We've spilled arguments, so we have at least 6 args
128-
mov rdi, [r11]
129-
mov rsi, [r11 + 8]
130-
mov rdx, [r11 + 16]
131-
mov rcx, [r11 + 24]
132-
mov r8, [r11 + 32]
133-
mov r9, [r11 + 40]
128+
# We've spilled arguments, so we have at least 6 args, move them into their
129+
# final destination registers in preparation for the call
130+
mov rdi, r9
131+
mov rsi, [r11]
132+
mov rdx, [r11 + 8]
133+
mov rcx, [r11 + 16]
134+
mov r8, [r11 + 24]
135+
mov r9, [r11 + 32]
134136

135137
.L_dyn_call_exec:
136138
# If we spill arguments to the stack, we can't perform
@@ -167,7 +169,6 @@ __firefly_dynamic_apply:
167169
.quad .L_dyn_call_regs3
168170
.quad .L_dyn_call_regs4
169171
.quad .L_dyn_call_regs5
170-
.quad .L_dyn_call_regs6
171172
.size .L_dyn_call_jt, 48
172173

173174
# The following is the LSDA metadata for exception handling

library/rt/src/function/apply/dynamic/asm/dynamic_apply_macos.s

Lines changed: 58 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@ L_dyn_call_begin:
88
# At this point, the following registers are bound:
99
#
1010
# rdi <- callee
11-
# rsi <- argv
12-
# rdx <- argc
11+
# rsi <- process
12+
# rdx <- argv
13+
# rcx <- argc
1314
#
1415
# Save the parent base pointer for when control returns to this call frame.
1516
# CFA directives will inform the unwinder to expect rbp at the bottom of the
@@ -20,14 +21,16 @@ L_dyn_call_begin:
2021
mov rbp, rsp
2122
.cfi_def_cfa_register rbp
2223

23-
# Save our callee and argv pointers, and argc
24+
# Pin callee pointer to r10
2425
mov r10, rdi
25-
mov r11, rsi
26-
mov rax, rdx
26+
# Pin the argv pointer to r11
27+
mov r11, rdx
28+
# The process pointer needs to be in rdi
29+
mov rdi, rsi
2730

28-
# Determine if spills are needed
31+
# Determine if spills are needed (argc + 1 should be <= 8 when not needed)
2932
# In the common case in which they are not, we perform a tail call
30-
cmp rdx, 7
33+
cmp rcx, 6
3134
ja L_dyn_call_spill
3235

3336
L_dyn_call_no_spill:
@@ -39,70 +42,70 @@ L_dyn_call_no_spill:
3942
# Calculate offset in jump table to block which handles the specific
4043
# number of registers we have arguments for, then jump to that block
4144
lea rcx, [rip + L_dyn_call_jt]
42-
mov rax, [rcx + rax * 4]
45+
movsxd rax, dword ptr [rcx + 4*rax]
4346
add rax, rcx
44-
jmp [rax]
47+
jmp rax
4548

4649
# All of these basic blocks perform a tail call. As such,
4750
# the unwinder will skip over this frame should the callee
4851
# throw an exception
4952
L_dyn_call_regs0:
5053
pop rbp
51-
jmp [r10]
54+
jmp r10
5255

5356
L_dyn_call_regs1:
54-
mov rdi, [r11]
57+
mov rsi, [r11]
5558
pop rbp
56-
jmp [r10]
59+
jmp r10
5760

5861
L_dyn_call_regs2:
59-
mov rdi, [r11]
60-
mov rsi, [r11 + 8]
62+
mov rsi, [r11]
63+
mov rdx, [r11 + 8]
6164
pop rbp
62-
jmp [r10]
65+
jmp r10
6366

6467
L_dyn_call_regs3:
65-
mov rdi, [r11]
66-
mov rsi, [r11 + 8]
67-
mov rdx, [r11 + 16]
68+
mov rsi, [r11]
69+
mov rdx, [r11 + 8]
70+
mov rcx, [r11 + 16]
6871
pop rbp
69-
jmp [r10]
72+
jmp r10
7073

7174
L_dyn_call_regs4:
72-
mov rdi, [r11]
73-
mov rsi, [r11 + 8]
74-
mov rdx, [r11 + 16]
75-
mov rcx, [r11 + 24]
75+
mov rsi, [r11]
76+
mov rdx, [r11 + 8]
77+
mov rcx, [r11 + 16]
78+
mov r8, [r11 + 24]
7679
pop rbp
77-
jmp [r10]
80+
jmp r10
7881

7982
L_dyn_call_regs5:
80-
mov rdi, [r11]
81-
mov rsi, [r11 + 8]
82-
mov rdx, [r11 + 16]
83-
mov rcx, [r11 + 24]
84-
mov r8, [r11 + 32]
83+
mov rsi, [r11]
84+
mov rdx, [r11 + 8]
85+
mov rcx, [r11 + 16]
86+
mov r8, [r11 + 24]
87+
mov r9, [r11 + 32]
8588
pop rbp
86-
jmp [r10]
87-
88-
L_dyn_call_regs6:
89-
mov rdi, [r11]
90-
mov rsi, [r11 + 8]
91-
mov rdx, [r11 + 16]
92-
mov rcx, [r11 + 24]
93-
mov r8, [r11 + 32]
94-
mov r9, [r11 + 40]
95-
pop rbp
96-
jmp [r10]
89+
jmp r10
9790

9891
L_dyn_call_spill:
9992
# If we hit this block, we have identified that there are
10093
# arguments to spill. We perform some setup for the actual
10194
# spilling, which is a loop built on `rep movsq`
95+
#
96+
# At this point, the following registers are occupied/hold these values:
97+
#
98+
# r10 <- callee
99+
# rdi <- process
100+
# r11 <- argv
101+
# rcx <- argc
102+
103+
# rcx, rdi, and rsi are used by `rep movsq`, so save them temporarily
104+
mov r8, rcx
105+
mov r9, rdi
102106

103-
# Calculate spill count for later (rep uses rcx for the iteration count,
107+
# Calculate spill count for later (rep uses rcx for the iteration count `i`,
104108
# which in this case is the number of quadwords to copy)
105-
mov rcx, rdx
106109
sub rcx, 6
107110

108111
# Calculate spill space, and ensure it is rounded up to the nearest 16 bytes.
@@ -113,21 +116,22 @@ L_dyn_call_spill:
113116
sub rsp, rax
114117

115118
# load source pointer (last item of argv)
116-
lea rsi, [r11 + rdx * 8 - 8]
119+
lea rsi, [r11 + r8 * 8 - 8]
117120
# load destination pointer (top of spill region)
118-
lea rdi, [rsp + rcx * 8 - 8]
119-
# copy rcx quadwords from rsi to rdi, in reverse
121+
lea rdi, [rsp + rcx * 8 - 8]
122+
# copy `i` quadwords from source to destination, in reverse
120123
std
121124
rep movsq
122125
cld
123126

124-
# We've spilled arguments, so we have at least 6 args
125-
mov rdi, [r11]
126-
mov rsi, [r11 + 8]
127-
mov rdx, [r11 + 16]
128-
mov rcx, [r11 + 24]
129-
mov r8, [r11 + 32]
130-
mov r9, [r11 + 40]
127+
# We've spilled arguments, so we have at least 6 args, move them into their
128+
# final destination registers in preparation for the call
129+
mov rdi, r9
130+
mov rsi, [r11]
131+
mov rdx, [r11 + 8]
132+
mov rcx, [r11 + 16]
133+
mov r8, [r11 + 24]
134+
mov r9, [r11 + 32]
131135

132136
L_dyn_call_exec:
133137
# If we spill arguments to the stack, we can't perform
@@ -141,7 +145,7 @@ L_dyn_call_exec:
141145
# This instruction will push the return address and jump,
142146
# and we can expect rbp to be the same as we left it upon
143147
# return.
144-
call [r10]
148+
call r10
145149

146150
L_dyn_call_ret:
147151
# Non-tail call completed successfully
@@ -156,21 +160,19 @@ L_dyn_call_end:
156160
# a variable number of register-based arguments
157161
.p2align 2
158162
.data_region jt32
159-
.set L_dyn_call_jt_entry0, L_dyn_call_exec-L_dyn_call_jt
163+
.set L_dyn_call_jt_entry0, L_dyn_call_regs0-L_dyn_call_jt
160164
.set L_dyn_call_jt_entry1, L_dyn_call_regs1-L_dyn_call_jt
161165
.set L_dyn_call_jt_entry2, L_dyn_call_regs2-L_dyn_call_jt
162166
.set L_dyn_call_jt_entry3, L_dyn_call_regs3-L_dyn_call_jt
163167
.set L_dyn_call_jt_entry4, L_dyn_call_regs4-L_dyn_call_jt
164168
.set L_dyn_call_jt_entry5, L_dyn_call_regs5-L_dyn_call_jt
165-
.set L_dyn_call_jt_entry6, L_dyn_call_regs6-L_dyn_call_jt
166169
L_dyn_call_jt:
167170
.long L_dyn_call_jt_entry0
168171
.long L_dyn_call_jt_entry1
169172
.long L_dyn_call_jt_entry2
170173
.long L_dyn_call_jt_entry3
171174
.long L_dyn_call_jt_entry4
172175
.long L_dyn_call_jt_entry5
173-
.long L_dyn_call_jt_entry6
174176
.end_data_region
175177

176178
# The following is the LSDA metadata for exception handling

0 commit comments

Comments
 (0)