Skip to content

Commit 107901d

Browse files
authored
Try to avoid julia becoming unkillable after fatal errors (#40056)
- don't smash the alt-stack when already using it - handle jl_critical_error on the original stack, leaving our signal handling thread free to handle more signals (and helping lock corruption detection in some cases) - unblock signals when handling signals: some libc apparently like to block all signals, which can cause mild havoc, since we'd really like the user or bad data to be able to still kill the process (and not just be ignored or cause it to hang) - reset signals to SIG_DFL earlier (so we recurse less) - destroy some state from the Task we co-opted to run the exit handlers, so that it can't accidentally jump back into the running program after we've started tearing down the process, from an untimely ^C (previously ^C might cancel the exit) or a jlbacktrace call. - mark functions as leaf with CFI instead of (potentially) smashing the stack, and add a bit of red-zone if we are recursing (to keep pgcstack sensible) - support safe_restore for the mach catch_exception_raise (while we're trying to generate the backtrace)
1 parent 6bdba43 commit 107901d

8 files changed

+247
-153
lines changed

src/gf.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1825,7 +1825,7 @@ static void JL_NORETURN jl_method_error_bare(jl_function_t *f, jl_value_t *args,
18251825
jl_static_show((JL_STREAM*)STDERR_FILENO,args); jl_printf((JL_STREAM*)STDERR_FILENO,"\n");
18261826
jl_ptls_t ptls = jl_get_ptls_states();
18271827
ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, 0);
1828-
jl_critical_error(0, NULL, ptls->bt_data, &ptls->bt_size);
1828+
jl_critical_error(0, NULL);
18291829
abort();
18301830
}
18311831
// not reached

src/julia_internal.h

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,36 @@ void __tsan_switch_to_fiber(void *fiber, unsigned flags);
6969
# define JL_USE_IFUNC 0
7070
#endif
7171

72+
// If we've smashed the stack, (and not just normal NORETURN)
73+
// this will smash stack-unwind too
74+
#ifdef _OS_WINDOWS_
75+
#if defined(_CPU_X86_64_)
76+
// install the unhandled exception handler at the top of our stack
77+
// to call directly into our personality handler
78+
#define CFI_NORETURN \
79+
asm volatile ("\t.seh_handler __julia_personality, @except\n\t.text");
80+
#else
81+
#define CFI_NORETURN
82+
#endif
83+
#else
84+
// wipe out the call-stack unwind capability beyond this function
85+
// (we are noreturn, so it is not a total lie)
86+
#if defined(_CPU_X86_64_)
87+
// per nongnu libunwind: "x86_64 ABI specifies that end of call-chain is marked with a NULL RBP or undefined return address"
88+
// so we do all 3, to be extra certain of it
89+
#define CFI_NORETURN \
90+
asm volatile ("\t.cfi_undefined rip"); \
91+
asm volatile ("\t.cfi_undefined rbp"); \
92+
asm volatile ("\t.cfi_return_column rbp");
93+
#else
94+
// per nongnu libunwind: "DWARF spec says undefined return address location means end of stack"
95+
// we use whatever happens to be register 1 on this platform for this
96+
#define CFI_NORETURN \
97+
asm volatile ("\t.cfi_undefined 1"); \
98+
asm volatile ("\t.cfi_return_column 1");
99+
#endif
100+
#endif
101+
72102
// If this is detected in a backtrace of segfault, it means the functions
73103
// that use this value must be reworked into their async form with cb arg
74104
// provided and with JL_UV_LOCK used around the calls
@@ -913,7 +943,7 @@ size_t rec_backtrace_ctx(jl_bt_element_t *bt_data, size_t maxsize, bt_context_t
913943
size_t rec_backtrace_ctx_dwarf(jl_bt_element_t *bt_data, size_t maxsize, bt_context_t *ctx, jl_gcframe_t *pgcstack) JL_NOTSAFEPOINT;
914944
#endif
915945
JL_DLLEXPORT jl_value_t *jl_get_backtrace(void);
916-
void jl_critical_error(int sig, bt_context_t *context, jl_bt_element_t *bt_data, size_t *bt_size);
946+
void jl_critical_error(int sig, bt_context_t *context);
917947
JL_DLLEXPORT void jl_raise_debugger(void);
918948
int jl_getFunctionInfo(jl_frame_t **frames, uintptr_t pointer, int skipC, int noInline) JL_NOTSAFEPOINT;
919949
JL_DLLEXPORT void jl_gdblookup(void* ip) JL_NOTSAFEPOINT;

src/signal-handling.c

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -231,15 +231,44 @@ void jl_show_sigill(void *_ctx)
231231
#endif
232232
}
233233

234-
// what to do on a critical error
235-
void jl_critical_error(int sig, bt_context_t *context, jl_bt_element_t *bt_data, size_t *bt_size)
234+
// what to do on a critical error on a thread
235+
void jl_critical_error(int sig, bt_context_t *context)
236236
{
237-
// This function is not allowed to reference any TLS variables.
238-
// We need to explicitly pass in the TLS buffer pointer when
239-
// we make `jl_filename` and `jl_lineno` thread local.
237+
238+
jl_ptls_t ptls = jl_get_ptls_states();
239+
jl_bt_element_t *bt_data = ptls->bt_data;
240+
size_t *bt_size = &ptls->bt_size;
240241
size_t i, n = *bt_size;
241-
if (sig)
242+
if (sig) {
243+
// kill this task, so that we cannot get back to it accidentally (via an untimely ^C or jlbacktrace in jl_exit)
244+
ptls->pgcstack = NULL;
245+
ptls->safe_restore = NULL;
246+
if (ptls->current_task) {
247+
ptls->current_task->eh = NULL;
248+
ptls->current_task->excstack = NULL;
249+
}
250+
#ifndef _OS_WINDOWS_
251+
sigset_t sset;
252+
sigemptyset(&sset);
253+
// n.b. In `abort()`, Apple's libSystem "helpfully" blocks all signals
254+
// on all threads but SIGABRT. But we also don't know what the thread
255+
// was doing, so unblock all critical signals so that they will crash
256+
// hard, and not just get stuck.
257+
sigaddset(&sset, SIGSEGV);
258+
sigaddset(&sset, SIGBUS);
259+
sigaddset(&sset, SIGILL);
260+
// also unblock fatal signals now, so we won't get back here twice
261+
sigaddset(&sset, SIGTERM);
262+
sigaddset(&sset, SIGABRT);
263+
sigaddset(&sset, SIGQUIT);
264+
// and the original signal is now fatal too, in case it wasn't
265+
// something already listed (?)
266+
if (sig != SIGINT)
267+
sigaddset(&sset, sig);
268+
pthread_sigmask(SIG_UNBLOCK, &sset, NULL);
269+
#endif
242270
jl_safe_printf("\nsignal (%d): %s\n", sig, strsignal(sig));
271+
}
243272
jl_safe_printf("in expression starting at %s:%d\n", jl_filename, jl_lineno);
244273
if (context) {
245274
// Must avoid extended backtrace frames here unless we're sure bt_data

src/signals-mach.c

Lines changed: 79 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -84,14 +84,16 @@ extern boolean_t exc_server(mach_msg_header_t *, mach_msg_header_t *);
8484
void *mach_segv_listener(void *arg)
8585
{
8686
(void)arg;
87+
(void)jl_get_ptls_states();
8788
while (1) {
8889
int ret = mach_msg_server(exc_server, 2048, segv_port, MACH_MSG_TIMEOUT_NONE);
8990
jl_safe_printf("mach_msg_server: %s\n", mach_error_string(ret));
9091
jl_exit(128 + SIGSEGV);
9192
}
9293
}
9394

94-
static void allocate_segv_handler()
95+
96+
static void allocate_mach_handler()
9597
{
9698
// ensure KEYMGR_GCC3_DW2_OBJ_LIST is initialized, as this requires malloc
9799
// and thus can deadlock when used without first initializing it.
@@ -122,7 +124,7 @@ static void allocate_segv_handler()
122124
jl_error("pthread_create failed");
123125
}
124126
pthread_attr_destroy(&attr);
125-
for (int16_t tid = 0;tid < jl_n_threads;tid++) {
127+
for (int16_t tid = 0; tid < jl_n_threads; tid++) {
126128
attach_exception_port(pthread_mach_thread_np(jl_all_tls_states[tid]->system_id), 0);
127129
}
128130
}
@@ -158,20 +160,31 @@ typedef arm_exception_state64_t host_exception_state_t;
158160
static void jl_call_in_state(jl_ptls_t ptls2, host_thread_state_t *state,
159161
void (*fptr)(void))
160162
{
161-
uint64_t rsp = (uint64_t)ptls2->signal_stack + sig_stack_size;
163+
#ifdef _CPU_X86_64_
164+
uintptr_t rsp = state->__rsp;
165+
#elif defined(_CPU_AARCH64_)
166+
uintptr_t rsp = state->__sp;
167+
#else
168+
#error "julia: throw-in-context not supported on this platform"
169+
#endif
170+
if (ptls2->signal_stack == NULL || is_addr_on_sigstack(ptls2, (void*)rsp)) {
171+
rsp = (rsp - 256) & ~(uintptr_t)15; // redzone and re-alignment
172+
}
173+
else {
174+
rsp = (uintptr_t)ptls2->signal_stack + sig_stack_size;
175+
}
162176
assert(rsp % 16 == 0);
163177

164178
#ifdef _CPU_X86_64_
165-
// push (null) $RIP onto the stack
166179
rsp -= sizeof(void*);
167-
*(void**)rsp = NULL;
168-
169180
state->__rsp = rsp; // set stack pointer
170181
state->__rip = (uint64_t)fptr; // "call" the function
171-
#else
182+
#elif defined(_CPU_AARCH64_)
172183
state->__sp = rsp;
173184
state->__pc = (uint64_t)fptr;
174185
state->__lr = 0;
186+
#else
187+
#error "julia: throw-in-context not supported on this platform"
175188
#endif
176189
}
177190

@@ -204,11 +217,22 @@ static void jl_throw_in_thread(int tid, mach_port_t thread, jl_value_t *exceptio
204217
ptls2->sig_exception = exception;
205218
}
206219
jl_call_in_state(ptls2, &state, &jl_sig_throw);
207-
ret = thread_set_state(thread, THREAD_STATE,
208-
(thread_state_t)&state, count);
220+
ret = thread_set_state(thread, THREAD_STATE, (thread_state_t)&state, count);
209221
HANDLE_MACH_ERROR("thread_set_state", ret);
210222
}
211223

224+
static void segv_handler(int sig, siginfo_t *info, void *context)
225+
{
226+
jl_ptls_t ptls = jl_get_ptls_states();
227+
assert(sig == SIGSEGV || sig == SIGBUS);
228+
if (ptls->safe_restore) { // restarting jl_ or jl_unwind_stepn
229+
jl_call_in_state(ptls, (host_thread_state_t*)jl_to_bt_context(context), &jl_sig_throw);
230+
}
231+
else {
232+
sigdie_handler(sig, info, context);
233+
}
234+
}
235+
212236
//exc_server uses dlsym to find symbol
213237
JL_DLLEXPORT
214238
kern_return_t catch_exception_raise(mach_port_t exception_port,
@@ -218,18 +242,16 @@ kern_return_t catch_exception_raise(mach_port_t exception_port,
218242
exception_data_t code,
219243
mach_msg_type_number_t code_count)
220244
{
221-
unsigned int count = THREAD_STATE_COUNT;
222245
unsigned int exc_count = HOST_EXCEPTION_STATE_COUNT;
223246
host_exception_state_t exc_state;
224-
host_thread_state_t state;
225247
#ifdef LLVMLIBUNWIND
226248
if (thread == mach_profiler_thread) {
227249
return profiler_segv_handler(exception_port, thread, task, exception, code, code_count);
228250
}
229251
#endif
230252
int16_t tid;
231253
jl_ptls_t ptls2 = NULL;
232-
for (tid = 0;tid < jl_n_threads;tid++) {
254+
for (tid = 0; tid < jl_n_threads; tid++) {
233255
jl_ptls_t _ptls2 = jl_all_tls_states[tid];
234256
if (pthread_mach_thread_np(_ptls2->system_id) == thread) {
235257
ptls2 = _ptls2;
@@ -298,11 +320,8 @@ kern_return_t catch_exception_raise(mach_port_t exception_port,
298320
return KERN_SUCCESS;
299321
}
300322
else {
301-
kern_return_t ret = thread_get_state(thread, THREAD_STATE, (thread_state_t)&state, &count);
302-
HANDLE_MACH_ERROR("thread_get_state", ret);
303-
jl_critical_error(SIGSEGV, (unw_context_t*)&state,
304-
ptls2->bt_data, &ptls2->bt_size);
305-
return KERN_INVALID_ARGUMENT;
323+
jl_exit_thread0(128 + SIGSEGV, NULL, 0);
324+
return KERN_SUCCESS;
306325
}
307326
}
308327

@@ -317,24 +336,27 @@ static void attach_exception_port(thread_port_t thread, int segv_only)
317336
HANDLE_MACH_ERROR("thread_set_exception_ports", ret);
318337
}
319338

320-
static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx)
339+
static void jl_thread_suspend_and_get_state2(int tid, host_thread_state_t *ctx)
321340
{
322341
jl_ptls_t ptls2 = jl_all_tls_states[tid];
323-
mach_port_t tid_port = pthread_mach_thread_np(ptls2->system_id);
342+
mach_port_t thread = pthread_mach_thread_np(ptls2->system_id);
324343

325-
kern_return_t ret = thread_suspend(tid_port);
344+
kern_return_t ret = thread_suspend(thread);
326345
HANDLE_MACH_ERROR("thread_suspend", ret);
327346

328347
// Do the actual sampling
329348
unsigned int count = THREAD_STATE_COUNT;
330-
static unw_context_t state;
331-
memset(&state, 0, sizeof(unw_context_t));
349+
memset(ctx, 0, sizeof(*ctx));
332350

333351
// Get the state of the suspended thread
334-
ret = thread_get_state(tid_port, THREAD_STATE, (thread_state_t)&state, &count);
352+
ret = thread_get_state(thread, THREAD_STATE, (thread_state_t)ctx, &count);
353+
}
335354

336-
// Initialize the unwind context with the suspend thread's state
337-
*ctx = &state;
355+
static void jl_thread_suspend_and_get_state(int tid, unw_context_t **ctx)
356+
{
357+
static host_thread_state_t state;
358+
jl_thread_suspend_and_get_state2(tid, &state);
359+
*ctx = (unw_context_t*)&state;
338360
}
339361

340362
static void jl_thread_resume(int tid, int sig)
@@ -376,29 +398,46 @@ static void jl_try_deliver_sigint(void)
376398
HANDLE_MACH_ERROR("thread_resume", ret);
377399
}
378400

379-
static void jl_exit_thread0(int exitstate)
401+
static void JL_NORETURN jl_exit_thread0_cb(int exitstate)
402+
{
403+
CFI_NORETURN
404+
jl_critical_error(exitstate - 128, NULL);
405+
jl_exit(exitstate);
406+
}
407+
408+
static void jl_exit_thread0(int exitstate, jl_bt_element_t *bt_data, size_t bt_size)
380409
{
381410
jl_ptls_t ptls2 = jl_all_tls_states[0];
382411
mach_port_t thread = pthread_mach_thread_np(ptls2->system_id);
383-
kern_return_t ret = thread_suspend(thread);
384-
HANDLE_MACH_ERROR("thread_suspend", ret);
412+
413+
host_thread_state_t state;
414+
jl_thread_suspend_and_get_state2(0, &state);
415+
unw_context_t *uc = (unw_context_t*)&state;
385416

386417
// This aborts `sleep` and other syscalls.
387-
ret = thread_abort(thread);
418+
kern_return_t ret = thread_abort(thread);
388419
HANDLE_MACH_ERROR("thread_abort", ret);
389420

390-
unsigned int count = THREAD_STATE_COUNT;
391-
host_thread_state_t state;
392-
ret = thread_get_state(thread, THREAD_STATE,
393-
(thread_state_t)&state, &count);
421+
if (bt_data == NULL) {
422+
// Must avoid extended backtrace frames here unless we're sure bt_data
423+
// is properly rooted.
424+
ptls2->bt_size = rec_backtrace_ctx(ptls2->bt_data, JL_MAX_BT_SIZE, uc, NULL);
425+
}
426+
else {
427+
ptls2->bt_size = bt_size; // <= JL_MAX_BT_SIZE
428+
memcpy(ptls2->bt_data, bt_data, ptls2->bt_size * sizeof(bt_data[0]));
429+
}
394430

395431
void (*exit_func)(int) = &_exit;
396432
if (thread0_exit_count <= 1) {
397-
exit_func = &jl_exit;
433+
exit_func = &jl_exit_thread0_cb;
398434
}
399435
else if (thread0_exit_count == 2) {
400436
exit_func = &exit;
401437
}
438+
else {
439+
exit_func = &_exit;
440+
}
402441

403442
#ifdef _CPU_X86_64_
404443
// First integer argument. Not portable but good enough =)
@@ -409,8 +448,8 @@ static void jl_exit_thread0(int exitstate)
409448
#error Fill in first integer argument here
410449
#endif
411450
jl_call_in_state(ptls2, &state, (void (*)(void))exit_func);
412-
ret = thread_set_state(thread, THREAD_STATE,
413-
(thread_state_t)&state, count);
451+
unsigned int count = THREAD_STATE_COUNT;
452+
ret = thread_set_state(thread, THREAD_STATE, (thread_state_t)&state, count);
414453
HANDLE_MACH_ERROR("thread_set_state", ret);
415454

416455
ret = thread_resume(thread);
@@ -508,8 +547,10 @@ void *mach_profile_listener(void *arg)
508547
break;
509548
}
510549

511-
unw_context_t *uc;
512-
jl_thread_suspend_and_get_state(i, &uc);
550+
host_thread_state_t state;
551+
jl_thread_suspend_and_get_state2(i, &state);
552+
unw_context_t *uc = (unw_context_t*)&state;
553+
513554
if (running) {
514555
#ifdef LLVMLIBUNWIND
515556
/*

0 commit comments

Comments
 (0)