From 0d09f3d0baa8843bd41fed013dad3fd9d69ae28d Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Fri, 11 Oct 2024 13:23:52 -0400 Subject: [PATCH] rearrange jl_delete_thread to be thread-safe (#56097) Prior to this, especially on macOS, the gc-safepoint here would cause the process to segfault as we had already freed the current_task state. Rearrange this code so that the GC interactions (except for the atomic store to current_task) are all handled before entering GC safe, and then signaling the thread is deleted (via setting current_task = NULL, published by jl_unlock_profile_wr to other threads) is last. ``` ERROR: Exception handler triggered on unmanaged thread. Process 53827 stopped * thread #5, stop reason = EXC_BAD_ACCESS (code=2, address=0x100018008) frame #0: 0x0000000100b74344 libjulia-internal.1.12.0.dylib`jl_delete_thread [inlined] jl_gc_state_set(ptls=0x000000011f8b3200, state='\x02', old_state=) at julia_threads.h:272:9 [opt] 269 assert(old_state != JL_GC_CONCURRENT_COLLECTOR_THREAD); 270 jl_atomic_store_release(&ptls->gc_state, state); 271 if (state == JL_GC_STATE_UNSAFE || old_state == JL_GC_STATE_UNSAFE) -> 272 jl_gc_safepoint_(ptls); 273 return old_state; 274 } 275 STATIC_INLINE int8_t jl_gc_state_save_and_set(jl_ptls_t ptls, Target 0: (julia) stopped. (lldb) up frame #1: 0x0000000100b74320 libjulia-internal.1.12.0.dylib`jl_delete_thread [inlined] jl_gc_state_save_and_set(ptls=0x000000011f8b3200, state='\x02') at julia_threads.h:278:12 [opt] 275 STATIC_INLINE int8_t jl_gc_state_save_and_set(jl_ptls_t ptls, 276 int8_t state) 277 { -> 278 return jl_gc_state_set(ptls, state, jl_atomic_load_relaxed(&ptls->gc_state)); 279 } 280 #ifdef __clang_gcanalyzer__ 281 // these might not be a safepoint (if they are no-op safe=>safe transitions), but we have to assume it could be (statically) (lldb) frame #2: 0x0000000100b7431c libjulia-internal.1.12.0.dylib`jl_delete_thread(value=0x000000011f8b3200) at threading.c:537:11 [opt] 534 ptls->root_task = NULL; 535 jl_free_thread_gc_state(ptls); 536 // then park in safe-region -> 537 (void)jl_gc_safe_enter(ptls); 538 } ``` (test incorporated into https://github.com/JuliaLang/julia/pull/55793) --- src/threading.c | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/src/threading.c b/src/threading.c index 44b1192528531..c26028d2f3da2 100644 --- a/src/threading.c +++ b/src/threading.c @@ -464,6 +464,30 @@ static void jl_delete_thread(void *value) JL_NOTSAFEPOINT_ENTER // prior unsafe-region (before we let it release the stack memory) (void)jl_gc_unsafe_enter(ptls); scheduler_delete_thread(ptls); + // need to clear pgcstack and eh, but we can clear everything now too + jl_task_t *ct = jl_atomic_load_relaxed(&ptls->current_task); + jl_task_frame_noreturn(ct); + if (jl_set_task_tid(ptls->root_task, ptls->tid)) { + // the system will probably free this stack memory soon + // so prevent any other thread from accessing it later + if (ct != ptls->root_task) + jl_task_frame_noreturn(ptls->root_task); + } + else { + // Uh oh. The user cleared the sticky bit so it started running + // elsewhere, then called pthread_exit on this thread from another + // Task, which will free the stack memory of that root task soon. This + // is not recoverable. Though we could just hang here, a fatal message + // is likely better. + jl_safe_printf("fatal: thread exited from wrong Task.\n"); + abort(); + } + ptls->previous_exception = NULL; + // allow the page root_task is on to be freed + ptls->root_task = NULL; + jl_free_thread_gc_state(ptls); + // park in safe-region from here on (this may run GC again) + (void)jl_gc_safe_enter(ptls); // try to free some state we do not need anymore #ifndef _OS_WINDOWS_ void *signal_stack = ptls->signal_stack; @@ -502,21 +526,7 @@ static void jl_delete_thread(void *value) JL_NOTSAFEPOINT_ENTER #else pthread_mutex_lock(&in_signal_lock); #endif - // need to clear pgcstack and eh, but we can clear everything now too - jl_task_frame_noreturn(jl_atomic_load_relaxed(&ptls->current_task)); - if (jl_set_task_tid(ptls->root_task, ptls->tid)) { - // the system will probably free this stack memory soon - // so prevent any other thread from accessing it later - jl_task_frame_noreturn(ptls->root_task); - } - else { - // Uh oh. The user cleared the sticky bit so it started running - // elsewhere, then called pthread_exit on this thread. This is not - // recoverable. Though we could just hang here, a fatal message is better. - jl_safe_printf("fatal: thread exited from wrong Task.\n"); - abort(); - } - jl_atomic_store_relaxed(&ptls->current_task, NULL); // dead + jl_atomic_store_relaxed(&ptls->current_task, NULL); // indicate dead // finally, release all of the locks we had grabbed #ifdef _OS_WINDOWS_ jl_unlock_profile_wr(); @@ -529,12 +539,6 @@ static void jl_delete_thread(void *value) JL_NOTSAFEPOINT_ENTER #endif free(ptls->bt_data); small_arraylist_free(&ptls->locks); - ptls->previous_exception = NULL; - // allow the page root_task is on to be freed - ptls->root_task = NULL; - jl_free_thread_gc_state(ptls); - // then park in safe-region - (void)jl_gc_safe_enter(ptls); } //// debugging hack: if we are exiting too fast for error message printing on threads,