From 4c13f8dc724438d12bf2480a907c4dc2b949e740 Mon Sep 17 00:00:00 2001 From: Karl Stenerud Date: Fri, 7 May 2021 13:25:40 +0200 Subject: [PATCH] Fix the SIGQUIT handling system to deal with multiple ANRs properly. --- CHANGELOG.md | 7 + .../src/main/jni/anr_google.c | 28 +- .../src/main/jni/anr_handler.c | 248 ++++++++++-------- .../src/main/jni/anr_handler.h | 3 + .../src/main/jni/bugsnag_anr.c | 2 +- .../src/main/jni/unwind_func.h | 2 - 6 files changed, 162 insertions(+), 128 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ca7e003999..7a743225f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## TBD + +### Bug fixes + + * Fix bug that terminated the app when multiple ANRs occur + [#1235](https://github.com/bugsnag/bugsnag-android/pull/1235) + ## 5.9.1 (2021-04-22) ### Bug fixes diff --git a/bugsnag-plugin-android-anr/src/main/jni/anr_google.c b/bugsnag-plugin-android-anr/src/main/jni/anr_google.c index 580c255549..ed036789f8 100644 --- a/bugsnag-plugin-android-anr/src/main/jni/anr_google.c +++ b/bugsnag-plugin-android-anr/src/main/jni/anr_google.c @@ -38,20 +38,20 @@ // // - Register a SIGQUIT handler via sigaction() // -// - Our handler callback blocks SIGQUIT (allowing the Google handler to work -// again) and sets a flag like "should_report_anr" -// -// - We have another thread waiting on the "should_report_anr" flag, which then -// calls bsg_google_anr_call() after a short delay to raise a SIGQUIT on -// Google's handler thread. Re-raising the signal on the signal handler -// thread won't work because the signaling system in the OS won't finish -// updating the new blocking state until the current signal handler returns. -// -// - Do our handler stuff, delay a short while (like 2 seconds) for Google's -// handlers to finish, then exit our watchdog thread. It's important that -// our watchdog thread stops before the runtime's timeout (currently 20 -// seconds), or else we'll be force-killed, which breaks Google reporting -// and the ANR popup in some cases. +// - When our SIGQUIT handler triggers, we block SIGQUIT (so that the Google +// handler can run), then trigger another "worker" thread we have waiting +// to do the actual work, then return. The context switch to the "worker" +// thread debounces the signaling mechanism so that we can trigger the Google +// ANR thread safely. Our SIGQUIT handler MUST NOT run for more than 20 +// seconds or else we'll be force-killed, which breaks Google reporting and +// the ANR popup in some cases. +// +// - Our ANR "worker" thread waits for our trigger, calls bsg_google_anr_call() +// to raise a SIGQUIT on Google's handler thread, and then does any extra +// processing we want to happen (this thread doesn't require async-safety). +// +// - When everything is done, we unblock SIGQUIT again so that our handler will +// trigger on the next ANR, and then put the "worker" thread back to sleep. // // // Functionality in this file: diff --git a/bugsnag-plugin-android-anr/src/main/jni/anr_handler.c b/bugsnag-plugin-android-anr/src/main/jni/anr_handler.c index 17b6f40882..14ce5d407c 100644 --- a/bugsnag-plugin-android-anr/src/main/jni/anr_handler.c +++ b/bugsnag-plugin-android-anr/src/main/jni/anr_handler.c @@ -22,9 +22,8 @@ static bool installed = false; static pthread_t watchdog_thread; static bool should_wait_for_semaphore = false; -static sem_t reporter_thread_semaphore; -static volatile bool should_report_anr = false; -static struct sigaction original_sigquit_handler; +static sem_t anr_reporting_semaphore; +static volatile bool should_report_anr_flag = false; static JavaVM *bsg_jvm = NULL; static jmethodID mthd_notify_anr_detected = NULL; @@ -35,13 +34,13 @@ static jmethodID frame_init = NULL; static bugsnag_stackframe anr_stacktrace[BUGSNAG_FRAMES_MAX]; static ssize_t anr_stacktrace_length = 0; -unwind_func local_bsg_unwind_stack; +static unwind_func unwind_stack_function; // duplication required for JNI methods that originally came from // bugsnag-plugin-android-ndk. Until a shared C module is available // for sharing common code when PLAT-5794 is addressed, this // duplication is a necessary evil. -bool anr_bsg_check_and_clear_exc(JNIEnv *env) { +static bool check_and_clear_exc(JNIEnv *env) { if (env == NULL) { return false; } @@ -52,7 +51,7 @@ bool anr_bsg_check_and_clear_exc(JNIEnv *env) { return false; } -jclass anr_bsg_safe_find_class(JNIEnv *env, const char *clz_name) { +static jclass safe_find_class(JNIEnv *env, const char *clz_name) { if (env == NULL) { return NULL; } @@ -60,22 +59,22 @@ jclass anr_bsg_safe_find_class(JNIEnv *env, const char *clz_name) { return NULL; } jclass clz = (*env)->FindClass(env, clz_name); - anr_bsg_check_and_clear_exc(env); + check_and_clear_exc(env); return clz; } -jmethodID anr_bsg_safe_get_method_id(JNIEnv *env, jclass clz, const char *name, - const char *sig) { +static jmethodID safe_get_method_id(JNIEnv *env, jclass clz, const char *name, + const char *sig) { if (env == NULL || clz == NULL || name == NULL || sig == NULL) { return NULL; } jmethodID methodId = (*env)->GetMethodID(env, clz, name, sig); - anr_bsg_check_and_clear_exc(env); + check_and_clear_exc(env); return methodId; } -bool configure_anr_jni_impl( - JNIEnv *env) { // get a global reference to the AnrPlugin class +static bool configure_anr_jni_impl(JNIEnv *env) { + // get a global reference to the AnrPlugin class // https://developer.android.com/training/articles/perf-jni#faq:-why-didnt-findclass-find-my-class if (env == NULL) { return false; @@ -85,30 +84,28 @@ bool configure_anr_jni_impl( return false; } - jclass clz = anr_bsg_safe_find_class(env, "com/bugsnag/android/AnrPlugin"); - if (anr_bsg_check_and_clear_exc(env) || clz == NULL) { + jclass clz = safe_find_class(env, "com/bugsnag/android/AnrPlugin"); + if (check_and_clear_exc(env) || clz == NULL) { return false; } - mthd_notify_anr_detected = anr_bsg_safe_get_method_id( - env, clz, "notifyAnrDetected", "(Ljava/util/List;)V"); - frame_class = - anr_bsg_safe_find_class(env, "com/bugsnag/android/NativeStackframe"); + mthd_notify_anr_detected = + safe_get_method_id(env, clz, "notifyAnrDetected", "(Ljava/util/List;)V"); + frame_class = safe_find_class(env, "com/bugsnag/android/NativeStackframe"); frame_class = (*env)->NewGlobalRef(env, frame_class); - frame_init = anr_bsg_safe_get_method_id( + frame_init = safe_get_method_id( env, frame_class, "", "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/Number;Ljava/lang/" "Long;Ljava/lang/Long;Ljava/lang/Long;)V"); return true; } -void anr_bsg_safe_delete_local_ref(JNIEnv *env, jobject obj) { +static void safe_delete_local_ref(JNIEnv *env, jobject obj) { if (env != NULL) { (*env)->DeleteLocalRef(env, obj); } } -jobject anr_bsg_safe_new_object(JNIEnv *env, jclass clz, jmethodID method, - ...) { +static jobject safe_new_object(JNIEnv *env, jclass clz, jmethodID method, ...) { if (env == NULL || clz == NULL) { return NULL; } @@ -116,16 +113,16 @@ jobject anr_bsg_safe_new_object(JNIEnv *env, jclass clz, jmethodID method, va_start(args, method); jobject obj = (*env)->NewObjectV(env, clz, method, args); va_end(args); - anr_bsg_check_and_clear_exc(env); + check_and_clear_exc(env); return obj; } -jstring anr_bsg_safe_new_string_utf(JNIEnv *env, const char *str) { +static jstring safe_new_string_utf(JNIEnv *env, const char *str) { if (env == NULL || str == NULL) { return NULL; } jstring jstr = (*env)->NewStringUTF(env, str); - anr_bsg_check_and_clear_exc(env); + check_and_clear_exc(env); return jstr; } @@ -161,50 +158,47 @@ static void notify_anr_detected() { return; } - jclass list_class = anr_bsg_safe_find_class(env, "java/util/LinkedList"); - jmethodID list_init = - anr_bsg_safe_get_method_id(env, list_class, "", "()V"); - jmethodID list_add = anr_bsg_safe_get_method_id(env, list_class, "add", - "(Ljava/lang/Object;)Z"); - jclass int_class = anr_bsg_safe_find_class(env, "java/lang/Integer"); - jmethodID int_init = - anr_bsg_safe_get_method_id(env, int_class, "", "(I)V"); - jclass long_class = anr_bsg_safe_find_class(env, "java/lang/Long"); - jmethodID long_init = - anr_bsg_safe_get_method_id(env, long_class, "", "(J)V"); - - jobject jlist = anr_bsg_safe_new_object(env, list_class, list_init); + jclass list_class = safe_find_class(env, "java/util/LinkedList"); + jmethodID list_init = safe_get_method_id(env, list_class, "", "()V"); + jmethodID list_add = + safe_get_method_id(env, list_class, "add", "(Ljava/lang/Object;)Z"); + jclass int_class = safe_find_class(env, "java/lang/Integer"); + jmethodID int_init = safe_get_method_id(env, int_class, "", "(I)V"); + jclass long_class = safe_find_class(env, "java/lang/Long"); + jmethodID long_init = safe_get_method_id(env, long_class, "", "(J)V"); + + jobject jlist = safe_new_object(env, list_class, list_init); for (ssize_t i = 0; i < anr_stacktrace_length; i++) { bugsnag_stackframe *frame = anr_stacktrace + i; - jobject jmethod = anr_bsg_safe_new_string_utf(env, frame->method); - jobject jfilename = anr_bsg_safe_new_string_utf(env, frame->filename); - jobject jline_number = anr_bsg_safe_new_object(env, int_class, int_init, - (jint)frame->line_number); - jobject jframe_address = anr_bsg_safe_new_object( - env, long_class, long_init, (jlong)frame->frame_address); - jobject jsymbol_address = anr_bsg_safe_new_object( - env, long_class, long_init, (jlong)frame->symbol_address); - jobject jload_address = anr_bsg_safe_new_object(env, long_class, long_init, - (jlong)frame->load_address); - jobject jframe = anr_bsg_safe_new_object( - env, frame_class, frame_init, jmethod, jfilename, jline_number, - jframe_address, jsymbol_address, jload_address); + jobject jmethod = safe_new_string_utf(env, frame->method); + jobject jfilename = safe_new_string_utf(env, frame->filename); + jobject jline_number = + safe_new_object(env, int_class, int_init, (jint)frame->line_number); + jobject jframe_address = safe_new_object(env, long_class, long_init, + (jlong)frame->frame_address); + jobject jsymbol_address = safe_new_object(env, long_class, long_init, + (jlong)frame->symbol_address); + jobject jload_address = + safe_new_object(env, long_class, long_init, (jlong)frame->load_address); + jobject jframe = safe_new_object(env, frame_class, frame_init, jmethod, + jfilename, jline_number, jframe_address, + jsymbol_address, jload_address); if (jlist != NULL && list_add != NULL && jframe != NULL) { (*env)->CallBooleanMethod(env, jlist, list_add, jframe); - anr_bsg_check_and_clear_exc(env); + check_and_clear_exc(env); } - anr_bsg_safe_delete_local_ref(env, jmethod); - anr_bsg_safe_delete_local_ref(env, jfilename); - anr_bsg_safe_delete_local_ref(env, jline_number); - anr_bsg_safe_delete_local_ref(env, jframe_address); - anr_bsg_safe_delete_local_ref(env, jsymbol_address); - anr_bsg_safe_delete_local_ref(env, jload_address); - anr_bsg_safe_delete_local_ref(env, jframe); + safe_delete_local_ref(env, jmethod); + safe_delete_local_ref(env, jfilename); + safe_delete_local_ref(env, jline_number); + safe_delete_local_ref(env, jframe_address); + safe_delete_local_ref(env, jsymbol_address); + safe_delete_local_ref(env, jload_address); + safe_delete_local_ref(env, jframe); } if (obj_plugin != NULL && mthd_notify_anr_detected != NULL && jlist != NULL) { (*env)->CallVoidMethod(env, obj_plugin, mthd_notify_anr_detected, jlist); - anr_bsg_check_and_clear_exc(env); + check_and_clear_exc(env); } if (should_detach) { @@ -213,60 +207,91 @@ static void notify_anr_detected() { } } -static void *sigquit_watchdog_thread_main(void *_) { - static const useconds_t delay_2sec = 2000000; - static const useconds_t delay_100ms = 100000; - static const useconds_t delay_10ms = 10000; +static inline void block_sigquit() { + sigset_t sigmask; + sigemptyset(&sigmask); + sigaddset(&sigmask, SIGQUIT); + if (pthread_sigmask(SIG_BLOCK, &sigmask, NULL) != 0) { + BUGSNAG_LOG( + "Could not block SIGQUIT. Google's ANR handling will be disabled."); + } +} - // Wait until our SIGQUIT handler is ready for us to start. - // Use sem_wait if possible, falling back to polling. - if (!should_wait_for_semaphore || sem_wait(&reporter_thread_semaphore) != 0) { - while (!should_report_anr) { - usleep(delay_100ms); +static inline void unblock_sigquit() { + sigset_t anr_sigmask; + sigemptyset(&anr_sigmask); + sigaddset(&anr_sigmask, SIGQUIT); + if (pthread_sigmask(SIG_UNBLOCK, &anr_sigmask, NULL) != 0) { + BUGSNAG_LOG( + "Could not unblock SIGQUIT. Bugsnag's ANR handling will be disabled."); + } +} + +static inline void trigger_sigquit_watchdog_thread() { + // Set the trigger flag for the fallback spin-lock in + // sigquit_watchdog_thread_main() + should_report_anr_flag = true; + + if (should_wait_for_semaphore) { + // Although sem_post() is not officially marked as async-safe, the Android + // implementation simply does an atomic compare-and-exchange when there is + // only one thread waiting (which is the case here). + // https://cs.android.com/android/platform/superproject/+/master:bionic/libc/bionic/semaphore.cpp;l=289?q=sem_post&ss=android + if (sem_post(&anr_reporting_semaphore) != 0) { + // The only possible failure from sem_post is EOVERFLOW, which won't + // happen in this code. But just to be thorough... + BUGSNAG_LOG("Could not unlock Bugsnag sigquit handler semaphore"); } } +} - // Force at least one task switch after being triggered, ensuring that the - // signal masks are properly settled before triggering the Google handler. - usleep(delay_10ms); +static void *sigquit_watchdog_thread_main(__unused void *_) { + static const useconds_t delay_100ms = 100000; + + while (enabled) { + // Unblock SIGQUIT so that handle_sigquit() will be called. + unblock_sigquit(); + + // Wait until our SIGQUIT handler is ready for us to start. + // Use sem_wait() if possible, falling back to polling. + should_report_anr_flag = false; + if (!should_wait_for_semaphore || sem_wait(&anr_reporting_semaphore) != 0) { + while (!should_report_anr_flag) { + usleep(delay_100ms); + } + } + + if (!enabled) { + // This happens if bsg_handler_uninstall_anr() woke us. + break; + } - // Do our ANR processing - notify_anr_detected(); + // Trigger Google ANR processing (occurs on a different thread). + bsg_google_anr_call(); - // Trigger Google ANR processing - bsg_google_anr_call(); + // Do our ANR processing. + notify_anr_detected(); + } - // Give a little time for the Google handler to dump state, then exit this - // thread. - usleep(delay_2sec); return NULL; } -static void handle_sigquit(int signum, siginfo_t *info, void *user_context) { - // Re-block SIGQUIT so that the Google handler can trigger - sigset_t sigmask; - sigemptyset(&sigmask); - sigaddset(&sigmask, SIGQUIT); - pthread_sigmask(SIG_BLOCK, &sigmask, NULL); - sigaction(SIGQUIT, &original_sigquit_handler, NULL); +static void handle_sigquit(__unused int signum, siginfo_t *info, + void *user_context) { + // Re-block SIGQUIT so that the Google handler can trigger. + // Do it in this handler so that the signal pending flags flip on the next + // context switch and will be off when the next sigquit_watchdog_thread_main() + // loop runs. + block_sigquit(); // The unwind function will be non-null if the NDK plugin is loaded. - if (local_bsg_unwind_stack != NULL) { + if (unwind_stack_function != NULL) { anr_stacktrace_length = - local_bsg_unwind_stack(anr_stacktrace, info, user_context); + unwind_stack_function(anr_stacktrace, info, user_context); } - // Instruct our watchdog thread to report the ANR and also call Google - should_report_anr = true; - // Although sem_post is not officially marked as async-safe, the Android - // implementation simply does an atomic compare-and-exchange when there is - // only one thread waiting (which is the case here). - // https://cs.android.com/android/platform/superproject/+/master:bionic/libc/bionic/semaphore.cpp;l=289?q=sem_post&ss=android - if (sem_post(&reporter_thread_semaphore) != 0) { - // The only possible failure from sem_post is EOVERFLOW, which won't happen - // in this code. But implementations can change... - BUGSNAG_LOG("Could not unlock semaphore"); - } + // Tell sigquit_watchdog_thread_main() to report an ANR. + trigger_sigquit_watchdog_thread(); } static void install_signal_handler() { @@ -276,14 +301,14 @@ static void install_signal_handler() { // We can still report to Bugsnag, so continue. } - if (sem_init(&reporter_thread_semaphore, 0, 0) == 0) { + if (sem_init(&anr_reporting_semaphore, 0, 0) == 0) { should_wait_for_semaphore = true; } else { BUGSNAG_LOG("Failed to init semaphore"); - // We can still poll should_report_anr, so continue. + // We can still poll should_report_anr_flag, so continue. } - // Start the watchdog thread + // Start the watchdog thread sigquit_watchdog_thread_main(). if (pthread_create(&watchdog_thread, NULL, sigquit_watchdog_thread_main, NULL) != 0) { BUGSNAG_LOG( @@ -291,25 +316,22 @@ static void install_signal_handler() { return; } - // Install our signal handler + // Install our signal handler. struct sigaction handler; sigemptyset(&handler.sa_mask); handler.sa_sigaction = handle_sigquit; handler.sa_flags = SA_SIGINFO; - if (sigaction(SIGQUIT, &handler, &original_sigquit_handler) != 0) { + // Note: We do NOT save the old handler since the default SIGQUIT handler MUST + // NOT be called in an Android environment. See anr_google.c. + if (sigaction(SIGQUIT, &handler, NULL) != 0) { BUGSNAG_LOG( "Failed to install SIGQUIT handler: %s. ANRs won't be sent to Bugsnag.", strerror(errno)); return; } - // Unblock SIGQUIT so that our handler will be called - sigset_t anr_sigmask; - sigemptyset(&anr_sigmask); - sigaddset(&anr_sigmask, SIGQUIT); - if (pthread_sigmask(SIG_UNBLOCK, &anr_sigmask, NULL) != 0) { - BUGSNAG_LOG("Could not unblock SIGQUIT. ANRs won't be sent to Bugsnag."); - } + // Unblock SIGQUIT so that our newly installed handler can run. + unblock_sigquit(); } bool bsg_handler_install_anr(JNIEnv *env, jobject plugin) { @@ -328,5 +350,9 @@ bool bsg_handler_install_anr(JNIEnv *env, jobject plugin) { void bsg_handler_uninstall_anr() { pthread_mutex_lock(&bsg_anr_handler_config); enabled = false; + // Trigger sigquit_watchdog_thread_main() so that it can exit. + trigger_sigquit_watchdog_thread(); pthread_mutex_unlock(&bsg_anr_handler_config); } + +void bsg_set_unwind_function(unwind_func func) { unwind_stack_function = func; } diff --git a/bugsnag-plugin-android-anr/src/main/jni/anr_handler.h b/bugsnag-plugin-android-anr/src/main/jni/anr_handler.h index 96bd2c0d7b..0bde0b627f 100644 --- a/bugsnag-plugin-android-anr/src/main/jni/anr_handler.h +++ b/bugsnag-plugin-android-anr/src/main/jni/anr_handler.h @@ -1,5 +1,6 @@ #ifndef BUGSNAG_ANR_HANDLER_H #define BUGSNAG_ANR_HANDLER_H +#include "unwind_func.h" #include #include @@ -31,6 +32,8 @@ bool bsg_handler_install_anr(JNIEnv *env, jobject plugin); void bsg_handler_uninstall_anr(void); +void bsg_set_unwind_function(unwind_func func); + #ifdef __cplusplus } #endif diff --git a/bugsnag-plugin-android-anr/src/main/jni/bugsnag_anr.c b/bugsnag-plugin-android-anr/src/main/jni/bugsnag_anr.c index 66df25fc67..80e4b57c4b 100644 --- a/bugsnag-plugin-android-anr/src/main/jni/bugsnag_anr.c +++ b/bugsnag-plugin-android-anr/src/main/jni/bugsnag_anr.c @@ -19,7 +19,7 @@ JNIEXPORT void JNICALL Java_com_bugsnag_android_AnrPlugin_disableAnrReporting( JNIEXPORT void JNICALL Java_com_bugsnag_android_AnrPlugin_setUnwindFunction( JNIEnv *env, jobject thiz, jlong unwind_function) { - local_bsg_unwind_stack = (unwind_func)unwind_function; + bsg_set_unwind_function((unwind_func)unwind_function); } #ifdef __cplusplus diff --git a/bugsnag-plugin-android-anr/src/main/jni/unwind_func.h b/bugsnag-plugin-android-anr/src/main/jni/unwind_func.h index d15d844f21..002bd7e23e 100644 --- a/bugsnag-plugin-android-anr/src/main/jni/unwind_func.h +++ b/bugsnag-plugin-android-anr/src/main/jni/unwind_func.h @@ -13,8 +13,6 @@ typedef ssize_t (*unwind_func)( bugsnag_stackframe stacktrace[BUGSNAG_FRAMES_MAX], siginfo_t *info, void *user_context); -extern unwind_func local_bsg_unwind_stack; - #ifdef __cplusplus } #endif