From c6775bcec566631a3565b157a7be6fe7fcb16aa1 Mon Sep 17 00:00:00 2001 From: Karthik Ramanathan Date: Thu, 16 May 2024 11:47:00 -0700 Subject: [PATCH] [#18948] YSQL: Explicitly destruct YSQL webserver on SIGTERM Summary: The YSQL webserver has occasionally produced coredumps of the following form upon receiving a termination signal from postmaster. ``` #0 0x00007fbac35a9ae3 _ZNKSt3__112__hash_tableINS_17__hash_value_typeINS_12basic_string #1 0x00007fbac005485d _ZNKSt3__113unordered_mapINS_12basic_string (libyb_util.so) #2 0x00007fbac0053180 _ZN2yb16PrometheusWriter16WriteSingleEntryERKNSt3__113unordered_mapINS1_12basic_string #3 0x00007fbab21ff1eb _ZN2yb6pggateL26PgPrometheusMetricsHandlerERKNS_19WebCallbackRegistry10WebRequestEPNS1_11WebResponseE (libyb_pggate_webserver.so) .... .... ``` The coredump indicates corruption of a namespace-scoped variable of type unordered_map while attempting to serve a request after a termination signal has been received. The current code causes the webserver (postgres background worker) to call postgres' `proc_exit()` which consequently calls `exit()`. According to the [[ https://en.cppreference.com/w/cpp/utility/program/exit | C++ standard ]], a limited amount of cleanup is performed on exit(): - Notably destructors of variables with automatic storage duration are not invoked. This implies that the webserver's destructor is not called, and therefore the server is not stopped. - Namespace-scoped variables have [[ https://en.cppreference.com/w/cpp/language/storage_duration | static storage duration ]]. - Objects with static storage duration are destroyed. - This leads to a possibility of undefined behavior where the webserver may continue running for a short duration of time, while the static variables used to serve requests may have been GC'ed. This revision explicitly stops the webserver upon receiving a termination signal, by calling its destructor. It also adds logic to the handlers to return a `503 SERVICE_UNAVAILABLE` once termination has been initiated. Jira: DB-7796 Test Plan: To test this manually, use a HTTP load generation tool like locust to bombard the YSQL Webserver with requests to an endpoint like `
:13000/prometheus-metrics`. On a standard devserver, I configured locust to use 30 simultaneous users (30 requests per second) to reproduce the issue. The following bash script can be used to detect the coredumps: ``` #/bin/bash ITERATIONS=50 YBDB_PATH=/path/to/code/yugabyte-db # Count the number of dump files to avoid having to use `sudo coredumpctl` idumps=$(ls /var/lib/systemd/coredump/ | wc -l) for ((i = 0 ; i < $ITERATIONS ; i++ )) do echo "Iteration: $(($i + 1))"; $YBDB_PATH/bin/yb-ctl restart > /dev/null nservers=$(netstat -nlpt 2> /dev/null | grep 13000 | wc -l) if (( nservers != 1)); then echo "Web server has not come up. Exiting" exit 1; fi sleep 5s # Kill the webserver pkill -TERM -f 'YSQL webserver' # Count the number of coredumps # Please validate that the coredump produced is that of postgres/webserver ndumps=$(ls /var/lib/systemd/coredump/ | wc -l) if (( ndumps > idumps )); then echo "Core dumps: $(($ndumps - $idumps))" else echo "No new core dumps found" fi done ``` Run the script with the load generation tool running against the webserver in the background. - Without the fix in this revision, the above script produced 8 postgres/webserver core dumps in 50 iterations. - With the fix, no coredumps were observed. Reviewers: telgersma, fizaa Reviewed By: telgersma Subscribers: ybase, smishra, yql Tags: #jenkins-ready Differential Revision: https://phorge.dev.yugabyte.com/D35116 --- .../yb_pg_metrics/yb_pg_metrics.c | 6 ++++++ src/yb/server/webserver.cc | 20 ++++++++++++++++++- .../webserver/pgsql_webserver_wrapper.cc | 5 +++++ .../webserver/pgsql_webserver_wrapper.h | 1 + 4 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/postgres/yb-extensions/yb_pg_metrics/yb_pg_metrics.c b/src/postgres/yb-extensions/yb_pg_metrics/yb_pg_metrics.c index 0fe4f616744b..a39fc449dd89 100644 --- a/src/postgres/yb-extensions/yb_pg_metrics/yb_pg_metrics.c +++ b/src/postgres/yb-extensions/yb_pg_metrics/yb_pg_metrics.c @@ -437,6 +437,12 @@ webserver_worker_main(Datum unused) MemoryContextSwitchTo(oldcontext); } + if (webserver) + { + DestroyWebserver(webserver); + webserver = NULL; + } + if (rc & WL_POSTMASTER_DEATH) proc_exit(1); diff --git a/src/yb/server/webserver.cc b/src/yb/server/webserver.cc index 7b8e229b2a82..4825763f4dc2 100644 --- a/src/yb/server/webserver.cc +++ b/src/yb/server/webserver.cc @@ -254,6 +254,9 @@ class Webserver::Impl { // Mutex guarding against concurrenct calls to Stop(). std::mutex stop_mutex_; + // Variable to notify handlers to stop serving requests. + std::atomic stop_initiated = false; + mutable std::mutex auto_flags_mutex_; // The AutoFlags that are associated with this particular server. In LTO builds we use the same // process for both yb-master and yb-tserver, so the process may have more AutoFlags than this @@ -448,11 +451,19 @@ Status Webserver::Impl::Start() { } void Webserver::Impl::Stop() { + // Indicate to the handlers that they can now stop serving requests. If any further signals are + // received to terminate the webserver, the handlers will no longer access non-static variables + // and will return immediately. + stop_initiated = true; + std::lock_guard lock_(stop_mutex_); if (context_ != nullptr) { + LOG(INFO) << "Stopping webserver on " << http_address_; sq_stop(context_); context_ = nullptr; } + + LOG(INFO) << "Webserver stopped"; } Status Webserver::Impl::GetInputHostPort(HostPort* hp) const { @@ -583,6 +594,13 @@ sq_callback_result_t Webserver::Impl::BeginRequestCallback(struct sq_connection* sq_callback_result_t Webserver::Impl::RunPathHandler(const PathHandler& handler, struct sq_connection* connection, struct sq_request_info* request_info) { + constexpr auto SERVICE_UNAVAILABLE_MSG = "HTTP/1.1 503 Service Unavailable\r\n"; + // If we're in the process of stopping, do not parse or route the request. Just return. + if (PREDICT_FALSE(stop_initiated)) { + sq_printf(connection, SERVICE_UNAVAILABLE_MSG); + return SQ_HANDLED_CLOSE_CONNECTION; + } + // Should we render with css styles? bool use_style = true; @@ -647,7 +665,7 @@ sq_callback_result_t Webserver::Impl::RunPathHandler(const PathHandler& handler, for (const PathHandlerCallback& callback_ : handler.callbacks()) { callback_(req, resp_ptr); if (resp_ptr->code == 503) { - sq_printf(connection, "HTTP/1.1 503 Service Unavailable\r\n"); + sq_printf(connection, SERVICE_UNAVAILABLE_MSG); return SQ_HANDLED_CLOSE_CONNECTION; } } diff --git a/src/yb/yql/pggate/webserver/pgsql_webserver_wrapper.cc b/src/yb/yql/pggate/webserver/pgsql_webserver_wrapper.cc index b70c511b9241..d7253c0a0ee2 100644 --- a/src/yb/yql/pggate/webserver/pgsql_webserver_wrapper.cc +++ b/src/yb/yql/pggate/webserver/pgsql_webserver_wrapper.cc @@ -624,6 +624,11 @@ YBCStatus StartWebserver(WebserverWrapper *webserver_wrapper) { return ToYBCStatus(WithMaskedYsqlSignals([webserver]() { return webserver->Start(); })); } +void DestroyWebserver(struct WebserverWrapper *webserver) { + Webserver *webserver_impl = reinterpret_cast(webserver); + delete webserver_impl; +} + void SetWebserverConfig( WebserverWrapper *webserver_wrapper, bool enable_access_logging, bool enable_tcmalloc_logging, int webserver_profiler_sample_freq_bytes) { diff --git a/src/yb/yql/pggate/webserver/pgsql_webserver_wrapper.h b/src/yb/yql/pggate/webserver/pgsql_webserver_wrapper.h index 4212e778d32f..8a96c929b839 100644 --- a/src/yb/yql/pggate/webserver/pgsql_webserver_wrapper.h +++ b/src/yb/yql/pggate/webserver/pgsql_webserver_wrapper.h @@ -99,6 +99,7 @@ typedef struct { } YbConnectionMetrics; struct WebserverWrapper *CreateWebserver(char *listen_addresses, int port); +void DestroyWebserver(struct WebserverWrapper *webserver); void RegisterMetrics(ybpgmEntry *tab, int num_entries, char *metric_node_name); void RegisterRpczEntries( postgresCallbacks *callbacks, int *num_backends_ptr, rpczEntry **rpczEntriesPointer,