From 277784fe34d3036c9f6f37bc7deff7fb235012ee Mon Sep 17 00:00:00 2001 From: Mark Stapp Date: Mon, 25 Nov 2024 15:37:39 -0500 Subject: [PATCH] zebra: avoid a race during FPM dplane plugin shutdown During zebra shutdown, the main pthread and the FPM pthread can deadlock if the FPM pthread is in fpm_reconnect(). Each pthread tries to use event_cancel_async() to cancel tasks that may be scheduled for the other pthread - this leads to a deadlock as neither thread can progress. This adds an atomic boolean that's managed as each pthread enters and leaves the cleanup code in question, preventing the two threads from running into the deadlock. Signed-off-by: Mark Stapp --- zebra/dplane_fpm_nl.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/zebra/dplane_fpm_nl.c b/zebra/dplane_fpm_nl.c index e6b4af367429..3ec1c9d65723 100644 --- a/zebra/dplane_fpm_nl.c +++ b/zebra/dplane_fpm_nl.c @@ -68,6 +68,8 @@ static const char *prov_name = "dplane_fpm_nl"; +static atomic_bool fpm_cleaning_up; + struct fpm_nl_ctx { /* data plane connection. */ int socket; @@ -524,6 +526,16 @@ static void fpm_connect(struct event *t); static void fpm_reconnect(struct fpm_nl_ctx *fnc) { + bool cleaning_p = false; + + /* This is being called in the FPM pthread: ensure we don't deadlock + * with similar code that may be run in the main pthread. + */ + if (!atomic_compare_exchange_strong_explicit( + &fpm_cleaning_up, &cleaning_p, true, memory_order_seq_cst, + memory_order_seq_cst)) + return; + /* Cancel all zebra threads first. */ event_cancel_async(zrouter.master, &fnc->t_lspreset, NULL); event_cancel_async(zrouter.master, &fnc->t_lspwalk, NULL); @@ -551,6 +563,12 @@ static void fpm_reconnect(struct fpm_nl_ctx *fnc) EVENT_OFF(fnc->t_read); EVENT_OFF(fnc->t_write); + /* Reset the barrier value */ + cleaning_p = true; + atomic_compare_exchange_strong_explicit( + &fpm_cleaning_up, &cleaning_p, false, memory_order_seq_cst, + memory_order_seq_cst); + /* FPM is disabled, don't attempt to connect. */ if (fnc->disabled) return; @@ -1624,6 +1642,16 @@ static int fpm_nl_start(struct zebra_dplane_provider *prov) static int fpm_nl_finish_early(struct fpm_nl_ctx *fnc) { + bool cleaning_p = false; + + /* This is being called in the main pthread: ensure we don't deadlock + * with similar code that may be run in the FPM pthread. + */ + if (!atomic_compare_exchange_strong_explicit( + &fpm_cleaning_up, &cleaning_p, true, memory_order_seq_cst, + memory_order_seq_cst)) + return 0; + /* Disable all events and close socket. */ EVENT_OFF(fnc->t_lspreset); EVENT_OFF(fnc->t_lspwalk); @@ -1644,6 +1672,12 @@ static int fpm_nl_finish_early(struct fpm_nl_ctx *fnc) fnc->socket = -1; } + /* Reset the barrier value */ + cleaning_p = true; + atomic_compare_exchange_strong_explicit( + &fpm_cleaning_up, &cleaning_p, false, memory_order_seq_cst, + memory_order_seq_cst); + return 0; }