summaryrefslogtreecommitdiff
path: root/zebra
diff options
context:
space:
mode:
authorMark Stapp <mjs@cisco.com>2024-11-25 15:37:39 -0500
committerMark Stapp <mjs@cisco.com>2024-11-25 15:37:39 -0500
commit277784fe34d3036c9f6f37bc7deff7fb235012ee (patch)
treeee262bf513d225568a6dc04b729fd6ed95fa34b2 /zebra
parent0bacbc6493ae0fb9b946e2df475367ea4541a5c8 (diff)
zebra: avoid a race during FPM dplane plugin shutdown
During zebra shutdown, the main pthread and the FPM pthread can deadlock if the FPM pthread is in fpm_reconnect(). Each pthread tries to use event_cancel_async() to cancel tasks that may be scheduled for the other pthread - this leads to a deadlock as neither thread can progress. This adds an atomic boolean that's managed as each pthread enters and leaves the cleanup code in question, preventing the two threads from running into the deadlock. Signed-off-by: Mark Stapp <mjs@cisco.com>
Diffstat (limited to 'zebra')
-rw-r--r--zebra/dplane_fpm_nl.c34
1 files changed, 34 insertions, 0 deletions
diff --git a/zebra/dplane_fpm_nl.c b/zebra/dplane_fpm_nl.c
index e6b4af3674..3ec1c9d657 100644
--- a/zebra/dplane_fpm_nl.c
+++ b/zebra/dplane_fpm_nl.c
@@ -68,6 +68,8 @@
static const char *prov_name = "dplane_fpm_nl";
+static atomic_bool fpm_cleaning_up;
+
struct fpm_nl_ctx {
/* data plane connection. */
int socket;
@@ -524,6 +526,16 @@ static void fpm_connect(struct event *t);
static void fpm_reconnect(struct fpm_nl_ctx *fnc)
{
+ bool cleaning_p = false;
+
+ /* This is being called in the FPM pthread: ensure we don't deadlock
+ * with similar code that may be run in the main pthread.
+ */
+ if (!atomic_compare_exchange_strong_explicit(
+ &fpm_cleaning_up, &cleaning_p, true, memory_order_seq_cst,
+ memory_order_seq_cst))
+ return;
+
/* Cancel all zebra threads first. */
event_cancel_async(zrouter.master, &fnc->t_lspreset, NULL);
event_cancel_async(zrouter.master, &fnc->t_lspwalk, NULL);
@@ -551,6 +563,12 @@ static void fpm_reconnect(struct fpm_nl_ctx *fnc)
EVENT_OFF(fnc->t_read);
EVENT_OFF(fnc->t_write);
+ /* Reset the barrier value */
+ cleaning_p = true;
+ atomic_compare_exchange_strong_explicit(
+ &fpm_cleaning_up, &cleaning_p, false, memory_order_seq_cst,
+ memory_order_seq_cst);
+
/* FPM is disabled, don't attempt to connect. */
if (fnc->disabled)
return;
@@ -1624,6 +1642,16 @@ static int fpm_nl_start(struct zebra_dplane_provider *prov)
static int fpm_nl_finish_early(struct fpm_nl_ctx *fnc)
{
+ bool cleaning_p = false;
+
+ /* This is being called in the main pthread: ensure we don't deadlock
+ * with similar code that may be run in the FPM pthread.
+ */
+ if (!atomic_compare_exchange_strong_explicit(
+ &fpm_cleaning_up, &cleaning_p, true, memory_order_seq_cst,
+ memory_order_seq_cst))
+ return 0;
+
/* Disable all events and close socket. */
EVENT_OFF(fnc->t_lspreset);
EVENT_OFF(fnc->t_lspwalk);
@@ -1644,6 +1672,12 @@ static int fpm_nl_finish_early(struct fpm_nl_ctx *fnc)
fnc->socket = -1;
}
+ /* Reset the barrier value */
+ cleaning_p = true;
+ atomic_compare_exchange_strong_explicit(
+ &fpm_cleaning_up, &cleaning_p, false, memory_order_seq_cst,
+ memory_order_seq_cst);
+
return 0;
}