]> git.puffer.fish Git - mirror/frr.git/commitdiff
zebra: avoid a race during FPM dplane plugin shutdown 17504/head
authorMark Stapp <mjs@cisco.com>
Mon, 25 Nov 2024 20:37:39 +0000 (15:37 -0500)
committerMark Stapp <mjs@cisco.com>
Mon, 25 Nov 2024 20:37:39 +0000 (15:37 -0500)
During zebra shutdown, the main pthread and the FPM pthread can
deadlock if the FPM pthread is in fpm_reconnect(). Each pthread
tries to use event_cancel_async() to cancel tasks that may be
scheduled for the other pthread - this leads to a deadlock as
neither thread can progress.

This adds an atomic boolean that's managed as each pthread
enters and leaves the cleanup code in question, preventing the
two threads from running into the deadlock.

Signed-off-by: Mark Stapp <mjs@cisco.com>
zebra/dplane_fpm_nl.c

index e6b4af3674293812db8cae0c93c91f95dd27416b..3ec1c9d65723318c600dad939ddf4ecf9953d1d8 100644 (file)
@@ -68,6 +68,8 @@
 
 static const char *prov_name = "dplane_fpm_nl";
 
+static atomic_bool fpm_cleaning_up;
+
 struct fpm_nl_ctx {
        /* data plane connection. */
        int socket;
@@ -524,6 +526,16 @@ static void fpm_connect(struct event *t);
 
 static void fpm_reconnect(struct fpm_nl_ctx *fnc)
 {
+       bool cleaning_p = false;
+
+       /* This is being called in the FPM pthread: ensure we don't deadlock
+        * with similar code that may be run in the main pthread.
+        */
+       if (!atomic_compare_exchange_strong_explicit(
+                   &fpm_cleaning_up, &cleaning_p, true, memory_order_seq_cst,
+                   memory_order_seq_cst))
+               return;
+
        /* Cancel all zebra threads first. */
        event_cancel_async(zrouter.master, &fnc->t_lspreset, NULL);
        event_cancel_async(zrouter.master, &fnc->t_lspwalk, NULL);
@@ -551,6 +563,12 @@ static void fpm_reconnect(struct fpm_nl_ctx *fnc)
        EVENT_OFF(fnc->t_read);
        EVENT_OFF(fnc->t_write);
 
+       /* Reset the barrier value */
+       cleaning_p = true;
+       atomic_compare_exchange_strong_explicit(
+               &fpm_cleaning_up, &cleaning_p, false, memory_order_seq_cst,
+               memory_order_seq_cst);
+
        /* FPM is disabled, don't attempt to connect. */
        if (fnc->disabled)
                return;
@@ -1624,6 +1642,16 @@ static int fpm_nl_start(struct zebra_dplane_provider *prov)
 
 static int fpm_nl_finish_early(struct fpm_nl_ctx *fnc)
 {
+       bool cleaning_p = false;
+
+       /* This is being called in the main pthread: ensure we don't deadlock
+        * with similar code that may be run in the FPM pthread.
+        */
+       if (!atomic_compare_exchange_strong_explicit(
+                   &fpm_cleaning_up, &cleaning_p, true, memory_order_seq_cst,
+                   memory_order_seq_cst))
+               return 0;
+
        /* Disable all events and close socket. */
        EVENT_OFF(fnc->t_lspreset);
        EVENT_OFF(fnc->t_lspwalk);
@@ -1644,6 +1672,12 @@ static int fpm_nl_finish_early(struct fpm_nl_ctx *fnc)
                fnc->socket = -1;
        }
 
+       /* Reset the barrier value */
+       cleaning_p = true;
+       atomic_compare_exchange_strong_explicit(
+               &fpm_cleaning_up, &cleaning_p, false, memory_order_seq_cst,
+               memory_order_seq_cst);
+
        return 0;
 }