diff options
| author | Mark Stapp <mjs@cisco.com> | 2024-11-25 15:37:39 -0500 |
|---|---|---|
| committer | Mark Stapp <mjs@cisco.com> | 2024-11-25 15:37:39 -0500 |
| commit | 277784fe34d3036c9f6f37bc7deff7fb235012ee (patch) | |
| tree | ee262bf513d225568a6dc04b729fd6ed95fa34b2 | |
| parent | 0bacbc6493ae0fb9b946e2df475367ea4541a5c8 (diff) | |
zebra: avoid a race during FPM dplane plugin shutdown
During zebra shutdown, the main pthread and the FPM pthread can
deadlock if the FPM pthread is in fpm_reconnect(). Each pthread
tries to use event_cancel_async() to cancel tasks that may be
scheduled for the other pthread - this leads to a deadlock as
neither thread can progress.
This adds an atomic boolean that's managed as each pthread
enters and leaves the cleanup code in question, preventing the
two threads from running into the deadlock.
Signed-off-by: Mark Stapp <mjs@cisco.com>
| -rw-r--r-- | zebra/dplane_fpm_nl.c | 34 |
1 files changed, 34 insertions, 0 deletions
diff --git a/zebra/dplane_fpm_nl.c b/zebra/dplane_fpm_nl.c index e6b4af3674..3ec1c9d657 100644 --- a/zebra/dplane_fpm_nl.c +++ b/zebra/dplane_fpm_nl.c @@ -68,6 +68,8 @@ static const char *prov_name = "dplane_fpm_nl"; +static atomic_bool fpm_cleaning_up; + struct fpm_nl_ctx { /* data plane connection. */ int socket; @@ -524,6 +526,16 @@ static void fpm_connect(struct event *t); static void fpm_reconnect(struct fpm_nl_ctx *fnc) { + bool cleaning_p = false; + + /* This is being called in the FPM pthread: ensure we don't deadlock + * with similar code that may be run in the main pthread. + */ + if (!atomic_compare_exchange_strong_explicit( + &fpm_cleaning_up, &cleaning_p, true, memory_order_seq_cst, + memory_order_seq_cst)) + return; + /* Cancel all zebra threads first. */ event_cancel_async(zrouter.master, &fnc->t_lspreset, NULL); event_cancel_async(zrouter.master, &fnc->t_lspwalk, NULL); @@ -551,6 +563,12 @@ static void fpm_reconnect(struct fpm_nl_ctx *fnc) EVENT_OFF(fnc->t_read); EVENT_OFF(fnc->t_write); + /* Reset the barrier value */ + cleaning_p = true; + atomic_compare_exchange_strong_explicit( + &fpm_cleaning_up, &cleaning_p, false, memory_order_seq_cst, + memory_order_seq_cst); + /* FPM is disabled, don't attempt to connect. */ if (fnc->disabled) return; @@ -1624,6 +1642,16 @@ static int fpm_nl_start(struct zebra_dplane_provider *prov) static int fpm_nl_finish_early(struct fpm_nl_ctx *fnc) { + bool cleaning_p = false; + + /* This is being called in the main pthread: ensure we don't deadlock + * with similar code that may be run in the FPM pthread. + */ + if (!atomic_compare_exchange_strong_explicit( + &fpm_cleaning_up, &cleaning_p, true, memory_order_seq_cst, + memory_order_seq_cst)) + return 0; + /* Disable all events and close socket. */ EVENT_OFF(fnc->t_lspreset); EVENT_OFF(fnc->t_lspwalk); @@ -1644,6 +1672,12 @@ static int fpm_nl_finish_early(struct fpm_nl_ctx *fnc) fnc->socket = -1; } + /* Reset the barrier value */ + cleaning_p = true; + atomic_compare_exchange_strong_explicit( + &fpm_cleaning_up, &cleaning_p, false, memory_order_seq_cst, + memory_order_seq_cst); + return 0; } |
