]> git.puffer.fish Git - mirror/frr.git/commitdiff
zebra: Prevent possible wedged fpm write
authorDonald Sharp <sharpd@nvidia.com>
Thu, 14 Dec 2023 13:19:36 +0000 (08:19 -0500)
committerDonald Sharp <sharpd@nvidia.com>
Thu, 14 Dec 2023 14:12:46 +0000 (09:12 -0500)
An operator is reporting that the dplane_fpm_nl connection has
started to accumulate contexts.  One such path that could cause
this is that the obuf used is full and stays full.  This would
imply that what ever is on the receiving end has gotten wedged
and is not reading from the stream of data being sent it's way.
If after 15 seconds of no response, let's declare the connection
dead and reset it.

Signed-off-by: Donald Sharp <sharpd@nvidia.com>
zebra/dplane_fpm_nl.c

index c98655fdb87740ec21976254a51fb27004e622b7..31e93d232322a8295f6abafdfe5bc415bc7a3a70 100644 (file)
 #define SOUTHBOUND_DEFAULT_ADDR INADDR_LOOPBACK
 #define SOUTHBOUND_DEFAULT_PORT 2620
 
+/*
+ * Time in seconds that if the other end is not responding
+ * something terrible has gone wrong.  Let's fix that.
+ */
+#define DPLANE_FPM_NL_WEDGIE_TIME 15
+
 /**
  * FPM header:
  * {
@@ -93,6 +99,7 @@ struct fpm_nl_ctx {
        struct event *t_event;
        struct event *t_nhg;
        struct event *t_dequeue;
+       struct event *t_wedged;
 
        /* zebra events. */
        struct event *t_lspreset;
@@ -1367,6 +1374,18 @@ static void fpm_rmac_reset(struct event *t)
                        &fnc->t_rmacwalk);
 }
 
+static void fpm_process_wedged(struct event *t)
+{
+       struct fpm_nl_ctx *fnc = EVENT_ARG(t);
+
+       zlog_warn("%s: Connection unable to write to peer for over %u seconds, resetting",
+                 __func__, DPLANE_FPM_NL_WEDGIE_TIME);
+
+       atomic_fetch_add_explicit(&fnc->counters.connection_errors, 1,
+                                 memory_order_relaxed);
+       FPM_RECONNECT(fnc);
+}
+
 static void fpm_process_queue(struct event *t)
 {
        struct fpm_nl_ctx *fnc = EVENT_ARG(t);
@@ -1411,9 +1430,13 @@ static void fpm_process_queue(struct event *t)
                                  processed_contexts, memory_order_relaxed);
 
        /* Re-schedule if we ran out of buffer space */
-       if (no_bufs)
+       if (no_bufs) {
                event_add_timer(fnc->fthread->master, fpm_process_queue, fnc, 0,
                                &fnc->t_dequeue);
+               event_add_timer(fnc->fthread->master, fpm_process_wedged, fnc,
+                               DPLANE_FPM_NL_WEDGIE_TIME, &fnc->t_wedged);
+       } else
+               EVENT_OFF(fnc->t_wedged);
 
        /*
         * Let the dataplane thread know if there are items in the