]> git.puffer.fish Git - mirror/frr.git/commitdiff
bgpd: implement SendHoldTimer 11225/head
authorDavid Lamparter <equinox@opensourcerouting.org>
Thu, 22 Apr 2021 09:04:52 +0000 (11:04 +0200)
committerDavid Lamparter <equinox@opensourcerouting.org>
Thu, 19 May 2022 10:14:40 +0000 (12:14 +0200)
As described by
https://www.ietf.org/archive/id/draft-spaghetti-idr-bgp-sendholdtimer-04.html

Since this replicates the HoldTime check on the receiver that is already
part of the protocol, I do not believe it necessary to wait for IETF
progress on this draft.  It's just replicating an existing element of
the protocol at the other side of the session.

Signed-off-by: David Lamparter <equinox@opensourcerouting.org>
bgpd/bgp_errors.c
bgpd/bgp_errors.h
bgpd/bgp_io.c
bgpd/bgp_packet.c
bgpd/bgpd.h

index 193c96a1698e4dc00cbdb29cc6e05492b5b52525..9f87f8ce7a6fcf382e9621c628f421269978e250 100644 (file)
@@ -181,6 +181,12 @@ static struct log_ref ferr_bgp_warn[] = {
                .description = "BGP is in the process of building NLRI information for a peer and has discovered an inconsistent internal state",
                .suggestion = "Gather log files and open an Issue, restart FRR",
        },
+       {
+               .code = EC_BGP_SENDQ_STUCK_WARN,
+               .title = "BGP has been unable to send anything to a peer for an extended time",
+               .description = "The BGP peer does not seem to be receiving or processing any data received from us, causing updates to be delayed.",
+               .suggestion = "Check connectivity to the peer and that it is not overloaded",
+       },
        {
                .code = END_FERR,
        }
@@ -480,6 +486,12 @@ static struct log_ref ferr_bgp_err[] = {
                .description = "BGP when using a v6 peer requires a v6 LL address to be configured on the outgoing interface as per RFC 4291 section 2.1",
                .suggestion = "Add a v6 LL address to the outgoing interfaces as per RFC",
        },
+       {
+               .code = EC_BGP_SENDQ_STUCK_PROPER,
+               .title = "BGP is shutting down a peer due to being unable to send anything for an extended time",
+               .description = "No BGP updates were successfully sent to the peer for more than twice the holdtime.",
+               .suggestion = "Check connectivity to the peer and that it is not overloaded",
+       },
        {
                .code = END_FERR,
        }
index 0b71af3fc60a1586fc18b787893f360f930ad724..0c0917c49e9606e7827923cf3ff9a92fff240115 100644 (file)
@@ -102,6 +102,8 @@ enum bgp_log_refs {
        EC_BGP_INVALID_BGP_INSTANCE,
        EC_BGP_INVALID_ROUTE,
        EC_BGP_NO_LL_ADDRESS_AVAILABLE,
+       EC_BGP_SENDQ_STUCK_WARN,
+       EC_BGP_SENDQ_STUCK_PROPER,
 };
 
 extern void bgp_error_init(void);
index bd0dfb3a6dd9388078022c3a5d866233bf158db1..75d34a84e02d8bf7f2d5c263529a6f4577876479 100644 (file)
@@ -298,6 +298,7 @@ static uint16_t bgp_write(struct peer *peer)
        unsigned int iovsz;
        unsigned int strmsz;
        unsigned int total_written;
+       time_t now;
 
        wpkt_quanta_old = atomic_load_explicit(&peer->bgp->wpkt_quanta,
                                               memory_order_relaxed);
@@ -430,19 +431,22 @@ static uint16_t bgp_write(struct peer *peer)
        }
 
 done : {
+       now = bgp_clock();
        /*
         * Update last_update if UPDATEs were written.
         * Note: that these are only updated at end,
         *       not per message (i.e., per loop)
         */
        if (uo)
-               atomic_store_explicit(&peer->last_update, bgp_clock(),
+               atomic_store_explicit(&peer->last_update, now,
                                      memory_order_relaxed);
 
        /* If we TXed any flavor of packet */
-       if (update_last_write)
-               atomic_store_explicit(&peer->last_write, bgp_clock(),
+       if (update_last_write) {
+               atomic_store_explicit(&peer->last_write, now,
                                      memory_order_relaxed);
+               peer->last_sendq_ok = now;
+       }
 }
 
        return status;
index cacc4995fcead0dce63a5f474b07d371080b154a..e4697fdb2bd45cb94f49f112af4d7ca18b47310b 100644 (file)
@@ -122,8 +122,45 @@ void bgp_packet_set_size(struct stream *s)
  */
 static void bgp_packet_add(struct peer *peer, struct stream *s)
 {
+       intmax_t delta;
+       uint32_t holdtime;
+
        frr_with_mutex(&peer->io_mtx) {
+               /* if the queue is empty, reset the "last OK" timestamp to
+                * now, otherwise if we write another packet immediately
+                * after it'll get confused
+                */
+               if (!stream_fifo_count_safe(peer->obuf))
+                       peer->last_sendq_ok = bgp_clock();
+
                stream_fifo_push(peer->obuf, s);
+
+               delta = bgp_clock() - peer->last_sendq_ok;
+               holdtime = atomic_load_explicit(&peer->holdtime,
+                                               memory_order_relaxed);
+
+               /* Note that when we're here, we're adding some packet to the
+                * OutQ.  That includes keepalives when there is nothing to
+                * do, so there's a guarantee we pass by here once in a while.
+                *
+                * That implies there is no need to go set up another separate
+                * timer that ticks down SendHoldTime, as we'll be here sooner
+                * or later anyway and will see the checks below failing.
+                */
+               if (delta > 2 * (intmax_t)holdtime) {
+                       flog_err(
+                               EC_BGP_SENDQ_STUCK_PROPER,
+                               "%s has not made any SendQ progress for 2 holdtimes, terminating session",
+                               peer->host);
+                       BGP_EVENT_ADD(peer, TCP_fatal_error);
+               } else if (delta > (intmax_t)holdtime &&
+                          bgp_clock() - peer->last_sendq_warn > 5) {
+                       flog_warn(
+                               EC_BGP_SENDQ_STUCK_WARN,
+                               "%s has not made any SendQ progress for 1 holdtime, peer overloaded?",
+                               peer->host);
+                       peer->last_sendq_warn = bgp_clock();
+               }
        }
 }
 
index 6b58c101a78457235a7ab10e4ab1cd3c5db996d9..d380d9d67844a3f872998e7d39494e014ffcd08b 100644 (file)
@@ -1533,6 +1533,11 @@ struct peer {
        /* timestamp when the last msg was written */
        _Atomic time_t last_update;
 
+       /* only updated under io_mtx.
+        * last_sendq_warn is only for ratelimiting log warning messages.
+        */
+       time_t last_sendq_ok, last_sendq_warn;
+
        /* Notify data. */
        struct bgp_notify notify;