]> git.puffer.fish Git - mirror/frr.git/commitdiff
bgpd: put BGP keepalives in a pthread
authorQuentin Young <qlyoung@cumulusnetworks.com>
Thu, 5 Jan 2017 23:13:16 +0000 (23:13 +0000)
committerQuentin Young <qlyoung@cumulusnetworks.com>
Thu, 30 Nov 2017 21:17:57 +0000 (16:17 -0500)
This patch, in tandem with moving packet writes into a dedicated kernel
thread, fixes session flaps caused by long-running internal operations
starving the (old) userspace write thread.

BGP keepalives are now produced by a kernel thread and placed onto the
peer's output queue. These are then consumed by the write thread. Both
of these tasks are concurrent with the rest of bgpd, obviating the
session flaps described above.

Signed-off-by: Quentin Young <qlyoung@cumulusnetworks.com>
bgpd/Makefile.am
bgpd/bgp_fsm.c
bgpd/bgp_keepalives.c [new file with mode: 0644]
bgpd/bgp_keepalives.h [new file with mode: 0644]
bgpd/bgp_main.c

index fa1dcbb7626ca7d608c9269c7e4606d2252c9998..c8791687ea66948a60d52f17b38664a1b0ba803d 100644 (file)
@@ -85,7 +85,8 @@ libbgp_a_SOURCES = \
        bgp_damp.c bgp_table.c bgp_advertise.c bgp_vty.c bgp_mpath.c \
        bgp_nht.c bgp_updgrp.c bgp_updgrp_packet.c bgp_updgrp_adv.c bgp_bfd.c \
        bgp_encap_tlv.c $(BGP_VNC_RFAPI_SRC) bgp_attr_evpn.c \
-       bgp_evpn.c bgp_evpn_vty.c bgp_vpn.c bgp_label.c bgp_rd.c
+       bgp_evpn.c bgp_evpn_vty.c bgp_vpn.c bgp_label.c bgp_rd.c \
+       bgp_keepalives.c
 
 noinst_HEADERS = \
        bgp_memory.h \
@@ -97,7 +98,7 @@ noinst_HEADERS = \
        bgp_advertise.h bgp_vty.h bgp_mpath.h bgp_nht.h \
        bgp_updgrp.h bgp_bfd.h bgp_encap_tlv.h bgp_encap_types.h \
        $(BGP_VNC_RFAPI_HD) bgp_attr_evpn.h bgp_evpn.h bgp_evpn_vty.h \
-        bgp_vpn.h bgp_label.h bgp_rd.h bgp_evpn_private.h
+        bgp_vpn.h bgp_label.h bgp_rd.h bgp_evpn_private.h bgp_keepalives.h \
 
 bgpd_SOURCES = bgp_main.c
 bgpd_LDADD = libbgp.a  $(BGP_VNC_RFP_LIB) ../lib/libfrr.la @LIBCAP@ @LIBM@
index 176089ebd0f2a45975d623909c8c1b739da56d71..aa883f73645d69ccd6123c4085164404de6a5cf1 100644 (file)
@@ -49,6 +49,7 @@
 #include "bgpd/bgp_nht.h"
 #include "bgpd/bgp_bfd.h"
 #include "bgpd/bgp_memory.h"
+#include "bgpd/bgp_keepalives.h"
 
 DEFINE_HOOK(peer_backward_transition, (struct peer * peer), (peer))
 DEFINE_HOOK(peer_established, (struct peer * peer), (peer))
@@ -86,7 +87,6 @@ int bgp_event(struct thread *);
 static int bgp_start_timer(struct thread *);
 static int bgp_connect_timer(struct thread *);
 static int bgp_holdtime_timer(struct thread *);
-static int bgp_keepalive_timer(struct thread *);
 
 /* BGP FSM functions. */
 static int bgp_start(struct peer *);
@@ -253,7 +253,7 @@ void bgp_timer_set(struct peer *peer)
                }
                BGP_TIMER_OFF(peer->t_connect);
                BGP_TIMER_OFF(peer->t_holdtime);
-               BGP_TIMER_OFF(peer->t_keepalive);
+               peer_keepalives_off(peer);
                BGP_TIMER_OFF(peer->t_routeadv);
                break;
 
@@ -265,7 +265,7 @@ void bgp_timer_set(struct peer *peer)
                BGP_TIMER_ON(peer->t_connect, bgp_connect_timer,
                             peer->v_connect);
                BGP_TIMER_OFF(peer->t_holdtime);
-               BGP_TIMER_OFF(peer->t_keepalive);
+               peer_keepalives_off(peer);
                BGP_TIMER_OFF(peer->t_routeadv);
                break;
 
@@ -282,7 +282,7 @@ void bgp_timer_set(struct peer *peer)
                                     peer->v_connect);
                }
                BGP_TIMER_OFF(peer->t_holdtime);
-               BGP_TIMER_OFF(peer->t_keepalive);
+               peer_keepalives_off(peer);
                BGP_TIMER_OFF(peer->t_routeadv);
                break;
 
@@ -296,7 +296,7 @@ void bgp_timer_set(struct peer *peer)
                } else {
                        BGP_TIMER_OFF(peer->t_holdtime);
                }
-               BGP_TIMER_OFF(peer->t_keepalive);
+               peer_keepalives_off(peer);
                BGP_TIMER_OFF(peer->t_routeadv);
                break;
 
@@ -309,12 +309,11 @@ void bgp_timer_set(struct peer *peer)
                   timer and KeepAlive timers are not started. */
                if (peer->v_holdtime == 0) {
                        BGP_TIMER_OFF(peer->t_holdtime);
-                       BGP_TIMER_OFF(peer->t_keepalive);
+                       peer_keepalives_off(peer);
                } else {
                        BGP_TIMER_ON(peer->t_holdtime, bgp_holdtime_timer,
                                     peer->v_holdtime);
-                       BGP_TIMER_ON(peer->t_keepalive, bgp_keepalive_timer,
-                                    peer->v_keepalive);
+                       peer_keepalives_on(peer);
                }
                BGP_TIMER_OFF(peer->t_routeadv);
                break;
@@ -329,12 +328,11 @@ void bgp_timer_set(struct peer *peer)
                   and keepalive must be turned off. */
                if (peer->v_holdtime == 0) {
                        BGP_TIMER_OFF(peer->t_holdtime);
-                       BGP_TIMER_OFF(peer->t_keepalive);
+                       peer_keepalives_off(peer);
                } else {
                        BGP_TIMER_ON(peer->t_holdtime, bgp_holdtime_timer,
                                     peer->v_holdtime);
-                       BGP_TIMER_ON(peer->t_keepalive, bgp_keepalive_timer,
-                                    peer->v_keepalive);
+                       peer_keepalives_on(peer);
                }
                break;
        case Deleted:
@@ -346,7 +344,7 @@ void bgp_timer_set(struct peer *peer)
                BGP_TIMER_OFF(peer->t_start);
                BGP_TIMER_OFF(peer->t_connect);
                BGP_TIMER_OFF(peer->t_holdtime);
-               BGP_TIMER_OFF(peer->t_keepalive);
+               peer_keepalives_off(peer);
                BGP_TIMER_OFF(peer->t_routeadv);
                break;
        }
@@ -412,24 +410,6 @@ static int bgp_holdtime_timer(struct thread *thread)
        return 0;
 }
 
-/* BGP keepalive fire ! */
-static int bgp_keepalive_timer(struct thread *thread)
-{
-       struct peer *peer;
-
-       peer = THREAD_ARG(thread);
-       peer->t_keepalive = NULL;
-
-       if (bgp_debug_neighbor_events(peer))
-               zlog_debug("%s [FSM] Timer (keepalive timer expire)",
-                          peer->host);
-
-       THREAD_VAL(thread) = KeepAlive_timer_expired;
-       bgp_event(thread); /* bgp_event unlocks peer */
-
-       return 0;
-}
-
 int bgp_routeadv_timer(struct thread *thread)
 {
        struct peer *peer;
@@ -963,9 +943,6 @@ int bgp_stop(struct peer *peer)
        char orf_name[BUFSIZ];
        int ret = 0;
 
-       // immediately remove from threads
-       peer_writes_off(peer);
-
        if (peer_dynamic_neighbor(peer)
            && !(CHECK_FLAG(peer->flags, PEER_FLAG_DELETE))) {
                if (bgp_debug_neighbor_events(peer))
@@ -1053,7 +1030,7 @@ int bgp_stop(struct peer *peer)
        BGP_TIMER_OFF(peer->t_start);
        BGP_TIMER_OFF(peer->t_connect);
        BGP_TIMER_OFF(peer->t_holdtime);
-       BGP_TIMER_OFF(peer->t_keepalive);
+       peer_keepalives_off(peer);
        BGP_TIMER_OFF(peer->t_routeadv);
 
        /* Stream reset. */
@@ -1401,13 +1378,6 @@ static int bgp_fsm_open(struct peer *peer)
        return 0;
 }
 
-/* Keepalive send to peer. */
-static int bgp_fsm_keepalive_expire(struct peer *peer)
-{
-       bgp_keepalive_send(peer);
-       return 0;
-}
-
 /* FSM error, unexpected event.  This is error of BGP connection. So cut the
    peer and change to Idle status. */
 static int bgp_fsm_event_error(struct peer *peer)
@@ -1761,9 +1731,8 @@ static const struct {
                {bgp_stop, Clearing},      /* TCP_fatal_error              */
                {bgp_stop, Clearing},      /* ConnectRetry_timer_expired   */
                {bgp_fsm_holdtime_expire, Clearing}, /* Hold_Timer_expired */
-               {bgp_fsm_keepalive_expire,
-                Established},  /* KeepAlive_timer_expired      */
-               {bgp_stop, Clearing}, /* Receive_OPEN_message         */
+               {bgp_ignore, Established}, /* KeepAlive_timer_expired      */
+               {bgp_stop, Clearing},      /* Receive_OPEN_message         */
                {bgp_fsm_keepalive,
                 Established}, /* Receive_KEEPALIVE_message    */
                {bgp_fsm_update, Established}, /* Receive_UPDATE_message */
diff --git a/bgpd/bgp_keepalives.c b/bgpd/bgp_keepalives.c
new file mode 100644 (file)
index 0000000..d142e31
--- /dev/null
@@ -0,0 +1,247 @@
+/* BGP Keepalives.
+ *
+ * Implemented server-style in a pthread.
+ * --------------------------------------
+ * Copyright (C) 2017 Cumulus Networks, Inc.
+ *
+ * This file is part of Free Range Routing.
+ *
+ * Free Range Routing is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any later
+ * version.
+ *
+ * Free Range Routing is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GN5U General Public License along
+ * with Free Range Routing; see the file COPYING.  If not, write to the Free
+ * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+#include <zebra.h>
+#include <signal.h>
+#include <sys/time.h>
+
+#include "thread.h"
+#include "log.h"
+#include "vty.h"
+#include "monotime.h"
+
+#include "bgpd/bgpd.h"
+#include "bgpd/bgp_keepalives.h"
+#include "bgpd/bgp_debug.h"
+#include "bgpd/bgp_attr.h"
+#include "bgpd/bgp_packet.h"
+
+/**
+ * Peer KeepAlive Timer.
+ * Associates a peer with the time of its last keepalive.
+ */
+struct pkat {
+       // the peer to send keepalives to
+       struct peer *peer;
+       // absolute time of last keepalive sent
+       struct timeval last;
+};
+
+/* List of peers we are sending keepalives for, and associated mutex. */
+static pthread_mutex_t peerlist_mtx = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t peerlist_cond;
+static struct list *peerlist;
+
+/* Thread control flag. */
+bool bgp_keepalives_thread_run;
+
+static struct pkat *pkat_new(struct peer *peer)
+{
+       struct pkat *pkat = XMALLOC(MTYPE_TMP, sizeof(struct pkat));
+       pkat->peer = peer;
+       monotime(&pkat->last);
+       return pkat;
+}
+
+static void pkat_del(void *pkat)
+{
+       XFREE(MTYPE_TMP, pkat);
+}
+/**
+ * Cleanup thread resources at termination.
+ *
+ * @param arg not used
+ */
+static void cleanup_handler(void *arg)
+{
+       if (peerlist)
+               list_delete(peerlist);
+
+       peerlist = NULL;
+
+       pthread_mutex_unlock(&peerlist_mtx);
+       pthread_cond_destroy(&peerlist_cond);
+       memset(&peerlist_cond, 0, sizeof(peerlist_cond));
+}
+
+/*
+ * Walks the list of peers, sending keepalives to those that are due for them.
+ *
+ * For any given peer, if the elapsed time since its last keepalive exceeds its
+ * configured keepalive timer, a keepalive is sent to the peer and its
+ * last-sent time is reset. Additionally, If the elapsed time does not exceed
+ * the configured keepalive timer, but the time until the next keepalive is due
+ * is within a hardcoded tolerance, a keepalive is sent as if the configured
+ * timer was exceeded. Doing this helps alleviate nanosecond sleeps between
+ * ticks by grouping together peers who are due for keepalives at roughly the
+ * same time. This tolerance value is arbitrarily chosen to be 100ms.
+ *
+ * In addition, this function calculates the maximum amount of time that the
+ * keepalive thread can sleep before another tick needs to take place. This is
+ * equivalent to shortest time until a keepalive is due for any one peer.
+ *
+ * @return maximum time to wait until next update (0 if infinity)
+ */
+static struct timeval update()
+{
+       struct listnode *ln;
+       struct pkat *pkat;
+
+       int update_set = 0;             // whether next_update has been set
+       struct timeval next_update;     // max sleep until next tick
+       static struct timeval elapsed;  // elapsed time since keepalive
+       static struct timeval ka = {0}; // peer->v_keepalive as a timeval
+       static struct timeval diff;     // ka - elapsed
+
+       // see function comment
+       static struct timeval tolerance = {0, 100000};
+
+       for (ALL_LIST_ELEMENTS_RO(peerlist, ln, pkat)) {
+               // calculate elapsed time since last keepalive
+               monotime_since(&pkat->last, &elapsed);
+
+               // calculate difference between elapsed time and configured time
+               ka.tv_sec = pkat->peer->v_keepalive;
+               timersub(&ka, &elapsed, &diff);
+
+               int send_keepalive = elapsed.tv_sec >= ka.tv_sec
+                                    || timercmp(&diff, &tolerance, <);
+
+               if (send_keepalive) {
+                       if (bgp_debug_neighbor_events(pkat->peer))
+                               zlog_debug(
+                                       "%s [FSM] Timer (keepalive timer expire)",
+                                       pkat->peer->host);
+
+                       bgp_keepalive_send(pkat->peer);
+                       monotime(&pkat->last);
+                       memset(&elapsed, 0x00, sizeof(struct timeval));
+                       diff = ka; // time until next keepalive == peer
+                                  // keepalive time
+               }
+
+               // if calculated next update for this peer < current delay, use
+               // it
+               if (!update_set || timercmp(&diff, &next_update, <)) {
+                       next_update = diff;
+                       update_set = 1;
+               }
+       }
+
+       return next_update;
+}
+
+void *peer_keepalives_start(void *arg)
+{
+       struct timeval currtime = {0, 0};
+       struct timeval next_update = {0, 0};
+       struct timespec next_update_ts = {0, 0};
+
+       // initialize synchronization primitives
+       pthread_mutex_lock(&peerlist_mtx);
+
+       // use monotonic clock with condition variable
+       pthread_condattr_t attrs;
+       pthread_condattr_init(&attrs);
+       pthread_condattr_setclock(&attrs, CLOCK_MONOTONIC);
+       pthread_cond_init(&peerlist_cond, &attrs);
+
+       // initialize peerlist
+       peerlist = list_new();
+       peerlist->del = pkat_del;
+
+       // register cleanup handlers
+       pthread_cleanup_push(&cleanup_handler, NULL);
+
+       bgp_keepalives_thread_run = true;
+
+       while (bgp_keepalives_thread_run) {
+               if (peerlist->count > 0)
+                       pthread_cond_timedwait(&peerlist_cond, &peerlist_mtx,
+                                              &next_update_ts);
+               else
+                       while (peerlist->count == 0
+                              && bgp_keepalives_thread_run)
+                               pthread_cond_wait(&peerlist_cond,
+                                                 &peerlist_mtx);
+
+               monotime(&currtime);
+               next_update = update();
+               timeradd(&currtime, &next_update, &next_update);
+               TIMEVAL_TO_TIMESPEC(&next_update, &next_update_ts);
+       }
+
+       // clean up
+       pthread_cleanup_pop(1);
+
+       return NULL;
+}
+
+/* --- thread external functions ------------------------------------------- */
+
+void peer_keepalives_on(struct peer *peer)
+{
+       pthread_mutex_lock(&peerlist_mtx);
+       {
+               struct listnode *ln, *nn;
+               struct pkat *pkat;
+
+               for (ALL_LIST_ELEMENTS(peerlist, ln, nn, pkat))
+                       if (pkat->peer == peer) {
+                               pthread_mutex_unlock(&peerlist_mtx);
+                               return;
+                       }
+
+               pkat = pkat_new(peer);
+               listnode_add(peerlist, pkat);
+               peer_lock(peer);
+       }
+       pthread_mutex_unlock(&peerlist_mtx);
+       peer_keepalives_wake();
+}
+
+void peer_keepalives_off(struct peer *peer)
+{
+       pthread_mutex_lock(&peerlist_mtx);
+       {
+               struct listnode *ln, *nn;
+               struct pkat *pkat;
+
+               for (ALL_LIST_ELEMENTS(peerlist, ln, nn, pkat))
+                       if (pkat->peer == peer) {
+                               XFREE(MTYPE_TMP, pkat);
+                               list_delete_node(peerlist, ln);
+                               peer_unlock(peer);
+                       }
+       }
+       pthread_mutex_unlock(&peerlist_mtx);
+}
+
+void peer_keepalives_wake()
+{
+       pthread_mutex_lock(&peerlist_mtx);
+       {
+               pthread_cond_signal(&peerlist_cond);
+       }
+       pthread_mutex_unlock(&peerlist_mtx);
+}
diff --git a/bgpd/bgp_keepalives.h b/bgpd/bgp_keepalives.h
new file mode 100644 (file)
index 0000000..281c32f
--- /dev/null
@@ -0,0 +1,83 @@
+/* BGP Keepalives.
+ *
+ * Implemented server-style in a pthread.
+ * --------------------------------------
+ * Copyright (C) 2017 Cumulus Networks, Inc.
+ *
+ * This file is part of Free Range Routing.
+ *
+ * Free Range Routing is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any later
+ * version.
+ *
+ * Free Range Routing is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GN5U General Public License along
+ * with Free Range Routing; see the file COPYING.  If not, write to the Free
+ * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+#ifndef _BGP_KEEPALIVES_H_
+#define _BGP_KEEPALIVES_H_
+
+#include "bgpd.h"
+
+/* Thread control flag.
+ *
+ * Setting this flag to 'false' while the thread is running will result in
+ * thread termination.
+ */
+extern bool bgp_keepalives_thread_run;
+
+/* Turns on keepalives for a peer.
+ *
+ * This function adds the peer to an internal list of peers to generate
+ * keepalives for.
+ *
+ * At set intervals, a BGP KEEPALIVE packet is generated and placed on
+ * peer->obuf. This operation is thread-safe with respect to peer->obuf.
+ *
+ * peer->v_keepalive determines the interval. Changing this value before
+ * unregistering this peer with peer_keepalives_off() results in undefined
+ * behavior.
+ *
+ * If the peer is already registered for keepalives via this function, nothing
+ * happens.
+ */
+extern void peer_keepalives_on(struct peer *);
+
+/* Turns off keepalives for a peer.
+ *
+ * Removes the peer from the internal list of peers to generate keepalives for.
+ *
+ * If the peer is already unregistered for keepalives, nothing happens.
+ */
+extern void peer_keepalives_off(struct peer *);
+
+/* Entry function for keepalives pthread.
+ *
+ * This function loops over an internal list of peers, generating keepalives at
+ * regular intervals as determined by each peer's keepalive timer.
+ *
+ * See peer_keepalives_on() for additional details.
+ *
+ * @param arg pthread arg, not used
+ */
+extern void *peer_keepalives_start(void *arg);
+
+/* Poking function for keepalives pthread.
+ *
+ * Under normal circumstances the pthread will automatically wake itself
+ * whenever it is necessary to do work. This function may be used to force the
+ * thread to wake up and see if there is any work to do, or if it is time to
+ * die.
+ *
+ * It is not necessary to call this after peer_keepalives_on().
+ */
+extern void peer_keepalives_wake(void);
+
+#endif /* _BGP_KEEPALIVES_H */
index b4ea45b3e0581d888ead9d75aa0b3180bcdaa765..d8cd7d51731b202c5d7004ba106a37ffead6cbcd 100644 (file)
@@ -20,6 +20,7 @@
 
 #include <zebra.h>
 
+#include <pthread.h>
 #include "vector.h"
 #include "command.h"
 #include "getopt.h"
@@ -55,6 +56,7 @@
 #include "bgpd/bgp_filter.h"
 #include "bgpd/bgp_zebra.h"
 #include "bgpd/bgp_packet.h"
+#include "bgpd/bgp_keepalives.h"
 
 #ifdef ENABLE_BGP_VNC
 #include "bgpd/rfapi/rfapi_backend.h"
@@ -393,8 +395,9 @@ int main(int argc, char **argv)
        snprintf(bgpd_di.startinfo, sizeof(bgpd_di.startinfo), ", bgp@%s:%d",
                 (bm->address ? bm->address : "<all>"), bm->port);
 
-       pthread_t packet_writes;
+       pthread_t packet_writes, keepalives;
        pthread_create(&packet_writes, NULL, &peer_writes_start, NULL);
+       pthread_create(&keepalives, NULL, &keepalives_start, NULL);
 
        frr_config_fork();
        frr_run(bm->master);