From cb1faec92248549307ffcfd93b24713d339cb8b0 Mon Sep 17 00:00:00 2001 From: Donald Sharp Date: Tue, 19 May 2015 17:40:37 -0700 Subject: [PATCH] bgpd: bgpd-mrai.patch BGP: Event-driven route announcement taking into account min route advertisement interval ISSUE BGP starts the routeadv timer (peer->t_routeadv) to expire in 1 sec when a peer is established. From then on, the timer expires periodically based on the configured MRAI value (default: 30sec for EBGP, 5sec for IBGP). At the expiry, the write thread is triggered that takes the routes from peer's sync FIFO (adj-rib-out) and sends UPDATEs. This has a few drawbacks: (1) Delay in new route announcement: Even when the last UPDATE message was sent a while back, the next route change will necessarily have to wait for routeadv expiry (2) CPU usage: The timer is always armed. If the operator chooses to configure a lower value of MRAI (zero second is a preferred choice in many deployments) for better convergence, it leads to high CPU usage for BGP process, even at the times of no network churn. PATCH Make the route advertisement event-driven - When routes are added to peer's sync FIFO, check if the routeadv timer needs to be adjusted (or started). Conversely, do not arm the routeadv timer unconditionally. The patch also addresses route announcements during read-only mode (update-delay). During read-only mode operation, the routeadv timer is not started. When BGP comes out of read-only mode and all the routes are processed, the timer is started for all peers with zero expiry, so that the UPDATEs can be sent all at once. This leads to (near-)optimal UPDATE packing. Finally, the patch makes the "max # packets to write to peer socket at a time" configurable. Currently it is hard-coded to 10. The command is at the top router-bgp mode and is called "write-quanta ". It is a useful convergence parameter to tweak. Signed-off-by: Pradosh Mohapatra Reviewed-by: Daniel Walton --- bgpd/bgp_advertise.c | 3 + bgpd/bgp_fsm.c | 142 +++++++++++++++++++++++++++++++++++++++---- bgpd/bgp_fsm.h | 18 ++++++ bgpd/bgp_packet.c | 13 +++- bgpd/bgp_route.c | 64 ++++++++++++++++--- bgpd/bgp_route.h | 6 ++ bgpd/bgp_vty.c | 64 ++++++++++++++++++- bgpd/bgp_vty.h | 1 + bgpd/bgpd.c | 5 ++ bgpd/bgpd.h | 3 + 10 files changed, 293 insertions(+), 26 deletions(-) diff --git a/bgpd/bgp_advertise.c b/bgpd/bgp_advertise.c index adf40a0bb9..9043c83b24 100644 --- a/bgpd/bgp_advertise.c +++ b/bgpd/bgp_advertise.c @@ -267,6 +267,9 @@ bgp_adj_out_set (struct bgp_node *rn, struct peer *peer, struct prefix *p, /* Add new advertisement to advertisement attribute list. */ bgp_advertise_add (adv->baa, adv); + if (FIFO_EMPTY(&peer->sync[afi][safi]->update)) + bgp_adjust_routeadv(peer); + BGP_ADV_FIFO_ADD (&peer->sync[afi][safi]->update, &adv->fifo); } diff --git a/bgpd/bgp_fsm.c b/bgpd/bgp_fsm.c index 653dd7b1be..9dbe904e88 100644 --- a/bgpd/bgp_fsm.c +++ b/bgpd/bgp_fsm.c @@ -294,6 +294,22 @@ bgp_keepalive_timer (struct thread *thread) return 0; } +static int +bgp_routeq_empty (struct peer *peer) +{ + afi_t afi; + safi_t safi; + + for (afi = AFI_IP; afi < AFI_MAX; afi++) + for (safi = SAFI_UNICAST; safi < SAFI_MAX; safi++) + { + if (!FIFO_EMPTY(&peer->sync[afi][safi]->withdraw) || + !FIFO_EMPTY(&peer->sync[afi][safi]->update)) + return 0; + } + return 1; +} + static int bgp_routeadv_timer (struct thread *thread) { @@ -311,8 +327,14 @@ bgp_routeadv_timer (struct thread *thread) BGP_WRITE_ON (peer->t_write, bgp_write, peer->fd); - BGP_TIMER_ON (peer->t_routeadv, bgp_routeadv_timer, - peer->v_routeadv); + /* + * If there is no UPDATE to send, don't start the timer. We will start + * it when the queues go non-empty. + */ + if (bgp_routeq_empty(peer)) + return 0; + + BGP_TIMER_ON (peer->t_routeadv, bgp_routeadv_timer, peer->v_routeadv); return 0; } @@ -448,6 +470,13 @@ bgp_update_delay_end (struct bgp *bgp) quagga_timestamp(3, bgp->update_delay_end_time, sizeof(bgp->update_delay_end_time)); + /* + * Add an end-of-initial-update marker to the main process queues so that + * the route advertisement timer for the peers can be started. + */ + bgp_add_eoiu_mark(bgp, BGP_TABLE_MAIN); + bgp_add_eoiu_mark(bgp, BGP_TABLE_RSCLIENT); + /* Route announcements were postponed for all the peers during read-only mode, send those now. */ for (ALL_LIST_ELEMENTS (bgp->peer, node, nnode, peer)) @@ -459,6 +488,92 @@ bgp_update_delay_end (struct bgp *bgp) work_queue_unplug(bm->process_rsclient_queue); } +/** + * see bgp_fsm.h + */ +void +bgp_start_routeadv (struct bgp *bgp) +{ + struct listnode *node, *nnode; + struct peer *peer; + + for (ALL_LIST_ELEMENTS (bgp->peer, node, nnode, peer)) + { + if (peer->status != Established) + continue; + BGP_TIMER_OFF(peer->t_routeadv); + BGP_TIMER_ON(peer->t_routeadv, bgp_routeadv_timer, 0); + } +} + +/** + * see bgp_fsm.h + */ +void +bgp_adjust_routeadv (struct peer *peer) +{ + time_t nowtime = bgp_clock(); + double diff; + unsigned long remain; + + /* + * CASE I: + * If the last update was written more than MRAI back, expire the timer + * instantly so that we can send the update out sooner. + * + * <------- MRAI ---------> + * |-----------------|-----------------------| + * <------------- m ------------> + * ^ ^ ^ + * | | | + * | | current time + * | timer start + * last write + * + * m > MRAI + */ + diff = difftime(nowtime, peer->last_write); + if (diff > (double) peer->v_routeadv) + { + BGP_TIMER_OFF(peer->t_routeadv); + BGP_TIMER_ON(peer->t_routeadv, bgp_routeadv_timer, 0); + if (BGP_DEBUG (update, UPDATE_OUT)) + zlog (peer->log, LOG_DEBUG, "%s: MRAI timer to expire instantly\n", + peer->host); + return; + } + + /* + * CASE II: + * - Find when to expire the MRAI timer. + * If MRAI timer is not active, assume we can start it now. + * + * <------- MRAI ---------> + * |------------|-----------------------| + * <-------- m ----------><----- r -----> + * ^ ^ ^ + * | | | + * | | current time + * | timer start + * last write + * + * (MRAI - m) < r + */ + if (peer->t_routeadv) + remain = thread_timer_remain_second(peer->t_routeadv); + else + remain = peer->v_routeadv; + diff = peer->v_routeadv - diff; + if (diff <= (double) remain) + { + BGP_TIMER_OFF(peer->t_routeadv); + BGP_TIMER_ON(peer->t_routeadv, bgp_routeadv_timer, diff); + if (BGP_DEBUG (update, UPDATE_OUT)) + zlog (peer->log, LOG_DEBUG, "%s: MRAI timer to expire in %f secs\n", + peer->host, diff); + } +} + /* The update delay timer expiry callback. */ static int bgp_update_delay_timer (struct thread *thread) @@ -920,16 +1035,12 @@ bgp_fsm_open (struct peer *peer) static int bgp_fsm_keepalive_expire (struct peer *peer) { - afi_t afi; - safi_t safi; - - for (afi = AFI_IP; afi < AFI_MAX; afi++) - for (safi = SAFI_UNICAST; safi < SAFI_MAX; safi++) - { - if (!FIFO_EMPTY(&peer->sync[afi][safi]->withdraw) || - !FIFO_EMPTY(&peer->sync[afi][safi]->update)) - return 0; - } + /* + * If there are UPDATE messages to send, no need to send keepalive. The + * peer will note our progress through the UPDATEs. + */ + if (!bgp_routeq_empty(peer)) + return 0; bgp_keepalive_send (peer); return 0; @@ -1062,7 +1173,12 @@ bgp_establish (struct peer *peer) bgp_announce_route_all (peer); - BGP_TIMER_ON (peer->t_routeadv, bgp_routeadv_timer, 1); + /* Start the route advertisement timer to send updates to the peer - if BGP + * is not in read-only mode. If it is, the timer will be started at the end + * of read-only mode. + */ + if (!bgp_update_delay_active(peer->bgp)) + BGP_TIMER_ON (peer->t_routeadv, bgp_routeadv_timer, 0); return 0; } diff --git a/bgpd/bgp_fsm.h b/bgpd/bgp_fsm.h index bef5e0a596..eecd131d66 100644 --- a/bgpd/bgp_fsm.h +++ b/bgpd/bgp_fsm.h @@ -71,6 +71,8 @@ Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA thread_cancel_event (master, (P)); \ } while (0) +#define BGP_MSEC_JITTER 10 + /* Prototypes. */ extern int bgp_event (struct thread *); extern int bgp_stop (struct peer *peer); @@ -79,4 +81,20 @@ extern void bgp_fsm_change_status (struct peer *peer, int status); extern const char *peer_down_str[]; extern void bgp_update_delay_end (struct bgp *); +/** + * Start the route advertisement timer (that honors MRAI) for all the + * peers. Typically called at the end of initial convergence, coming + * out of read-only mode. + */ +extern void bgp_start_routeadv (struct bgp *); + +/** + * See if the route advertisement timer needs to be adjusted for a + * peer. For example, if the last update was written to the peer a + * long while back, we don't need to wait for the periodic advertisement + * timer to expire to send the new set of prefixes. It should fire + * instantly and updates should go out sooner. + */ +extern void bgp_adjust_routeadv (struct peer *); + #endif /* _QUAGGA_BGP_FSM_H */ diff --git a/bgpd/bgp_packet.c b/bgpd/bgp_packet.c index 674892515b..0c12df891c 100644 --- a/bgpd/bgp_packet.c +++ b/bgpd/bgp_packet.c @@ -616,7 +616,7 @@ bgp_write_packet (struct peer *peer) adv = FIFO_HEAD (&peer->sync[afi][safi]->update); if (adv) { - if (adv->binfo && adv->binfo->uptime < peer->synctime) + if (adv->binfo && adv->binfo->uptime <= peer->synctime) { if (CHECK_FLAG (adv->binfo->peer->cap, PEER_CAP_RESTART_RCV) && CHECK_FLAG (adv->binfo->peer->cap, PEER_CAP_RESTART_ADV) @@ -689,6 +689,7 @@ bgp_write (struct thread *thread) struct stream *s; int num; unsigned int count = 0; + int oc = 0; /* Yes first of all get peer pointer. */ peer = THREAD_ARG (thread); @@ -707,6 +708,8 @@ bgp_write (struct thread *thread) sockopt_cork (peer->fd, 1); + oc = peer->update_out; + /* Nonblocking write until TCP output buffer is full. */ do { @@ -774,13 +777,17 @@ bgp_write (struct thread *thread) /* OK we send packet so delete it. */ bgp_packet_delete (peer); } - while (++count < BGP_WRITE_PACKET_MAX && + while (++count < peer->bgp->wpkt_quanta && (s = bgp_write_packet (peer)) != NULL); - + if (bgp_write_proceed (peer)) BGP_WRITE_ON (peer->t_write, bgp_write, peer->fd); done: + /* Update the last write if some updates were written. */ + if (peer->update_out > oc) + peer->last_write = bgp_clock (); + sockopt_cork (peer->fd, 0); return 0; } diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c index 134c7c07dc..b2ec4811af 100644 --- a/bgpd/bgp_route.c +++ b/bgpd/bgp_route.c @@ -1521,8 +1521,17 @@ bgp_process_rsclient (struct work_queue *wq, void *data) struct bgp_info *old_select; struct bgp_info_pair old_and_new; struct listnode *node, *nnode; - struct peer *rsclient = bgp_node_table (rn)->owner; - + struct peer *rsclient; + + /* Is it end of initial update? (after startup) */ + if (!rn) + { + bgp_start_routeadv(bgp); + return WQ_SUCCESS; + } + + rsclient = bgp_node_table (rn)->owner; + /* Best path selection. */ bgp_best_selection (bgp, rn, &bgp->maxpaths[afi][safi], &old_and_new); new_select = old_and_new.new; @@ -1585,7 +1594,14 @@ bgp_process_main (struct work_queue *wq, void *data) struct bgp_info_pair old_and_new; struct listnode *node, *nnode; struct peer *peer; - + + /* Is it end of initial update? (after startup) */ + if (!rn) + { + bgp_start_routeadv(bgp); + return WQ_SUCCESS; + } + /* Best path selection. */ bgp_best_selection (bgp, rn, &bgp->maxpaths[afi][safi], &old_and_new); old_select = old_and_new.old; @@ -1652,11 +1668,15 @@ static void bgp_processq_del (struct work_queue *wq, void *data) { struct bgp_process_queue *pq = data; - struct bgp_table *table = bgp_node_table (pq->rn); - + struct bgp_table *table; + bgp_unlock (pq->bgp); - bgp_unlock_node (pq->rn); - bgp_table_unlock (table); + if (pq->rn) + { + table = bgp_node_table (pq->rn); + bgp_unlock_node (pq->rn); + bgp_table_unlock (table); + } XFREE (MTYPE_BGP_PROCESS_QUEUE, pq); } @@ -1724,6 +1744,36 @@ bgp_process (struct bgp *bgp, struct bgp_node *rn, afi_t afi, safi_t safi) return; } +void +bgp_add_eoiu_mark (struct bgp *bgp, bgp_table_t type) +{ + struct bgp_process_queue *pqnode; + + if ( (bm->process_main_queue == NULL) || + (bm->process_rsclient_queue == NULL) ) + bgp_process_queue_init (); + + pqnode = XCALLOC (MTYPE_BGP_PROCESS_QUEUE, + sizeof (struct bgp_process_queue)); + if (!pqnode) + return; + + pqnode->rn = NULL; + pqnode->bgp = bgp; + bgp_lock (bgp); + switch (type) + { + case BGP_TABLE_MAIN: + work_queue_add (bm->process_main_queue, pqnode); + break; + case BGP_TABLE_RSCLIENT: + work_queue_add (bm->process_rsclient_queue, pqnode); + break; + } + + return; +} + static int bgp_maximum_prefix_restart_timer (struct thread *thread) { diff --git a/bgpd/bgp_route.h b/bgpd/bgp_route.h index 7f31a38204..b8d2ed7c73 100644 --- a/bgpd/bgp_route.h +++ b/bgpd/bgp_route.h @@ -237,6 +237,12 @@ extern int bgp_withdraw (struct peer *, struct prefix *, struct attr *, /* for bgp_nexthop and bgp_damp */ extern void bgp_process (struct bgp *, struct bgp_node *, afi_t, safi_t); + +/* + * Add an end-of-initial-update marker to the process queue. This is just a + * queue element with NULL bgp node. + */ +extern void bgp_add_eoiu_mark (struct bgp *, bgp_table_t); extern int bgp_config_write_table_map (struct vty *, struct bgp *, afi_t, safi_t, int *); extern int bgp_config_write_network (struct vty *, struct bgp *, afi_t, safi_t, int *); diff --git a/bgpd/bgp_vty.c b/bgpd/bgp_vty.c index 12e6b8b845..db09d583b7 100644 --- a/bgpd/bgp_vty.c +++ b/bgpd/bgp_vty.c @@ -49,6 +49,7 @@ Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA #include "bgpd/bgp_table.h" #include "bgpd/bgp_vty.h" #include "bgpd/bgp_mpath.h" +#include "bgpd/bgp_packet.h" extern struct in_addr router_id_zebra; @@ -796,6 +797,54 @@ ALIAS (no_bgp_update_delay, "Wait for peers to be established\n" "Seconds\n") +int +bgp_wpkt_quanta_config_vty (struct vty *vty, char *num, char set) +{ + struct bgp *bgp; + u_int16_t update_delay; + + bgp = vty->index; + + if (set) + VTY_GET_INTEGER_RANGE ("write-quanta", bgp->wpkt_quanta, num, + 1, 4294967295); + else + bgp->wpkt_quanta = BGP_WRITE_PACKET_MAX; + + return CMD_SUCCESS; +} + +int +bgp_config_write_wpkt_quanta (struct vty *vty, struct bgp *bgp) +{ + if (bgp->wpkt_quanta != BGP_WRITE_PACKET_MAX) + vty_out (vty, " write-quanta %d%s", + bgp->wpkt_quanta, VTY_NEWLINE); + + return 0; +} + + +/* Update-delay configuration */ +DEFUN (bgp_wpkt_quanta, + bgp_wpkt_quanta_cmd, + "write-quanta <1-4294967295>", + "How many packets to write to peer socket per run\n" + "Number of packets\n") +{ + return bgp_wpkt_quanta_config_vty(vty, argv[0], 1); +} + +/* Update-delay deconfiguration */ +DEFUN (no_bgp_wpkt_quanta, + no_bgp_wpkt_quanta_cmd, + "no write-quanta <1-4294967295>", + "How many packets to write to peer socket per run\n" + "Number of packets\n") +{ + return bgp_wpkt_quanta_config_vty(vty, argv[0], 0); +} + /* Maximum-paths configuration */ DEFUN (bgp_maxpaths, bgp_maxpaths_cmd, @@ -7835,9 +7884,11 @@ bgp_show_peer (struct vty *vty, struct peer *p) /* read timer */ vty_out (vty, " Last read %s", peer_uptime (p->readtime, timebuf, BGP_UPTIME_LEN)); + vty_out (vty, ", Last write %s%s", + peer_uptime (p->last_write, timebuf, BGP_UPTIME_LEN), VTY_NEWLINE); /* Configured timer values. */ - vty_out (vty, ", hold time is %d, keepalive interval is %d seconds%s", + vty_out (vty, " Hold time is %d, keepalive interval is %d seconds%s", p->v_holdtime, p->v_keepalive, VTY_NEWLINE); if (CHECK_FLAG (p->config, PEER_CONFIG_TIMER)) { @@ -8133,8 +8184,12 @@ bgp_show_peer (struct vty *vty, struct peer *p) if (p->t_connect) vty_out (vty, "Next connect timer due in %ld seconds%s", thread_timer_remain_second (p->t_connect), VTY_NEWLINE); - - vty_out (vty, "Read thread: %s Write thread: %s%s", + if (p->t_routeadv) + vty_out (vty, "MRAI (interval %ld) timer expires in %ld seconds%s", + p->v_routeadv, thread_timer_remain_second (p->t_routeadv), + VTY_NEWLINE); + + vty_out (vty, "Read thread: %s Write thread: %s%s", p->t_read ? "on" : "off", p->t_write ? "on" : "off", VTY_NEWLINE); @@ -9383,6 +9438,9 @@ bgp_vty_init (void) install_element (BGP_NODE, &bgp_update_delay_establish_wait_cmd); install_element (BGP_NODE, &no_bgp_update_delay_establish_wait_cmd); + install_element (BGP_NODE, &bgp_wpkt_quanta_cmd); + install_element (BGP_NODE, &no_bgp_wpkt_quanta_cmd); + /* "maximum-paths" commands. */ install_element (BGP_NODE, &bgp_maxpaths_cmd); install_element (BGP_NODE, &no_bgp_maxpaths_cmd); diff --git a/bgpd/bgp_vty.h b/bgpd/bgp_vty.h index e9dc09a061..9caf0baace 100644 --- a/bgpd/bgp_vty.h +++ b/bgpd/bgp_vty.h @@ -26,5 +26,6 @@ Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA extern void bgp_vty_init (void); extern const char *afi_safi_print (afi_t, safi_t); extern int bgp_config_write_update_delay (struct vty *, struct bgp *); +extern int bgp_config_write_wpkt_quanta(struct vty *vty, struct bgp *bgp); #endif /* _QUAGGA_BGP_VTY_H */ diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c index acca578c80..f1903630f0 100644 --- a/bgpd/bgpd.c +++ b/bgpd/bgpd.c @@ -1997,6 +1997,8 @@ bgp_create (as_t *as, const char *name) if (name) bgp->name = strdup (name); + bgp->wpkt_quanta = BGP_WRITE_PACKET_MAX; + THREAD_TIMER_ON (master, bgp->t_startup, bgp_startup_timer_expire, bgp, bgp->restart_time); @@ -5388,6 +5390,9 @@ bgp_config_write (struct vty *vty) /* BGP update-delay. */ bgp_config_write_update_delay (vty, bgp); + /* write quanta */ + bgp_config_write_wpkt_quanta (vty, bgp); + /* BGP graceful-restart. */ if (bgp->stalepath_time != BGP_DEFAULT_STALEPATH_TIME) vty_out (vty, " bgp graceful-restart stalepath-time %d%s", diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h index b2d421ed1f..d32bdcf043 100644 --- a/bgpd/bgpd.h +++ b/bgpd/bgpd.h @@ -196,6 +196,8 @@ struct bgp u_int16_t ibgp_flags; #define BGP_FLAG_IBGP_MULTIPATH_SAME_CLUSTERLEN (1 << 0) } maxpaths[AFI_MAX][SAFI_MAX]; + + u_int32_t wpkt_quanta; /* per peer packet quanta to write */ }; /* BGP peer-group support. */ @@ -535,6 +537,7 @@ struct peer /* Syncronization list and time. */ struct bgp_synchronize *sync[AFI_MAX][SAFI_MAX]; time_t synctime; + time_t last_write; /* timestamp when the last UPDATE msg was written */ /* Send prefix count. */ unsigned long scount[AFI_MAX][SAFI_MAX]; -- 2.39.5