BGP: Event-driven route announcement taking into account min route advertisement interval
ISSUE
BGP starts the routeadv timer (peer->t_routeadv) to expire in 1 sec
when a peer is established. From then on, the timer expires
periodically based on the configured MRAI value (default: 30sec for
EBGP, 5sec for IBGP). At the expiry, the write thread is triggered
that takes the routes from peer's sync FIFO (adj-rib-out) and sends
UPDATEs. This has a few drawbacks:
(1) Delay in new route announcement: Even when the last UPDATE message
was sent a while back, the next route change will necessarily have
to wait for routeadv expiry
(2) CPU usage: The timer is always armed. If the operator chooses to
configure a lower value of MRAI (zero second is a preferred choice
in many deployments) for better convergence, it leads to high CPU
usage for BGP process, even at the times of no network churn.
PATCH
Make the route advertisement event-driven - When routes are added to
peer's sync FIFO, check if the routeadv timer needs to be adjusted (or
started). Conversely, do not arm the routeadv timer unconditionally.
The patch also addresses route announcements during read-only mode
(update-delay). During read-only mode operation, the routeadv timer
is not started. When BGP comes out of read-only mode and all the
routes are processed, the timer is started for all peers with zero
expiry, so that the UPDATEs can be sent all at once. This leads to
(near-)optimal UPDATE packing.
Finally, the patch makes the "max # packets to write to peer socket at
a time" configurable. Currently it is hard-coded to 10. The command is
at the top router-bgp mode and is called "write-quanta <number>". It
is a useful convergence parameter to tweak.
Signed-off-by: Pradosh Mohapatra <pmohapat@cumulusnetworks.com>
Reviewed-by: Daniel Walton <dwalton@cumulusnetworks.com>
/* Add new advertisement to advertisement attribute list. */
bgp_advertise_add (adv->baa, adv);
+ if (FIFO_EMPTY(&peer->sync[afi][safi]->update))
+ bgp_adjust_routeadv(peer);
+
BGP_ADV_FIFO_ADD (&peer->sync[afi][safi]->update, &adv->fifo);
}
return 0;
}
+static int
+bgp_routeq_empty (struct peer *peer)
+{
+ afi_t afi;
+ safi_t safi;
+
+ for (afi = AFI_IP; afi < AFI_MAX; afi++)
+ for (safi = SAFI_UNICAST; safi < SAFI_MAX; safi++)
+ {
+ if (!FIFO_EMPTY(&peer->sync[afi][safi]->withdraw) ||
+ !FIFO_EMPTY(&peer->sync[afi][safi]->update))
+ return 0;
+ }
+ return 1;
+}
+
static int
bgp_routeadv_timer (struct thread *thread)
{
BGP_WRITE_ON (peer->t_write, bgp_write, peer->fd);
- BGP_TIMER_ON (peer->t_routeadv, bgp_routeadv_timer,
- peer->v_routeadv);
+ /*
+ * If there is no UPDATE to send, don't start the timer. We will start
+ * it when the queues go non-empty.
+ */
+ if (bgp_routeq_empty(peer))
+ return 0;
+
+ BGP_TIMER_ON (peer->t_routeadv, bgp_routeadv_timer, peer->v_routeadv);
return 0;
}
quagga_timestamp(3, bgp->update_delay_end_time,
sizeof(bgp->update_delay_end_time));
+ /*
+ * Add an end-of-initial-update marker to the main process queues so that
+ * the route advertisement timer for the peers can be started.
+ */
+ bgp_add_eoiu_mark(bgp, BGP_TABLE_MAIN);
+ bgp_add_eoiu_mark(bgp, BGP_TABLE_RSCLIENT);
+
/* Route announcements were postponed for all the peers during read-only mode,
send those now. */
for (ALL_LIST_ELEMENTS (bgp->peer, node, nnode, peer))
work_queue_unplug(bm->process_rsclient_queue);
}
+/**
+ * see bgp_fsm.h
+ */
+void
+bgp_start_routeadv (struct bgp *bgp)
+{
+ struct listnode *node, *nnode;
+ struct peer *peer;
+
+ for (ALL_LIST_ELEMENTS (bgp->peer, node, nnode, peer))
+ {
+ if (peer->status != Established)
+ continue;
+ BGP_TIMER_OFF(peer->t_routeadv);
+ BGP_TIMER_ON(peer->t_routeadv, bgp_routeadv_timer, 0);
+ }
+}
+
+/**
+ * see bgp_fsm.h
+ */
+void
+bgp_adjust_routeadv (struct peer *peer)
+{
+ time_t nowtime = bgp_clock();
+ double diff;
+ unsigned long remain;
+
+ /*
+ * CASE I:
+ * If the last update was written more than MRAI back, expire the timer
+ * instantly so that we can send the update out sooner.
+ *
+ * <------- MRAI --------->
+ * |-----------------|-----------------------|
+ * <------------- m ------------>
+ * ^ ^ ^
+ * | | |
+ * | | current time
+ * | timer start
+ * last write
+ *
+ * m > MRAI
+ */
+ diff = difftime(nowtime, peer->last_write);
+ if (diff > (double) peer->v_routeadv)
+ {
+ BGP_TIMER_OFF(peer->t_routeadv);
+ BGP_TIMER_ON(peer->t_routeadv, bgp_routeadv_timer, 0);
+ if (BGP_DEBUG (update, UPDATE_OUT))
+ zlog (peer->log, LOG_DEBUG, "%s: MRAI timer to expire instantly\n",
+ peer->host);
+ return;
+ }
+
+ /*
+ * CASE II:
+ * - Find when to expire the MRAI timer.
+ * If MRAI timer is not active, assume we can start it now.
+ *
+ * <------- MRAI --------->
+ * |------------|-----------------------|
+ * <-------- m ----------><----- r ----->
+ * ^ ^ ^
+ * | | |
+ * | | current time
+ * | timer start
+ * last write
+ *
+ * (MRAI - m) < r
+ */
+ if (peer->t_routeadv)
+ remain = thread_timer_remain_second(peer->t_routeadv);
+ else
+ remain = peer->v_routeadv;
+ diff = peer->v_routeadv - diff;
+ if (diff <= (double) remain)
+ {
+ BGP_TIMER_OFF(peer->t_routeadv);
+ BGP_TIMER_ON(peer->t_routeadv, bgp_routeadv_timer, diff);
+ if (BGP_DEBUG (update, UPDATE_OUT))
+ zlog (peer->log, LOG_DEBUG, "%s: MRAI timer to expire in %f secs\n",
+ peer->host, diff);
+ }
+}
+
/* The update delay timer expiry callback. */
static int
bgp_update_delay_timer (struct thread *thread)
static int
bgp_fsm_keepalive_expire (struct peer *peer)
{
- afi_t afi;
- safi_t safi;
-
- for (afi = AFI_IP; afi < AFI_MAX; afi++)
- for (safi = SAFI_UNICAST; safi < SAFI_MAX; safi++)
- {
- if (!FIFO_EMPTY(&peer->sync[afi][safi]->withdraw) ||
- !FIFO_EMPTY(&peer->sync[afi][safi]->update))
- return 0;
- }
+ /*
+ * If there are UPDATE messages to send, no need to send keepalive. The
+ * peer will note our progress through the UPDATEs.
+ */
+ if (!bgp_routeq_empty(peer))
+ return 0;
bgp_keepalive_send (peer);
return 0;
bgp_announce_route_all (peer);
- BGP_TIMER_ON (peer->t_routeadv, bgp_routeadv_timer, 1);
+ /* Start the route advertisement timer to send updates to the peer - if BGP
+ * is not in read-only mode. If it is, the timer will be started at the end
+ * of read-only mode.
+ */
+ if (!bgp_update_delay_active(peer->bgp))
+ BGP_TIMER_ON (peer->t_routeadv, bgp_routeadv_timer, 0);
return 0;
}
thread_cancel_event (master, (P)); \
} while (0)
+#define BGP_MSEC_JITTER 10
+
/* Prototypes. */
extern int bgp_event (struct thread *);
extern int bgp_stop (struct peer *peer);
extern const char *peer_down_str[];
extern void bgp_update_delay_end (struct bgp *);
+/**
+ * Start the route advertisement timer (that honors MRAI) for all the
+ * peers. Typically called at the end of initial convergence, coming
+ * out of read-only mode.
+ */
+extern void bgp_start_routeadv (struct bgp *);
+
+/**
+ * See if the route advertisement timer needs to be adjusted for a
+ * peer. For example, if the last update was written to the peer a
+ * long while back, we don't need to wait for the periodic advertisement
+ * timer to expire to send the new set of prefixes. It should fire
+ * instantly and updates should go out sooner.
+ */
+extern void bgp_adjust_routeadv (struct peer *);
+
#endif /* _QUAGGA_BGP_FSM_H */
adv = FIFO_HEAD (&peer->sync[afi][safi]->update);
if (adv)
{
- if (adv->binfo && adv->binfo->uptime < peer->synctime)
+ if (adv->binfo && adv->binfo->uptime <= peer->synctime)
{
if (CHECK_FLAG (adv->binfo->peer->cap, PEER_CAP_RESTART_RCV)
&& CHECK_FLAG (adv->binfo->peer->cap, PEER_CAP_RESTART_ADV)
struct stream *s;
int num;
unsigned int count = 0;
+ int oc = 0;
/* Yes first of all get peer pointer. */
peer = THREAD_ARG (thread);
sockopt_cork (peer->fd, 1);
+ oc = peer->update_out;
+
/* Nonblocking write until TCP output buffer is full. */
do
{
/* OK we send packet so delete it. */
bgp_packet_delete (peer);
}
- while (++count < BGP_WRITE_PACKET_MAX &&
+ while (++count < peer->bgp->wpkt_quanta &&
(s = bgp_write_packet (peer)) != NULL);
-
+
if (bgp_write_proceed (peer))
BGP_WRITE_ON (peer->t_write, bgp_write, peer->fd);
done:
+ /* Update the last write if some updates were written. */
+ if (peer->update_out > oc)
+ peer->last_write = bgp_clock ();
+
sockopt_cork (peer->fd, 0);
return 0;
}
struct bgp_info *old_select;
struct bgp_info_pair old_and_new;
struct listnode *node, *nnode;
- struct peer *rsclient = bgp_node_table (rn)->owner;
-
+ struct peer *rsclient;
+
+ /* Is it end of initial update? (after startup) */
+ if (!rn)
+ {
+ bgp_start_routeadv(bgp);
+ return WQ_SUCCESS;
+ }
+
+ rsclient = bgp_node_table (rn)->owner;
+
/* Best path selection. */
bgp_best_selection (bgp, rn, &bgp->maxpaths[afi][safi], &old_and_new);
new_select = old_and_new.new;
struct bgp_info_pair old_and_new;
struct listnode *node, *nnode;
struct peer *peer;
-
+
+ /* Is it end of initial update? (after startup) */
+ if (!rn)
+ {
+ bgp_start_routeadv(bgp);
+ return WQ_SUCCESS;
+ }
+
/* Best path selection. */
bgp_best_selection (bgp, rn, &bgp->maxpaths[afi][safi], &old_and_new);
old_select = old_and_new.old;
bgp_processq_del (struct work_queue *wq, void *data)
{
struct bgp_process_queue *pq = data;
- struct bgp_table *table = bgp_node_table (pq->rn);
-
+ struct bgp_table *table;
+
bgp_unlock (pq->bgp);
- bgp_unlock_node (pq->rn);
- bgp_table_unlock (table);
+ if (pq->rn)
+ {
+ table = bgp_node_table (pq->rn);
+ bgp_unlock_node (pq->rn);
+ bgp_table_unlock (table);
+ }
XFREE (MTYPE_BGP_PROCESS_QUEUE, pq);
}
return;
}
+void
+bgp_add_eoiu_mark (struct bgp *bgp, bgp_table_t type)
+{
+ struct bgp_process_queue *pqnode;
+
+ if ( (bm->process_main_queue == NULL) ||
+ (bm->process_rsclient_queue == NULL) )
+ bgp_process_queue_init ();
+
+ pqnode = XCALLOC (MTYPE_BGP_PROCESS_QUEUE,
+ sizeof (struct bgp_process_queue));
+ if (!pqnode)
+ return;
+
+ pqnode->rn = NULL;
+ pqnode->bgp = bgp;
+ bgp_lock (bgp);
+ switch (type)
+ {
+ case BGP_TABLE_MAIN:
+ work_queue_add (bm->process_main_queue, pqnode);
+ break;
+ case BGP_TABLE_RSCLIENT:
+ work_queue_add (bm->process_rsclient_queue, pqnode);
+ break;
+ }
+
+ return;
+}
+
static int
bgp_maximum_prefix_restart_timer (struct thread *thread)
{
/* for bgp_nexthop and bgp_damp */
extern void bgp_process (struct bgp *, struct bgp_node *, afi_t, safi_t);
+
+/*
+ * Add an end-of-initial-update marker to the process queue. This is just a
+ * queue element with NULL bgp node.
+ */
+extern void bgp_add_eoiu_mark (struct bgp *, bgp_table_t);
extern int bgp_config_write_table_map (struct vty *, struct bgp *, afi_t, safi_t,
int *);
extern int bgp_config_write_network (struct vty *, struct bgp *, afi_t, safi_t, int *);
#include "bgpd/bgp_table.h"
#include "bgpd/bgp_vty.h"
#include "bgpd/bgp_mpath.h"
+#include "bgpd/bgp_packet.h"
extern struct in_addr router_id_zebra;
"Wait for peers to be established\n"
"Seconds\n")
+int
+bgp_wpkt_quanta_config_vty (struct vty *vty, char *num, char set)
+{
+ struct bgp *bgp;
+ u_int16_t update_delay;
+
+ bgp = vty->index;
+
+ if (set)
+ VTY_GET_INTEGER_RANGE ("write-quanta", bgp->wpkt_quanta, num,
+ 1, 4294967295);
+ else
+ bgp->wpkt_quanta = BGP_WRITE_PACKET_MAX;
+
+ return CMD_SUCCESS;
+}
+
+int
+bgp_config_write_wpkt_quanta (struct vty *vty, struct bgp *bgp)
+{
+ if (bgp->wpkt_quanta != BGP_WRITE_PACKET_MAX)
+ vty_out (vty, " write-quanta %d%s",
+ bgp->wpkt_quanta, VTY_NEWLINE);
+
+ return 0;
+}
+
+
+/* Update-delay configuration */
+DEFUN (bgp_wpkt_quanta,
+ bgp_wpkt_quanta_cmd,
+ "write-quanta <1-4294967295>",
+ "How many packets to write to peer socket per run\n"
+ "Number of packets\n")
+{
+ return bgp_wpkt_quanta_config_vty(vty, argv[0], 1);
+}
+
+/* Update-delay deconfiguration */
+DEFUN (no_bgp_wpkt_quanta,
+ no_bgp_wpkt_quanta_cmd,
+ "no write-quanta <1-4294967295>",
+ "How many packets to write to peer socket per run\n"
+ "Number of packets\n")
+{
+ return bgp_wpkt_quanta_config_vty(vty, argv[0], 0);
+}
+
/* Maximum-paths configuration */
DEFUN (bgp_maxpaths,
bgp_maxpaths_cmd,
/* read timer */
vty_out (vty, " Last read %s", peer_uptime (p->readtime, timebuf, BGP_UPTIME_LEN));
+ vty_out (vty, ", Last write %s%s",
+ peer_uptime (p->last_write, timebuf, BGP_UPTIME_LEN), VTY_NEWLINE);
/* Configured timer values. */
- vty_out (vty, ", hold time is %d, keepalive interval is %d seconds%s",
+ vty_out (vty, " Hold time is %d, keepalive interval is %d seconds%s",
p->v_holdtime, p->v_keepalive, VTY_NEWLINE);
if (CHECK_FLAG (p->config, PEER_CONFIG_TIMER))
{
if (p->t_connect)
vty_out (vty, "Next connect timer due in %ld seconds%s",
thread_timer_remain_second (p->t_connect), VTY_NEWLINE);
-
- vty_out (vty, "Read thread: %s Write thread: %s%s",
+ if (p->t_routeadv)
+ vty_out (vty, "MRAI (interval %ld) timer expires in %ld seconds%s",
+ p->v_routeadv, thread_timer_remain_second (p->t_routeadv),
+ VTY_NEWLINE);
+
+ vty_out (vty, "Read thread: %s Write thread: %s%s",
p->t_read ? "on" : "off",
p->t_write ? "on" : "off",
VTY_NEWLINE);
install_element (BGP_NODE, &bgp_update_delay_establish_wait_cmd);
install_element (BGP_NODE, &no_bgp_update_delay_establish_wait_cmd);
+ install_element (BGP_NODE, &bgp_wpkt_quanta_cmd);
+ install_element (BGP_NODE, &no_bgp_wpkt_quanta_cmd);
+
/* "maximum-paths" commands. */
install_element (BGP_NODE, &bgp_maxpaths_cmd);
install_element (BGP_NODE, &no_bgp_maxpaths_cmd);
extern void bgp_vty_init (void);
extern const char *afi_safi_print (afi_t, safi_t);
extern int bgp_config_write_update_delay (struct vty *, struct bgp *);
+extern int bgp_config_write_wpkt_quanta(struct vty *vty, struct bgp *bgp);
#endif /* _QUAGGA_BGP_VTY_H */
if (name)
bgp->name = strdup (name);
+ bgp->wpkt_quanta = BGP_WRITE_PACKET_MAX;
+
THREAD_TIMER_ON (master, bgp->t_startup, bgp_startup_timer_expire,
bgp, bgp->restart_time);
/* BGP update-delay. */
bgp_config_write_update_delay (vty, bgp);
+ /* write quanta */
+ bgp_config_write_wpkt_quanta (vty, bgp);
+
/* BGP graceful-restart. */
if (bgp->stalepath_time != BGP_DEFAULT_STALEPATH_TIME)
vty_out (vty, " bgp graceful-restart stalepath-time %d%s",
u_int16_t ibgp_flags;
#define BGP_FLAG_IBGP_MULTIPATH_SAME_CLUSTERLEN (1 << 0)
} maxpaths[AFI_MAX][SAFI_MAX];
+
+ u_int32_t wpkt_quanta; /* per peer packet quanta to write */
};
/* BGP peer-group support. */
/* Syncronization list and time. */
struct bgp_synchronize *sync[AFI_MAX][SAFI_MAX];
time_t synctime;
+ time_t last_write; /* timestamp when the last UPDATE msg was written */
/* Send prefix count. */
unsigned long scount[AFI_MAX][SAFI_MAX];