From: Donald Sharp Date: Wed, 20 May 2015 00:40:42 +0000 (-0700) Subject: bgpd-scale-update-delay-packing.patch X-Git-Tag: frr-2.0-rc1~1530 X-Git-Url: https://git.puffer.fish/?a=commitdiff_plain;h=4a16ae86a600e716850e672c2d7222f378dc1bde;p=mirror%2Ffrr.git bgpd-scale-update-delay-packing.patch ISSUE: During startup, BGP update prefix packing wasnt optimal and route installation was found to be spread over. SOLUTION: With this patch, update-delay post processing is serialized to achieve: a. better peer update packing (which helps in reducing total number of BGP update packets) b. installation of the resulting routes in zebra as close to each others as possible. (which can help zebra batch its processing and updates to Kernel better) --- diff --git a/bgpd/bgp_fsm.c b/bgpd/bgp_fsm.c index 35037fd37c..7693a5af92 100644 --- a/bgpd/bgp_fsm.c +++ b/bgpd/bgp_fsm.c @@ -563,9 +563,6 @@ bgp_update_delay_configured (struct bgp *bgp) void bgp_update_delay_end (struct bgp *bgp) { - struct listnode *node, *nnode; - struct peer *peer; - THREAD_TIMER_OFF (bgp->t_update_delay); THREAD_TIMER_OFF (bgp->t_establish_wait); @@ -581,15 +578,22 @@ bgp_update_delay_end (struct bgp *bgp) /* * Add an end-of-initial-update marker to the main process queues so that - * the route advertisement timer for the peers can be started. + * the route advertisement timer for the peers can be started. Also set + * the zebra and peer update hold flags. These flags are used to achieve + * three stages in the update-delay post processing: + * 1. Finish best-path selection for all the prefixes held on the queues. + * (routes in BGP are updated, and peers sync queues are populated too) + * 2. As the eoiu mark is reached in the bgp process routine, ship all the + * routes to zebra. With that zebra should see updates from BGP close + * to each other. + * 3. Unblock the peer update writes. With that peer update packing with + * the prefixes should be at its maximum. */ bgp_add_eoiu_mark(bgp, BGP_TABLE_MAIN); bgp_add_eoiu_mark(bgp, BGP_TABLE_RSCLIENT); - - /* Route announcements were postponed for all the peers during read-only mode, - send those now. */ - for (ALL_LIST_ELEMENTS (bgp->peer, node, nnode, peer)) - bgp_announce_route_all (peer); + bgp->main_zebra_update_hold = 1; + bgp->main_peers_update_hold = 1; + bgp->rsclient_peers_update_hold = 1; /* Resume the queue processing. This should trigger the event that would take care of processing any work that was queued during the read-only mode. */ @@ -606,6 +610,15 @@ bgp_start_routeadv (struct bgp *bgp) struct listnode *node, *nnode; struct peer *peer; + zlog_info("bgp_start_routeadv(), update hold status - main: %d, rsclient: %d", + bgp->main_peers_update_hold, bgp->rsclient_peers_update_hold); + + if (bgp->main_peers_update_hold || bgp->rsclient_peers_update_hold) + return; + + quagga_timestamp(3, bgp->update_delay_peers_resume_time, + sizeof(bgp->update_delay_peers_resume_time)); + for (ALL_LIST_ELEMENTS (bgp->peer, node, nnode, peer)) { if (peer->status != Established) diff --git a/bgpd/bgp_packet.c b/bgpd/bgp_packet.c index 6afbe7d76c..cd96cbcf1a 100644 --- a/bgpd/bgp_packet.c +++ b/bgpd/bgp_packet.c @@ -439,9 +439,6 @@ bgp_default_update_send (struct peer *peer, struct attr *attr, if (DISABLE_BGP_ANNOUNCE) return; - if (bgp_update_delay_active(peer->bgp)) - return; - if (afi == AFI_IP) str2prefix ("0.0.0.0/0", &p); #ifdef HAVE_IPV6 @@ -515,9 +512,6 @@ bgp_default_withdraw_send (struct peer *peer, afi_t afi, safi_t safi) if (DISABLE_BGP_ANNOUNCE) return; - if (bgp_update_delay_active(peer->bgp)) - return; - if (afi == AFI_IP) str2prefix ("0.0.0.0/0", &p); #ifdef HAVE_IPV6 @@ -598,6 +592,12 @@ bgp_write_packet (struct peer *peer) if (s) return s; + /* The code beyond this part deals with update packets, check if updates + are on hold as part of the update-delay post processing stages. */ + if (peer->bgp && (peer->bgp->main_peers_update_hold || + peer->bgp->rsclient_peers_update_hold)) + return NULL; + for (afi = AFI_IP; afi < AFI_MAX; afi++) for (safi = SAFI_UNICAST; safi < SAFI_MAX; safi++) { diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c index d9936b23c4..3fb6e1cd40 100644 --- a/bgpd/bgp_route.c +++ b/bgpd/bgp_route.c @@ -1526,6 +1526,12 @@ bgp_process_rsclient (struct work_queue *wq, void *data) /* Is it end of initial update? (after startup) */ if (!rn) { + /* This is just to keep the display sane in case all the peers are + rsclients only */ + quagga_timestamp(3, bgp->update_delay_zebra_resume_time, + sizeof(bgp->update_delay_zebra_resume_time)); + + bgp->rsclient_peers_update_hold = 0; bgp_start_routeadv(bgp); return WQ_SUCCESS; } @@ -1598,6 +1604,17 @@ bgp_process_main (struct work_queue *wq, void *data) /* Is it end of initial update? (after startup) */ if (!rn) { + quagga_timestamp(3, bgp->update_delay_zebra_resume_time, + sizeof(bgp->update_delay_zebra_resume_time)); + + bgp->main_zebra_update_hold = 0; + for (afi = AFI_IP; afi < AFI_MAX; afi++) + for (safi = SAFI_UNICAST; safi < SAFI_MAX; safi++) + { + bgp_zebra_announce_table(bgp, afi, safi); + } + bgp->main_peers_update_hold = 0; + bgp_start_routeadv(bgp); return WQ_SUCCESS; } @@ -2657,19 +2674,9 @@ bgp_announce_table (struct peer *peer, afi_t afi, safi_t safi, if (! table) table = (rsclient) ? peer->rib[afi][safi] : peer->bgp->rib[afi][safi]; - if (safi != SAFI_MPLS_VPN) - { - if (CHECK_FLAG (peer->af_flags[afi][safi], PEER_FLAG_DEFAULT_ORIGINATE)) - { - bgp_default_originate (peer, afi, safi, 0); - } - else - { - /* Send the withdraw if it was postponed during read-only mode. */ - if (CHECK_FLAG (peer->af_flags[afi][safi], PEER_STATUS_DEFAULT_ORIGINATE)) - bgp_default_originate (peer, afi, safi, 1); - } - } + if (safi != SAFI_MPLS_VPN + && CHECK_FLAG (peer->af_flags[afi][safi], PEER_FLAG_DEFAULT_ORIGINATE)) + bgp_default_originate (peer, afi, safi, 0); /* It's initialized in bgp_announce_[check|check_rsclient]() */ attr.extra = &extra; @@ -2721,9 +2728,6 @@ bgp_announce_route_all (struct peer *peer) afi_t afi; safi_t safi; - if (bgp_update_delay_active(peer->bgp)) - return; - for (afi = AFI_IP; afi < AFI_MAX; afi++) for (safi = SAFI_UNICAST; safi < SAFI_MAX; safi++) bgp_announce_route (peer, afi, safi); diff --git a/bgpd/bgp_vty.c b/bgpd/bgp_vty.c index afd5d0c78a..c638484a0c 100644 --- a/bgpd/bgp_vty.c +++ b/bgpd/bgp_vty.c @@ -7388,8 +7388,12 @@ bgp_show_summary (struct vty *vty, struct bgp *bgp, int afi, int safi, char *del { vty_out (vty, " First neighbor established: %s%s", bgp->update_delay_begin_time, VTY_NEWLINE); - vty_out (vty, " Best-paths/updates resumed: %s%s", + vty_out (vty, " Best-paths resumed: %s%s", bgp->update_delay_end_time, VTY_NEWLINE); + vty_out (vty, " zebra update resumed: %s%s", + bgp->update_delay_zebra_resume_time, VTY_NEWLINE); + vty_out (vty, " peers update resumed: %s%s", + bgp->update_delay_peers_resume_time, VTY_NEWLINE); } } } diff --git a/bgpd/bgp_zebra.c b/bgpd/bgp_zebra.c index 72634ae1fd..58990c5030 100644 --- a/bgpd/bgp_zebra.c +++ b/bgpd/bgp_zebra.c @@ -881,6 +881,9 @@ bgp_zebra_announce (struct prefix *p, struct bgp_info *info, struct bgp *bgp, if (! zclient->redist[ZEBRA_ROUTE_BGP]) return; + if (bgp->main_zebra_update_hold) + return; + flags = 0; peer = info->peer; @@ -1171,6 +1174,7 @@ bgp_zebra_announce_table (struct bgp *bgp, afi_t afi, safi_t safi) struct bgp_info *ri; table = bgp->rib[afi][safi]; + if (!table) return; for (rn = bgp_table_top (table); rn; rn = bgp_route_next (rn)) for (ri = rn->info; ri; ri = ri->next) @@ -1193,6 +1197,10 @@ bgp_zebra_withdraw (struct prefix *p, struct bgp_info *info, safi_t safi) return; peer = info->peer; + + if (peer->bgp && peer->bgp->main_zebra_update_hold) + return; + flags = 0; if (peer->sort == BGP_PEER_IBGP) diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h index 0ebbe2a0eb..486a5f96af 100644 --- a/bgpd/bgpd.h +++ b/bgpd/bgpd.h @@ -117,10 +117,15 @@ struct bgp struct thread *t_update_delay; struct thread *t_establish_wait; u_char update_delay_over; + u_char main_zebra_update_hold; + u_char main_peers_update_hold; + u_char rsclient_peers_update_hold; u_int16_t v_update_delay; u_int16_t v_establish_wait; char update_delay_begin_time[64]; char update_delay_end_time[64]; + char update_delay_zebra_resume_time[64]; + char update_delay_peers_resume_time[64]; u_int32_t established; u_int32_t restarted_peers; u_int32_t implicit_eors;