diff options
| -rw-r--r-- | bgpd/bgp_fsm.c | 15 | ||||
| -rw-r--r-- | bgpd/bgp_io.c | 16 | ||||
| -rw-r--r-- | bgpd/bgp_io.h | 1 | ||||
| -rw-r--r-- | bgpd/bgp_main.c | 8 | ||||
| -rw-r--r-- | bgpd/bgp_packet.c | 127 | ||||
| -rw-r--r-- | bgpd/bgp_route.c | 18 | ||||
| -rw-r--r-- | bgpd/bgpd.c | 4 | ||||
| -rw-r--r-- | bgpd/bgpd.h | 16 | ||||
| -rw-r--r-- | tests/topotests/high_ecmp/r1/frr.conf | 5 | ||||
| -rw-r--r-- | tests/topotests/high_ecmp/r1/frr_ipv4_bgp.conf | 3 | ||||
| -rw-r--r-- | tests/topotests/high_ecmp/r1/frr_ipv6_bgp.conf | 3 | ||||
| -rw-r--r-- | tests/topotests/high_ecmp/r1/frr_unnumbered_bgp.conf | 3 | ||||
| -rw-r--r-- | tests/topotests/high_ecmp/r2/frr.conf | 5 | ||||
| -rw-r--r-- | tests/topotests/high_ecmp/r2/frr_ipv4_bgp.conf | 3 | ||||
| -rw-r--r-- | tests/topotests/high_ecmp/r2/frr_ipv6_bgp.conf | 3 | ||||
| -rw-r--r-- | tests/topotests/high_ecmp/r2/frr_unnumbered_bgp.conf | 3 | ||||
| -rw-r--r-- | zebra/rib.h | 2 | ||||
| -rw-r--r-- | zebra/zebra_rib.c | 26 | ||||
| -rw-r--r-- | zebra/zserv.c | 6 |
19 files changed, 209 insertions, 58 deletions
diff --git a/bgpd/bgp_fsm.c b/bgpd/bgp_fsm.c index 940d93a8df..1a830abd90 100644 --- a/bgpd/bgp_fsm.c +++ b/bgpd/bgp_fsm.c @@ -184,7 +184,11 @@ static struct peer *peer_xfer_conn(struct peer *from_peer) EVENT_OFF(keeper->t_delayopen); EVENT_OFF(keeper->t_connect_check_r); EVENT_OFF(keeper->t_connect_check_w); - EVENT_OFF(keeper->t_process_packet); + + frr_with_mutex (&bm->peer_connection_mtx) { + if (peer_connection_fifo_member(&bm->connection_fifo, keeper)) + peer_connection_fifo_del(&bm->connection_fifo, keeper); + } /* * At this point in time, it is possible that there are packets pending @@ -305,8 +309,13 @@ static struct peer *peer_xfer_conn(struct peer *from_peer) bgp_reads_on(keeper); bgp_writes_on(keeper); - event_add_event(bm->master, bgp_process_packet, keeper, 0, - &keeper->t_process_packet); + + frr_with_mutex (&bm->peer_connection_mtx) { + if (!peer_connection_fifo_member(&bm->connection_fifo, keeper)) { + peer_connection_fifo_add_tail(&bm->connection_fifo, keeper); + } + } + event_add_event(bm->master, bgp_process_packet, NULL, 0, &bm->e_process_packet); return (peer); } diff --git a/bgpd/bgp_io.c b/bgpd/bgp_io.c index 729a8fe299..dac915a73d 100644 --- a/bgpd/bgp_io.c +++ b/bgpd/bgp_io.c @@ -99,7 +99,11 @@ void bgp_reads_off(struct peer_connection *connection) assert(fpt->running); event_cancel_async(fpt->master, &connection->t_read, NULL); - EVENT_OFF(connection->t_process_packet); + + frr_with_mutex (&bm->peer_connection_mtx) { + if (peer_connection_fifo_member(&bm->connection_fifo, connection)) + peer_connection_fifo_del(&bm->connection_fifo, connection); + } UNSET_FLAG(connection->thread_flags, PEER_THREAD_READS_ON); } @@ -292,9 +296,13 @@ done: event_add_read(fpt->master, bgp_process_reads, connection, connection->fd, &connection->t_read); - if (added_pkt) - event_add_event(bm->master, bgp_process_packet, connection, 0, - &connection->t_process_packet); + if (added_pkt) { + frr_with_mutex (&bm->peer_connection_mtx) { + if (!peer_connection_fifo_member(&bm->connection_fifo, connection)) + peer_connection_fifo_add_tail(&bm->connection_fifo, connection); + } + event_add_event(bm->master, bgp_process_packet, NULL, 0, &bm->e_process_packet); + } } /* diff --git a/bgpd/bgp_io.h b/bgpd/bgp_io.h index 8d481129e5..278980fde6 100644 --- a/bgpd/bgp_io.h +++ b/bgpd/bgp_io.h @@ -10,6 +10,7 @@ #define BGP_WRITE_PACKET_MAX 64U #define BGP_READ_PACKET_MAX 10U +#define BGP_PACKET_PROCESS_LIMIT 100 #include "bgpd/bgpd.h" #include "frr_pthread.h" diff --git a/bgpd/bgp_main.c b/bgpd/bgp_main.c index 9dbef791b0..1dbac2b864 100644 --- a/bgpd/bgp_main.c +++ b/bgpd/bgp_main.c @@ -161,6 +161,14 @@ __attribute__((__noreturn__)) void sigint(void) bgp_exit(0); + /* + * This is being done after bgp_exit because items may be removed + * from the connection_fifo + */ + peer_connection_fifo_fini(&bm->connection_fifo); + EVENT_OFF(bm->e_process_packet); + pthread_mutex_destroy(&bm->peer_connection_mtx); + exit(0); } diff --git a/bgpd/bgp_packet.c b/bgpd/bgp_packet.c index 4c2eb03f3a..f3ee837498 100644 --- a/bgpd/bgp_packet.c +++ b/bgpd/bgp_packet.c @@ -3974,6 +3974,18 @@ int bgp_capability_receive(struct peer_connection *connection, * would not, making event flow difficult to understand. Please think twice * before hacking this. * + * packet_processing is now a FIFO of connections that need to be handled + * This loop has a maximum run of 100(BGP_PACKET_PROCESS_LIMIT) packets, + * but each individual connection can only handle the quanta value as + * specified in bgp_vty.c. If the connection still has work to do, place it + * back on the back of the queue for more work. Do note that event_should_yield + * is also being called to figure out if processing should stop and work + * picked up after other items can run. This was added *After* withdrawals + * started being processed at scale and this function was taking cpu for 40+ seconds + * On my machine we are getting 2-3 packets before a yield should happen in the + * update case. Withdrawal is 1 packet being processed(note this is a very very + * fast computer) before other items should be run. + * * Thread type: EVENT_EVENT * @param thread * @return 0 @@ -3986,30 +3998,54 @@ void bgp_process_packet(struct event *thread) uint32_t rpkt_quanta_old; // how many packets to read int fsm_update_result; // return code of bgp_event_update() int mprc; // message processing return code + uint32_t processed = 0, curr_connection_processed = 0; + bool more_work = false; + size_t count; + uint32_t total_packets_to_process, total_processed = 0; + + frr_with_mutex (&bm->peer_connection_mtx) + connection = peer_connection_fifo_pop(&bm->connection_fifo); + + if (!connection) + goto done; - connection = EVENT_ARG(thread); + total_packets_to_process = BGP_PACKET_PROCESS_LIMIT; peer = connection->peer; rpkt_quanta_old = atomic_load_explicit(&peer->bgp->rpkt_quanta, memory_order_relaxed); + fsm_update_result = 0; - /* Guard against scheduled events that occur after peer deletion. */ - if (connection->status == Deleted || connection->status == Clearing) - return; + while ((processed < total_packets_to_process) && connection) { + total_processed++; + /* Guard against scheduled events that occur after peer deletion. */ + if (connection->status == Deleted || connection->status == Clearing) { + frr_with_mutex (&bm->peer_connection_mtx) + connection = peer_connection_fifo_pop(&bm->connection_fifo); - unsigned int processed = 0; + if (connection) + peer = connection->peer; + + continue; + } - while (processed < rpkt_quanta_old) { uint8_t type = 0; bgp_size_t size; char notify_data_length[2]; - frr_with_mutex (&connection->io_mtx) { + frr_with_mutex (&connection->io_mtx) peer->curr = stream_fifo_pop(connection->ibuf); - } - if (peer->curr == NULL) // no packets to process, hmm... - return; + if (peer->curr == NULL) { + frr_with_mutex (&bm->peer_connection_mtx) + connection = peer_connection_fifo_pop(&bm->connection_fifo); + + + if (connection) + peer = connection->peer; + + continue; + } /* skip the marker and copy the packet length */ stream_forward_getp(peer->curr, BGP_MARKER_SIZE); @@ -4113,32 +4149,81 @@ void bgp_process_packet(struct event *thread) stream_free(peer->curr); peer->curr = NULL; processed++; + curr_connection_processed++; /* Update FSM */ if (mprc != BGP_PACKET_NOOP) fsm_update_result = bgp_event_update(connection, mprc); - else - continue; /* * If peer was deleted, do not process any more packets. This * is usually due to executing BGP_Stop or a stub deletion. */ - if (fsm_update_result == FSM_PEER_TRANSFERRED - || fsm_update_result == FSM_PEER_STOPPED) - break; + if (fsm_update_result == FSM_PEER_TRANSFERRED || + fsm_update_result == FSM_PEER_STOPPED) { + frr_with_mutex (&bm->peer_connection_mtx) + connection = peer_connection_fifo_pop(&bm->connection_fifo); + + if (connection) + peer = connection->peer; + + continue; + } + + bool yield = event_should_yield(thread); + if (curr_connection_processed >= rpkt_quanta_old || yield) { + curr_connection_processed = 0; + frr_with_mutex (&bm->peer_connection_mtx) { + if (!peer_connection_fifo_member(&bm->connection_fifo, connection)) + peer_connection_fifo_add_tail(&bm->connection_fifo, + connection); + if (!yield) + connection = peer_connection_fifo_pop(&bm->connection_fifo); + else + connection = NULL; + } + if (connection) + peer = connection->peer; + + continue; + } + + frr_with_mutex (&connection->io_mtx) { + if (connection->ibuf->count > 0) + more_work = true; + else + more_work = false; + } + + if (!more_work) { + frr_with_mutex (&bm->peer_connection_mtx) + connection = peer_connection_fifo_pop(&bm->connection_fifo); + + if (connection) + peer = connection->peer; + } } - if (fsm_update_result != FSM_PEER_TRANSFERRED - && fsm_update_result != FSM_PEER_STOPPED) { + if (connection) { frr_with_mutex (&connection->io_mtx) { - // more work to do, come back later if (connection->ibuf->count > 0) - event_add_event(bm->master, bgp_process_packet, - connection, 0, - &connection->t_process_packet); + more_work = true; + else + more_work = false; + } + frr_with_mutex (&bm->peer_connection_mtx) { + if (more_work && + !peer_connection_fifo_member(&bm->connection_fifo, connection)) + peer_connection_fifo_add_tail(&bm->connection_fifo, connection); } } + +done: + frr_with_mutex (&bm->peer_connection_mtx) + count = peer_connection_fifo_count(&bm->connection_fifo); + + if (count) + event_add_event(bm->master, bgp_process_packet, NULL, 0, &bm->e_process_packet); } /* Send EOR when routes are processed by selection deferral timer */ diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c index 71a1149d10..0c3f983f42 100644 --- a/bgpd/bgp_route.c +++ b/bgpd/bgp_route.c @@ -4201,12 +4201,30 @@ static wq_item_status meta_queue_process(struct work_queue *dummy, void *data) { struct meta_queue *mq = data; uint32_t i; + uint32_t peers_on_fifo; + static uint32_t total_runs = 0; + + total_runs++; + + frr_with_mutex (&bm->peer_connection_mtx) + peers_on_fifo = peer_connection_fifo_count(&bm->connection_fifo); + + /* + * If the number of peers on the fifo is greater than 10 + * let's yield this run of the MetaQ to allow the packet processing to make + * progress against the incoming packets. But we should also + * attempt to allow this to run occassionally. Let's run + * something every 10 attempts to process the work queue. + */ + if (peers_on_fifo > 10 && total_runs % 10 != 0) + return WQ_QUEUE_BLOCKED; for (i = 0; i < MQ_SIZE; i++) if (process_subq(mq->subq[i], i)) { mq->size--; break; } + return mq->size ? WQ_REQUEUE : WQ_SUCCESS; } diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c index 77150786ce..5ebcebd1d6 100644 --- a/bgpd/bgpd.c +++ b/bgpd/bgpd.c @@ -8683,6 +8683,10 @@ void bgp_master_init(struct event_loop *master, const int buffer_size, bm = &bgp_master; + /* Initialize the peer connection FIFO list */ + peer_connection_fifo_init(&bm->connection_fifo); + pthread_mutex_init(&bm->peer_connection_mtx, NULL); + zebra_announce_init(&bm->zebra_announce_head); zebra_l2_vni_init(&bm->zebra_l2_vni_head); zebra_l3_vni_init(&bm->zebra_l3_vni_head); diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h index 4b5833151f..bbc45994b4 100644 --- a/bgpd/bgpd.h +++ b/bgpd/bgpd.h @@ -107,6 +107,9 @@ enum bgp_af_index { extern struct frr_pthread *bgp_pth_io; extern struct frr_pthread *bgp_pth_ka; +/* FIFO list for peer connections */ +PREDECL_LIST(peer_connection_fifo); + /* BGP master for system wide configurations and variables. */ struct bgp_master { /* BGP instance list. */ @@ -121,6 +124,11 @@ struct bgp_master { /* BGP port number. */ uint16_t port; + /* FIFO list head for peer connections */ + struct peer_connection_fifo_head connection_fifo; + struct event *e_process_packet; + pthread_mutex_t peer_connection_mtx; + /* Listener addresses */ struct list *addresses; @@ -1378,7 +1386,6 @@ struct peer_connection { struct event *t_pmax_restart; struct event *t_routeadv; - struct event *t_process_packet; struct event *t_stop_with_notify; @@ -1394,7 +1401,14 @@ struct peer_connection { union sockunion *su_local; /* Sockunion of local address. */ union sockunion *su_remote; /* Sockunion of remote address. */ + + /* For FIFO list */ + struct peer_connection_fifo_item fifo_item; }; + +/* Declare the FIFO list implementation */ +DECLARE_LIST(peer_connection_fifo, struct peer_connection, fifo_item); + const char *bgp_peer_get_connection_direction(struct peer_connection *connection); extern struct peer_connection *bgp_peer_connection_new(struct peer *peer); extern void bgp_peer_connection_free(struct peer_connection **connection); diff --git a/tests/topotests/high_ecmp/r1/frr.conf b/tests/topotests/high_ecmp/r1/frr.conf index 957e17e914..e64fb2e416 100644 --- a/tests/topotests/high_ecmp/r1/frr.conf +++ b/tests/topotests/high_ecmp/r1/frr.conf @@ -2577,3 +2577,8 @@ interface r1-eth514 ipv6 address 2001:db8:3:5::1/64 no shut ! +router bgp 1001 + timers bgp 5 60 + no bgp ebgp-requires-policy + read-quanta 1 +! diff --git a/tests/topotests/high_ecmp/r1/frr_ipv4_bgp.conf b/tests/topotests/high_ecmp/r1/frr_ipv4_bgp.conf index df64a146bc..ad7b9df919 100644 --- a/tests/topotests/high_ecmp/r1/frr_ipv4_bgp.conf +++ b/tests/topotests/high_ecmp/r1/frr_ipv4_bgp.conf @@ -1,7 +1,4 @@ router bgp 1001 - timers bgp 5 20 - no bgp ebgp-requires-policy - read-quanta 1 neighbor 10.1.1.2 remote-as external neighbor 10.1.2.2 remote-as external neighbor 10.1.3.2 remote-as external diff --git a/tests/topotests/high_ecmp/r1/frr_ipv6_bgp.conf b/tests/topotests/high_ecmp/r1/frr_ipv6_bgp.conf index 15137a8715..9f4b961d82 100644 --- a/tests/topotests/high_ecmp/r1/frr_ipv6_bgp.conf +++ b/tests/topotests/high_ecmp/r1/frr_ipv6_bgp.conf @@ -1,7 +1,4 @@ router bgp 1001 - timers bgp 5 20 - no bgp ebgp-requires-policy - read-quanta 1 neighbor 2001:db8:1:1::2 remote-as external neighbor 2001:db8:1:2::2 remote-as external neighbor 2001:db8:1:3::2 remote-as external diff --git a/tests/topotests/high_ecmp/r1/frr_unnumbered_bgp.conf b/tests/topotests/high_ecmp/r1/frr_unnumbered_bgp.conf index 7985fe6e99..f48eb23405 100644 --- a/tests/topotests/high_ecmp/r1/frr_unnumbered_bgp.conf +++ b/tests/topotests/high_ecmp/r1/frr_unnumbered_bgp.conf @@ -1,7 +1,4 @@ router bgp 1001 - timers bgp 5 20 - no bgp ebgp-requires-policy - read-quanta 1 neighbor r1-eth0 interface remote-as external neighbor r1-eth1 interface remote-as external neighbor r1-eth2 interface remote-as external diff --git a/tests/topotests/high_ecmp/r2/frr.conf b/tests/topotests/high_ecmp/r2/frr.conf index 151f6da45c..1e98e31236 100644 --- a/tests/topotests/high_ecmp/r2/frr.conf +++ b/tests/topotests/high_ecmp/r2/frr.conf @@ -2577,3 +2577,8 @@ interface r2-eth514 ipv6 address 2001:db8:3:5::2/64 no shutdown ! +router bgp 1002 + timers bgp 5 60 + no bgp ebgp-requires-policy + read-quanta 1 +! diff --git a/tests/topotests/high_ecmp/r2/frr_ipv4_bgp.conf b/tests/topotests/high_ecmp/r2/frr_ipv4_bgp.conf index 48842bbfc2..afc3a30c7d 100644 --- a/tests/topotests/high_ecmp/r2/frr_ipv4_bgp.conf +++ b/tests/topotests/high_ecmp/r2/frr_ipv4_bgp.conf @@ -1,7 +1,4 @@ router bgp 1002 - timers bgp 5 20 - no bgp ebgp-requires-policy - read-quanta 1 neighbor 10.1.1.1 remote-as external neighbor 10.1.2.1 remote-as external neighbor 10.1.3.1 remote-as external diff --git a/tests/topotests/high_ecmp/r2/frr_ipv6_bgp.conf b/tests/topotests/high_ecmp/r2/frr_ipv6_bgp.conf index e3258cabd4..b0728d3069 100644 --- a/tests/topotests/high_ecmp/r2/frr_ipv6_bgp.conf +++ b/tests/topotests/high_ecmp/r2/frr_ipv6_bgp.conf @@ -1,7 +1,4 @@ router bgp 1002 - timers bgp 5 20 - no bgp ebgp-requires-policy - read-quanta 1 neighbor 2001:db8:1:1::1 remote-as external neighbor 2001:db8:1:2::1 remote-as external neighbor 2001:db8:1:3::1 remote-as external diff --git a/tests/topotests/high_ecmp/r2/frr_unnumbered_bgp.conf b/tests/topotests/high_ecmp/r2/frr_unnumbered_bgp.conf index 23827789fb..0b12e83e1f 100644 --- a/tests/topotests/high_ecmp/r2/frr_unnumbered_bgp.conf +++ b/tests/topotests/high_ecmp/r2/frr_unnumbered_bgp.conf @@ -1,7 +1,4 @@ router bgp 1002 - timers bgp 5 20 - no bgp ebgp-requires-policy - read-quanta 1 neighbor r2-eth0 interface remote-as external neighbor r2-eth1 interface remote-as external neighbor r2-eth2 interface remote-as external diff --git a/zebra/rib.h b/zebra/rib.h index 8484fe1291..7eaeb50c7e 100644 --- a/zebra/rib.h +++ b/zebra/rib.h @@ -462,6 +462,8 @@ extern void meta_queue_free(struct meta_queue *mq, struct zebra_vrf *zvrf); extern int zebra_rib_labeled_unicast(struct route_entry *re); extern struct route_table *rib_table_ipv6; +extern uint32_t zebra_rib_meta_queue_size(void); + extern void rib_unlink(struct route_node *rn, struct route_entry *re); extern int rib_gc_dest(struct route_node *rn); extern struct route_table *rib_tables_iter_next(rib_tables_iter_t *iter); diff --git a/zebra/zebra_rib.c b/zebra/zebra_rib.c index 20ec25a431..0c4f7e02f8 100644 --- a/zebra/zebra_rib.c +++ b/zebra/zebra_rib.c @@ -3302,8 +3302,8 @@ static int rib_meta_queue_add(struct meta_queue *mq, void *data) mq->size++; if (IS_ZEBRA_DEBUG_RIB_DETAILED) - rnode_debug(rn, re->vrf_id, "queued rn %p into sub-queue %s", - (void *)rn, subqueue2str(qindex)); + rnode_debug(rn, re->vrf_id, "queued rn %p into sub-queue %s mq size %u", (void *)rn, + subqueue2str(qindex), zrouter.mq->size); return 0; } @@ -3335,8 +3335,8 @@ static int rib_meta_queue_nhg_ctx_add(struct meta_queue *mq, void *data) mq->size++; if (IS_ZEBRA_DEBUG_RIB_DETAILED) - zlog_debug("NHG Context id=%u queued into sub-queue %s", - ctx->id, subqueue2str(qindex)); + zlog_debug("NHG Context id=%u queued into sub-queue %s mq size %u", ctx->id, + subqueue2str(qindex), zrouter.mq->size); return 0; } @@ -3363,8 +3363,8 @@ static int rib_meta_queue_nhg_process(struct meta_queue *mq, void *data, mq->size++; if (IS_ZEBRA_DEBUG_RIB_DETAILED) - zlog_debug("NHG id=%u queued into sub-queue %s", nhe->id, - subqueue2str(qindex)); + zlog_debug("NHG id=%u queued into sub-queue %s mq size %u", nhe->id, + subqueue2str(qindex), zrouter.mq->size); return 0; } @@ -3410,6 +3410,11 @@ static int mq_add_handler(void *data, return mq_add_func(zrouter.mq, data); } +uint32_t zebra_rib_meta_queue_size(void) +{ + return zrouter.mq->size; +} + void mpls_ftn_uninstall(struct zebra_vrf *zvrf, enum lsp_types_t type, struct prefix *prefix, uint8_t route_type, uint8_t route_instance) @@ -4226,7 +4231,7 @@ static int rib_meta_queue_gr_run_add(struct meta_queue *mq, void *data) mq->size++; if (IS_ZEBRA_DEBUG_RIB_DETAILED) - zlog_debug("Graceful Run adding"); + zlog_debug("Graceful Run adding mq size %u", zrouter.mq->size); return 0; } @@ -4241,10 +4246,9 @@ static int rib_meta_queue_early_route_add(struct meta_queue *mq, void *data) if (IS_ZEBRA_DEBUG_RIB_DETAILED) { struct vrf *vrf = vrf_lookup_by_id(ere->re->vrf_id); - zlog_debug("Route %pFX(%s) (%s) queued for processing into sub-queue %s", - &ere->p, VRF_LOGNAME(vrf), - ere->deletion ? "delete" : "add", - subqueue2str(META_QUEUE_EARLY_ROUTE)); + zlog_debug("Route %pFX(%s) (%s) queued for processing into sub-queue %s mq size %u", + &ere->p, VRF_LOGNAME(vrf), ere->deletion ? "delete" : "add", + subqueue2str(META_QUEUE_EARLY_ROUTE), zrouter.mq->size); } return 0; diff --git a/zebra/zserv.c b/zebra/zserv.c index d477cd051f..aab1bd0062 100644 --- a/zebra/zserv.c +++ b/zebra/zserv.c @@ -530,6 +530,12 @@ static void zserv_process_messages(struct event *thread) struct stream_fifo *cache = stream_fifo_new(); uint32_t p2p = zrouter.packets_to_process; bool need_resched = false; + uint32_t meta_queue_size = zebra_rib_meta_queue_size(); + + if (meta_queue_size < p2p) + p2p = p2p - meta_queue_size; + else + p2p = 0; frr_with_mutex (&client->ibuf_mtx) { uint32_t i; |
