From 12bf042c688fedf82637fab9ff77aa1eab271160 Mon Sep 17 00:00:00 2001 From: Donald Sharp Date: Fri, 21 Mar 2025 07:48:50 -0400 Subject: [PATCH] bgpd: Modify bgp to handle packet events in a FIFO Current behavor of BGP is to have a event per connection. Given that on startup of BGP with a high number of neighbors you end up with 2 * # of peers events that are being processed. Additionally once BGP has selected the connection this still only comes down to 512 events. This number of events is swamping the event system and in addition delaying any other work from being done in BGP at all because the the 512 events are always going to take precedence over everything else. The other main events are the handling of the metaQ(1 event), update group events( 1 per update group ) and the zebra batching event. These are being swamped. Modify the BGP code to have a FIFO of connections. As new data comes in to read, place the connection on the end of the FIFO. Have the bgp_process_packet handle up to 100 packets spread across the individual peers where each peer/connection is limited to the original quanta. During testing I noticed that withdrawal events at very very large scale are taking up to 40 seconds to process so I added a check for yielding to further limit the number of packets being processed. This change also allow for BGP to be interactive again on scale setups on initial convergence. Prior to this change any vtysh command entered would be delayed by 10's of seconds in my setup while BGP was doing other work. Signed-off-by: Donald Sharp --- bgpd/bgp_fsm.c | 15 ++++-- bgpd/bgp_io.c | 16 ++++-- bgpd/bgp_io.h | 1 + bgpd/bgp_main.c | 8 +++ bgpd/bgp_packet.c | 127 ++++++++++++++++++++++++++++++++++++++-------- bgpd/bgpd.c | 4 ++ bgpd/bgpd.h | 16 +++++- 7 files changed, 158 insertions(+), 29 deletions(-) diff --git a/bgpd/bgp_fsm.c b/bgpd/bgp_fsm.c index 478f8c9136..2c1cbee6d0 100644 --- a/bgpd/bgp_fsm.c +++ b/bgpd/bgp_fsm.c @@ -184,7 +184,11 @@ static struct peer *peer_xfer_conn(struct peer *from_peer) EVENT_OFF(keeper->t_delayopen); EVENT_OFF(keeper->t_connect_check_r); EVENT_OFF(keeper->t_connect_check_w); - EVENT_OFF(keeper->t_process_packet); + + frr_with_mutex (&bm->peer_connection_mtx) { + if (peer_connection_fifo_member(&bm->connection_fifo, keeper)) + peer_connection_fifo_del(&bm->connection_fifo, keeper); + } /* * At this point in time, it is possible that there are packets pending @@ -305,8 +309,13 @@ static struct peer *peer_xfer_conn(struct peer *from_peer) bgp_reads_on(keeper); bgp_writes_on(keeper); - event_add_event(bm->master, bgp_process_packet, keeper, 0, - &keeper->t_process_packet); + + frr_with_mutex (&bm->peer_connection_mtx) { + if (!peer_connection_fifo_member(&bm->connection_fifo, keeper)) { + peer_connection_fifo_add_tail(&bm->connection_fifo, keeper); + } + } + event_add_event(bm->master, bgp_process_packet, NULL, 0, &bm->e_process_packet); return (peer); } diff --git a/bgpd/bgp_io.c b/bgpd/bgp_io.c index 729a8fe299..dac915a73d 100644 --- a/bgpd/bgp_io.c +++ b/bgpd/bgp_io.c @@ -99,7 +99,11 @@ void bgp_reads_off(struct peer_connection *connection) assert(fpt->running); event_cancel_async(fpt->master, &connection->t_read, NULL); - EVENT_OFF(connection->t_process_packet); + + frr_with_mutex (&bm->peer_connection_mtx) { + if (peer_connection_fifo_member(&bm->connection_fifo, connection)) + peer_connection_fifo_del(&bm->connection_fifo, connection); + } UNSET_FLAG(connection->thread_flags, PEER_THREAD_READS_ON); } @@ -292,9 +296,13 @@ done: event_add_read(fpt->master, bgp_process_reads, connection, connection->fd, &connection->t_read); - if (added_pkt) - event_add_event(bm->master, bgp_process_packet, connection, 0, - &connection->t_process_packet); + if (added_pkt) { + frr_with_mutex (&bm->peer_connection_mtx) { + if (!peer_connection_fifo_member(&bm->connection_fifo, connection)) + peer_connection_fifo_add_tail(&bm->connection_fifo, connection); + } + event_add_event(bm->master, bgp_process_packet, NULL, 0, &bm->e_process_packet); + } } /* diff --git a/bgpd/bgp_io.h b/bgpd/bgp_io.h index 8d481129e5..278980fde6 100644 --- a/bgpd/bgp_io.h +++ b/bgpd/bgp_io.h @@ -10,6 +10,7 @@ #define BGP_WRITE_PACKET_MAX 64U #define BGP_READ_PACKET_MAX 10U +#define BGP_PACKET_PROCESS_LIMIT 100 #include "bgpd/bgpd.h" #include "frr_pthread.h" diff --git a/bgpd/bgp_main.c b/bgpd/bgp_main.c index 9dbef791b0..1dbac2b864 100644 --- a/bgpd/bgp_main.c +++ b/bgpd/bgp_main.c @@ -161,6 +161,14 @@ __attribute__((__noreturn__)) void sigint(void) bgp_exit(0); + /* + * This is being done after bgp_exit because items may be removed + * from the connection_fifo + */ + peer_connection_fifo_fini(&bm->connection_fifo); + EVENT_OFF(bm->e_process_packet); + pthread_mutex_destroy(&bm->peer_connection_mtx); + exit(0); } diff --git a/bgpd/bgp_packet.c b/bgpd/bgp_packet.c index 3d33a02db0..afd8f1e83a 100644 --- a/bgpd/bgp_packet.c +++ b/bgpd/bgp_packet.c @@ -3967,6 +3967,18 @@ int bgp_capability_receive(struct peer_connection *connection, * would not, making event flow difficult to understand. Please think twice * before hacking this. * + * packet_processing is now a FIFO of connections that need to be handled + * This loop has a maximum run of 100(BGP_PACKET_PROCESS_LIMIT) packets, + * but each individual connection can only handle the quanta value as + * specified in bgp_vty.c. If the connection still has work to do, place it + * back on the back of the queue for more work. Do note that event_should_yield + * is also being called to figure out if processing should stop and work + * picked up after other items can run. This was added *After* withdrawals + * started being processed at scale and this function was taking cpu for 40+ seconds + * On my machine we are getting 2-3 packets before a yield should happen in the + * update case. Withdrawal is 1 packet being processed(note this is a very very + * fast computer) before other items should be run. + * * Thread type: EVENT_EVENT * @param thread * @return 0 @@ -3979,30 +3991,54 @@ void bgp_process_packet(struct event *thread) uint32_t rpkt_quanta_old; // how many packets to read int fsm_update_result; // return code of bgp_event_update() int mprc; // message processing return code + uint32_t processed = 0, curr_connection_processed = 0; + bool more_work = false; + size_t count; + uint32_t total_packets_to_process, total_processed = 0; + + frr_with_mutex (&bm->peer_connection_mtx) + connection = peer_connection_fifo_pop(&bm->connection_fifo); + + if (!connection) + goto done; - connection = EVENT_ARG(thread); + total_packets_to_process = BGP_PACKET_PROCESS_LIMIT; peer = connection->peer; rpkt_quanta_old = atomic_load_explicit(&peer->bgp->rpkt_quanta, memory_order_relaxed); + fsm_update_result = 0; - /* Guard against scheduled events that occur after peer deletion. */ - if (connection->status == Deleted || connection->status == Clearing) - return; + while ((processed < total_packets_to_process) && connection) { + total_processed++; + /* Guard against scheduled events that occur after peer deletion. */ + if (connection->status == Deleted || connection->status == Clearing) { + frr_with_mutex (&bm->peer_connection_mtx) + connection = peer_connection_fifo_pop(&bm->connection_fifo); - unsigned int processed = 0; + if (connection) + peer = connection->peer; + + continue; + } - while (processed < rpkt_quanta_old) { uint8_t type = 0; bgp_size_t size; char notify_data_length[2]; - frr_with_mutex (&connection->io_mtx) { + frr_with_mutex (&connection->io_mtx) peer->curr = stream_fifo_pop(connection->ibuf); - } - if (peer->curr == NULL) // no packets to process, hmm... - return; + if (peer->curr == NULL) { + frr_with_mutex (&bm->peer_connection_mtx) + connection = peer_connection_fifo_pop(&bm->connection_fifo); + + + if (connection) + peer = connection->peer; + + continue; + } /* skip the marker and copy the packet length */ stream_forward_getp(peer->curr, BGP_MARKER_SIZE); @@ -4106,32 +4142,81 @@ void bgp_process_packet(struct event *thread) stream_free(peer->curr); peer->curr = NULL; processed++; + curr_connection_processed++; /* Update FSM */ if (mprc != BGP_PACKET_NOOP) fsm_update_result = bgp_event_update(connection, mprc); - else - continue; /* * If peer was deleted, do not process any more packets. This * is usually due to executing BGP_Stop or a stub deletion. */ - if (fsm_update_result == FSM_PEER_TRANSFERRED - || fsm_update_result == FSM_PEER_STOPPED) - break; + if (fsm_update_result == FSM_PEER_TRANSFERRED || + fsm_update_result == FSM_PEER_STOPPED) { + frr_with_mutex (&bm->peer_connection_mtx) + connection = peer_connection_fifo_pop(&bm->connection_fifo); + + if (connection) + peer = connection->peer; + + continue; + } + + bool yield = event_should_yield(thread); + if (curr_connection_processed >= rpkt_quanta_old || yield) { + curr_connection_processed = 0; + frr_with_mutex (&bm->peer_connection_mtx) { + if (!peer_connection_fifo_member(&bm->connection_fifo, connection)) + peer_connection_fifo_add_tail(&bm->connection_fifo, + connection); + if (!yield) + connection = peer_connection_fifo_pop(&bm->connection_fifo); + else + connection = NULL; + } + if (connection) + peer = connection->peer; + + continue; + } + + frr_with_mutex (&connection->io_mtx) { + if (connection->ibuf->count > 0) + more_work = true; + else + more_work = false; + } + + if (!more_work) { + frr_with_mutex (&bm->peer_connection_mtx) + connection = peer_connection_fifo_pop(&bm->connection_fifo); + + if (connection) + peer = connection->peer; + } } - if (fsm_update_result != FSM_PEER_TRANSFERRED - && fsm_update_result != FSM_PEER_STOPPED) { + if (connection) { frr_with_mutex (&connection->io_mtx) { - // more work to do, come back later if (connection->ibuf->count > 0) - event_add_event(bm->master, bgp_process_packet, - connection, 0, - &connection->t_process_packet); + more_work = true; + else + more_work = false; + } + frr_with_mutex (&bm->peer_connection_mtx) { + if (more_work && + !peer_connection_fifo_member(&bm->connection_fifo, connection)) + peer_connection_fifo_add_tail(&bm->connection_fifo, connection); } } + +done: + frr_with_mutex (&bm->peer_connection_mtx) + count = peer_connection_fifo_count(&bm->connection_fifo); + + if (count) + event_add_event(bm->master, bgp_process_packet, NULL, 0, &bm->e_process_packet); } /* Send EOR when routes are processed by selection deferral timer */ diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c index 77150786ce..5ebcebd1d6 100644 --- a/bgpd/bgpd.c +++ b/bgpd/bgpd.c @@ -8683,6 +8683,10 @@ void bgp_master_init(struct event_loop *master, const int buffer_size, bm = &bgp_master; + /* Initialize the peer connection FIFO list */ + peer_connection_fifo_init(&bm->connection_fifo); + pthread_mutex_init(&bm->peer_connection_mtx, NULL); + zebra_announce_init(&bm->zebra_announce_head); zebra_l2_vni_init(&bm->zebra_l2_vni_head); zebra_l3_vni_init(&bm->zebra_l3_vni_head); diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h index 4b5833151f..bbc45994b4 100644 --- a/bgpd/bgpd.h +++ b/bgpd/bgpd.h @@ -107,6 +107,9 @@ enum bgp_af_index { extern struct frr_pthread *bgp_pth_io; extern struct frr_pthread *bgp_pth_ka; +/* FIFO list for peer connections */ +PREDECL_LIST(peer_connection_fifo); + /* BGP master for system wide configurations and variables. */ struct bgp_master { /* BGP instance list. */ @@ -121,6 +124,11 @@ struct bgp_master { /* BGP port number. */ uint16_t port; + /* FIFO list head for peer connections */ + struct peer_connection_fifo_head connection_fifo; + struct event *e_process_packet; + pthread_mutex_t peer_connection_mtx; + /* Listener addresses */ struct list *addresses; @@ -1378,7 +1386,6 @@ struct peer_connection { struct event *t_pmax_restart; struct event *t_routeadv; - struct event *t_process_packet; struct event *t_stop_with_notify; @@ -1394,7 +1401,14 @@ struct peer_connection { union sockunion *su_local; /* Sockunion of local address. */ union sockunion *su_remote; /* Sockunion of remote address. */ + + /* For FIFO list */ + struct peer_connection_fifo_item fifo_item; }; + +/* Declare the FIFO list implementation */ +DECLARE_LIST(peer_connection_fifo, struct peer_connection, fifo_item); + const char *bgp_peer_get_connection_direction(struct peer_connection *connection); extern struct peer_connection *bgp_peer_connection_new(struct peer *peer); extern void bgp_peer_connection_free(struct peer_connection **connection); -- 2.39.5