]> git.puffer.fish Git - matthieu/frr.git/commitdiff
bgpd: Allow batch clear to do partial work and continue later
authorMark Stapp <mjs@cisco.com>
Wed, 12 Mar 2025 13:55:53 +0000 (09:55 -0400)
committerDonald Sharp <sharpd@nvidia.com>
Thu, 20 Mar 2025 13:33:52 +0000 (09:33 -0400)
Modify the batch clear code to be able to stop after processing
some of the work and to pick back up again.  This will allow
the very expensive nature of the batch clearing to be spread out
and allow bgp to continue to be responsive.

Signed-off-by: Mark Stapp <mjs@cisco.com>
bgpd/bgp_route.c
bgpd/bgpd.c
bgpd/bgpd.h

index 95564bc0d868e2ab7b0c62774760678216830d2e..1a5f4c2777e3f67ba5bbb3dd0636c90528f33eae 100644 (file)
@@ -119,6 +119,8 @@ static const struct message bgp_pmsi_tnltype_str[] = {
 #define VRFID_NONE_STR "-"
 #define SOFT_RECONFIG_TASK_MAX_PREFIX 25000
 
+static int clear_batch_rib_helper(struct bgp_clearing_info *cinfo);
+
 static inline char *bgp_route_dump_path_info_flags(struct bgp_path_info *pi,
                                                   char *buf, size_t len)
 {
@@ -6457,124 +6459,165 @@ void bgp_clear_route(struct peer *peer, afi_t afi, safi_t safi)
 }
 
 /*
- * Callback scheduled to process prefixes/dests for batch clearing; the
- * dests were found via a rib walk.
- * The one-peer version of this uses a per-peer workqueue to manage
- * rescheduling, but we're just using a fixed limit here.
+ * Clear one path-info during clearing processing
  */
-
-/* Limit the number of dests we'll process per callback */
-#define BGP_CLEARING_BATCH_MAX_DESTS 100
-
-static void bgp_clear_batch_dests_task(struct event *event)
+static void clearing_clear_one_pi(struct bgp_table *table, struct bgp_dest *dest,
+                                 struct bgp_path_info *pi)
 {
-       struct bgp_clearing_info *cinfo = EVENT_ARG(event);
-       struct bgp_dest *dest;
-       struct bgp_path_info *pi, *next;
-       struct bgp_table *table;
-       struct bgp *bgp;
        afi_t afi;
        safi_t safi;
-       int counter = 0;
-
-       bgp = cinfo->bgp;
-
-next_dest:
-
-       dest = bgp_clearing_batch_next_dest(cinfo);
-       if (dest == NULL)
-               goto done;
+       struct bgp *bgp;
 
-       table = bgp_dest_table(dest);
+       bgp = table->bgp;
        afi = table->afi;
        safi = table->safi;
 
-       /* Have to check every path: it is possible that we have multiple paths
-        * for a prefix from a peer if that peer is using AddPath.
-        * Note that the clearing action may change the pi list; we try to do
-        * a "safe" iteration.
-        */
-       for (pi = bgp_dest_get_bgp_path_info(dest); pi; pi = next) {
-               next = pi ? pi->next : NULL;
+       /* graceful restart STALE flag set. */
+       if (((CHECK_FLAG(pi->peer->sflags, PEER_STATUS_NSF_WAIT)
+             && pi->peer->nsf[afi][safi])
+            || CHECK_FLAG(pi->peer->af_sflags[afi][safi],
+                          PEER_STATUS_ENHANCED_REFRESH))
+           && !CHECK_FLAG(pi->flags, BGP_PATH_STALE)
+           && !CHECK_FLAG(pi->flags, BGP_PATH_UNUSEABLE)) {
 
-               if (!bgp_clearing_batch_check_peer(cinfo, pi->peer))
-                       continue;
+               bgp_path_info_set_flag(dest, pi, BGP_PATH_STALE);
+       } else {
+               /* If this is an EVPN route, process for
+                * un-import. */
+               if (safi == SAFI_EVPN)
+                       bgp_evpn_unimport_route(
+                               bgp, afi, safi,
+                               bgp_dest_get_prefix(dest), pi);
+               /* Handle withdraw for VRF route-leaking and L3VPN */
+               if (SAFI_UNICAST == safi
+                   && (bgp->inst_type == BGP_INSTANCE_TYPE_VRF ||
+                       bgp->inst_type == BGP_INSTANCE_TYPE_DEFAULT)) {
+                       vpn_leak_from_vrf_withdraw(bgp_get_default(),
+                                                  bgp, pi);
+               }
+               if (SAFI_MPLS_VPN == safi &&
+                   bgp->inst_type == BGP_INSTANCE_TYPE_DEFAULT) {
+                       vpn_leak_to_vrf_withdraw(pi);
+               }
 
-               /* graceful restart STALE flag set. */
-               if (((CHECK_FLAG(pi->peer->sflags, PEER_STATUS_NSF_WAIT)
-                     && pi->peer->nsf[afi][safi])
-                    || CHECK_FLAG(pi->peer->af_sflags[afi][safi],
-                                  PEER_STATUS_ENHANCED_REFRESH))
-                   && !CHECK_FLAG(pi->flags, BGP_PATH_STALE)
-                   && !CHECK_FLAG(pi->flags, BGP_PATH_UNUSEABLE))
-                       bgp_path_info_set_flag(dest, pi, BGP_PATH_STALE);
-               else {
-                       /* If this is an EVPN route, process for
-                        * un-import. */
-                       if (safi == SAFI_EVPN)
-                               bgp_evpn_unimport_route(
-                                       bgp, afi, safi,
-                                       bgp_dest_get_prefix(dest), pi);
-                       /* Handle withdraw for VRF route-leaking and L3VPN */
-                       if (SAFI_UNICAST == safi
-                           && (bgp->inst_type == BGP_INSTANCE_TYPE_VRF ||
-                               bgp->inst_type == BGP_INSTANCE_TYPE_DEFAULT)) {
-                               vpn_leak_from_vrf_withdraw(bgp_get_default(),
-                                                          bgp, pi);
-                       }
-                       if (SAFI_MPLS_VPN == safi &&
-                           bgp->inst_type == BGP_INSTANCE_TYPE_DEFAULT) {
-                               vpn_leak_to_vrf_withdraw(pi);
-                       }
+               bgp_rib_remove(dest, pi, pi->peer, afi, safi);
+       }
+}
 
-                       bgp_rib_remove(dest, pi, pi->peer, afi, safi);
-               }
+/*
+ * Helper to capture interrupt/resume context info for clearing processing. We
+ * may be iterating at two levels, so we may need to capture two levels of context
+ * or keying data.
+ */
+static void set_clearing_resume_info(struct bgp_clearing_info *cinfo,
+                                    const struct bgp_table *table,
+                                    const struct bgp_dest *dest, bool inner_p)
+{
+       if (bgp_debug_neighbor_events(NULL))
+               zlog_debug("%s: %sinfo for %s/%s %pFX", __func__,
+                          inner_p ? "inner " : "", afi2str(table->afi),
+                          safi2str(table->safi), &dest->rn->p);
+
+       SET_FLAG(cinfo->flags, BGP_CLEARING_INFO_FLAG_RESUME);
+
+       if (inner_p) {
+               cinfo->inner_afi = table->afi;
+               cinfo->inner_safi = table->safi;
+               cinfo->inner_pfx = dest->rn->p;
+               SET_FLAG(cinfo->flags, BGP_CLEARING_INFO_FLAG_INNER);
+       } else {
+               cinfo->last_afi = table->afi;
+               cinfo->last_safi = table->safi;
+               cinfo->last_pfx = dest->rn->p;
        }
+}
 
-       /* Unref this dest and table */
-       bgp_dest_unlock_node(dest);
-       bgp_table_unlock(table);
+/*
+ * Helper to establish position in a table, possibly using "resume" info stored
+ * during an iteration
+ */
+static struct bgp_dest *clearing_dest_helper(struct bgp_table *table,
+                                            struct bgp_clearing_info *cinfo,
+                                            bool inner_p)
+{
+       struct bgp_dest *dest;
+       const struct prefix *pfx;
 
-       counter++;
-       if (counter < BGP_CLEARING_BATCH_MAX_DESTS)
-               goto next_dest;
+       /* Iterate at start of table, or resume using inner or outer prefix */
+       dest = bgp_table_top(table);
 
-done:
+       if (CHECK_FLAG(cinfo->flags, BGP_CLEARING_INFO_FLAG_RESUME)) {
+               pfx = NULL;
+               if (inner_p) {
+                       if (CHECK_FLAG(cinfo->flags, BGP_CLEARING_INFO_FLAG_INNER))
+                               pfx = &(cinfo->inner_pfx);
+               } else {
+                       pfx = &(cinfo->last_pfx);
+               }
 
-       /* If there are still dests to process, reschedule. */
-       if (bgp_clearing_batch_dests_present(cinfo)) {
-               if (bgp_debug_neighbor_events(NULL))
-                       zlog_debug("%s: Batch %p: Rescheduled after processing %d dests",
-                                  __func__, cinfo, counter);
+               if (pfx) {
+                       dest = bgp_node_match(table, pfx);
+                       if (dest) {
+                               /* if 'dest' matches or precedes the 'last' prefix
+                                * visited, then advance.
+                                */
+                               while (dest && (prefix_cmp(&(dest->rn->p), pfx) <= 0))
+                                       dest = bgp_route_next(dest);
+                       }
+               }
+       }
 
-               event_add_event(bm->master, bgp_clear_batch_dests_task, cinfo,
-                               0, &cinfo->t_sched);
-       } else {
-               if (bgp_debug_neighbor_events(NULL))
-                       zlog_debug("%s: Batch %p: Done after processing %d dests",
-                                  __func__, cinfo, counter);
+       return dest;
+}
+
+/*
+ * Callback to begin or resume the rib-walk for peer clearing, with info carried in
+ * a clearing context.
+ */
+static void clear_dests_callback(struct event *event)
+{
+       int ret;
+       struct bgp_clearing_info *cinfo = EVENT_ARG(event);
+
+       /* Begin, or continue, work */
+       ret = clear_batch_rib_helper(cinfo);
+       if (ret == 0) {
+               /* All done, clean up context */
                bgp_clearing_batch_completed(cinfo);
+       } else {
+               /* Need to resume the work, with 'cinfo' */
+               event_add_event(bm->master, clear_dests_callback, cinfo, 0,
+                               &cinfo->t_sched);
        }
-
-       return;
 }
 
 /*
- * Walk a single table for batch peer clearing processing
+ * Walk a single table for batch peer clearing processing. Limit the number of dests
+ * examined, and return when reaching the limit. Capture "last" info about the
+ * last dest we process so we can resume later.
  */
-static void clear_batch_table_helper(struct bgp_clearing_info *cinfo,
-                                    struct bgp_table *table)
+static int walk_batch_table_helper(struct bgp_clearing_info *cinfo,
+                                  struct bgp_table *table, bool inner_p)
 {
+       int ret = 0;
        struct bgp_dest *dest;
        bool force = (cinfo->bgp->process_queue == NULL);
-       uint32_t examined = 0, queued = 0;
+       uint32_t examined = 0, processed = 0;
 
-       for (dest = bgp_table_top(table); dest; dest = bgp_route_next(dest)) {
+       /* Locate starting dest, possibly using "resume" info */
+       dest = clearing_dest_helper(table, cinfo, inner_p);
+       if (dest == NULL) {
+               /* Nothing more to do for this table? */
+               return 0;
+       }
+
+       for ( ; dest; dest = bgp_route_next(dest)) {
                struct bgp_path_info *pi, *next;
                struct bgp_adj_in *ain;
                struct bgp_adj_in *ain_next;
 
                examined++;
+               cinfo->curr_counter++;
 
                ain = dest->adj_in;
                while (ain) {
@@ -6593,27 +6636,40 @@ static void clear_batch_table_helper(struct bgp_clearing_info *cinfo,
                        if (!bgp_clearing_batch_check_peer(cinfo, pi->peer))
                                continue;
 
-                       queued++;
+                       processed++;
 
                        if (force) {
                                bgp_path_info_reap(dest, pi);
                        } else {
-                               /* Unlocked after processing */
-                               bgp_table_lock(bgp_dest_table(dest));
-                               bgp_dest_lock_node(dest);
-
-                               bgp_clearing_batch_add_dest(cinfo, dest);
-                               break;
+                               /* Do clearing for this pi */
+                               clearing_clear_one_pi(table, dest, pi);
                        }
                }
+
+               if (cinfo->curr_counter >= bm->peer_clearing_batch_max_dests) {
+                       /* Capture info about last dest seen and break */
+                       if (bgp_debug_neighbor_events(NULL))
+                               zlog_debug("%s: %s/%s: pfx %pFX reached limit %u",
+                                          __func__, afi2str(table->afi),
+                                          safi2str(table->safi), &dest->rn->p,
+                                          cinfo->curr_counter);
+
+                       /* Reset the counter */
+                       cinfo->curr_counter = 0;
+                       set_clearing_resume_info(cinfo, table, dest, inner_p);
+                       ret = -1;
+                       break;
+               }
        }
 
        if (examined > 0) {
                if (bgp_debug_neighbor_events(NULL))
-                       zlog_debug("%s: %s/%s: examined %u, queued %u",
+                       zlog_debug("%s: %s/%s: examined %u dests, processed %u paths",
                                   __func__, afi2str(table->afi),
-                                  safi2str(table->safi), examined, queued);
+                                  safi2str(table->safi), examined, processed);
        }
+
+       return ret;
 }
 
 /*
@@ -6621,37 +6677,96 @@ static void clear_batch_table_helper(struct bgp_clearing_info *cinfo,
  * dests that are affected by the peers in the batch, enqueue the dests for
  * async processing.
  */
-static void clear_batch_rib_helper(struct bgp_clearing_info *cinfo)
+static int clear_batch_rib_helper(struct bgp_clearing_info *cinfo)
 {
+       int ret = 0;
        afi_t afi;
        safi_t safi;
        struct bgp_dest *dest;
-       struct bgp_table *table;
+       struct bgp_table *table, *outer_table;
 
-       FOREACH_AFI_SAFI (afi, safi) {
-               /* Identify table to be examined */
-               if (safi != SAFI_MPLS_VPN && safi != SAFI_ENCAP &&
-                   safi != SAFI_EVPN) {
-                       table = cinfo->bgp->rib[afi][safi];
-                       if (!table)
+       /* Maybe resume afi/safi iteration */
+       if (CHECK_FLAG(cinfo->flags, BGP_CLEARING_INFO_FLAG_RESUME)) {
+               afi = cinfo->last_afi;
+               safi = cinfo->last_safi;
+       } else {
+               afi = AFI_IP;
+               safi = SAFI_UNICAST;
+       }
+
+       /* Iterate through afi/safi combos */
+       for (; afi < AFI_MAX; afi++) {
+               for (; safi < SAFI_MAX; safi++) {
+                       /* Identify table to be examined: special handling
+                        * for some SAFIs
+                        */
+                       if (bgp_debug_neighbor_events(NULL))
+                               zlog_debug("%s: examining AFI/SAFI %s/%s", __func__, afi2str(afi),
+                                          safi2str(safi));
+
+                       /* Record the tables we've seen and don't repeat */
+                       if (cinfo->table_map[afi][safi] > 0)
                                continue;
 
-                       clear_batch_table_helper(cinfo, table);
-               } else {
-                       for (dest = bgp_table_top(cinfo->bgp->rib[afi][safi]);
-                            dest; dest = bgp_route_next(dest)) {
-                               table = bgp_dest_get_bgp_table_info(dest);
-                               if (!table)
+                       if (safi != SAFI_MPLS_VPN && safi != SAFI_ENCAP && safi != SAFI_EVPN) {
+                               table = cinfo->bgp->rib[afi][safi];
+                               if (!table) {
+                                       /* Invalid table: don't use 'resume' info */
+                                       UNSET_FLAG(cinfo->flags, BGP_CLEARING_INFO_FLAG_RESUME);
                                        continue;
+                               }
 
-                               /* TODO -- record the tables we've seen
-                                * and don't repeat any?
-                                */
+                               ret = walk_batch_table_helper(cinfo, table, false /*inner*/);
+                               if (ret != 0)
+                                       break;
+
+                               cinfo->table_map[afi][safi] = 1;
+
+                       } else {
+                               /* Process "inner" table for these SAFIs */
+                               outer_table = cinfo->bgp->rib[afi][safi];
+
+                               /* Begin or resume iteration in "outer" table */
+                               dest = clearing_dest_helper(outer_table, cinfo, false);
 
-                               clear_batch_table_helper(cinfo, table);
+                               for (; dest; dest = bgp_route_next(dest)) {
+                                       table = bgp_dest_get_bgp_table_info(dest);
+                                       if (!table) {
+                                               /* If we resumed to an inner afi/safi, but
+                                                * it's no longer valid, reset resume info.
+                                                */
+                                               UNSET_FLAG(cinfo->flags,
+                                                          BGP_CLEARING_INFO_FLAG_RESUME);
+                                               continue;
+                                       }
+
+                                       /* This will resume the "inner" walk if necessary */
+                                       ret = walk_batch_table_helper(cinfo, table, true /*inner*/);
+                                       if (ret != 0) {
+                                               /* The "inner" resume info will be set; 
+                                                * capture the resume info we need
+                                                * from the outer afi/safi and dest
+                                                */
+                                               set_clearing_resume_info(cinfo, outer_table, dest,
+                                                                        false);
+                                               break;
+                                       }
+                               }
+
+                               if (ret != 0)
+                                       break;
+
+                               cinfo->table_map[afi][safi] = 1;
                        }
+
+                       /* We've finished with a table: ensure we don't try to use stale
+                        * resume info.
+                        */
+                       UNSET_FLAG(cinfo->flags, BGP_CLEARING_INFO_FLAG_RESUME);
                }
+               safi = SAFI_UNICAST;
        }
+       return ret;
 }
 
 /*
@@ -6660,21 +6775,30 @@ static void clear_batch_rib_helper(struct bgp_clearing_info *cinfo)
  */
 void bgp_clear_route_batch(struct bgp_clearing_info *cinfo)
 {
-       if (bgp_debug_neighbor_events(NULL))
-               zlog_debug("%s: BGP %s, batch %p", __func__,
-                          cinfo->bgp->name_pretty, cinfo);
-
-       /* Walk the rib, checking the peers in the batch */
-       clear_batch_rib_helper(cinfo);
+       int ret;
 
-       /* If we found some prefixes, schedule a task to begin work. */
-       if (bgp_clearing_batch_dests_present(cinfo))
-               event_add_event(bm->master, bgp_clear_batch_dests_task, cinfo,
-                               0, &cinfo->t_sched);
+       if (bgp_debug_neighbor_events(NULL))
+               zlog_debug("%s: BGP %s, batch %u", __func__,
+                          cinfo->bgp->name_pretty, cinfo->id);
 
-       /* NB -- it's the caller's job to clean up, release refs, etc. if
-        * we didn't find any dests
+       /* Walk the rib, checking the peers in the batch. If the rib walk needs
+        * to continue, a task will be scheduled
         */
+       ret = clear_batch_rib_helper(cinfo);
+       if (ret == 0) {
+               /* All done - clean up. */
+               bgp_clearing_batch_completed(cinfo);
+       } else {
+               /* Handle pause/resume for the walk: we've captured key info
+                * in cinfo so we can resume later.
+                */
+               if (bgp_debug_neighbor_events(NULL))
+                       zlog_debug("%s: reschedule cinfo at %s/%s, %pFX", __func__,
+                                  afi2str(cinfo->last_afi),
+                                  safi2str(cinfo->last_safi), &(cinfo->last_pfx));
+               event_add_event(bm->master, clear_dests_callback, cinfo, 0,
+                               &cinfo->t_sched);
+       }
 }
 
 void bgp_clear_route_all(struct peer *peer)
@@ -6682,17 +6806,14 @@ void bgp_clear_route_all(struct peer *peer)
        afi_t afi;
        safi_t safi;
 
+       if (bgp_debug_neighbor_events(peer))
+               zlog_debug("%s: peer %pBP", __func__, peer);
+
        /* We may be able to batch multiple peers' clearing work: check
         * and see.
         */
-       if (bgp_clearing_batch_add_peer(peer->bgp, peer)) {
-               if (bgp_debug_neighbor_events(peer))
-                       zlog_debug("%s: peer %pBP batched", __func__, peer);
+       if (bgp_clearing_batch_add_peer(peer->bgp, peer))
                return;
-       }
-
-       if (bgp_debug_neighbor_events(peer))
-               zlog_debug("%s: peer %pBP", __func__, peer);
 
        FOREACH_AFI_SAFI (afi, safi)
                bgp_clear_route(peer, afi, safi);
index 9000dc851a4e77074088bdd76dfff48b0be6bb1a..70ae4d20143d1a77ea326d96f83c00c148f780fe 100644 (file)
@@ -8698,6 +8698,11 @@ void bgp_master_init(struct event_loop *master, const int buffer_size,
        bm->t_bgp_zebra_l2_vni = NULL;
        bm->t_bgp_zebra_l3_vni = NULL;
 
+       bm->peer_clearing_batch_id = 1;
+       /* TODO -- make these configurable */
+       bm->peer_conn_errs_dequeue_limit = BGP_CONN_ERROR_DEQUEUE_MAX;
+       bm->peer_clearing_batch_max_dests = BGP_CLEARING_BATCH_MAX_DESTS;
+
        bgp_mac_init();
        /* init the rd id space.
           assign 0th index in the bitfield,
@@ -9096,6 +9101,7 @@ static void bgp_clearing_batch_begin(struct bgp *bgp)
        cinfo = XCALLOC(MTYPE_CLEARING_BATCH, sizeof(struct bgp_clearing_info));
 
        cinfo->bgp = bgp;
+       cinfo->id = bm->peer_clearing_batch_id++;
 
        /* Init hash of peers and list of dests */
        bgp_clearing_hash_init(&cinfo->peers);
@@ -9131,15 +9137,10 @@ static void bgp_clearing_batch_end(struct bgp *bgp)
        /* Do a RIB walk for the current batch. If it finds dests/prefixes
         * to work on, this will schedule a task to process
         * the dests/prefixes in the batch.
+        * NB this will free the batch if it finishes, or if there was no work
+        * to do.
         */
        bgp_clear_route_batch(cinfo);
-
-       /* If we found no prefixes/dests, just discard the batch,
-        * remembering that we're holding a ref for each peer.
-        */
-       if (bgp_clearing_destlist_count(&cinfo->destlist) == 0) {
-               bgp_clearing_batch_completed(cinfo);
-       }
 }
 
 /* Check whether a dest's peer is relevant to a clearing batch */
@@ -9242,6 +9243,10 @@ bool bgp_clearing_batch_add_peer(struct bgp *bgp, struct peer *peer)
 
                        bgp_clearing_hash_add(&cinfo->peers, peer);
                        SET_FLAG(peer->flags, PEER_FLAG_CLEARING_BATCH);
+
+                       if (bgp_debug_neighbor_events(peer))
+                               zlog_debug("%s: peer %pBP batched in %#x", __func__,
+                                          peer, cinfo->id);
                }
                return true;
        }
@@ -9254,17 +9259,12 @@ bool bgp_clearing_batch_add_peer(struct bgp *bgp, struct peer *peer)
  * encountered in the io pthread. We avoid having the io pthread try
  * to enqueue fsm events or mess with the peer struct.
  */
-
-/* TODO -- should this be configurable? */
-/* Max number of peers to process without rescheduling */
-#define BGP_CONN_ERROR_DEQUEUE_MAX 10
-
 static void bgp_process_conn_error(struct event *event)
 {
        struct bgp *bgp;
        struct peer *peer;
        struct peer_connection *connection;
-       int counter = 0;
+       uint32_t counter = 0;
        size_t list_count = 0;
        bool more_p = false;
 
@@ -9311,7 +9311,7 @@ static void bgp_process_conn_error(struct event *event)
                bgp_event_update(connection, connection->connection_errcode);
 
                counter++;
-               if (counter >= BGP_CONN_ERROR_DEQUEUE_MAX)
+               if (counter >= bm->peer_conn_errs_dequeue_limit)
                        break;
 
                connection = bgp_dequeue_conn_err(bgp, &more_p);
index 9bedf4477a49bd00e5359709a9e1c8d25a763a2f..5656774bc5d6680ae74432cf5f6640630d0b719d 100644 (file)
@@ -48,6 +48,11 @@ DECLARE_HOOK(bgp_hook_config_write_vrf, (struct vty *vty, struct vrf *vrf),
 /* Default interval for IPv6 RAs when triggered by BGP unnumbered neighbor. */
 #define BGP_UNNUM_DEFAULT_RA_INTERVAL 10
 
+/* Max number of peers to process without rescheduling */
+#define BGP_CONN_ERROR_DEQUEUE_MAX 10
+/* Limit the number of clearing dests we'll process per callback */
+#define BGP_CLEARING_BATCH_MAX_DESTS 100
+
 struct update_subgroup;
 struct bpacket;
 struct bgp_pbr_config;
@@ -214,6 +219,16 @@ struct bgp_master {
        /* To preserve ordering of processing of BGP-VRFs for L3 VNIs */
        struct zebra_l3_vni_head zebra_l3_vni_head;
 
+       /* ID value for peer clearing batches */
+       uint32_t peer_clearing_batch_id;
+
+       /* Limits for batched peer clearing code:
+        * Max number of errored peers to process without rescheduling
+        */
+       uint32_t peer_conn_errs_dequeue_limit;
+       /* Limit the number of clearing dests we'll process per callback */
+       uint32_t peer_clearing_batch_max_dests;
+
        QOBJ_FIELDS;
 };
 DECLARE_QOBJ_TYPE(bgp_master);
@@ -414,6 +429,9 @@ struct bgp_clearing_info {
        /* Hash of peers */
        struct bgp_clearing_hash_head peers;
 
+       /* Batch ID, for debugging/logging */
+       uint32_t id;
+
        /* Flags */
        uint32_t flags;
 
@@ -423,9 +441,28 @@ struct bgp_clearing_info {
        /* Event to schedule/reschedule processing */
        struct event *t_sched;
 
-       /* TODO -- id, serial number, for debugging/logging? */
+       /* Info for rescheduling the RIB walk */
+       afi_t last_afi;
+       safi_t last_safi;
+       struct prefix last_pfx;
 
-       /* TODO -- info for rescheduling the RIB walk? future? */
+       /* For some afi/safi (vpn/evpn e.g.), bgp may do an inner walk
+        * for a related table; the 'last' info represents the outer walk,
+        * and this info represents the inner walk.
+        */
+       afi_t inner_afi;
+       safi_t inner_safi;
+       struct prefix inner_pfx;
+
+       /* Map of afi/safi so we don't re-walk any tables */
+       uint8_t table_map[AFI_MAX][SAFI_MAX];
+
+       /* Counters: current iteration, overall total, and processed count. */
+       uint32_t curr_counter;
+       uint32_t total_counter;
+       uint32_t total_processed;
+
+       /* TODO -- id, serial number, for debugging/logging? */
 
        /* Linkage for list of batches per bgp */
        struct bgp_clearing_info_item link;
@@ -433,6 +470,10 @@ struct bgp_clearing_info {
 
 /* Batch is open, new peers can be added */
 #define BGP_CLEARING_INFO_FLAG_OPEN  (1 << 0)
+/* Batch is resuming iteration after yielding */
+#define BGP_CLEARING_INFO_FLAG_RESUME (1 << 1)
+/* Batch has 'inner' resume info set */
+#define BGP_CLEARING_INFO_FLAG_INNER (1 << 2)
 
 /* BGP instance structure.  */
 struct bgp {