From dc65cd999d85bcbd356997a112feecddce441620 Mon Sep 17 00:00:00 2001 From: Stephen Worley Date: Thu, 22 Apr 2021 17:21:12 -0400 Subject: [PATCH] zebra: handle gracefulRS/retain with proto NHGs Properly handle refcounting of Proto-owned NHGs when zebra is operating under graceful restart and retain conditions. We have an extra refcnt of 1 we keep for proto-owned NHGs to indicate the upper level proto has created and owns it. When we are reading these in from the kernel, we need to set them to 1 as appropriate. Without this, we fail in the assert() during zebra_nhg_proto_add() after the owning daemons resends the NHG and the refcnts are off by one. Also add in the same logic we use for routes when sweeping with respect to uptimes. Signed-off-by: Stephen Worley --- zebra/zebra_nhg.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/zebra/zebra_nhg.c b/zebra/zebra_nhg.c index 9640d05f05..1d3da4e673 100644 --- a/zebra/zebra_nhg.c +++ b/zebra/zebra_nhg.c @@ -1193,6 +1193,13 @@ static int nhg_ctx_process_new(struct nhg_ctx *ctx) if (IS_ZEBRA_DEBUG_NHG_DETAIL) zlog_debug("%s: nhe %p (%u) is new", __func__, nhe, nhe->id); + /* + * If daemon nhg from the kernel, add a refcnt here to indicate the + * daemon owns it. + */ + if (PROTO_OWNED(nhe)) + zebra_nhg_increment_ref(nhe); + SET_FLAG(nhe->flags, NEXTHOP_GROUP_VALID); SET_FLAG(nhe->flags, NEXTHOP_GROUP_INSTALLED); @@ -2929,7 +2936,31 @@ static void zebra_nhg_sweep_entry(struct hash_bucket *bucket, void *arg) nhe = (struct nhg_hash_entry *)bucket->data; - /* If its being ref'd, just let it be uninstalled via a route removal */ + /* + * same logic as with routes. + * + * If older than startup time, we know we read them in from the + * kernel and have not gotten and update for them since startup + * from an upper level proto. + */ + if (zrouter.startup_time < nhe->uptime) + return; + + /* + * If it's proto-owned and not being used by a route, remove it since + * we haven't gotten an update about it from the proto since startup. + * This means that either the config for it was removed or the daemon + * didn't get started. This handles graceful restart & retain scenario. + */ + if (PROTO_OWNED(nhe) && nhe->refcnt == 1) { + zebra_nhg_decrement_ref(nhe); + return; + } + + /* + * If its being ref'd by routes, just let it be uninstalled via a route + * removal. + */ if (ZEBRA_NHG_CREATED(nhe) && nhe->refcnt <= 0) zebra_nhg_uninstall_kernel(nhe); } -- 2.39.5