From de168795ab5d75151f1b410fcdad88e2c28ef40e Mon Sep 17 00:00:00 2001 From: Rajasekar Raja Date: Tue, 11 Mar 2025 12:15:32 -0700 Subject: [PATCH] zebra: Fix reinstalling nexthops in NHGs upon interface flaps Trigger: Imagine a route utilizing an NHG with six nexthops (Intf swp1-swp6). If interfaces swp1-swp4 flaps, the NHG remains the same but now only references two nexthops (swp5-6) instead of all six. This behavior occurs due to how NHGs with recursive nexthops are managed within Zebra. In the scenario below, NHG 370 has all six nexthops installed in the kernel. However, Zebra maintains a list of recursive NHGs that NHG 370 references i.e., Depends: (371), (372), (373) which are not directly installed in the kernel. - When an interface comes up, its nexthop and corresponding dependents are installed. - These dependents (counterparts to 371-373) are non-recursive and are installed as well. - However, when attempting to install the recursive ones in zebra_nhg_install_kernel(), they resolve to the already installed counterparts, resulting in a NO-OP. Fixing this by iterating all dependents of the recursively resolved NHGs and reinstalling them. Trigger: Flap swp1 to swp4 Before Fix: root@leaf-11:mgmt:/var/home/cumulus# ip route show | grep 6.0.0.5 6.0.0.5 nhid 370 proto bgp metric 20 ip -d next show id 337 via 2000:1:0:1:0:f:0:9 dev swp6 scope link proto zebra id 339 via 2000:1:0:1:0:e:0:9 dev swp5 scope link proto zebra id 341 via 2000:1:0:1:0:8:0:8 dev swp4 scope link proto zebra id 343 via 2000:1:0:1:0:7:0:8 dev swp3 scope link proto zebra id 346 via 2000:1:0:1:0:1:0:7 dev swp2 scope link proto zebra id 348 via 2000:1:0:1::7 dev swp1 scope link proto zebra id 370 group 346/348/341/343/337/339 scope global proto zebra After Trigger: root@leaf-11:mgmt:/var/home/cumulus# ip route show | grep 6.0.0.5 6.0.0.5 nhid 370 proto bgp metric 20 root@leaf-11:mgmt:/var/home/cumulus# ip -d next show id 337 via 2000:1:0:1:0:f:0:9 dev swp6 scope link proto zebra id 339 via 2000:1:0:1:0:e:0:9 dev swp5 scope link proto zebra id 370 group 337/339 scope global proto zebra After Fix: root@leaf-11:mgmt:/var/home/cumulus# ip route show | grep 6.0.0.5 6.0.0.5 nhid 432 proto bgp metric 20 ip -d next show id 432 group 395/397/400/402/405/407 scope global proto zebra After Trigger root@leaf-11:mgmt:/var/home/cumulus# ip route show | grep 6.0.0.5 6.0.0.5 nhid 432 proto bgp metric 20 root@leaf-11:mgmt:/var/home/cumulus# ip -d next show id 432 group 395/397/400/402/405/407 scope global proto zebra Ticket :# Signed-off-by: Rajasekar Raja Signed-off-by: Donald Sharp --- .../test_all_protocol_startup.py | 28 +++++++++++++++++++ zebra/zebra_nhg.c | 18 ++++++++++++ 2 files changed, 46 insertions(+) diff --git a/tests/topotests/all_protocol_startup/test_all_protocol_startup.py b/tests/topotests/all_protocol_startup/test_all_protocol_startup.py index 06a350c8e9..80903b134e 100644 --- a/tests/topotests/all_protocol_startup/test_all_protocol_startup.py +++ b/tests/topotests/all_protocol_startup/test_all_protocol_startup.py @@ -652,6 +652,34 @@ def test_nexthop_groups(): nhg_id ) + ## Validate NHG's installed in kernel has same nexthops with Interface flaps + pre_out = net["r1"].cmd('ip route show | grep "5.5.5.1"') + pre_nhg = re.search(r"nhid\s+(\d+)", pre_out) + pre_nh_show = net["r1"].cmd("ip next show id {}".format(pre_nhg.group(1))) + pre_total_nhs = len((re.search(r"group ([\d/]+)", pre_nh_show)).group(1).split("/")) + + net["r1"].cmd( + "ip link set r1-eth1 down;ip link set r1-eth2 down;ip link set r1-eth3 down;ip link set r1-eth4 down" + ) + sleep(1) + net["r1"].cmd( + "ip link set r1-eth1 up;ip link set r1-eth2 up;ip link set r1-eth3 up;ip link set r1-eth4 up" + ) + sleep(5) + + post_out = net["r1"].cmd('ip route show | grep "5.5.5.1"') + post_nhg = re.search(r"nhid\s+(\d+)", post_out) + post_nh_show = net["r1"].cmd("ip next show id {}".format(post_nhg.group(1))) + post_total_nhs = len( + (re.search(r"group ([\d/]+)", post_nh_show)).group(1).split("/") + ) + + assert ( + post_total_nhs == pre_total_nhs + ), "Expected same nexthops(pre-{}: post-{}) in NHG (pre-{}:post-{}) after few Interface flaps".format( + pre_total_nhs, post_total_nhs, pre_nhg.group(1), post_nhg.group(1) + ) + ## Remove all NHG routes net["r1"].cmd('vtysh -c "sharp remove routes 2.2.2.1 1"') diff --git a/zebra/zebra_nhg.c b/zebra/zebra_nhg.c index f5141c8f23..b20aa3f2a3 100644 --- a/zebra/zebra_nhg.c +++ b/zebra/zebra_nhg.c @@ -1165,11 +1165,29 @@ static void zebra_nhg_handle_install(struct nhg_hash_entry *nhe, bool install) { /* Update validity of groups depending on it */ struct nhg_connected *rb_node_dep; + struct nhg_connected *rb_node_indirect_dep = NULL; frr_each_safe (nhg_connected_tree, &nhe->nhg_dependents, rb_node_dep) { zebra_nhg_set_valid(rb_node_dep->nhe, true); /* install dependent NHG into kernel */ if (install) { + if (CHECK_FLAG(nhe->flags, NEXTHOP_GROUP_INSTALLED) && + CHECK_FLAG(rb_node_dep->nhe->flags, NEXTHOP_GROUP_RECURSIVE)) { + frr_each_safe (nhg_connected_tree, &rb_node_dep->nhe->nhg_dependents, + rb_node_indirect_dep) { + SET_FLAG(rb_node_indirect_dep->nhe->flags, + NEXTHOP_GROUP_REINSTALL); + if (IS_ZEBRA_DEBUG_NHG_DETAIL) + zlog_debug("%s nh id %u (flags 0x%x) associated dependents NHG %pNG (flags 0x%x) Re-install", + __func__, rb_node_dep->nhe->id, + rb_node_dep->nhe->flags, + rb_node_indirect_dep->nhe, + rb_node_indirect_dep->nhe->flags); + zebra_nhg_install_kernel(rb_node_indirect_dep->nhe, + ZEBRA_ROUTE_MAX); + } + } + if (IS_ZEBRA_DEBUG_NHG_DETAIL) zlog_debug( "%s nh id %u (flags 0x%x) associated dependent NHG %pNG install", -- 2.39.5