From cd0558e629b66916f8e8f59d4871fe2cb20d18fc Mon Sep 17 00:00:00 2001 From: Rajasekar Raja Date: Sun, 23 Jul 2023 05:43:12 +0000 Subject: [PATCH] zebra: fix nhe refcnt when frr service goes down MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit When frr.service is going down(restart or stop), zebra core can be seen. Sequence of events leading to crash: Increments of nhe refcnt: - Upper level creates a new nhe(say NHE1) —> nhe->refcnt=1 - Two RE’s (Say RE1 & RE2) associate with NHE1 —> nhe->refcnt = 3 Decrements of nhe refcnt: - BGP sends a zapi msg to zebra to delete NHG. —> nhe->refcnt = 2 - RE1 is queued for delete in META-Q - As zebra is dissociating with its clients, zebra_nhg_score_proto() is invoked -> nhe->refcnt=1 - RE2 is no more associated with the NHE1 —>nhe->refcnt=0 & hence NHE IS FREED - Now RE1 is dequeued from META-Q for processing the re delete. —> At this point re->nhe is pointing to freed pointer. CRASH CRASH!!!! Fix: - When we iterate zebra_nhg_score_proto_entry() to delete the upper proto specific nhe’s, we need to skip the additional nhe->refcnt decrement in case nhe->flags has NEXTHOP_GROUP_PROTO_RELEASED set. Backtrace-1 0x00007fa8449ce8eb in raise () from /lib/x86_64-linux-gnu/libc.so.6 0x00007fa8449b9535 in abort () from /lib/x86_64-linux-gnu/libc.so.6 0x00007fa844d32f86 in _zlog_assert_failed (xref=xref@entry=0x55fa37871040 <_xref.28142>, extra=extra@entry=0x0) at lib/zlog.c:680 0x000055fa3778f770 in rib_re_nhg_free (re=0x55fa39e33770) at zebra/zebra_rib.c:2578 rib_unlink (rn=0x55fa39e27a60, re=0x55fa39e33770) at zebra/zebra_rib.c:3930 0x000055fa3778ff18 in rib_process (rn=0x55fa39e27a60) at zebra/zebra_rib.c:1439 0x000055fa37790b1c in process_subq_route (qindex=8 '\b', lnode=0x55fa39e1c1b0) at zebra/zebra_rib.c:2549 process_subq (qindex=META_QUEUE_BGP, subq=0x55fa3999c580) at zebra/zebra_rib.c:3107 meta_queue_process (dummy=, data=0x55fa3999c480) at zebra/zebra_rib.c:3146 0x00007fa844d232b8 in work_queue_run (thread=0x7ffffbdf6cb0) at lib/workqueue.c:285 0x00007fa844d195fd in thread_call (thread=thread@entry=0x7ffffbdf6cb0) at lib/thread.c:2008 0x00007fa844cd3888 in frr_run (master=0x55fa397b7630) at lib/libfrr.c:1223 0x000055fa3771e294 in main (argc=12, argv=0x7ffffbdf7098) at zebra/main.c:526 Backtrace-2 0x00007f125af3f535 in abort () from /lib/x86_64-linux-gnu/libc.so.6 0x00007f125b2b8f96 in _zlog_assert_failed (xref=xref@entry=0x7f125b344260 <_xref.18768>, extra=extra@entry=0x0) at lib/zlog.c:680 0x00007f125b268190 in nexthop_copy_no_recurse (copy=copy@entry=0x5606dd726f10, nexthop=nexthop@entry=0x7f125b0d7f90, rparent=) at lib/nexthop.c:806 0x00007f125b2681b2 in nexthop_copy (copy=0x5606dd726f10, nexthop=0x7f125b0d7f90, rparent=) at lib/nexthop.c:836 0x00007f125b268249 in nexthop_dup (nexthop=nexthop@entry=0x7f125b0d7f90, rparent=rparent@entry=0x0) at lib/nexthop.c:860 0x00007f125b26b67b in copy_nexthops (tnh=tnh@entry=0x5606dd9ec748, nh=, rparent=rparent@entry=0x0) at lib/nexthop_group.c:457 0x00007f125b26b6ba in nexthop_group_copy (to=to@entry=0x5606dd9ec748, from=from@entry=0x5606dd9ee9f8) at lib/nexthop_group.c:291 0x00005606db6ec678 in zebra_nhe_copy (orig=0x5606dd9ee9d0, id=id@entry=0) at zebra/zebra_nhg.c:431 0x00005606db6ddc63 in mpls_ftn_uninstall_all (zvrf=zvrf@entry=0x5606dd6e7cd0, afi=afi@entry=2, lsp_type=ZEBRA_LSP_NONE) at zebra/zebra_mpls.c:3410 0x00005606db6de108 in zebra_mpls_cleanup_zclient_labels (client=0x5606dd8e03b0) at ./zebra/zebra_mpls.h:471 0x00005606db73e575 in hook_call_zserv_client_close (client=0x5606dd8e03b0) at zebra/zserv.c:566 zserv_client_free (client=0x5606dd8e03b0) at zebra/zserv.c:585 zserv_close_client (client=0x5606dd8e03b0) at zebra/zserv.c:706 0x00007f125b29f60d in thread_call (thread=thread@entry=0x7ffc2a740290) at lib/thread.c:2008 0x00007f125b259888 in frr_run (master=0x5606dd3b7630) at lib/libfrr.c:1223 0x00005606db68d298 in main (argc=12, argv=0x7ffc2a740678) at zebra/main.c:534 Issue: 3492031 Ticket# 3492031 Signed-off-by: Rajasekar Raja --- zebra/zebra_nhg.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/zebra/zebra_nhg.c b/zebra/zebra_nhg.c index 977ea425d5..94db148840 100644 --- a/zebra/zebra_nhg.c +++ b/zebra/zebra_nhg.c @@ -3635,7 +3635,18 @@ unsigned long zebra_nhg_score_proto(int type) * This should be the last ref if we remove client routes too, * and thus should remove and free them. */ - zebra_nhg_decrement_ref(nhe); + if (!CHECK_FLAG(nhe->flags, NEXTHOP_GROUP_PROTO_RELEASED)) + zebra_nhg_decrement_ref(nhe); + else { + + /* protocol sends explicit delete of nhg, the + * nhe->refcount is decremented in zread_nhg_del() + */ + if (IS_ZEBRA_DEBUG_RIB_DETAILED) + zlog_debug( + "%s: nhe %u (%p) refcount %u already decremented in zread_nhg_del", + __func__, nhe->id, nhe, nhe->refcnt); + } } count = iter.found->count; -- 2.39.5