From 07b9ebca65832813cc00722401f282a51a11ac17 Mon Sep 17 00:00:00 2001 From: Donald Sharp Date: Wed, 1 Dec 2021 16:28:42 -0500 Subject: [PATCH] zebra: Ensure zebra_nhg_sweep_table accounts for double deletes I'm seeing this crash in various forms: Program terminated with signal SIGSEGV, Segmentation fault. 50 ../sysdeps/unix/sysv/linux/raise.c: No such file or directory. [Current thread is 1 (Thread 0x7f418efbc7c0 (LWP 3580253))] (gdb) bt (gdb) f 4 267 (*func)(hb, arg); (gdb) p hb $1 = (struct hash_bucket *) 0x558cdaafb250 (gdb) p *hb $2 = {len = 0, next = 0x0, key = 0, data = 0x0} (gdb) I've also seen a crash where data is 0x03. My suspicion is that hash_iterate is calling zebra_nhg_sweep_entry which does delete the particular entry we are looking at as well as possibly other entries when the ref count for those entries gets set to 0 as well. Then we have this loop in hash_iterate.c: for (i = 0; i < hash->size; i++) for (hb = hash->index[i]; hb; hb = hbnext) { /* get pointer to next hash bucket here, in case (*func) * decides to delete hb by calling hash_release */ hbnext = hb->next; (*func)(hb, arg); } Suppose in the previous loop hbnext is set to hb->next and we call zebra_nhg_sweep_entry. This deletes the previous entry and also happens to cause the hbnext entry to be deleted as well, because of nhg refcounts. At this point in time the memory pointed to by hbnext is not owned by the pthread anymore and we can end up on a state where it's overwritten by another pthread in zebra with data for other incoming events. What to do? Let's change the sweep function to a hash_walk and have it stop iterating and to start over if there is a possible double delete operation. Signed-off-by: Donald Sharp --- zebra/zebra_nhg.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/zebra/zebra_nhg.c b/zebra/zebra_nhg.c index aa015992d5..fac312cf7c 100644 --- a/zebra/zebra_nhg.c +++ b/zebra/zebra_nhg.c @@ -2995,7 +2995,7 @@ void zebra_nhg_dplane_result(struct zebra_dplane_ctx *ctx) dplane_ctx_fini(&ctx); } -static void zebra_nhg_sweep_entry(struct hash_bucket *bucket, void *arg) +static int zebra_nhg_sweep_entry(struct hash_bucket *bucket, void *arg) { struct nhg_hash_entry *nhe = NULL; @@ -3009,7 +3009,7 @@ static void zebra_nhg_sweep_entry(struct hash_bucket *bucket, void *arg) * from an upper level proto. */ if (zrouter.startup_time < nhe->uptime) - return; + return HASHWALK_CONTINUE; /* * If it's proto-owned and not being used by a route, remove it since @@ -3019,20 +3019,41 @@ static void zebra_nhg_sweep_entry(struct hash_bucket *bucket, void *arg) */ if (PROTO_OWNED(nhe) && nhe->refcnt == 1) { zebra_nhg_decrement_ref(nhe); - return; + return HASHWALK_ABORT; } /* * If its being ref'd by routes, just let it be uninstalled via a route * removal. */ - if (ZEBRA_NHG_CREATED(nhe) && nhe->refcnt <= 0) + if (ZEBRA_NHG_CREATED(nhe) && nhe->refcnt <= 0) { zebra_nhg_uninstall_kernel(nhe); + return HASHWALK_ABORT; + } + + return HASHWALK_CONTINUE; } void zebra_nhg_sweep_table(struct hash *hash) { - hash_iterate(hash, zebra_nhg_sweep_entry, NULL); + uint32_t count; + + /* + * Yes this is extremely odd. Effectively nhg's have + * other nexthop groups that depend on them and when you + * remove them, you can have other entries blown up. + * our hash code does not work with deleting multiple + * entries at a time and will possibly cause crashes + * So what to do? Whenever zebra_nhg_sweep_entry + * deletes an entry it will return HASHWALK_ABORT, + * cause that deletion might have triggered more. + * then we can just keep sweeping this table + * until nothing more is found to do. + */ + do { + count = hashcount(hash); + hash_walk(hash, zebra_nhg_sweep_entry, NULL); + } while (count != hashcount(hash)); } static void zebra_nhg_mark_keep_entry(struct hash_bucket *bucket, void *arg) -- 2.39.5