]> git.puffer.fish Git - matthieu/frr.git/commitdiff
tests: Fix ospf[6]_gr_topo1 tests to work better under load
authorDonald Sharp <sharpd@nvidia.com>
Fri, 8 Oct 2021 11:37:15 +0000 (07:37 -0400)
committerDonald Sharp <sharpd@nvidia.com>
Fri, 8 Oct 2021 12:35:16 +0000 (08:35 -0400)
2 things:

a) Each test was setting up for graceful restart with calls to
`graceful-restart prepare ip[v6] ospf`, then sleeping for
3 or 5 seconds.  Then killing the ospf process.  Under heavy
load there is no guarantee that zebra has received/processed
this signal.  Write some code to ensure that this happens

b) Tests are issuing commands in this order:
   1) issue gr prepare command
   2) kill router
   3) <ensure routes were still installed in zebra>
   4) start router
   5) <ensure routes were stil installed in zebra>

Imagine that the system is under some load and there is
a small amount of time before step 5 happens.  In this
case ospf could have come up and started neighbor relations
and also started installing routes.  If zebra receives
a new route before step 5 is issued then the route could
be in a state where it is not installed, because it is
being sent to the kernel for installation.  This would
fail the test because it would only look 1 time.  This
is fixed by giving time on restart for the routes to
be in the installed state.

Signed-off-by: Donald Sharp <sharpd@nvidia.com>
tests/topotests/ospf6_gr_topo1/test_ospf6_gr_topo1.py
tests/topotests/ospf_gr_topo1/test_ospf_gr_topo1.py

index ccbcadb8b12b4715b5320979caacfaf376e929c7..d50223191d56cef3544c6cbdf6a54497b34a5cb9 100755 (executable)
@@ -175,10 +175,21 @@ def check_routers(initial_convergence=False, exiting=None, restarting=None):
     for rname in ["rt1", "rt2", "rt3", "rt4", "rt5", "rt6", "rt7"]:
         # Check the RIB first, which should be preserved across restarts in
         # all routers of the routing domain.
+        # If we are not on initial convergence *but* we are checking
+        # after a restart.  Looking in the zebra rib for installed
+        # is a recipe for test failure.  Why?  because if we are restarting
+        # then ospf is in the process of establishing neighbors and passing
+        # new routes to zebra.  Zebra will not mark the route as installed
+        # when it receives a replacement from ospf until it has finished
+        # processing it.  Let's give it a few seconds to allow this to happen
+        # under load.
         if initial_convergence == True:
             tries = 240
         else:
-            tries = 1
+            if restarting != None:
+                tries = 40
+            else:
+                tries = 1
         router_compare_json_output(
             rname, "show ipv6 route ospf json", "show_ipv6_route.json", tries
         )
@@ -212,6 +223,26 @@ def check_routers(initial_convergence=False, exiting=None, restarting=None):
             )
 
 
+def ensure_gr_is_in_zebra(rname):
+    retry = True
+    retry_times = 10
+    tgen = get_topogen()
+
+    while retry and retry_times > 0:
+        out = tgen.net[rname].cmd(
+            'vtysh -c "show zebra client" | grep "Client: ospf6$" -A 40 | grep "Capabilities "'
+        )
+
+        if "Graceful Restart" not in out:
+            sleep(2)
+            retry_times -= 1
+        else:
+            retry = False
+
+    assertmsg = "%s does not appear to have Graceful Restart setup" % rname
+    assert not retry and retry_times > 0, assertmsg
+
+
 #
 # Test initial network convergence
 #
@@ -238,10 +269,9 @@ def test_gr_rt1():
         pytest.skip(tgen.errors)
 
     tgen.net["rt1"].cmd('vtysh -c "graceful-restart prepare ipv6 ospf"')
-    sleep(5)
+    ensure_gr_is_in_zebra("rt1")
     kill_router_daemons(tgen, "rt1", ["ospf6d"], save_config=False)
     check_routers(exiting="rt1")
-
     start_router_daemons(tgen, "rt1", ["ospf6d"])
     check_routers(restarting="rt1")
 
@@ -258,7 +288,7 @@ def test_gr_rt2():
         pytest.skip(tgen.errors)
 
     tgen.net["rt2"].cmd('vtysh -c "graceful-restart prepare ipv6 ospf"')
-    sleep(5)
+    ensure_gr_is_in_zebra("rt2")
     kill_router_daemons(tgen, "rt2", ["ospf6d"], save_config=False)
     check_routers(exiting="rt2")
 
@@ -278,7 +308,7 @@ def test_gr_rt3():
         pytest.skip(tgen.errors)
 
     tgen.net["rt3"].cmd('vtysh -c "graceful-restart prepare ipv6 ospf"')
-    sleep(5)
+    ensure_gr_is_in_zebra("rt3")
     kill_router_daemons(tgen, "rt3", ["ospf6d"], save_config=False)
     check_routers(exiting="rt3")
 
@@ -298,7 +328,7 @@ def test_gr_rt4():
         pytest.skip(tgen.errors)
 
     tgen.net["rt4"].cmd('vtysh -c "graceful-restart prepare ipv6 ospf"')
-    sleep(5)
+    ensure_gr_is_in_zebra("rt4")
     kill_router_daemons(tgen, "rt4", ["ospf6d"], save_config=False)
     check_routers(exiting="rt4")
 
@@ -318,7 +348,7 @@ def test_gr_rt5():
         pytest.skip(tgen.errors)
 
     tgen.net["rt5"].cmd('vtysh -c "graceful-restart prepare ipv6 ospf"')
-    sleep(5)
+    ensure_gr_is_in_zebra("rt5")
     kill_router_daemons(tgen, "rt5", ["ospf6d"], save_config=False)
     check_routers(exiting="rt5")
 
@@ -338,7 +368,7 @@ def test_gr_rt6():
         pytest.skip(tgen.errors)
 
     tgen.net["rt6"].cmd('vtysh -c "graceful-restart prepare ipv6 ospf"')
-    sleep(5)
+    ensure_gr_is_in_zebra("rt6")
     kill_router_daemons(tgen, "rt6", ["ospf6d"], save_config=False)
     check_routers(exiting="rt6")
 
@@ -358,7 +388,7 @@ def test_gr_rt7():
         pytest.skip(tgen.errors)
 
     tgen.net["rt7"].cmd('vtysh -c "graceful-restart prepare ipv6 ospf"')
-    sleep(5)
+    ensure_gr_is_in_zebra("rt7")
     kill_router_daemons(tgen, "rt7", ["ospf6d"], save_config=False)
     check_routers(exiting="rt7")
 
index 7d9cc684128cafcc59fe3e7e017177958a76f080..1432d53ffc825709667754747075c71b11134197 100755 (executable)
@@ -184,10 +184,21 @@ def check_routers(initial_convergence=False, exiting=None, restarting=None):
     for rname in ["rt1", "rt2", "rt3", "rt4", "rt5", "rt6", "rt7"]:
         # Check the RIB first, which should be preserved across restarts in
         # all routers of the routing domain.
+        # If we are not on initial convergence *but* we are checking
+        # after a restart.  Looking in the zebra rib for installed
+        # is a recipe for test failure.  Why?  because if we are restarting
+        # then ospf is in the process of establishing neighbors and passing
+        # new routes to zebra.  Zebra will not mark the route as installed
+        # when it receives a replacement from ospf until it has finished
+        # processing it.  Let's give it a few seconds to allow this to happen
+        # under load.
         if initial_convergence == True:
             tries = 240
         else:
-            tries = 1
+            if restarting != None:
+                tries = 40
+            else:
+                tries = 1
         router_compare_json_output(
             rname, "show ip route ospf json", "show_ip_route.json", tries
         )
@@ -215,6 +226,26 @@ def check_routers(initial_convergence=False, exiting=None, restarting=None):
             )
 
 
+def ensure_gr_is_in_zebra(rname):
+    retry = True
+    retry_times = 10
+    tgen = get_topogen()
+
+    while retry and retry_times > 0:
+        out = tgen.net[rname].cmd(
+            'vtysh -c "show zebra client" | grep "Client: ospf$" -A 40 | grep "Capabilities "'
+        )
+
+        if "Graceful Restart" not in out:
+            sleep(2)
+            retry_times -= 1
+        else:
+            retry = False
+
+    assertmsg = "%s does not appear to have Graceful Restart setup" % rname
+    assert not retry and retry_times > 0, assertmsg
+
+
 #
 # Test initial network convergence
 #
@@ -241,7 +272,7 @@ def test_gr_rt1():
         pytest.skip(tgen.errors)
 
     tgen.net["rt1"].cmd('vtysh -c "graceful-restart prepare ip ospf"')
-    sleep(3)
+    ensure_gr_is_in_zebra("rt1")
     kill_router_daemons(tgen, "rt1", ["ospfd"], save_config=False)
     check_routers(exiting="rt1")
 
@@ -261,7 +292,7 @@ def test_gr_rt2():
         pytest.skip(tgen.errors)
 
     tgen.net["rt2"].cmd('vtysh -c "graceful-restart prepare ip ospf"')
-    sleep(3)
+    ensure_gr_is_in_zebra("rt2")
     kill_router_daemons(tgen, "rt2", ["ospfd"], save_config=False)
     check_routers(exiting="rt2")
 
@@ -281,7 +312,7 @@ def test_gr_rt3():
         pytest.skip(tgen.errors)
 
     tgen.net["rt3"].cmd('vtysh -c "graceful-restart prepare ip ospf"')
-    sleep(3)
+    ensure_gr_is_in_zebra("rt3")
     kill_router_daemons(tgen, "rt3", ["ospfd"], save_config=False)
     check_routers(exiting="rt3")
 
@@ -301,7 +332,7 @@ def test_gr_rt4():
         pytest.skip(tgen.errors)
 
     tgen.net["rt4"].cmd('vtysh -c "graceful-restart prepare ip ospf"')
-    sleep(3)
+    ensure_gr_is_in_zebra("rt4")
     kill_router_daemons(tgen, "rt4", ["ospfd"], save_config=False)
     check_routers(exiting="rt4")
 
@@ -321,7 +352,7 @@ def test_gr_rt5():
         pytest.skip(tgen.errors)
 
     tgen.net["rt5"].cmd('vtysh -c "graceful-restart prepare ip ospf"')
-    sleep(3)
+    ensure_gr_is_in_zebra("rt5")
     kill_router_daemons(tgen, "rt5", ["ospfd"], save_config=False)
     check_routers(exiting="rt5")
 
@@ -341,7 +372,7 @@ def test_gr_rt6():
         pytest.skip(tgen.errors)
 
     tgen.net["rt6"].cmd('vtysh -c "graceful-restart prepare ip ospf"')
-    sleep(3)
+    ensure_gr_is_in_zebra("rt6")
     kill_router_daemons(tgen, "rt6", ["ospfd"], save_config=False)
     check_routers(exiting="rt6")
 
@@ -361,7 +392,7 @@ def test_gr_rt7():
         pytest.skip(tgen.errors)
 
     tgen.net["rt7"].cmd('vtysh -c "graceful-restart prepare ip ospf"')
-    sleep(3)
+    ensure_gr_is_in_zebra("rt7")
     kill_router_daemons(tgen, "rt7", ["ospfd"], save_config=False)
     check_routers(exiting="rt7")