From f3e836217c2dd84b96938d292a3fe94f346ccdc8 Mon Sep 17 00:00:00 2001 From: Slawek Kaplonski Date: Tue, 11 Jan 2022 09:56:32 +0100 Subject: [PATCH] [Functional] Add extra logs to the L3 HA router transitions This patch adds extra logs to log current and expected ha state of the routers on various fake agents during the functional tests, log on which agent router is "primary" and where it is "backup" and when failover of the router should happen. Those logs should allow us better understand what happens during those functional tests and why some of them are failing from time to time. Related-Bug: #1956958 Change-Id: I567036470b7256275f67e8ef3546ed780c81b5ae --- .../tests/functional/agent/l3/framework.py | 17 +++++++ .../functional/agent/l3/test_dvr_router.py | 22 ++++----- .../functional/agent/l3/test_ha_router.py | 48 +++++++++---------- 3 files changed, 50 insertions(+), 37 deletions(-) diff --git a/neutron/tests/functional/agent/l3/framework.py b/neutron/tests/functional/agent/l3/framework.py index 8edb14350ac..98c966ed735 100644 --- a/neutron/tests/functional/agent/l3/framework.py +++ b/neutron/tests/functional/agent/l3/framework.py @@ -689,13 +689,30 @@ class L3AgentTestFramework(base.BaseSudoTestCase): lambda: self._check_external_device(primary_router)) common_utils.wait_until_true( lambda: backup_router.ha_state == 'backup') + + LOG.debug("Found primary router %s and backup router %s", + primary_router.router_id, backup_router.router_id) return primary_router, backup_router def fail_ha_router(self, router): device_name = router.get_ha_device_name() + LOG.debug("Failing HA router %s by setting device %s to DOWN", + router.router_id, device_name) ha_device = ip_lib.IPDevice(device_name, router.ha_namespace) ha_device.link.set_down() + @staticmethod + def wait_until_ha_router_has_state(router, expected_state): + + def router_has_expected_state(): + state = router.ha_state + LOG.debug("Router %s; current state is '%s', " + "expected state is '%s'", + router.router_id, state, expected_state) + return state == expected_state + + common_utils.wait_until_true(router_has_expected_state) + @staticmethod def fail_gw_router_port(router): r_br = ip_lib.IPDevice(router.driver.conf.OVS.integration_bridge) diff --git a/neutron/tests/functional/agent/l3/test_dvr_router.py b/neutron/tests/functional/agent/l3/test_dvr_router.py index 80e3baf3813..2d6fe70f94d 100644 --- a/neutron/tests/functional/agent/l3/test_dvr_router.py +++ b/neutron/tests/functional/agent/l3/test_dvr_router.py @@ -599,7 +599,7 @@ class TestDvrRouter(DvrRouterTestFramework, framework.L3AgentTestFramework): interface_name = router.get_external_device_name(port['id']) self._assert_no_ip_addresses_on_interface(router.ha_namespace, interface_name) - utils.wait_until_true(lambda: router.ha_state == 'primary') + self.wait_until_ha_router_has_state(router, 'primary') # Keepalived notifies of a state transition when it starts, # not when it ends. Thus, we have to wait until keepalived finishes @@ -1611,7 +1611,7 @@ class TestDvrRouter(DvrRouterTestFramework, framework.L3AgentTestFramework): self.assertFalse( ip_lib.device_exists(primary_ha_device, snat_namespace_name)) - utils.wait_until_true(lambda: backup.ha_state == 'primary') + self.wait_until_ha_router_has_state(backup, 'primary') self._assert_ip_addresses_in_dvr_ha_snat_namespace(backup) self.assertTrue( ip_lib.device_exists(backup_ha_device, backup.ha_namespace)) @@ -1641,16 +1641,16 @@ class TestDvrRouter(DvrRouterTestFramework, framework.L3AgentTestFramework): primary, backup = self._get_primary_and_backup_routers( router1, router2, check_external_device=False) - utils.wait_until_true(lambda: primary.ha_state == 'primary') - utils.wait_until_true(lambda: backup.ha_state == 'backup') + self.wait_until_ha_router_has_state(primary, 'primary') + self.wait_until_ha_router_has_state(backup, 'backup') self._assert_ip_addresses_in_dvr_ha_snat_namespace_with_fip(primary) self._assert_no_ip_addresses_in_dvr_ha_snat_namespace_with_fip(backup) self.fail_ha_router(primary) - utils.wait_until_true(lambda: backup.ha_state == 'primary') - utils.wait_until_true(lambda: primary.ha_state == 'backup') + self.wait_until_ha_router_has_state(backup, 'primary') + self.wait_until_ha_router_has_state(primary, 'backup') self._assert_ip_addresses_in_dvr_ha_snat_namespace_with_fip(backup) self._assert_no_ip_addresses_in_dvr_ha_snat_namespace_with_fip(primary) @@ -1671,16 +1671,16 @@ class TestDvrRouter(DvrRouterTestFramework, framework.L3AgentTestFramework): primary, backup = self._get_primary_and_backup_routers( router1, router2, check_external_device=False) - utils.wait_until_true(lambda: primary.ha_state == 'primary') - utils.wait_until_true(lambda: backup.ha_state == 'backup') + self.wait_until_ha_router_has_state(primary, 'primary') + self.wait_until_ha_router_has_state(backup, 'backup') self._assert_ip_addresses_in_dvr_ha_snat_namespace(primary) self._assert_no_ip_addresses_in_dvr_ha_snat_namespace(backup) self.fail_ha_router(primary) - utils.wait_until_true(lambda: backup.ha_state == 'primary') - utils.wait_until_true(lambda: primary.ha_state == 'backup') + self.wait_until_ha_router_has_state(backup, 'primary') + self.wait_until_ha_router_has_state(primary, 'backup') self._assert_ip_addresses_in_dvr_ha_snat_namespace(backup) self._assert_no_ip_addresses_in_dvr_ha_snat_namespace(primary) @@ -1717,7 +1717,7 @@ class TestDvrRouter(DvrRouterTestFramework, framework.L3AgentTestFramework): r2_chsfr = mock.patch.object(self.failover_agent, 'check_ha_state_for_router').start() - utils.wait_until_true(lambda: router1.ha_state == 'primary') + self.wait_until_ha_router_has_state(router1, 'primary') self.agent._process_updated_router(router1.router) self.assertTrue(r1_chsfr.called) diff --git a/neutron/tests/functional/agent/l3/test_ha_router.py b/neutron/tests/functional/agent/l3/test_ha_router.py index 722c4b25f26..23e519c4e2f 100644 --- a/neutron/tests/functional/agent/l3/test_ha_router.py +++ b/neutron/tests/functional/agent/l3/test_ha_router.py @@ -45,10 +45,10 @@ class L3HATestCase(framework.L3AgentTestFramework): side_effect=self.change_router_state).start() router_info = self.generate_router_info(enable_ha=True) router = self.manage_router(self.agent, router_info) - common_utils.wait_until_true(lambda: router.ha_state == 'primary') + self.wait_until_ha_router_has_state(router, 'primary') self.fail_ha_router(router) - common_utils.wait_until_true(lambda: router.ha_state == 'backup') + self.wait_until_ha_router_has_state(router, 'backup') def enqueue_call_count_match(): LOG.debug("enqueue_mock called %s times.", enqueue_mock.call_count) @@ -78,8 +78,8 @@ class L3HATestCase(framework.L3AgentTestFramework): router_info = self.generate_router_info(enable_ha=True) router2 = self.manage_router(self.agent, router_info) - common_utils.wait_until_true(lambda: router1.ha_state == 'backup') - common_utils.wait_until_true(lambda: router2.ha_state == 'primary') + self.wait_until_ha_router_has_state(router1, 'backup') + self.wait_until_ha_router_has_state(router2, 'primary') common_utils.wait_until_true( lambda: self._expected_rpc_report( {router1.router_id: 'standby', router2.router_id: 'active'})) @@ -118,10 +118,10 @@ class L3HATestCase(framework.L3AgentTestFramework): router_info = l3_test_common.prepare_router_data( enable_snat=True, enable_ha=True, dual_stack=True, enable_gw=False) router = self.manage_router(self.agent, router_info) - common_utils.wait_until_true(lambda: router.ha_state == 'primary') + self.wait_until_ha_router_has_state(router, 'primary') if state == 'backup': self.fail_ha_router(router) - common_utils.wait_until_true(lambda: router.ha_state == 'backup') + self.wait_until_ha_router_has_state(router, 'backup') _ext_dev_name, ex_port = l3_test_common.prepare_ext_gw_test( mock.Mock(), router, dual_stack=enable_v6_gw) router_info['gw_port'] = ex_port @@ -225,7 +225,7 @@ class L3HATestCase(framework.L3AgentTestFramework): router_info = self.generate_router_info( ip_version=constants.IP_VERSION_6, enable_ha=True) router1 = self.manage_router(self.agent, router_info) - common_utils.wait_until_true(lambda: router1.ha_state == 'primary') + self.wait_until_ha_router_has_state(router1, 'primary') common_utils.wait_until_true(lambda: router1.radvd.enabled) def _check_lla_status(router, expected): @@ -243,7 +243,7 @@ class L3HATestCase(framework.L3AgentTestFramework): ha_device = ip_lib.IPDevice(device_name, namespace=router1.ns_name) ha_device.link.set_down() - common_utils.wait_until_true(lambda: router1.ha_state == 'backup') + self.wait_until_ha_router_has_state(router1, 'backup') common_utils.wait_until_true( lambda: not router1.radvd.enabled, timeout=10) _check_lla_status(router1, False) @@ -269,7 +269,7 @@ class L3HATestCase(framework.L3AgentTestFramework): ipv6_subnet_modes=[slaac_mode], interface_id=interface_id) router.process() - common_utils.wait_until_true(lambda: router.ha_state == 'primary') + self.wait_until_ha_router_has_state(router, 'primary') # Verify that router internal interface is present and is configured # with IP address from both the subnets. @@ -301,10 +301,10 @@ class L3HATestCase(framework.L3AgentTestFramework): def test_delete_external_gateway_on_standby_router(self): router_info = self.generate_router_info(enable_ha=True) router = self.manage_router(self.agent, router_info) - common_utils.wait_until_true(lambda: router.ha_state == 'primary') + self.wait_until_ha_router_has_state(router, 'primary') self.fail_ha_router(router) - common_utils.wait_until_true(lambda: router.ha_state == 'backup') + self.wait_until_ha_router_has_state(router, 'backup') # The purpose of the test is to simply make sure no exception is raised port = router.get_ex_gw_port() @@ -316,7 +316,7 @@ class L3HATestCase(framework.L3AgentTestFramework): router = self.manage_router(self.agent, router_info) ex_gw_port = router.get_ex_gw_port() interface_name = router.get_external_device_interface_name(ex_gw_port) - common_utils.wait_until_true(lambda: router.ha_state == 'primary') + self.wait_until_ha_router_has_state(router, 'primary') self._add_fip(router, '172.168.1.20', fixed_address='10.0.0.3') router.process() router.router[constants.FLOATINGIP_KEY] = [] @@ -330,12 +330,12 @@ class L3HATestCase(framework.L3AgentTestFramework): router_info[constants.HA_INTERFACE_KEY]['status'] = ( constants.PORT_STATUS_DOWN) router1 = self.manage_router(self.agent, router_info) - common_utils.wait_until_true(lambda: router1.ha_state == 'backup') + self.wait_until_ha_router_has_state(router1, 'backup') router1.router[constants.HA_INTERFACE_KEY]['status'] = ( constants.PORT_STATUS_ACTIVE) self.agent._process_updated_router(router1.router) - common_utils.wait_until_true(lambda: router1.ha_state == 'primary') + self.wait_until_ha_router_has_state(router1, 'primary') def test_ha_router_namespace_has_ip_nonlocal_bind_disabled(self): router_info = self.generate_router_info(enable_ha=True) @@ -362,14 +362,14 @@ class L3HATestCase(framework.L3AgentTestFramework): external_device_name = router.get_external_device_name( external_port['id']) - common_utils.wait_until_true(lambda: router.ha_state == 'backup') + self.wait_until_ha_router_has_state(router, 'backup') self._wait_until_ipv6_forwarding_has_state( router.ns_name, external_device_name, 0) router.router[constants.HA_INTERFACE_KEY]['status'] = ( constants.PORT_STATUS_ACTIVE) self.agent._process_updated_router(router.router) - common_utils.wait_until_true(lambda: router.ha_state == 'primary') + self.wait_until_ha_router_has_state(router, 'primary') self._wait_until_ipv6_forwarding_has_state( router.ns_name, external_device_name, 1) @@ -381,13 +381,13 @@ class L3HATestCase(framework.L3AgentTestFramework): constants.PORT_STATUS_DOWN) router = self.manage_router(self.agent, router_info) - common_utils.wait_until_true(lambda: router.ha_state == 'backup') + self.wait_until_ha_router_has_state(router, 'backup') self._wait_until_ipv6_forwarding_has_state(router.ns_name, 'all', 0) router.router[constants.HA_INTERFACE_KEY]['status'] = ( constants.PORT_STATUS_ACTIVE) self.agent._process_updated_router(router.router) - common_utils.wait_until_true(lambda: router.ha_state == 'primary') + self.wait_until_ha_router_has_state(router, 'primary') self._wait_until_ipv6_forwarding_has_state(router.ns_name, 'all', 1) def test_router_interface_mtu_update(self): @@ -503,10 +503,8 @@ class L3HATestFailover(framework.L3AgentTestFramework): self._assert_ipv6_accept_ra(backup_router, False) self._assert_ipv6_forwarding(backup_router, False, False) - common_utils.wait_until_true( - lambda: primary_router.ha_state == 'primary') - common_utils.wait_until_true( - lambda: backup_router.ha_state == 'backup') + self.wait_until_ha_router_has_state(primary_router, 'primary') + self.wait_until_ha_router_has_state(backup_router, 'backup') self.fail_ha_router(router1) @@ -559,10 +557,8 @@ class L3HATestFailover(framework.L3AgentTestFramework): self.fail_gw_router_port(primary_router) self.fail_gw_router_port(backup_router) - common_utils.wait_until_true( - lambda: primary_router.ha_state == 'primary') - common_utils.wait_until_true( - lambda: backup_router.ha_state == 'primary') + self.wait_until_ha_router_has_state(primary_router, 'primary') + self.wait_until_ha_router_has_state(backup_router, 'primary') self.restore_gw_router_port(primary_router)