From 4cc611d319d0afe1ee04df6e4419014f1133df09 Mon Sep 17 00:00:00 2001 From: Felix Huettner Date: Fri, 25 Nov 2022 16:39:31 +0100 Subject: [PATCH] Fix handling the restart of ovn-controllers The previous `getattr(old, 'nb_cfg', False)` would evaluate to `False` if the `old` row either did not contain a `nb_cfg` value or if the value was 0. As 0 is the value set on startup of the ovn-controller this causes the neutron-api to ignore any event a ovn-controller directly sends after startup. In turn this causes us to miss the information that the agent is synchronized, causing the agent to appear as down, until something bumps the `nb_cfg` value globally. Closes-Bug: #1997982 Change-Id: Icec8fee93e64b871999f38674e305238e9705fd4 --- .../ovn/mech_driver/ovsdb/ovsdb_monitor.py | 2 +- .../mech_driver/ovsdb/test_ovsdb_monitor.py | 60 +++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/ovsdb_monitor.py b/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/ovsdb_monitor.py index bb4259996d8..f122b16488b 100644 --- a/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/ovsdb_monitor.py +++ b/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/ovsdb_monitor.py @@ -336,7 +336,7 @@ class ChassisAgentWriteEvent(ChassisAgentEvent): # don't update the AgentCache. We use chassis_private.chassis to return # data about the agent. return event == self.ROW_CREATE or ( - getattr(old, 'nb_cfg', False) and not + hasattr(old, 'nb_cfg') and not (self.table == 'Chassis_Private' and not row.chassis)) def run(self, event, row, old): diff --git a/neutron/tests/functional/plugins/ml2/drivers/ovn/mech_driver/ovsdb/test_ovsdb_monitor.py b/neutron/tests/functional/plugins/ml2/drivers/ovn/mech_driver/ovsdb/test_ovsdb_monitor.py index d130ad7e40d..9e25655bed1 100644 --- a/neutron/tests/functional/plugins/ml2/drivers/ovn/mech_driver/ovsdb/test_ovsdb_monitor.py +++ b/neutron/tests/functional/plugins/ml2/drivers/ovn/mech_driver/ovsdb/test_ovsdb_monitor.py @@ -484,6 +484,66 @@ class TestAgentMonitor(base.TestOVNFunctionalBase): self.fail('Chassis timestamp: %s, agent updated_at: %s' % (chassis_ts, str(agent.updated_at))) + def test_agent_restart(self): + def check_agent_up(): + agent = neutron_agent.AgentCache()[self.chassis_name] + return agent.alive + + def check_agent_down(): + return not check_agent_up() + + def check_nb_cfg_timestamp_is_not_null(): + agent = neutron_agent.AgentCache()[self.chassis_name] + return agent.updated_at != 0 + + if not self.sb_api.is_table_present('Chassis_Private'): + self.skipTest('Ovn sb not support Chassis_Private') + + # Set nb_cfg to some realistic value, so that the alive check can + # actually work + self.nb_api.db_set( + 'NB_Global', '.', ('nb_cfg', 1337)).execute(check_error=True) + self.sb_api.db_set( + 'Chassis_Private', self.chassis_name, ('nb_cfg', 1337) + ).execute(check_error=True) + + chassis_uuid = self.sb_api.db_get( + 'Chassis', self.chassis_name, 'uuid').execute(check_error=True) + + self.assertTrue(check_agent_up()) + n_utils.wait_until_true(check_nb_cfg_timestamp_is_not_null, timeout=5) + + # Lets start by shutting down the ovn-controller + # (where it will remove the Chassis_Private table entry) + self.sb_api.db_destroy( + 'Chassis_Private', self.chassis_name).execute(check_error=True) + try: + n_utils.wait_until_true(check_agent_down, timeout=5) + except n_utils.WaitTimeout: + self.fail('Agent did not go down after Chassis_Private removal') + + # Now the ovn-controller starts up again and has not yet synced with + # the southbound database + self.sb_api.db_create( + 'Chassis_Private', name=self.chassis_name, + external_ids={}, chassis=chassis_uuid, + nb_cfg_timestamp=0, nb_cfg=0 + ).execute(check_error=True) + self.assertTrue(check_agent_down()) + + # Now the ovn-controller has synced with the southbound database + nb_cfg_timestamp = timeutils.utcnow_ts() * 1000 + with self.sb_api.transaction() as txn: + txn.add(self.sb_api.db_set('Chassis_Private', self.chassis_name, + ('nb_cfg_timestamp', nb_cfg_timestamp))) + txn.add(self.sb_api.db_set('Chassis_Private', self.chassis_name, + ('nb_cfg', 1337))) + try: + n_utils.wait_until_true(check_agent_up, timeout=5) + except n_utils.WaitTimeout: + self.fail('Agent did not go up after sync is done') + self.assertTrue(check_nb_cfg_timestamp_is_not_null()) + class TestOvnIdlProbeInterval(base.TestOVNFunctionalBase): def setUp(self):