Fix handling the restart of ovn-controllers

The previous `getattr(old, 'nb_cfg', False)` would evaluate to `False`
if the `old` row either did not contain a `nb_cfg` value or if the value
was 0.

As 0 is the value set on startup of the ovn-controller this causes the
neutron-api to ignore any event a ovn-controller directly sends after
startup. In turn this causes us to miss the information that the agent
is synchronized, causing the agent to appear as down, until something
bumps the `nb_cfg` value globally.

Closes-Bug: #1997982

Change-Id: Icec8fee93e64b871999f38674e305238e9705fd4
This commit is contained in:
Felix Huettner 2022-11-25 16:39:31 +01:00
parent 0384b3193b
commit 4cc611d319
2 changed files with 61 additions and 1 deletions

View File

@ -336,7 +336,7 @@ class ChassisAgentWriteEvent(ChassisAgentEvent):
# don't update the AgentCache. We use chassis_private.chassis to return # don't update the AgentCache. We use chassis_private.chassis to return
# data about the agent. # data about the agent.
return event == self.ROW_CREATE or ( return event == self.ROW_CREATE or (
getattr(old, 'nb_cfg', False) and not hasattr(old, 'nb_cfg') and not
(self.table == 'Chassis_Private' and not row.chassis)) (self.table == 'Chassis_Private' and not row.chassis))
def run(self, event, row, old): def run(self, event, row, old):

View File

@ -484,6 +484,66 @@ class TestAgentMonitor(base.TestOVNFunctionalBase):
self.fail('Chassis timestamp: %s, agent updated_at: %s' % self.fail('Chassis timestamp: %s, agent updated_at: %s' %
(chassis_ts, str(agent.updated_at))) (chassis_ts, str(agent.updated_at)))
def test_agent_restart(self):
def check_agent_up():
agent = neutron_agent.AgentCache()[self.chassis_name]
return agent.alive
def check_agent_down():
return not check_agent_up()
def check_nb_cfg_timestamp_is_not_null():
agent = neutron_agent.AgentCache()[self.chassis_name]
return agent.updated_at != 0
if not self.sb_api.is_table_present('Chassis_Private'):
self.skipTest('Ovn sb not support Chassis_Private')
# Set nb_cfg to some realistic value, so that the alive check can
# actually work
self.nb_api.db_set(
'NB_Global', '.', ('nb_cfg', 1337)).execute(check_error=True)
self.sb_api.db_set(
'Chassis_Private', self.chassis_name, ('nb_cfg', 1337)
).execute(check_error=True)
chassis_uuid = self.sb_api.db_get(
'Chassis', self.chassis_name, 'uuid').execute(check_error=True)
self.assertTrue(check_agent_up())
n_utils.wait_until_true(check_nb_cfg_timestamp_is_not_null, timeout=5)
# Lets start by shutting down the ovn-controller
# (where it will remove the Chassis_Private table entry)
self.sb_api.db_destroy(
'Chassis_Private', self.chassis_name).execute(check_error=True)
try:
n_utils.wait_until_true(check_agent_down, timeout=5)
except n_utils.WaitTimeout:
self.fail('Agent did not go down after Chassis_Private removal')
# Now the ovn-controller starts up again and has not yet synced with
# the southbound database
self.sb_api.db_create(
'Chassis_Private', name=self.chassis_name,
external_ids={}, chassis=chassis_uuid,
nb_cfg_timestamp=0, nb_cfg=0
).execute(check_error=True)
self.assertTrue(check_agent_down())
# Now the ovn-controller has synced with the southbound database
nb_cfg_timestamp = timeutils.utcnow_ts() * 1000
with self.sb_api.transaction() as txn:
txn.add(self.sb_api.db_set('Chassis_Private', self.chassis_name,
('nb_cfg_timestamp', nb_cfg_timestamp)))
txn.add(self.sb_api.db_set('Chassis_Private', self.chassis_name,
('nb_cfg', 1337)))
try:
n_utils.wait_until_true(check_agent_up, timeout=5)
except n_utils.WaitTimeout:
self.fail('Agent did not go up after sync is done')
self.assertTrue(check_nb_cfg_timestamp_is_not_null())
class TestOvnIdlProbeInterval(base.TestOVNFunctionalBase): class TestOvnIdlProbeInterval(base.TestOVNFunctionalBase):
def setUp(self): def setUp(self):