[ovn] Agent liveness - allow time to propagate checks

Right now neutron-server bumps the nb_cfg parameter in NB_Global
table which needs to be propagated by northd to SB_Global,
processed by agents, and write it back into SB_Global.
This requires processing by neutron-server but unfortunatelly
the server checks straight away and many times the value read
is behind the expected value.

All this results in frequent false positives showing dead agents
when they are not.

This patch is relaxing the checks by allowing a difference of 1
between the read and expected values.

Change-Id: Id91481b690ad569c5dcfa5bd404f497f591d729d
Closes-Bug: 1860436
Signed-off-by: Daniel Alvarez <dalvarez@redhat.com>
This commit is contained in:
Daniel Alvarez 2020-01-21 14:26:22 +01:00
parent 3b03b509ee
commit 18410097f2
2 changed files with 16 additions and 3 deletions

View File

@ -967,7 +967,9 @@ class OVNMechanismDriver(api.MechanismDriver):
except KeyError:
updated_at = timeutils.utcnow(with_timezone=True)
if self._nb_ovn.nb_global.nb_cfg == nb_cfg:
# Allow a maximum of 1 difference between expected and read values
# to avoid false positives.
if self._nb_ovn.nb_global.nb_cfg - nb_cfg <= 1:
# update the time of our successful check
value = timeutils.utcnow(with_timezone=True).isoformat()
self._sb_ovn.db_set('Chassis', chassis.uuid,

View File

@ -1542,11 +1542,22 @@ class TestOVNMechanismDriver(test_plugin.Ml2PluginV2TestCase):
chassis = self._add_chassis_agent(5, agent_type)
self.assertTrue(self.mech_driver.agent_alive(chassis, agent_type))
def test_agent_alive_true_one_diff(self):
# Agent should be reported as alive when the nb_cfg delta is 1
# even if the last update time was old enough.
for agent_type in (ovn_const.OVN_CONTROLLER_AGENT,
ovn_const.OVN_METADATA_AGENT):
self.mech_driver._nb_ovn.nb_global.nb_cfg = 5
now = timeutils.utcnow()
updated_at = now - datetime.timedelta(cfg.CONF.agent_down_time + 1)
chassis = self._add_chassis_agent(4, agent_type, updated_at)
self.assertTrue(self.mech_driver.agent_alive(chassis, agent_type))
def test_agent_alive_not_timed_out(self):
for agent_type in (ovn_const.OVN_CONTROLLER_AGENT,
ovn_const.OVN_METADATA_AGENT):
self.mech_driver._nb_ovn.nb_global.nb_cfg = 5
chassis = self._add_chassis_agent(4, agent_type)
chassis = self._add_chassis_agent(3, agent_type)
self.assertTrue(self.mech_driver.agent_alive(chassis, agent_type),
"Agent type %s is not alive" % agent_type)
@ -1556,7 +1567,7 @@ class TestOVNMechanismDriver(test_plugin.Ml2PluginV2TestCase):
self.mech_driver._nb_ovn.nb_global.nb_cfg = 5
now = timeutils.utcnow()
updated_at = now - datetime.timedelta(cfg.CONF.agent_down_time + 1)
chassis = self._add_chassis_agent(4, agent_type, updated_at)
chassis = self._add_chassis_agent(3, agent_type, updated_at)
self.assertFalse(self.mech_driver.agent_alive(chassis, agent_type))
def _test__update_dnat_entry_if_needed(self, up=True):