Browse Source

Fix evacuation when host dies uncleanly

This is a squashed backport of fix and relevant test

1.  Fix evacuation when host dies uncleanly

If a host crashes and ovn-controller doesn't clean up the
Port_Binding chassis column, the Logical_Switch_Port is never set
to DOWN, so we do not detect the up->down transition and update the
port status with the driver. This patch watches for Port_Binding
chassis column changes and if the port is up goes ahead and updates
the driver.

Conflicts:
        networking_ovn/ovsdb/ovsdb_monitor.py

(cherry picked from commit 4f73c0f016)

2. Test PortBindingChassisUpdateEvent

While other tests cover what happens when a Port_Binding is updated
when this event doesn't match, there was no test that covered the
case when it does. This test does test primarily implementation
details, but will have to do until we can test the actual use case
which involves uncleanly killing a compute server and doing a
'host evacuate' from it.

Conflicts:
        networking_ovn/tests/unit/ovsdb/test_ovsdb_monitor.py

(cherry picked from commit f234deba18)

Change-Id: I8a5e6ad2e98b79a140977ce003609ed5b21e3499
Closes-Bug: #1840876
(cherry picked from commit 36f06a32bb)
changes/40/678240/6
Terry Wilson 2 years ago
parent
commit
964205b2a9
  1. 45
      networking_ovn/ovsdb/ovsdb_monitor.py
  2. 29
      networking_ovn/tests/unit/ovsdb/test_ovsdb_monitor.py

45
networking_ovn/ovsdb/ovsdb_monitor.py

@ -128,6 +128,46 @@ class ChassisEvent(row_event.RowEvent):
self.l3_plugin.schedule_unhosted_gateways()
class PortBindingChassisUpdateEvent(BaseEvent):
"""Event for matching a port moving chassis
If the LSP is up and the Port_Binding chassis has just changed,
there is a good chance the host died without cleaning up the chassis
column on the Port_Binding. The port never goes down, so we won't
see update the driver with the LogicalSwitchPortUpdateUpEvent which
only monitors for transitions from DOWN to UP.
"""
table = 'Port_Binding'
events = (BaseEvent.ROW_UPDATE,)
def __init__(self, driver):
self.driver = driver
super(PortBindingChassisUpdateEvent, self).__init__()
def match_fn(self, event, row, old=None):
# NOTE(twilson) ROW_UPDATE events always pass old, but chassis will
# only be set if chassis has changed
old_chassis = getattr(old, 'chassis', None)
if not (row.chassis and old_chassis) or row.chassis == old_chassis:
return False
if row.type == 'chassisredirect':
return False
try:
lsp = self.driver._nb_ovn.lookup('Logical_Switch_Port',
row.logical_port)
except idlutils.RowNotFound:
LOG.warning("Logical Switch Port %(port)s not found for "
"Port_Binding %(binding)s",
{'port': row.logical_port, 'binding': row.uuid})
return False
return bool(lsp.up)
def run(self, event, row, old=None):
self.driver.set_port_status_up(row.logical_port)
class PortBindingChassisEvent(row_event.RowEvent):
"""Port_Binding update event - set chassis for chassisredirect port.
@ -404,8 +444,9 @@ class OvnSbIdl(OvnIdl):
"""
self._chassis_event = ChassisEvent(self.driver)
self._portbinding_event = PortBindingChassisEvent(self.driver)
self.notify_handler.watch_events([self._chassis_event,
self._portbinding_event])
self.notify_handler.watch_events(
[self._chassis_event, self._portbinding_event,
PortBindingChassisUpdateEvent(self.driver)])
def _check_and_set_ssl_files(schema_name):

29
networking_ovn/tests/unit/ovsdb/test_ovsdb_monitor.py

@ -429,3 +429,32 @@ class TestChassisMetadataAgentEvent(base.TestCase):
ovn_const.OVN_AGENT_METADATA_SB_CFG_KEY: '1'}})
self.assertTrue(self.event.match_fn(ROW_UPDATE, row, old=old))
class TestPortBindingChassisUpdateEvent(base.TestCase):
def setUp(self):
super(TestPortBindingChassisUpdateEvent, self).setUp()
self.driver = mock.Mock()
self.event = ovsdb_monitor.PortBindingChassisUpdateEvent(self.driver)
def _test_event(self, event, row, old):
if self.event.matches(event, row, old):
self.event.run(event, row, old)
self.driver.set_port_status_up.assert_called()
else:
self.driver.set_port_status_up.assert_not_called()
def test_event_matches(self):
# NOTE(twilson) This primarily tests implementation details. If a
# scenario test is written that handles shutting down a compute
# node uncleanly and performing a 'host-evacuate', this can be removed
pbtable = fakes.FakeOvsdbTable.create_one_ovsdb_table(
attrs={'name': 'Port_Binding'})
ovsdb_row = fakes.FakeOvsdbRow.create_one_ovsdb_row
self.driver._nb_ovn.lookup.return_value = ovsdb_row(attrs={'up': True})
self._test_event(
self.event.ROW_UPDATE,
ovsdb_row(attrs={'_table': pbtable, 'chassis': 'one',
'type': '_fake_', 'logical_port': 'foo'}),
ovsdb_row(attrs={'_table': pbtable, 'chassis': 'two',
'type': '_fake_'}))
Loading…
Cancel
Save