From 4f75c6a616d3cb549153fcc496926358dfc9178a Mon Sep 17 00:00:00 2001 From: Slawek Kaplonski Date: Tue, 5 Apr 2022 11:26:32 +0200 Subject: [PATCH] Retry port_update in the OVN if revision mismatch during live-migration This is terrible hack but it seems that there is no other way to fix/workaround the race which may happen during live-migration between: - port update event comming from the OVN db (port DOWN on the src node), - API call from nova-compute to activate port binding on the destination node. If those 2 events will be executed in specific order by different workers it may happen that port binding activation will not update "requested_chassis" of the port in OVN northd due to revision mismatch (ovn_revision and neutron_revision will be already bumped by the worker which processes "port update" OVN event). If "requested_chassis" will not be updated, OVN will not claim port on the dest node thus connectivity to the vm will be broken. To workaround that issue, port_update_postcommit method from the OVN mechanism driver will catch RevisionMismatch exception raised by the ovn_client and in case that this was port_update after live_migration, will get port data from neutron db and try to update port in the OVN northd once again. Closes-bug: #1967144 Change-Id: If6e1c6e0fc772101bcd3427601800aaae84381dd --- .../drivers/ovn/mech_driver/mech_driver.py | 31 ++++- .../ovn/mech_driver/ovsdb/impl_idl_ovn.py | 3 + .../ovn/mech_driver/ovsdb/ovn_client.py | 3 +- .../ovn/mech_driver/test_mech_driver.py | 111 ++++++++++++++++++ 4 files changed, 145 insertions(+), 3 deletions(-) diff --git a/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py b/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py index 002e28b34f3..42d7f5311b3 100644 --- a/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py +++ b/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py @@ -44,6 +44,7 @@ from ovsdbapp.backend.ovs_idl import idlutils from neutron._i18n import _ from neutron.common.ovn import acl as ovn_acl from neutron.common.ovn import constants as ovn_const +from neutron.common.ovn import exceptions as ovn_exceptions from neutron.common.ovn import extensions as ovn_extensions from neutron.common.ovn import utils as ovn_utils from neutron.common import utils @@ -756,6 +757,32 @@ class OVNMechanismDriver(api.MechanismDriver): 'device_owner': original_port['device_owner']}) raise OVNPortUpdateError(resource='port', msg=msg) + def _ovn_update_port(self, plugin_context, port, original_port, + retry_on_revision_mismatch): + try: + self._ovn_client.update_port(plugin_context, port, + port_object=original_port) + except ovn_exceptions.RevisionConflict: + if retry_on_revision_mismatch: + # NOTE(slaweq): I know this is terrible hack but there is no + # other way to workaround possible race between port update + # event from the OVN (port down on the src node) and API + # request from nova-compute to activate binding of the port on + # the dest node. + original_port_migrating_to = original_port.get( + portbindings.PROFILE, {}).get('migrating_to') + port_host_id = port.get(portbindings.HOST_ID) + if (original_port_migrating_to is not None and + original_port_migrating_to == port_host_id): + LOG.debug("Revision number of the port %s has changed " + "probably during live migration. Retrying " + "update port in OVN.", port) + db_port = self._plugin.get_port(plugin_context, + port['id']) + port['revision_number'] = db_port['revision_number'] + self._ovn_update_port(plugin_context, port, original_port, + retry_on_revision_mismatch=False) + def create_port_postcommit(self, context): """Create a port. @@ -847,8 +874,8 @@ class OVNMechanismDriver(api.MechanismDriver): # will fail that OVN has port with bigger revision. return - self._ovn_client.update_port(context._plugin_context, port, - port_object=original_port) + self._ovn_update_port(context._plugin_context, port, original_port, + retry_on_revision_mismatch=True) self._notify_dhcp_updated(port['id']) def delete_port_postcommit(self, context): diff --git a/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/impl_idl_ovn.py b/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/impl_idl_ovn.py index f0e05c0b9d4..835635c7647 100644 --- a/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/impl_idl_ovn.py +++ b/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/impl_idl_ovn.py @@ -263,11 +263,14 @@ class OvsdbNbOvnIdl(nb_impl_idl.OvnNbApiIdlImpl, Backend): This method is just a wrapper around the ovsdbapp transaction to handle revision conflicts correctly. """ + revision_mismatch_raise = kwargs.pop('revision_mismatch_raise', False) try: with super(OvsdbNbOvnIdl, self).transaction(*args, **kwargs) as t: yield t except ovn_exc.RevisionConflict as e: LOG.info('Transaction aborted. Reason: %s', e) + if revision_mismatch_raise: + raise e def create_lswitch_port(self, lport_name, lswitch_name, may_exist=True, **columns): diff --git a/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/ovn_client.py b/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/ovn_client.py index 133701cc663..fd761661beb 100644 --- a/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/ovn_client.py +++ b/neutron/plugins/ml2/drivers/ovn/mech_driver/ovsdb/ovn_client.py @@ -594,7 +594,8 @@ class OVNClient(object): check_rev_cmd = self._nb_idl.check_revision_number( port['id'], port, ovn_const.TYPE_PORTS) - with self._nb_idl.transaction(check_error=True) as txn: + with self._nb_idl.transaction(check_error=True, + revision_mismatch_raise=True) as txn: txn.add(check_rev_cmd) columns_dict = {} if utils.is_lsp_router_port(port): diff --git a/neutron/tests/unit/plugins/ml2/drivers/ovn/mech_driver/test_mech_driver.py b/neutron/tests/unit/plugins/ml2/drivers/ovn/mech_driver/test_mech_driver.py index 12e4a022223..93bce25d511 100644 --- a/neutron/tests/unit/plugins/ml2/drivers/ovn/mech_driver/test_mech_driver.py +++ b/neutron/tests/unit/plugins/ml2/drivers/ovn/mech_driver/test_mech_driver.py @@ -45,6 +45,7 @@ from webob import exc from neutron.common.ovn import acl as ovn_acl from neutron.common.ovn import constants as ovn_const +from neutron.common.ovn import exceptions as ovn_exceptions from neutron.common.ovn import hash_ring_manager from neutron.common.ovn import utils as ovn_utils from neutron.conf.plugins.ml2.drivers.ovn import ovn_conf @@ -1960,6 +1961,116 @@ class TestOVNMechanismDriver(TestOVNMechanismDriverBase): self.plugin.update_port_status.assert_called_once_with( fake_context, fake_port['id'], const.PORT_STATUS_ACTIVE) + @mock.patch.object(mech_driver.OVNMechanismDriver, + '_is_port_provisioning_required', lambda *_: True) + @mock.patch.object(mech_driver.OVNMechanismDriver, '_notify_dhcp_updated') + @mock.patch.object(ovn_client.OVNClient, 'update_port') + def test_update_port_postcommit_live_migration_revision_mismatch_always( + self, mock_update_port, mock_notify_dhcp): + self.plugin.update_port_status = mock.Mock() + self.plugin.get_port = mock.Mock(return_value=mock.MagicMock()) + + fake_context = mock.MagicMock() + fake_port = fakes.FakePort.create_one_port( + attrs={ + 'status': const.PORT_STATUS_ACTIVE, + portbindings.PROFILE: {}, + portbindings.VIF_TYPE: portbindings.VIF_TYPE_OVS}).info() + original_fake_port = fakes.FakePort.create_one_port( + attrs={ + 'status': const.PORT_STATUS_ACTIVE, + portbindings.PROFILE: { + ovn_const.MIGRATING_ATTR: fake_port[portbindings.HOST_ID]}, + portbindings.VIF_TYPE: portbindings.VIF_TYPE_OVS}).info() + + fake_ctx = mock.Mock(current=fake_port, original=original_fake_port, + _plugin_context=fake_context) + mock_update_port.side_effect = ovn_exceptions.RevisionConflict( + resource_id=fake_port['id'], + resource_type=ovn_const.TYPE_PORTS) + + self.mech_driver.update_port_postcommit(fake_ctx) + + self.plugin.update_port_status.assert_not_called() + self.plugin.get_port.assert_called_once_with( + mock.ANY, fake_port['id']) + self.assertEqual(2, mock_update_port.call_count) + mock_notify_dhcp.assert_called_with(fake_port['id']) + + @mock.patch.object(mech_driver.OVNMechanismDriver, + '_is_port_provisioning_required', lambda *_: True) + @mock.patch.object(mech_driver.OVNMechanismDriver, '_notify_dhcp_updated') + @mock.patch.object(ovn_client.OVNClient, 'update_port') + def test_update_port_postcommit_live_migration_revision_mismatch_once( + self, mock_update_port, mock_notify_dhcp): + self.plugin.update_port_status = mock.Mock() + self.plugin.get_port = mock.Mock(return_value=mock.MagicMock()) + + fake_context = mock.MagicMock() + fake_port = fakes.FakePort.create_one_port( + attrs={ + 'status': const.PORT_STATUS_ACTIVE, + portbindings.PROFILE: {}, + portbindings.VIF_TYPE: portbindings.VIF_TYPE_OVS}).info() + original_fake_port = fakes.FakePort.create_one_port( + attrs={ + 'status': const.PORT_STATUS_ACTIVE, + portbindings.PROFILE: { + ovn_const.MIGRATING_ATTR: fake_port[portbindings.HOST_ID]}, + portbindings.VIF_TYPE: portbindings.VIF_TYPE_OVS}).info() + + fake_ctx = mock.Mock(current=fake_port, original=original_fake_port, + _plugin_context=fake_context) + mock_update_port.side_effect = [ + ovn_exceptions.RevisionConflict( + resource_id=fake_port['id'], + resource_type=ovn_const.TYPE_PORTS), + None] + + self.mech_driver.update_port_postcommit(fake_ctx) + + self.plugin.update_port_status.assert_not_called() + self.plugin.get_port.assert_called_once_with( + mock.ANY, fake_port['id']) + self.assertEqual(2, mock_update_port.call_count) + mock_notify_dhcp.assert_called_with(fake_port['id']) + + @mock.patch.object(mech_driver.OVNMechanismDriver, + '_is_port_provisioning_required', lambda *_: True) + @mock.patch.object(mech_driver.OVNMechanismDriver, '_notify_dhcp_updated') + @mock.patch.object(ovn_client.OVNClient, 'update_port') + def test_update_port_postcommit_revision_mismatch_not_after_live_migration( + self, mock_update_port, mock_notify_dhcp): + self.plugin.update_port_status = mock.Mock() + self.plugin.get_port = mock.Mock(return_value=mock.MagicMock()) + + fake_context = mock.MagicMock() + fake_port = fakes.FakePort.create_one_port( + attrs={ + 'status': const.PORT_STATUS_ACTIVE, + portbindings.PROFILE: {}, + portbindings.VIF_TYPE: portbindings.VIF_TYPE_OVS}).info() + original_fake_port = fakes.FakePort.create_one_port( + attrs={ + 'status': const.PORT_STATUS_DOWN, + portbindings.PROFILE: {}, + portbindings.VIF_TYPE: portbindings.VIF_TYPE_OVS}).info() + + fake_ctx = mock.Mock(current=fake_port, original=original_fake_port, + _plugin_context=fake_context) + mock_update_port.side_effect = [ + ovn_exceptions.RevisionConflict( + resource_id=fake_port['id'], + resource_type=ovn_const.TYPE_PORTS), + None] + + self.mech_driver.update_port_postcommit(fake_ctx) + + self.plugin.update_port_status.assert_not_called() + self.plugin.get_port.assert_not_called() + self.assertEqual(1, mock_update_port.call_count) + mock_notify_dhcp.assert_called_with(fake_port['id']) + def _add_chassis(self, nb_cfg): chassis_private = mock.Mock() chassis_private.nb_cfg = nb_cfg