From f8a22c7d4aa654eaad3b683073849c873ea3beff Mon Sep 17 00:00:00 2001 From: Rodolfo Alonso Hernandez Date: Thu, 3 Dec 2020 11:58:35 +0000 Subject: [PATCH] [OVS] Fix live-migration connection disruption The goal of this patch is to avoid the connection disruption during the live-migration using OVS. Since [1], when a port is migrated, both the source and the destination hosts are added to the profile binding information. Initially, the source host binding is activated and the destination is deactivated. When the port is created in the destination host (created by Nova), the port was not configured because the binding was not activated. The binding (that means, all the OpenFlow rules) was done when Nova sent the port activation. That happend when the VM was already running in the destination host. If the OVS agent was loaded, the port was bound seconds later to the port activation. Instead, this patch enables the OpenFlow rule creation in the destination host when the port is created. Another problem are the "neutron-vif-plugged" events sent by Neutron to Nova to inform about the port binding. Nova is expecting one single event informing about the destination port binding. At this moment, Nova considers the port is bound and ready to transmit data. Several triggers were firing expectedly this event: - When the port binding was updated, the port is set to down and then up again, forcing this event. - When the port binding was updated, first the binding is deleted and then updated with the new information. That triggers in the source host to set the port down and the up again, sending the event. This patch removes those events, sending the "neutron-vif-plugged" event only when the port is bound to the destination host (and as commented before, this is happening now regardless of the binding activation status). This feature depends on [2]. If this Nova patch is not in place, Nova will never plug the port in the destination host and Neutron won't be able to send the vif-plugged event to Nova to finish the live-migration process. Because from Neutron cannot query Nova to know if this patch is in place, a new temporary configuration option has been created to enable this feature. The default value will be "False"; that means Neutron will behave as before. [1]https://bugs.launchpad.net/neutron/+bug/1580880 [2]https://review.opendev.org/c/openstack/nova/+/767368 Closes-Bug: #1901707 Change-Id: Iee323943ac66e566e5a5e92de1861832e86fc7fc --- neutron/agent/rpc.py | 38 +++++++++++++++++-- neutron/conf/common.py | 18 +++++++++ neutron/db/provisioning_blocks.py | 3 +- neutron/notifiers/nova.py | 16 ++++++++ .../openvswitch/agent/ovs_neutron_agent.py | 23 +++++++---- neutron/plugins/ml2/plugin.py | 16 +++++++- neutron/tests/unit/agent/test_rpc.py | 21 ++++++++++ .../agent/test_ovs_neutron_agent.py | 15 ++++---- zuul.d/tempest-multinode.yaml | 5 +++ 9 files changed, 134 insertions(+), 21 deletions(-) diff --git a/neutron/agent/rpc.py b/neutron/agent/rpc.py index ebfb53e815f..990aef9d209 100644 --- a/neutron/agent/rpc.py +++ b/neutron/agent/rpc.py @@ -25,8 +25,10 @@ from neutron_lib.callbacks import resources as callback_resources from neutron_lib import constants from neutron_lib.plugins import utils from neutron_lib import rpc as lib_rpc +from oslo_config import cfg from oslo_log import log as logging import oslo_messaging +from oslo_serialization import jsonutils from oslo_utils import uuidutils from neutron.agent import resource_cache @@ -329,8 +331,10 @@ class CacheBackedPluginApi(PluginApi): binding = utils.get_port_binding_by_status_and_host( port_obj.bindings, constants.ACTIVE, raise_if_not_found=True, port_id=port_obj.id) - if (port_obj.device_owner.startswith( - constants.DEVICE_OWNER_COMPUTE_PREFIX) and + migrating_to = migrating_to_host(port_obj.bindings) + if (not (migrating_to and cfg.CONF.nova.live_migration_events) and + port_obj.device_owner.startswith( + constants.DEVICE_OWNER_COMPUTE_PREFIX) and binding[pb_ext.HOST] != host): LOG.debug("Device %s has no active binding in this host", port_obj) @@ -366,7 +370,8 @@ class CacheBackedPluginApi(PluginApi): 'profile': binding.profile, 'vif_type': binding.vif_type, 'vnic_type': binding.vnic_type, - 'security_groups': list(port_obj.security_group_ids) + 'security_groups': list(port_obj.security_group_ids), + 'migrating_to': migrating_to, } LOG.debug("Returning: %s", entry) return entry @@ -381,3 +386,30 @@ class CacheBackedPluginApi(PluginApi): rcache = resource_cache.RemoteResourceCache(self.RESOURCE_TYPES) rcache.start_watcher() self.remote_resource_cache = rcache + + +# TODO(ralonsoh): move this method to neutron_lib.plugins.utils +def migrating_to_host(bindings, host=None): + """Return the host the port is being migrated. + + If the host is passed, the port binding profile with the "migrating_to", + that contains the host the port is being migrated, is compared to this + value. If no value is passed, this method will return if the port is + being migrated ("migrating_to" is present in any port binding profile). + + The function returns None or the matching host. + """ + for binding in (binding for binding in bindings if + binding[pb_ext.STATUS] == constants.ACTIVE): + profile = binding.get('profile') + if not profile: + continue + profile = (jsonutils.loads(profile) if isinstance(profile, str) else + profile) + migrating_to = profile.get('migrating_to') + if migrating_to: + if not host: # Just know if the port is being migrated. + return migrating_to + if migrating_to == host: + return migrating_to + return None diff --git a/neutron/conf/common.py b/neutron/conf/common.py index 57e875ca737..b44d09dd568 100644 --- a/neutron/conf/common.py +++ b/neutron/conf/common.py @@ -174,6 +174,24 @@ nova_opts = [ help=_('Type of the nova endpoint to use. This endpoint will' ' be looked up in the keystone catalog and should be' ' one of public, internal or admin.')), + cfg.BoolOpt('live_migration_events', default=False, + help=_('When this option is enabled, during the live ' + 'migration, the OVS agent will only send the ' + '"vif-plugged-event" when the destination host ' + 'interface is bound. This option also disables any ' + 'other agent (like DHCP) to send to Nova this event ' + 'when the port is provisioned.' + 'This option can be enabled if Nova patch ' + 'https://review.opendev.org/c/openstack/nova/+/767368 ' + 'is in place.' + 'This option is temporary and will be removed in Y and ' + 'the behavior will be "True".'), + deprecated_for_removal=True, + deprecated_reason=( + 'In Y the Nova patch ' + 'https://review.opendev.org/c/openstack/nova/+/767368 ' + 'will be in the code even when running a Nova server in ' + 'X.')), ] diff --git a/neutron/db/provisioning_blocks.py b/neutron/db/provisioning_blocks.py index 22c48277179..8e5ee499e76 100644 --- a/neutron/db/provisioning_blocks.py +++ b/neutron/db/provisioning_blocks.py @@ -138,8 +138,7 @@ def provisioning_complete(context, object_id, object_type, entity): context, standard_attr_id=standard_attr_id): LOG.debug("Provisioning complete for %(otype)s %(oid)s triggered by " "entity %(entity)s.", log_dict) - registry.publish(object_type, PROVISIONING_COMPLETE, - 'neutron.db.provisioning_blocks', + registry.publish(object_type, PROVISIONING_COMPLETE, entity, payload=events.DBEventPayload( context, resource_id=object_id)) diff --git a/neutron/notifiers/nova.py b/neutron/notifiers/nova.py index 804fb73964e..5165cb4ac0a 100644 --- a/neutron/notifiers/nova.py +++ b/neutron/notifiers/nova.py @@ -13,6 +13,8 @@ # License for the specific language governing permissions and limitations # under the License. +import contextlib + from keystoneauth1 import loading as ks_loading from neutron_lib.callbacks import events from neutron_lib.callbacks import registry @@ -66,6 +68,16 @@ class Notifier(object): if ext.name == "server_external_events"] self.batch_notifier = batch_notifier.BatchNotifier( cfg.CONF.send_events_interval, self.send_events) + self._enabled = True + + @contextlib.contextmanager + def context_enabled(self, enabled): + stored_enabled = self._enabled + try: + self._enabled = enabled + yield + finally: + self._enabled = stored_enabled def _get_nova_client(self): global_id = common_context.generate_request_id() @@ -164,6 +176,10 @@ class Notifier(object): return self._get_network_changed_event(port) def _can_notify(self, port): + if not self._enabled: + LOG.debug("Nova notifier disabled") + return False + if not port.id: LOG.warning("Port ID not set! Nova will not be notified of " "port status change.") diff --git a/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py b/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py index 3924d8c7915..8693012b025 100644 --- a/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py +++ b/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py @@ -181,8 +181,8 @@ class OVSNeutronAgent(l2population_rpc.L2populationRpcCallBackTunnelMixin, "DVR and tunneling are enabled, setting to True.") self.arp_responder_enabled = True - host = self.conf.host - self.agent_id = 'ovs-agent-%s' % host + self.host = self.conf.host + self.agent_id = 'ovs-agent-%s' % self.host # Validate agent configurations self._check_agent_configurations() @@ -267,7 +267,7 @@ class OVSNeutronAgent(l2population_rpc.L2populationRpcCallBackTunnelMixin, self.phys_ofports, self.patch_int_ofport, self.patch_tun_ofport, - host, + self.host, self.enable_tunneling, self.enable_distributed_routing) @@ -312,7 +312,7 @@ class OVSNeutronAgent(l2population_rpc.L2populationRpcCallBackTunnelMixin, # or which are used by specific extensions. self.agent_state = { 'binary': n_const.AGENT_PROCESS_OVS, - 'host': host, + 'host': self.host, 'topic': n_const.L2_AGENT_TOPIC, 'configurations': {'bridge_mappings': self.bridge_mappings, n_const.RP_BANDWIDTHS: self.rp_bandwidths, @@ -1890,6 +1890,7 @@ class OVSNeutronAgent(l2population_rpc.L2populationRpcCallBackTunnelMixin, skipped_devices = [] need_binding_devices = [] binding_no_activated_devices = set() + migrating_devices = set() agent_restarted = self.iter_num == 0 devices_details_list = ( self.plugin_rpc.get_devices_details_list_and_failed_devices( @@ -1919,6 +1920,12 @@ class OVSNeutronAgent(l2population_rpc.L2populationRpcCallBackTunnelMixin, if not port.ofport or port.ofport == ovs_lib.INVALID_OFPORT: devices_not_in_datapath.add(device) + migrating_to = details.get('migrating_to') + if migrating_to and migrating_to != self.host: + LOG.info('Port %(device)s is being migrated to host %(host)s.', + {'device': device, 'host': migrating_to}) + migrating_devices.add(device) + if 'port_id' in details: LOG.info("Port %(device)s updated. Details: %(details)s", {'device': device, 'details': details}) @@ -1956,7 +1963,8 @@ class OVSNeutronAgent(l2population_rpc.L2populationRpcCallBackTunnelMixin, if (port and port.ofport != -1): self.port_dead(port) return (skipped_devices, binding_no_activated_devices, - need_binding_devices, failed_devices, devices_not_in_datapath) + need_binding_devices, failed_devices, devices_not_in_datapath, + migrating_devices) def _update_port_network(self, port_id, network_id): self._clean_network_ports(port_id) @@ -2051,11 +2059,12 @@ class OVSNeutronAgent(l2population_rpc.L2populationRpcCallBackTunnelMixin, skipped_devices = set() binding_no_activated_devices = set() devices_not_in_datapath = set() + migrating_devices = set() start = time.time() if devices_added_updated: (skipped_devices, binding_no_activated_devices, need_binding_devices, failed_devices['added'], - devices_not_in_datapath) = ( + devices_not_in_datapath, migrating_devices) = ( self.treat_devices_added_or_updated( devices_added_updated, provisioning_needed, re_added)) LOG.info("process_network_ports - iteration:%(iter_num)d - " @@ -2078,7 +2087,7 @@ class OVSNeutronAgent(l2population_rpc.L2populationRpcCallBackTunnelMixin, # TODO(salv-orlando): Optimize avoiding applying filters # unnecessarily, (eg: when there are no IP address changes) added_ports = (port_info.get('added', set()) - skipped_devices - - binding_no_activated_devices) + binding_no_activated_devices - migrating_devices) self._add_port_tag_info(need_binding_devices) self.process_install_ports_egress_flows(need_binding_devices) added_to_datapath = added_ports - devices_not_in_datapath diff --git a/neutron/plugins/ml2/plugin.py b/neutron/plugins/ml2/plugin.py index c9ffc340ef6..d5a8f5c3772 100644 --- a/neutron/plugins/ml2/plugin.py +++ b/neutron/plugins/ml2/plugin.py @@ -91,6 +91,7 @@ from sqlalchemy import or_ from sqlalchemy.orm import exc as sa_exc from neutron._i18n import _ +from neutron.agent import rpc as agent_rpc from neutron.agent import securitygroups_rpc as sg_rpc from neutron.api.rpc.agentnotifiers import dhcp_rpc_agent_api from neutron.api.rpc.handlers import dhcp_rpc @@ -342,8 +343,19 @@ class Ml2Plugin(db_base_plugin_v2.NeutronDbPluginV2, LOG.debug("Port %s is administratively disabled so it will " "not transition to active.", port_id) return - self.update_port_status( - payload.context, port_id, const.PORT_STATUS_ACTIVE) + + host_migrating = agent_rpc.migrating_to_host( + getattr(port, 'port_bindings', [])) + if (host_migrating and cfg.CONF.nova.live_migration_events and + self.nova_notifier): + send_nova_event = bool(trigger == + provisioning_blocks.L2_AGENT_ENTITY) + with self.nova_notifier.context_enabled(send_nova_event): + self.update_port_status(payload.context, port_id, + const.PORT_STATUS_ACTIVE) + else: + self.update_port_status(payload.context, port_id, + const.PORT_STATUS_ACTIVE) @log_helpers.log_method_call def _start_rpc_notifiers(self): diff --git a/neutron/tests/unit/agent/test_rpc.py b/neutron/tests/unit/agent/test_rpc.py index c6b312dbb0d..9d61f8c91b2 100644 --- a/neutron/tests/unit/agent/test_rpc.py +++ b/neutron/tests/unit/agent/test_rpc.py @@ -22,7 +22,9 @@ from neutron_lib.callbacks import events from neutron_lib.callbacks import resources from neutron_lib import constants from neutron_lib import rpc as n_rpc +from oslo_config import cfg from oslo_context import context as oslo_context +from oslo_serialization import jsonutils from oslo_utils import uuidutils from neutron.agent import rpc @@ -305,6 +307,7 @@ class TestCacheBackedPluginApi(base.BaseTestCase): self.assertEqual(self._port_id, entry['port_id']) self.assertEqual(self._network_id, entry['network_id']) self.assertNotIn(constants.NO_ACTIVE_BINDING, entry) + self.assertIsNone(entry['migrating_to']) def test_get_device_details_binding_not_in_host(self): self._api.remote_resource_cache.get_resource_by_id.side_effect = [ @@ -314,8 +317,26 @@ class TestCacheBackedPluginApi(base.BaseTestCase): self.assertEqual(self._port_id, entry['device']) self.assertNotIn('port_id', entry) self.assertNotIn('network_id', entry) + self.assertNotIn('migrating_to', entry) self.assertIn(constants.NO_ACTIVE_BINDING, entry) + def test_get_device_details_migrating_to_host(self): + for live_migration_events, migrating_to in ((True, 'host2'), + (False, 'irrelevant')): + cfg.CONF.set_override('live_migration_events', + live_migration_events, group='nova') + profile = jsonutils.dumps({'migrating_to': migrating_to}) + self._port.bindings[0].profile = profile + self._api.remote_resource_cache.get_resource_by_id.side_effect = [ + self._port, self._network] + entry = self._api.get_device_details(mock.ANY, self._port_id, + mock.ANY, 'host2') + if live_migration_events: + self.assertEqual('host2', entry['migrating_to']) + else: + self.assertTrue(entry[constants.NO_ACTIVE_BINDING]) + self.assertNotIn('migrating_to', entry) + @mock.patch('neutron.agent.resource_cache.RemoteResourceCache') def test_initialization_with_default_resources(self, rcache_class): rcache_obj = mock.MagicMock() diff --git a/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py b/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py index aafc9449a0b..5a3fbf29cb6 100644 --- a/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py +++ b/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py @@ -884,7 +884,7 @@ class TestOvsNeutronAgent(object): 'get_port_tag_dict', return_value={}),\ mock.patch.object(self.agent, func_name) as func: - skip_devs, _, need_bound_devices, _, _ = ( + skip_devs, _, need_bound_devices, _, _, _ = ( self.agent.treat_devices_added_or_updated([], False, set())) # The function should not raise self.assertFalse(skip_devs) @@ -902,7 +902,7 @@ class TestOvsNeutronAgent(object): 'get_vifs_by_ids', return_value={details['device']: port}),\ mock.patch.object(self.agent, 'port_dead') as func: - skip_devs, binding_no_activated_devices, _, _, _ = ( + skip_devs, binding_no_activated_devices, _, _, _, _ = ( self.agent.treat_devices_added_or_updated([], False, set())) self.assertFalse(skip_devs) self.assertTrue(func.called) @@ -979,8 +979,9 @@ class TestOvsNeutronAgent(object): [], False, set()) # The function should return False for resync and no device # processed - self.assertEqual((['the_skipped_one'], set(), [], set(), set()), - skip_devs) + self.assertEqual( + (['the_skipped_one'], set(), [], set(), set(), set()), + skip_devs) ext_mgr_delete_port.assert_called_once_with( self.agent.context, {'port_id': 'the_skipped_one'}) treat_vif_port.assert_not_called() @@ -997,7 +998,7 @@ class TestOvsNeutronAgent(object): mock.patch.object(self.agent, 'treat_vif_port') as treat_vif_port: failed_devices = {'added': set(), 'removed': set()} - (_, _, _, failed_devices['added'], _) = ( + (_, _, _, failed_devices['added'], _, _) = ( self.agent.treat_devices_added_or_updated([], False, set())) # The function should return False for resync and no device # processed @@ -1028,7 +1029,7 @@ class TestOvsNeutronAgent(object): return_value={}),\ mock.patch.object(self.agent, 'treat_vif_port') as treat_vif_port: - skip_devs, _, need_bound_devices, _, _ = ( + skip_devs, _, need_bound_devices, _, _, _ = ( self.agent.treat_devices_added_or_updated([], False, set())) # The function should return False for resync self.assertFalse(skip_devs) @@ -1138,7 +1139,7 @@ class TestOvsNeutronAgent(object): return_value=( skipped_devices, binding_no_activated_devices, [], failed_devices['added'], - set())) as device_added_updated,\ + set(), set())) as device_added_updated,\ mock.patch.object(self.agent.int_br, "get_ports_attributes", return_value=[]),\ mock.patch.object(self.agent, diff --git a/zuul.d/tempest-multinode.yaml b/zuul.d/tempest-multinode.yaml index f0987bddf9e..faa3c8f4886 100644 --- a/zuul.d/tempest-multinode.yaml +++ b/zuul.d/tempest-multinode.yaml @@ -136,6 +136,11 @@ s-container: false s-object: false s-proxy: false + devstack_local_conf: + post-config: + $NEUTRON_CONF: + nova: + live_migration_events: True group-vars: subnode: devstack_services: