diff --git a/doc/source/_static/images/live-migration.svg b/doc/source/_static/images/live-migration.svg index 4c26a7f3ae7a..293461594e98 100644 --- a/doc/source/_static/images/live-migration.svg +++ b/doc/source/_static/images/live-migration.svg @@ -1,13 +1,33 @@ - - - - - - - - - blockdiag - seqdiag { + + + + + + + + + + + + + + + + + + + + + + + + + + + + + blockdiag + seqdiag { Conductor; Source; Destination; edge_length = 300; span_height = 15; @@ -36,114 +56,204 @@ Source <-- Destination; Source ->> Destination [label = "cast", note = "rollback_live_migration_at_destination"]; } - - - - - - - - - - - - - - - - - Conductor - - Source - - Destination - - - - - - check_can_live_migrate_destination - - - - - - check_can_live_migrate_source - - - - - - - - - - - live_migrate - - - - - - pre_live_migration (set up dest) - - - - - - - - - - post_live_migration (clean up source) - - - - - - post_live_migration_at_destination (finish dest) - - - - - - - - - - _rollback_live_migration - - - - - - remove_volume_connections - - - - - - - - - rollback_live_migration_at_destination - call - call - cast - call - call - call - cast - - - - - - driver.live_migration (success) - - - - - - driver.live_migration (failure) + + + + + + + + + + + + + + + + + + + + + Conductor + + Source + + Destination + + + + + + + + check_can_live_migrate_destination + + + + + + + + check_can_live_migrate_source + + + + + + + + + + + + + + + + + live_migrate + + + + + + + + pre_live_migration (set up dest) + + + + + + + + + + + + + + post_live_migration (clean up source) + + + + + + + + post_live_migration_at_destination (finish dest) + + + + + + + + + + + + + + _rollback_live_migration + + + + + + + + remove_volume_connections + + + + + + + + + + + + + + rollback_live_migration_at_destination + call + call + cast + call + call + call + cast + + + + + + + + + + + + rollback_live_migration_at_destination + cast + + + + + + + + driver.live_migration (success) + + + + + + + + driver.live_migration (failure) + + + + blockdiag + + + + + + + + + + + + + + + + call + pre_live_migration (failure) + + + + + + _rollback_live_migration + Exception + + + + + + pre_live_migration (set up dest) + + + except Exception: + disconnect volumes + delete new attachments + diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 73e3f82930c1..426ac66199a2 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -9316,43 +9316,56 @@ class ComputeManager(manager.Manager): bdm.attachment_id = attach_ref['id'] bdm.save() + # Retrieve connection_info for the destination. Note that it is not + # saved back to the database yet. block_device_info = self._get_instance_block_device_info( context, instance, refresh_conn_info=True, bdms=bdms) - # The driver pre_live_migration will plug vifs on the host - migrate_data = self.driver.pre_live_migration(context, - instance, - block_device_info, - network_info, - disk, - migrate_data) - LOG.debug('driver pre_live_migration data is %s', migrate_data) - # driver.pre_live_migration is what plugs vifs on the destination - # host so now we can set the wait_for_vif_plugged flag in the - # migrate_data object which the source compute will use to - # determine if it should wait for a 'network-vif-plugged' event - # from neutron before starting the actual guest transfer in the - # hypervisor - using_multiple_port_bindings = ( - 'vifs' in migrate_data and migrate_data.vifs) - migrate_data.wait_for_vif_plugged = ( - CONF.compute.live_migration_wait_for_vif_plug and - using_multiple_port_bindings - ) + # The driver pre_live_migration will plug vifs and connect volumes + # on the host + try: + migrate_data = self.driver.pre_live_migration( + context, instance, block_device_info, network_info, disk, + migrate_data) - # NOTE(tr3buchet): setup networks on destination host - self.network_api.setup_networks_on_host(context, instance, - self.host) + LOG.debug('driver pre_live_migration data is %s', migrate_data) + # driver.pre_live_migration is what plugs vifs on the + # destination host so now we can set the wait_for_vif_plugged + # flag in the migrate_data object which the source compute will + # use to determine if it should wait for a + # 'network-vif-plugged' event from neutron before starting the + # actual guest transfer in the hypervisor + using_multiple_port_bindings = ( + 'vifs' in migrate_data and migrate_data.vifs) + migrate_data.wait_for_vif_plugged = ( + CONF.compute.live_migration_wait_for_vif_plug and + using_multiple_port_bindings + ) - # NOTE(lyarwood): The above call to driver.pre_live_migration - # can result in the virt drivers attempting to stash additional - # metadata into the connection_info of the underlying bdm. - # Ensure this is saved to the database by calling .save() against - # the driver BDMs we passed down via block_device_info. - for driver_bdm in block_device_info['block_device_mapping']: - driver_bdm.save() + # NOTE(tr3buchet): setup networks on destination host + self.network_api.setup_networks_on_host(context, instance, + self.host) + # NOTE(lyarwood): The above call to driver.pre_live_migration + # can result in the virt drivers attempting to stash additional + # metadata into the connection_info of the underlying bdm. + # Ensure this is saved to the database by calling .save() + # against the driver BDMs we passed down via block_device_info. + for driver_bdm in block_device_info['block_device_mapping']: + driver_bdm.save() + except Exception: + # NOTE(melwitt): Try to disconnect any volumes which may have + # been connected during driver pre_live_migration(). By the + # time this error is received by the source host, BDM records + # in the database will refer only to the source host. Detach + # volumes while we still have connection_info about the + # destination host. + for driver_bdm in block_device_info['block_device_mapping']: + driver_bdm.driver_detach( + context, instance, self.volume_api, self.driver) + # Re-raise to perform any remaining rollback actions. + raise except Exception: # If we raise, migrate_data with the updated attachment ids # will not be returned to the source host for rollback. @@ -10321,8 +10334,15 @@ class ComputeManager(manager.Manager): # TODO(lyarwood): Turn the following into a lookup method within # BlockDeviceMappingList. vol_bdms = [bdm for bdm in bdms if bdm.is_volume] - self._remove_remote_volume_connections(context, dest, vol_bdms, - instance) + + if not pre_live_migration: + # This will do both a driver detach and a Cinder attachment delete. + # If we are in here due to a pre_live_migration failure, BDMs have + # already been rolled back to contain info for the source, so don't + # try to remove volume connections on the destination. + # See ComputeManager.pre_live_migration() for details. + self._remove_remote_volume_connections( + context, dest, vol_bdms, instance) self._rollback_volume_bdms(context, vol_bdms, source_bdms, instance) self._notify_about_instance_usage(context, instance, @@ -10469,6 +10489,17 @@ class ComputeManager(manager.Manager): # from remote volumes if necessary block_device_info = self._get_instance_block_device_info(context, instance) + # NOTE(melwitt): By the time we get here, the instance BDMs have + # already been rolled back to contain info for the source during + # _rollback_live_migration(). + # The code above predates the addition of rollback of the instance + # BDM records to point at the source. It also predates the addition + # of a driver detach call to remove_volume_connection(). + # Set the list for Cinder volumes to empty to avoid attempting to + # disconnect volumes during driver.cleanup() on the destination. + bdi_without_volumes = copy.deepcopy(block_device_info) + bdi_without_volumes['block_device_mapping'] = [] + # free any instance PCI claims done on destination during # check_can_live_migrate_destination() self.rt.free_pci_device_claims_for_instance(context, instance) @@ -10478,7 +10509,7 @@ class ComputeManager(manager.Manager): # specific resources like vpmem with instance.mutated_migration_context(): self.driver.rollback_live_migration_at_destination( - context, instance, network_info, block_device_info, + context, instance, network_info, bdi_without_volumes, destroy_disks=destroy_disks, migrate_data=migrate_data) self._notify_about_instance_usage( diff --git a/nova/tests/functional/regressions/test_bug_1899835.py b/nova/tests/functional/regressions/test_bug_1899835.py index 96301d1de2aa..780a98f511ec 100644 --- a/nova/tests/functional/regressions/test_bug_1899835.py +++ b/nova/tests/functional/regressions/test_bug_1899835.py @@ -13,6 +13,7 @@ from unittest import mock import fixtures +from oslo_serialization import jsonutils from nova import context from nova import objects @@ -20,8 +21,6 @@ from nova import test from nova.tests import fixtures as nova_fixtures from nova.tests.functional.libvirt import base -from oslo_serialization import jsonutils - class TestVolumeDisconnectDuringPreLiveMigrationRollback(base.ServersTestBase): """Regression test for bug #1899835 @@ -75,13 +74,6 @@ class TestVolumeDisconnectDuringPreLiveMigrationRollback(base.ServersTestBase): mock_src_connector.assert_called_once() mock_src_connect.assert_called_once() - # Fetch the connection_info from the src - ctxt = context.get_admin_context() - bdm = objects.BlockDeviceMapping.get_by_volume_id( - ctxt, nova_fixtures.CinderFixture.IMAGE_BACKED_VOL, - instance_uuid=server['id']) - src_connection_info = jsonutils.loads(bdm.connection_info) - with test.nested( mock.patch.object( self.computes['dest'].driver, 'get_volume_connector'), @@ -106,19 +98,106 @@ class TestVolumeDisconnectDuringPreLiveMigrationRollback(base.ServersTestBase): # Assert that connect_volume hasn't been called on the dest mock_dest_connect.assert_not_called() - # FIXME(lyarwood): This is bug #1899835, disconnect_volume shouldn't be - # called on the destination host without connect_volume first being - # called and especially using with the connection_info from the source - self.assertEqual(2, mock_dest_disconnect.call_count) - # First call is from ComputeManager._remove_volume_connection() called - # eventually from ComputeManager._rollback_live_migration() on the - # source. - call1 = mock.call( - mock.ANY, src_connection_info, mock.ANY, encryption=mock.ANY) - # Second call is from LibvirtDriver.destroy() => - # LibvirtDriver.cleanup() on the destination as part of - # ComputeManager.rollback_live_migration_at_destination(). - call2 = mock.call( - mock.ANY, src_connection_info, mock.ANY, destroy_secrets=True, - force=True) - mock_dest_disconnect.assert_has_calls([call1, call2]) + # Assert that disconnect_volume was not called on the destination host + self.assertEqual(0, mock_dest_disconnect.call_count) + + def test_pre_live_migration_failed_after_new_attachment_created(self): + """Test a scenario where live migration fails during pre_live_migration + where the BDM record attachment_id got updated with a new Cinder + attachment but the BDM record connection_info was not yet updated. + + In other words, BlockDeviceMapping.attachment_id is pointing at the + destination but BlockDeviceMapping.connection_info is still pointing + at the source. + + The _disconnect_volume() should run on the destination and with + destination connection_info despite the BDM database record still + containing source connection_info. + """ + server = { + 'name': 'test', + 'imageRef': '', + 'flavorRef': 1, + 'networks': 'none', + 'host': 'src', + 'block_device_mapping_v2': [{ + 'source_type': 'volume', + 'destination_type': 'volume', + 'boot_index': 0, + 'uuid': nova_fixtures.CinderFixture.IMAGE_BACKED_VOL + }] + } + + with test.nested( + mock.patch.object( + self.computes['src'].driver, 'get_volume_connector'), + mock.patch.object( + self.computes['src'].driver, '_connect_volume'), + ) as ( + mock_src_connector, mock_src_connect + ): + server = self.api.post_server({'server': server}) + server = self._wait_for_state_change(server, 'ACTIVE') + + # Assert that we called the src connector and connect mocks + mock_src_connector.assert_called_once() + mock_src_connect.assert_called_once() + + # Fetch the connection_info from the src + ctxt = context.get_admin_context() + bdm = objects.BlockDeviceMapping.get_by_volume_id( + ctxt, nova_fixtures.CinderFixture.IMAGE_BACKED_VOL, + instance_uuid=server['id']) + src_connection_info = jsonutils.loads(bdm.connection_info) + + with test.nested( + mock.patch.object( + self.computes['dest'].driver, 'get_volume_connector'), + mock.patch.object( + self.computes['dest'].driver, '_connect_volume'), + mock.patch.object( + self.computes['dest'].driver, '_disconnect_volume'), + mock.patch.object( + self.computes['dest'].manager.network_api, + 'setup_networks_on_host', + # Fail setup networks during pre_live_migration but have it + # succeed during rollback + side_effect=[test.TestingException, None], autospec=False), + ) as ( + mock_dest_connector, mock_dest_connect, mock_dest_disconnect, + mock_setup_networks + ): + # Attempt to live migrate and ensure it is marked as error + self._live_migrate(server, 'failed') + + # Assert that we called the dest connector and setup networks mocks + mock_dest_connector.assert_called_once() + mock_setup_networks.assert_called() + + # Assert that connect_volume has been called on the dest + mock_dest_connect.assert_called() + + # Assert that disconnect_volume was called on the destination host + # and with connection_info for the destination + mock_dest_disconnect.assert_called_once() + # def _disconnect_volume(self, context, connection_info, instance, ...) + dest_connection_info = mock_dest_disconnect.call_args.args[1] + volumes_attached = server['os-extended-volumes:volumes_attached'] + self.assertEqual(1, len(volumes_attached)) + # The volume_id in the destination connection_info should match the + # volume attached to the server + self.assertEqual( + volumes_attached[0]['id'], + dest_connection_info['data']['volume_id']) + # The attachment_id should not be the same as the source (new + # attachment_id) + self.assertNotEqual( + src_connection_info['data']['attachment_id'], + dest_connection_info['data']['attachment_id']) + + # The connection_info in the database should still be for the source + bdm = objects.BlockDeviceMapping.get_by_volume_id( + ctxt, nova_fixtures.CinderFixture.IMAGE_BACKED_VOL, + instance_uuid=server['id']) + current_connection_info = jsonutils.loads(bdm.connection_info) + self.assertEqual(src_connection_info, current_connection_info) diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index cb2dcaa8f252..0a64a47d74a6 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -6826,9 +6826,6 @@ class ComputeTestCase(BaseTestCase, final_bdm.save.assert_called() mock_get_bdms.assert_called_with(c, instance.uuid) - mock_remove.assert_has_calls([ - mock.call(c, instance, uuids.volume_id_1, dest_host), - mock.call(c, instance, uuids.volume_id_2, dest_host)]) mock_rollback.assert_called_once_with(c, instance, dest_host, destroy_disks=True, migrate_data=test.MatchType( diff --git a/releasenotes/notes/pre-live-migration-rollback-7de399d9b3979f79.yaml b/releasenotes/notes/pre-live-migration-rollback-7de399d9b3979f79.yaml new file mode 100644 index 000000000000..d86b6d66fe7d --- /dev/null +++ b/releasenotes/notes/pre-live-migration-rollback-7de399d9b3979f79.yaml @@ -0,0 +1,9 @@ +fixes: + - | + When live migration fails during pre_live_migration on the destination, + during rollback Cinder volumes will now be disconnected from the + destination locally instead of remotely over RPC from the source. This + should ensure that only connection_info for the destination will be used to + disconnect volumes from the destination. + See `bug #1899835 `_ for more + details.