From c741258f8fc069f223d233f46598f790834eb37b Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Thu, 14 Feb 2019 16:37:30 -0500 Subject: [PATCH] Add confirm_snapshot_based_resize_at_source compute method This adds the confirm_snapshot_based_resize_at_source compute service method which will be RPC called from (super)conductor when confirming a cross-cell resize. This method is pretty basic and is similar to confirm_resize() but not exactly. It cleans the guest disks, cleans up port bindings and volume attachments, and frees up resource usage in the resource tracker and placement. The changes made to the instance object in the traditional confirm_resize flow are not made here since conductor is going to just destroy the instance in the source cell database once we have cleaned up the source host. Another difference is when this is called, it is with the copy of the instance from the source cell database which still has the host/node set pointing to the source host/node which is unlike how a traditional resize works where the instance host/node are set to the target host prior to confirmation. Also, the resize.confirm.start/end notifications do not happen here because conductor will send those notifications since the confirmation process spans both the source and target cell. Part of blueprint cross-cell-resize Change-Id: If6cdd627f66a66c412f3c1d4997baab3552a9a9c --- nova/compute/manager.py | 155 ++++++++++++++- nova/compute/rpcapi.py | 36 ++++ nova/objects/service.py | 4 +- nova/tests/unit/compute/test_compute_mgr.py | 206 ++++++++++++++++++++ nova/tests/unit/compute/test_rpcapi.py | 27 +++ 5 files changed, 426 insertions(+), 2 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 27a519292281..1465848278d8 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -520,7 +520,7 @@ class ComputeVirtAPI(virtapi.VirtAPI): class ComputeManager(manager.Manager): """Manages the running instances from creation to destruction.""" - target = messaging.Target(version='5.7') + target = messaging.Target(version='5.8') def __init__(self, compute_driver=None, *args, **kwargs): """Load configuration options and connect to the hypervisor.""" @@ -4377,6 +4377,159 @@ class ComputeManager(manager.Manager): 'migration_uuid': migration.uuid}) raise + @wrap_exception() + @wrap_instance_event(prefix='compute') + @errors_out_migration + @wrap_instance_fault + def confirm_snapshot_based_resize_at_source( + self, ctxt, instance, migration): + """Confirms a snapshot-based resize on the source host. + + Cleans the guest from the source hypervisor including disks and drops + the MoveClaim which will free up "old_flavor" usage from the + ResourceTracker. + + Deletes the allocations held by the migration consumer against the + source compute node resource provider. + + :param ctxt: nova auth request context targeted at the source cell + :param instance: Instance object being resized which should have the + "old_flavor" attribute set + :param migration: Migration object for the resize operation + """ + + @utils.synchronized(instance.uuid) + def do_confirm(): + LOG.info('Confirming resize on source host.', instance=instance) + with self._error_out_instance_on_exception(ctxt, instance): + # TODO(mriedem): Could probably make this try/except/finally + # a context manager to share with confirm_resize(). + try: + self._confirm_snapshot_based_resize_at_source( + ctxt, instance, migration) + except Exception: + # Something failed when cleaning up the source host so + # log a traceback and leave a hint about hard rebooting + # the server to correct its state in the DB. + with excutils.save_and_reraise_exception(logger=LOG): + LOG.exception( + 'Confirm resize failed on source host %s. ' + 'Resource allocations in the placement service ' + 'will be removed regardless because the instance ' + 'is now on the destination host %s. You can try ' + 'hard rebooting the instance to correct its ' + 'state.', self.host, migration.dest_compute, + instance=instance) + finally: + # Whether an error occurred or not, at this point the + # instance is on the dest host so to avoid leaking + # allocations in placement, delete them here. + # TODO(mriedem): Should we catch and just log + # AllocationDeleteFailed? What is the user's recourse if + # we got this far but this fails? At this point the + # instance is on the target host and the allocations + # could just be manually cleaned up by the operator. + self._delete_allocation_after_move(ctxt, instance, + migration) + do_confirm() + + def _confirm_snapshot_based_resize_at_source( + self, ctxt, instance, migration): + """Private version of confirm_snapshot_based_resize_at_source + + This allows the main method to be decorated with error handlers. + + :param ctxt: nova auth request context targeted at the source cell + :param instance: Instance object being resized which should have the + "old_flavor" attribute set + :param migration: Migration object for the resize operation + """ + # Cleanup the guest from the hypervisor including local disks. + network_info = self.network_api.get_instance_nw_info(ctxt, instance) + LOG.debug('Cleaning up guest from source hypervisor including disks.', + instance=instance) + + # FIXME(mriedem): Per bug 1809095, _confirm_resize calls + # _get_updated_nw_info_with_pci_mapping here prior to unplugging + # VIFs on the source, but in our case we have already unplugged + # VIFs during prep_snapshot_based_resize_at_source, so what do we + # need to do about those kinds of ports? Do we need to wait to unplug + # VIFs until confirm like normal resize? + + # Note that prep_snapshot_based_resize_at_source already destroyed the + # guest which disconnected volumes and unplugged VIFs but did not + # destroy disks in case something failed during the resize and the + # instance needed to be rebooted or rebuilt on the source host. Now + # that we are confirming the resize we want to cleanup the disks left + # on the source host. We call cleanup() instead of destroy() to avoid + # any InstanceNotFound confusion from the driver since the guest was + # already destroyed on this host. block_device_info=None and + # destroy_vifs=False means cleanup() will not try to disconnect volumes + # or unplug VIFs. + self.driver.cleanup( + ctxt, instance, network_info, block_device_info=None, + destroy_disks=True, destroy_vifs=False) + + # Delete port bindings for the source host. + self._confirm_snapshot_based_resize_delete_port_bindings( + ctxt, instance, migration) + + # Delete volume attachments for the source host. + self._delete_volume_attachments(ctxt, instance.get_bdms()) + + # Free up the old_flavor usage from the resource tracker for this host. + self.rt.drop_move_claim( + ctxt, instance, migration.source_node, instance.old_flavor, + prefix='old_') + instance.drop_migration_context() + + migration.status = 'confirmed' + migration.save() + + def _confirm_snapshot_based_resize_delete_port_bindings( + self, ctxt, instance, migration): + """Delete port bindings for the source host when confirming + snapshot-based resize on the source host." + + :param ctxt: nova auth RequestContext + :param instance: Instance object that was resized/cold migrated + :param migration: Migration object for the resize/cold migrate + """ + # setup_networks_on_host relies on the instance.host not being the same + # as the host we pass in, so we have to mutate the instance to + # effectively trick this code. + with utils.temporary_mutation(instance, host=migration.dest_compute): + LOG.debug('Deleting port bindings for source host.', + instance=instance) + try: + self.network_api.setup_networks_on_host( + ctxt, instance, host=self.host, teardown=True) + except exception.PortBindingDeletionFailed as e: + # Do not let this stop us from cleaning up since the guest + # is already gone. + LOG.error('Failed to delete port bindings from source host. ' + 'Error: %s', six.text_type(e), instance=instance) + + def _delete_volume_attachments(self, ctxt, bdms): + """Deletes volume attachment records for the given bdms. + + This method will log but not re-raise any exceptions if the volume + attachment delete fails. + + :param ctxt: nova auth request context used to make + DELETE /attachments/{attachment_id} requests to cinder. + :param bdms: objects.BlockDeviceMappingList representing volume + attachments to delete based on BlockDeviceMapping.attachment_id. + """ + for bdm in bdms: + if bdm.attachment_id: + try: + self.volume_api.attachment_delete(ctxt, bdm.attachment_id) + except Exception as e: + LOG.error('Failed to delete volume attachment with ID %s. ' + 'Error: %s', bdm.attachment_id, six.text_type(e), + instance_uuid=bdm.instance_uuid) + @wrap_exception() @reverts_task_state @wrap_instance_event(prefix='compute') diff --git a/nova/compute/rpcapi.py b/nova/compute/rpcapi.py index 75c3b804e642..837156939bc5 100644 --- a/nova/compute/rpcapi.py +++ b/nova/compute/rpcapi.py @@ -373,6 +373,7 @@ class ComputeAPI(object): * 5.5 - Add prep_snapshot_based_resize_at_dest() * 5.6 - Add prep_snapshot_based_resize_at_source() * 5.7 - Add finish_snapshot_based_resize_at_dest() + * 5.8 - Add confirm_snapshot_based_resize_at_source() ''' VERSION_ALIASES = { @@ -599,6 +600,41 @@ class ComputeAPI(object): return rpc_method(ctxt, 'confirm_resize', instance=instance, migration=migration) + def confirm_snapshot_based_resize_at_source( + self, ctxt, instance, migration): + """Confirms a snapshot-based resize on the source host. + + Cleans the guest from the source hypervisor including disks and drops + the MoveClaim which will free up "old_flavor" usage from the + ResourceTracker. + + Deletes the allocations held by the migration consumer against the + source compute node resource provider. + + This is a synchronous RPC call using the ``long_rpc_timeout`` + configuration option. + + :param ctxt: nova auth request context targeted at the source cell + :param instance: Instance object being resized which should have the + "old_flavor" attribute set + :param migration: Migration object for the resize operation + :raises: nova.exception.MigrationError if the source compute is too + old to perform the operation + :raises: oslo_messaging.exceptions.MessagingTimeout if the RPC call + times out + """ + version = '5.8' + client = self.router.client(ctxt) + if not client.can_send_version(version): + raise exception.MigrationError(reason=_('Compute too old')) + cctxt = client.prepare(server=migration.source_compute, + version=version, + call_monitor_timeout=CONF.rpc_response_timeout, + timeout=CONF.long_rpc_timeout) + return cctxt.call( + ctxt, 'confirm_snapshot_based_resize_at_source', + instance=instance, migration=migration) + def detach_interface(self, ctxt, instance, port_id): version = '5.0' cctxt = self.router.client(ctxt).prepare( diff --git a/nova/objects/service.py b/nova/objects/service.py index 9ea0fc2a2c2e..88a67696858c 100644 --- a/nova/objects/service.py +++ b/nova/objects/service.py @@ -31,7 +31,7 @@ LOG = logging.getLogger(__name__) # NOTE(danms): This is the global service version counter -SERVICE_VERSION = 44 +SERVICE_VERSION = 45 # NOTE(danms): This is our SERVICE_VERSION history. The idea is that any @@ -169,6 +169,8 @@ SERVICE_VERSION_HISTORY = ( {'compute_rpc': '5.6'}, # Version 44: Compute RPC version 5.7: finish_snapshot_based_resize_at_dest {'compute_rpc': '5.7'}, + # Version 45: Compute RPC v5.8: confirm_snapshot_based_resize_at_source + {'compute_rpc': '5.8'}, ) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 2b1ef19c471b..f9e96136df41 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -10908,6 +10908,212 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, allocations=get_allocs.return_value, network_info=nwinfo, block_device_info=_prep_block_device.return_value, power_on=True) + @mock.patch('nova.objects.Instance.save') + @mock.patch('nova.compute.manager.ComputeManager.' + '_delete_allocation_after_move') + @mock.patch('nova.compute.utils.add_instance_fault_from_exc') + def test_confirm_snapshot_based_resize_at_source_error_handling( + self, mock_add_fault, mock_delete_allocs, mock_inst_save): + """Tests the error handling in confirm_snapshot_based_resize_at_source + when a failure occurs. + """ + error = test.TestingException('oops') + with mock.patch.object( + self.compute, '_confirm_snapshot_based_resize_at_source', + side_effect=error) as confirm_mock: + self.assertRaises( + test.TestingException, + self.compute.confirm_snapshot_based_resize_at_source, + self.context, self.instance, self.migration) + confirm_mock.assert_called_once_with( + self.context, self.instance, self.migration) + self.assertIn('Confirm resize failed on source host', + self.stdlog.logger.output) + # _error_out_instance_on_exception should set the instance to ERROR + self.assertEqual(vm_states.ERROR, self.instance.vm_state) + mock_inst_save.assert_called() + mock_delete_allocs.assert_called_once_with( + self.context, self.instance, self.migration) + # wrap_instance_fault should record a fault + mock_add_fault.assert_called_once_with( + self.context, self.instance, error, test.MatchType(tuple)) + # errors_out_migration should set the migration status to 'error' + self.assertEqual('error', self.migration.status) + self.migration.save.assert_called_once_with() + # Assert wrap_exception is called. + self.assertEqual(1, len(fake_notifier.VERSIONED_NOTIFICATIONS)) + self.assertEqual( + 'compute.%s' % fields.NotificationAction.EXCEPTION, + fake_notifier.VERSIONED_NOTIFICATIONS[0]['event_type']) + + @mock.patch('nova.objects.Instance.save') + @mock.patch('nova.compute.manager.ComputeManager.' + '_confirm_snapshot_based_resize_at_source') + @mock.patch('nova.compute.utils.add_instance_fault_from_exc') + def test_confirm_snapshot_based_resize_at_source_delete_alloc_fails( + self, mock_add_fault, mock_confirm, mock_inst_save): + """Tests the error handling in confirm_snapshot_based_resize_at_source + when _delete_allocation_after_move fails. + """ + error = exception.AllocationDeleteFailed( + consumer_uuid=self.migration.uuid, + error='placement down') + with mock.patch.object( + self.compute, '_delete_allocation_after_move', + side_effect=error) as mock_delete_allocs: + self.assertRaises( + exception.AllocationDeleteFailed, + self.compute.confirm_snapshot_based_resize_at_source, + self.context, self.instance, self.migration) + mock_confirm.assert_called_once_with( + self.context, self.instance, self.migration) + # _error_out_instance_on_exception should set the instance to ERROR + self.assertEqual(vm_states.ERROR, self.instance.vm_state) + mock_inst_save.assert_called() + mock_delete_allocs.assert_called_once_with( + self.context, self.instance, self.migration) + # wrap_instance_fault should record a fault + mock_add_fault.assert_called_once_with( + self.context, self.instance, error, test.MatchType(tuple)) + # errors_out_migration should set the migration status to 'error' + self.assertEqual('error', self.migration.status) + self.migration.save.assert_called_once_with() + # Assert wrap_exception is called. + self.assertEqual(1, len(fake_notifier.VERSIONED_NOTIFICATIONS)) + self.assertEqual( + 'compute.%s' % fields.NotificationAction.EXCEPTION, + fake_notifier.VERSIONED_NOTIFICATIONS[0]['event_type']) + + @mock.patch('nova.objects.Instance.get_bdms') + @mock.patch('nova.compute.manager.ComputeManager.' + '_delete_allocation_after_move') + @mock.patch('nova.compute.manager.ComputeManager.' + '_confirm_snapshot_based_resize_delete_port_bindings') + @mock.patch('nova.compute.manager.ComputeManager.' + '_delete_volume_attachments') + @mock.patch('nova.objects.Instance.drop_migration_context') + def test_confirm_snapshot_based_resize_at_source( + self, mock_drop_mig_ctx, mock_delete_vols, mock_delete_bindings, + mock_delete_allocs, mock_get_bdms): + """Happy path test for confirm_snapshot_based_resize_at_source.""" + self.instance.old_flavor = objects.Flavor() + with test.nested( + mock.patch.object(self.compute, 'network_api'), + mock.patch.object(self.compute.driver, 'cleanup'), + mock.patch.object(self.compute.rt, 'drop_move_claim') + ) as ( + mock_network_api, mock_cleanup, mock_drop_claim + ): + # Run the code. + self.compute.confirm_snapshot_based_resize_at_source( + self.context, self.instance, self.migration) + # Assert the mocks. + mock_network_api.get_instance_nw_info.assert_called_once_with( + self.context, self.instance) + # The guest was cleaned up. + mock_cleanup.assert_called_once_with( + self.context, self.instance, + mock_network_api.get_instance_nw_info.return_value, + block_device_info=None, destroy_disks=True, destroy_vifs=False) + # Ports and volumes were cleaned up. + mock_delete_bindings.assert_called_once_with( + self.context, self.instance, self.migration) + mock_delete_vols.assert_called_once_with( + self.context, mock_get_bdms.return_value) + # Move claim and migration context were dropped. + mock_drop_claim.assert_called_once_with( + self.context, self.instance, self.migration.source_node, + self.instance.old_flavor, prefix='old_') + mock_drop_mig_ctx.assert_called_once_with() + # The migration was updated. + self.assertEqual('confirmed', self.migration.status) + self.migration.save.assert_called_once_with() + mock_delete_allocs.assert_called_once_with( + self.context, self.instance, self.migration) + + def test_confirm_snapshot_based_resize_delete_port_bindings(self): + """Happy path test for + _confirm_snapshot_based_resize_delete_port_bindings. + """ + + def stub_setup_networks_on_host(ctxt, instance, *args, **kwargs): + # Make sure the instance.host was mutated to point at the dest host + self.assertEqual(self.migration.dest_compute, instance.host) + + with mock.patch.object( + self.compute.network_api, 'setup_networks_on_host', + side_effect=stub_setup_networks_on_host) as setup_networks: + self.compute._confirm_snapshot_based_resize_delete_port_bindings( + self.context, self.instance, self.migration) + setup_networks.assert_called_once_with( + self.context, self.instance, host=self.compute.host, + teardown=True) + + def test_confirm_snapshot_based_resize_delete_port_bindings_errors(self): + """Tests error handling for + _confirm_snapshot_based_resize_delete_port_bindings. + """ + # PortBindingDeletionFailed will be caught and logged. + with mock.patch.object( + self.compute.network_api, 'setup_networks_on_host', + side_effect=exception.PortBindingDeletionFailed( + port_id=uuids.port_id, host=self.compute.host) + ) as setup_networks: + self.compute._confirm_snapshot_based_resize_delete_port_bindings( + self.context, self.instance, self.migration) + setup_networks.assert_called_once_with( + self.context, self.instance, host=self.compute.host, + teardown=True) + self.assertIn('Failed to delete port bindings from source host', + self.stdlog.logger.output) + + # Anything else is re-raised. + func = self.compute._confirm_snapshot_based_resize_delete_port_bindings + with mock.patch.object( + self.compute.network_api, 'setup_networks_on_host', + side_effect=test.TestingException('neutron down') + ) as setup_networks: + self.assertRaises(test.TestingException, func, + self.context, self.instance, self.migration) + setup_networks.assert_called_once_with( + self.context, self.instance, host=self.compute.host, + teardown=True) + + def test_delete_volume_attachments(self): + """Happy path test for _delete_volume_attachments.""" + bdms = objects.BlockDeviceMappingList(objects=[ + objects.BlockDeviceMapping(attachment_id=uuids.attachment1), + objects.BlockDeviceMapping(attachment_id=None), # skipped + objects.BlockDeviceMapping(attachment_id=uuids.attachment2) + ]) + with mock.patch.object(self.compute.volume_api, + 'attachment_delete') as attachment_delete: + self.compute._delete_volume_attachments(self.context, bdms) + self.assertEqual(2, attachment_delete.call_count) + attachment_delete.assert_has_calls([ + mock.call(self.context, uuids.attachment1), + mock.call(self.context, uuids.attachment2)]) + + def test_delete_volume_attachments_errors(self): + """Tests error handling for _delete_volume_attachments.""" + bdms = objects.BlockDeviceMappingList(objects=[ + objects.BlockDeviceMapping(attachment_id=uuids.attachment1, + instance_uuid=self.instance.uuid), + objects.BlockDeviceMapping(attachment_id=uuids.attachment2) + ]) + # First attachment_delete call fails and is logged, second is OK. + errors = [test.TestingException('cinder down'), None] + with mock.patch.object(self.compute.volume_api, + 'attachment_delete', + side_effect=errors) as attachment_delete: + self.compute._delete_volume_attachments(self.context, bdms) + self.assertEqual(2, attachment_delete.call_count) + attachment_delete.assert_has_calls([ + mock.call(self.context, uuids.attachment1), + mock.call(self.context, uuids.attachment2)]) + self.assertIn('Failed to delete volume attachment with ID %s' % + uuids.attachment1, self.stdlog.logger.output) + class ComputeManagerInstanceUsageAuditTestCase(test.TestCase): def setUp(self): diff --git a/nova/tests/unit/compute/test_rpcapi.py b/nova/tests/unit/compute/test_rpcapi.py index 90a50443b925..370862e16ae8 100644 --- a/nova/tests/unit/compute/test_rpcapi.py +++ b/nova/tests/unit/compute/test_rpcapi.py @@ -601,6 +601,33 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): request_spec=objects.RequestSpec()) self.assertIn('Compute too old', six.text_type(ex)) + def test_confirm_snapshot_based_resize_at_source(self): + """Tests happy path for confirm_snapshot_based_resize_at_source.""" + self.flags(long_rpc_timeout=1234) + self._test_compute_api( + 'confirm_snapshot_based_resize_at_source', 'call', + # compute method kwargs + instance=self.fake_instance_obj, + migration=migration_obj.Migration(source_compute='source'), + # client.prepare kwargs + version='5.8', prepare_server='source', + call_monitor_timeout=60, timeout=1234) + + @mock.patch('nova.rpc.ClientRouter.client') + def test_confirm_snapshot_based_resize_at_source_old_compute(self, client): + """Tests when the source compute service is too old to call + confirm_snapshot_based_resize_at_source so MigrationError is raised. + """ + client.return_value.can_send_version.return_value = False + rpcapi = compute_rpcapi.ComputeAPI() + ex = self.assertRaises( + exception.MigrationError, + rpcapi.confirm_snapshot_based_resize_at_source, + self.context, + instance=self.fake_instance_obj, + migration=migration_obj.Migration(source_compute='source')) + self.assertIn('Compute too old', six.text_type(ex)) + def test_reboot_instance(self): self.maxDiff = None self._test_compute_api('reboot_instance', 'cast',