Delete allocations even if _confirm_resize raises

When we are confirming a resize, the guest is on the dest
host and the instance host/node values in the database
are pointing at the dest host, so the _confirm_resize method
on the source is really best effort. If something fails, we
should not leak allocations in placement for the source compute
node resource provider since the instance is not actually
consuming the source node provider resources.

This change refactors the error handling around the _confirm_resize
call so the big nesting for _error_out_instance_on_exception is
moved to confirm_resize and then a try/finally is added around
_confirm_resize so we can be sure to try and cleanup the allocations
even if _confirm_resize fails in some obscure way. If _confirm_resize
does fail, the error gets re-raised along with logging a traceback
and hint about how to correct the instance state in the DB by hard
rebooting the server on the dest host.

Change-Id: I29c5f491ec20a71283190a1599e7732541de736f
Closes-Bug: #1821594
This commit is contained in:
Matt Riedemann 2019-03-25 14:02:17 -04:00
parent 28944d05f3
commit 03a6d26691
2 changed files with 82 additions and 52 deletions

View File

@ -3973,7 +3973,29 @@ class ComputeManager(manager.Manager):
instance=instance) instance=instance)
return return
self._confirm_resize(context, instance, migration=migration) with self._error_out_instance_on_exception(context, instance):
try:
self._confirm_resize(
context, instance, migration=migration)
except Exception:
# Something failed when cleaning up the source host so
# log a traceback and leave a hint about hard rebooting
# the server to correct its state in the DB.
with excutils.save_and_reraise_exception(logger=LOG):
LOG.exception(
'Confirm resize failed on source host %s. '
'Resource allocations in the placement service '
'will be removed regardless because the instance '
'is now on the destination host %s. You can try '
'hard rebooting the instance to correct its '
'state.', self.host, migration.dest_compute,
instance=instance)
finally:
# Whether an error occurred or not, at this point the
# instance is on the dest host so to avoid leaking
# allocations in placement, delete them here.
self._delete_allocation_after_move(
context, instance, migration)
do_confirm_resize(context, instance, migration.id) do_confirm_resize(context, instance, migration.id)
@ -3985,59 +4007,57 @@ class ComputeManager(manager.Manager):
self.host, action=fields.NotificationAction.RESIZE_CONFIRM, self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
phase=fields.NotificationPhase.START) phase=fields.NotificationPhase.START)
with self._error_out_instance_on_exception(context, instance): # NOTE(danms): delete stashed migration information
# NOTE(danms): delete stashed migration information old_instance_type = instance.old_flavor
old_instance_type = instance.old_flavor instance.old_flavor = None
instance.old_flavor = None instance.new_flavor = None
instance.new_flavor = None instance.system_metadata.pop('old_vm_state', None)
instance.system_metadata.pop('old_vm_state', None) instance.save()
instance.save()
# NOTE(tr3buchet): tear down networks on source host # NOTE(tr3buchet): tear down networks on source host
self.network_api.setup_networks_on_host(context, instance, self.network_api.setup_networks_on_host(context, instance,
migration.source_compute, teardown=True) migration.source_compute, teardown=True)
network_info = self.network_api.get_instance_nw_info(context, network_info = self.network_api.get_instance_nw_info(context,
instance) instance)
# TODO(mriedem): Get BDMs here and pass them to the driver. # TODO(mriedem): Get BDMs here and pass them to the driver.
self.driver.confirm_migration(context, migration, instance, self.driver.confirm_migration(context, migration, instance,
network_info) network_info)
migration.status = 'confirmed' migration.status = 'confirmed'
migration.save() migration.save()
self.rt.drop_move_claim(context, instance, migration.source_node, self.rt.drop_move_claim(context, instance, migration.source_node,
old_instance_type, prefix='old_') old_instance_type, prefix='old_')
self._delete_allocation_after_move(context, instance, migration) instance.drop_migration_context()
instance.drop_migration_context()
# NOTE(mriedem): The old_vm_state could be STOPPED but the user # NOTE(mriedem): The old_vm_state could be STOPPED but the user
# might have manually powered up the instance to confirm the # might have manually powered up the instance to confirm the
# resize/migrate, so we need to check the current power state # resize/migrate, so we need to check the current power state
# on the instance and set the vm_state appropriately. We default # on the instance and set the vm_state appropriately. We default
# to ACTIVE because if the power state is not SHUTDOWN, we # to ACTIVE because if the power state is not SHUTDOWN, we
# assume _sync_instance_power_state will clean it up. # assume _sync_instance_power_state will clean it up.
p_state = instance.power_state p_state = instance.power_state
vm_state = None vm_state = None
if p_state == power_state.SHUTDOWN: if p_state == power_state.SHUTDOWN:
vm_state = vm_states.STOPPED vm_state = vm_states.STOPPED
LOG.debug("Resized/migrated instance is powered off. " LOG.debug("Resized/migrated instance is powered off. "
"Setting vm_state to '%s'.", vm_state, "Setting vm_state to '%s'.", vm_state,
instance=instance) instance=instance)
else: else:
vm_state = vm_states.ACTIVE vm_state = vm_states.ACTIVE
instance.vm_state = vm_state instance.vm_state = vm_state
instance.task_state = None instance.task_state = None
instance.save(expected_task_state=[None, task_states.DELETING, instance.save(expected_task_state=[None, task_states.DELETING,
task_states.SOFT_DELETING]) task_states.SOFT_DELETING])
self._notify_about_instance_usage( self._notify_about_instance_usage(
context, instance, "resize.confirm.end", context, instance, "resize.confirm.end",
network_info=network_info) network_info=network_info)
compute_utils.notify_about_instance_action(context, instance, compute_utils.notify_about_instance_action(context, instance,
self.host, action=fields.NotificationAction.RESIZE_CONFIRM, self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
phase=fields.NotificationPhase.END) phase=fields.NotificationPhase.END)
def _delete_allocation_after_move(self, context, instance, migration): def _delete_allocation_after_move(self, context, instance, migration):
"""Deletes resource allocations held by the migration record against """Deletes resource allocations held by the migration record against

View File

@ -7183,6 +7183,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
do_finish_revert_resize() do_finish_revert_resize()
def test_confirm_resize_deletes_allocations(self): def test_confirm_resize_deletes_allocations(self):
@mock.patch('nova.objects.Instance.get_by_uuid')
@mock.patch('nova.objects.Migration.get_by_id')
@mock.patch.object(self.migration, 'save') @mock.patch.object(self.migration, 'save')
@mock.patch.object(self.compute, '_notify_about_instance_usage') @mock.patch.object(self.compute, '_notify_about_instance_usage')
@mock.patch.object(self.compute, 'network_api') @mock.patch.object(self.compute, 'network_api')
@ -7192,13 +7194,16 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
@mock.patch.object(self.instance, 'save') @mock.patch.object(self.instance, 'save')
def do_confirm_resize(mock_save, mock_drop, mock_delete, def do_confirm_resize(mock_save, mock_drop, mock_delete,
mock_confirm, mock_nwapi, mock_notify, mock_confirm, mock_nwapi, mock_notify,
mock_mig_save): mock_mig_save, mock_mig_get, mock_inst_get):
self._mock_rt() self._mock_rt()
self.instance.migration_context = objects.MigrationContext() self.instance.migration_context = objects.MigrationContext()
self.migration.source_compute = self.instance['host'] self.migration.source_compute = self.instance['host']
self.migration.source_node = self.instance['node'] self.migration.source_node = self.instance['node']
self.compute._confirm_resize(self.context, self.instance, self.migration.status = 'confirming'
self.migration) mock_mig_get.return_value = self.migration
mock_inst_get.return_value = self.instance
self.compute.confirm_resize(self.context, self.instance,
self.migration)
mock_delete.assert_called_once_with(self.context, self.instance, mock_delete.assert_called_once_with(self.context, self.instance,
self.migration) self.migration)
mock_save.assert_called_with(expected_task_state= mock_save.assert_called_with(expected_task_state=
@ -7229,9 +7234,10 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
with test.nested( with test.nested(
mock.patch.object(self.compute, 'network_api'), mock.patch.object(self.compute, 'network_api'),
mock.patch.object(self.compute.driver, 'confirm_migration', mock.patch.object(self.compute.driver, 'confirm_migration',
side_effect=error) side_effect=error),
mock.patch.object(self.compute, '_delete_allocation_after_move')
) as ( ) as (
network_api, confirm_migration network_api, confirm_migration, delete_allocation
): ):
self.assertRaises(exception.HypervisorUnavailable, self.assertRaises(exception.HypervisorUnavailable,
self.compute.confirm_resize, self.compute.confirm_resize,
@ -7245,6 +7251,10 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
self.assertEqual(2, instance_save.call_count) self.assertEqual(2, instance_save.call_count)
# The migration.status should have been saved. # The migration.status should have been saved.
self.migration.save.assert_called_once_with() self.migration.save.assert_called_once_with()
# Allocations should always be cleaned up even if cleaning up the
# source host fails.
delete_allocation.assert_called_once_with(
self.context, self.instance, self.migration)
# Assert other mocks we care less about. # Assert other mocks we care less about.
notify_usage.assert_called_once() notify_usage.assert_called_once()
notify_action.assert_called_once() notify_action.assert_called_once()