Move revert resize under semaphore

As discussed in change I26b050c402f5721fc490126e9becb643af9279b4, the
resource tracker's periodic task is reliant on the status of migrations
to determine whether to include usage from these migrations in the
total, and races between setting the migration status and decrementing
resource usage via 'drop_move_claim' can result in incorrect usage.
That change tackled the confirm resize operation. This one changes the
revert resize operation, and is a little trickier due to kinks in how
both the same-cell and cross-cell resize revert operations work.

For same-cell resize revert, the 'ComputeManager.revert_resize'
function, running on the destination host, sets the migration status to
'reverted' before dropping the move claim. This exposes the same race
that we previously saw with the confirm resize operation. It then calls
back to 'ComputeManager.finish_revert_resize' on the source host to boot
up the instance itself. This is kind of weird, because, even ignoring
the race, we're marking the migration as 'reverted' before we've done
any of the necessary work on the source host.

The cross-cell resize revert splits dropping of the move claim and
setting of the migration status between the source and destination host
tasks. Specifically, we do cleanup on the destination and drop the move
claim first, via 'ComputeManager.revert_snapshot_based_resize_at_dest'
before resuming the instance and setting the migration status on the
source via
'ComputeManager.finish_revert_snapshot_based_resize_at_source'. This
would appear to avoid the weird quirk of same-cell migration, however,
in typical weird cross-cell fashion, these are actually different
instances and different migration records.

The solution is once again to move the setting of the migration status
and the dropping of the claim under 'COMPUTE_RESOURCE_SEMAPHORE'. This
introduces the weird setting of migration status before completion to
the cross-cell resize case and perpetuates it in the same-cell case, but
this seems like a suitable compromise to avoid attempts to do things
like unplugging already unplugged PCI devices or unpinning already
unpinned CPUs. From an end-user perspective, instance state changes are
what really matter and once a revert is completed on the destination
host and the instance has been marked as having returned to the source
host, hard reboots can help us resolve any remaining issues.

Change-Id: I29d6f4a78c0206385a550967ce244794e71cef6d
Signed-off-by: Stephen Finucane <stephenfin@redhat.com>
Closes-Bug: #1879878
This commit is contained in:
Stephen Finucane 2020-08-21 17:43:36 +01:00
parent a57800d382
commit dc9c7a5ebf
7 changed files with 69 additions and 62 deletions

View File

@ -4673,10 +4673,7 @@ class ComputeManager(manager.Manager):
self._delete_volume_attachments(ctxt, bdms)
# Free up the new_flavor usage from the resource tracker for this host.
instance.revert_migration_context()
instance.save(expected_task_state=task_states.RESIZE_REVERTING)
self.rt.drop_move_claim(ctxt, instance, instance.node,
instance_type=instance.new_flavor)
self.rt.drop_move_claim_at_dest(ctxt, instance, migration)
def _revert_instance_flavor_host_node(self, instance, migration):
"""Revert host, node and flavor fields after a resize-revert."""
@ -4873,20 +4870,9 @@ class ComputeManager(manager.Manager):
self._terminate_volume_connections(context, instance, bdms)
migration.status = 'reverted'
migration.save()
# NOTE(ndipanov): We need to do this here because dropping the
# claim means we lose the migration_context data. We really should
# fix this by moving the drop_move_claim call to the
# finish_revert_resize method as this is racy (revert is dropped,
# but instance resources will be tracked with the new flavor until
# it gets rolled back in finish_revert_resize, which is
# potentially wrong for a period of time).
instance.revert_migration_context()
instance.save()
self.rt.drop_move_claim(context, instance, instance.node)
# Free up the new_flavor usage from the resource tracker for this
# host.
self.rt.drop_move_claim_at_dest(context, instance, migration)
# RPC cast back to the source host to finish the revert there.
self.compute_rpcapi.finish_revert_resize(context, instance,

View File

@ -567,6 +567,24 @@ class ResourceTracker(object):
# though.
instance.drop_migration_context()
@utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True)
def drop_move_claim_at_dest(self, context, instance, migration):
"""Drop a move claim after reverting a resize or cold migration."""
# NOTE(stephenfin): This runs on the destination, before we return to
# the source and resume the instance there. As such, the migration
# isn't really really reverted yet, but this status is what we use to
# indicate that we no longer needs to account for usage on this host
migration.status = 'reverted'
migration.save()
self._drop_move_claim(
context, instance, migration.dest_node, instance.new_flavor,
prefix='new_')
instance.revert_migration_context()
instance.save(expected_task_state=[task_states.RESIZE_REVERTING])
@utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True)
def drop_move_claim(self, context, instance, nodename,
instance_type=None, prefix='new_'):

View File

@ -200,15 +200,22 @@ class InstanceHelperMixin:
api = getattr(self, 'admin_api', self.api)
statuses = [status.lower() for status in expected_statuses]
actual_status = None
for attempt in range(10):
migrations = api.api_get('/os-migrations').body['migrations']
for migration in migrations:
if (migration['instance_uuid'] == server['id'] and
migration['status'].lower() in statuses):
if migration['instance_uuid'] == server['id']:
actual_status = migration['status']
if migration['status'].lower() in statuses:
return migration
time.sleep(0.5)
self.fail('Timed out waiting for migration with status "%s" for '
'instance: %s' % (expected_statuses, server['id']))
self.fail(
'Timed out waiting for migration with status for instance %s '
'(expected "%s", got "%s")' % (
server['id'], expected_statuses, actual_status,
))
def _wait_for_log(self, log_line):
for i in range(10):

View File

@ -114,7 +114,7 @@ class TestColdMigrationUsage(integrated_helpers.ProviderUsageBaseTestCase):
self.assertUsage(src_host, 1)
self.assertUsage(dst_host, 0)
orig_drop_claim = rt.ResourceTracker.drop_move_claim
orig_drop_claim = rt.ResourceTracker.drop_move_claim_at_dest
def fake_drop_move_claim(*args, **kwargs):
# run periodics after marking the migration reverted, simulating a
@ -127,15 +127,14 @@ class TestColdMigrationUsage(integrated_helpers.ProviderUsageBaseTestCase):
if drop_race:
self._run_periodics()
# FIXME(stephenfin): the periodic should not have dropped the
# records for the src
self.assertUsage(src_host, 0)
self.assertUsage(src_host, 1)
self.assertUsage(dst_host, 1)
return orig_drop_claim(*args, **kwargs)
self.stub_out(
'nova.compute.resource_tracker.ResourceTracker.drop_move_claim',
'nova.compute.resource_tracker.ResourceTracker.'
'drop_move_claim_at_dest',
fake_drop_move_claim,
)
@ -151,11 +150,8 @@ class TestColdMigrationUsage(integrated_helpers.ProviderUsageBaseTestCase):
# migration is now reverted so we should once again only have usage on
# one host
# FIXME(stephenfin): Our usage here should always be 1 and 0 for source
# and dest respectively when reverting, but that won't happen until we
# run the periodic and rebuild our inventory from scratch
self.assertUsage(src_host, 0 if drop_race else 1)
self.assertUsage(dst_host, 1 if drop_race else 0)
self.assertUsage(src_host, 1)
self.assertUsage(dst_host, 0)
# running periodics shouldn't change things
self._run_periodics()

View File

@ -5771,6 +5771,7 @@ class ComputeTestCase(BaseTestCase,
migration.status = 'finished'
migration.migration_type = 'migration'
migration.source_node = NODENAME
migration.dest_node = NODENAME
migration.create()
migration_context = objects.MigrationContext()

View File

@ -8583,7 +8583,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
request_spec = objects.RequestSpec()
@mock.patch('nova.compute.resource_tracker.ResourceTracker.'
'drop_move_claim')
'drop_move_claim_at_dest')
@mock.patch('nova.compute.rpcapi.ComputeAPI.finish_revert_resize')
@mock.patch.object(self.instance, 'revert_migration_context')
@mock.patch.object(self.compute.network_api, 'get_instance_nw_info')
@ -8621,9 +8621,9 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
migration=self.migration,
instance=self.instance,
request_spec=request_spec)
mock_drop_move_claim.assert_called_once_with(self.context,
self.instance, self.instance.node)
self.assertIsNotNone(self.instance.migration_context)
mock_drop_move_claim.assert_called_once_with(
self.context, self.instance, self.migration)
# Three fake BDMs:
# 1. volume BDM with an attachment_id which will be updated/completed
@ -11754,11 +11754,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
# Assert _error_out_instance_on_exception wasn't tripped somehow.
self.assertNotEqual(vm_states.ERROR, self.instance.vm_state)
@mock.patch('nova.objects.Instance.save')
@mock.patch('nova.objects.Instance.revert_migration_context')
@mock.patch('nova.objects.BlockDeviceMappingList.get_by_instance_uuid')
def test_revert_snapshot_based_resize_at_dest(
self, mock_get_bdms, mock_revert_mig_ctxt, mock_inst_save):
def test_revert_snapshot_based_resize_at_dest(self, mock_get_bdms):
"""Happy path test for _revert_snapshot_based_resize_at_dest"""
# Setup more mocks.
def stub_migrate_instance_start(ctxt, instance, migration):
@ -11771,7 +11768,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
mock.patch.object(self.compute, '_get_instance_block_device_info'),
mock.patch.object(self.compute.driver, 'destroy'),
mock.patch.object(self.compute, '_delete_volume_attachments'),
mock.patch.object(self.compute.rt, 'drop_move_claim')
mock.patch.object(self.compute.rt, 'drop_move_claim_at_dest')
) as (
mock_network_api, mock_get_bdi, mock_destroy,
mock_delete_attachments, mock_drop_claim
@ -11786,6 +11783,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
# Run the code.
self.compute._revert_snapshot_based_resize_at_dest(
self.context, self.instance, self.migration)
# Assert the calls.
mock_network_api.get_instance_nw_info.assert_called_once_with(
self.context, self.instance)
@ -11806,12 +11804,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
self.stdlog.logger.output)
mock_delete_attachments.assert_called_once_with(
self.context, mock_get_bdms.return_value)
mock_revert_mig_ctxt.assert_called_once_with()
mock_inst_save.assert_called_once_with(
expected_task_state=task_states.RESIZE_REVERTING)
mock_drop_claim.assert_called_once_with(
self.context, self.instance, self.instance.node,
instance_type=self.instance.new_flavor)
self.context, self.instance, self.migration)
@mock.patch('nova.compute.manager.ComputeManager.'
'_finish_revert_snapshot_based_resize_at_source')
@ -11910,9 +11904,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
# Run the code.
self.compute.finish_revert_snapshot_based_resize_at_source(
self.context, self.instance, self.migration)
# Assert the migration status was updated.
self.migration.save.assert_called_once_with()
self.assertEqual('reverted', self.migration.status)
# Assert the instance host/node and flavor info was updated.
self.assertEqual(self.migration.source_compute, self.instance.host)
self.assertEqual(self.migration.source_node, self.instance.node)

View File

@ -2586,12 +2586,11 @@ class TestResize(BaseTestCase):
with test.nested(
mock.patch('nova.objects.Migration.save'),
mock.patch('nova.objects.Instance.drop_migration_context'),
mock.patch('nova.objects.Instance.save'),
):
if revert:
flavor = new_flavor
prefix = 'new_'
self.rt.drop_move_claim(
ctx, instance, _NODENAME, flavor, prefix=prefix)
self.rt.drop_move_claim_at_dest(ctx, instance, migration)
else: # confirm
flavor = old_flavor
self.rt.drop_move_claim_at_source(ctx, instance, migration)
@ -2758,32 +2757,40 @@ class TestResize(BaseTestCase):
instance.migration_context.new_pci_devices = objects.PciDeviceList(
objects=pci_devs)
# When reverting a resize and dropping the move claim, the
# destination compute calls drop_move_claim to drop the new_flavor
# When reverting a resize and dropping the move claim, the destination
# compute calls drop_move_claim_at_dest to drop the new_flavor
# usage and the instance should be in tracked_migrations from when
# the resize_claim was made on the dest during prep_resize.
self.rt.tracked_migrations = {
instance.uuid: objects.Migration(migration_type='resize')}
migration = objects.Migration(
dest_node=cn.hypervisor_hostname,
migration_type='resize',
)
self.rt.tracked_migrations = {instance.uuid: migration}
# not using mock.sentinel.ctx because drop_move_claim calls elevated
# not using mock.sentinel.ctx because _drop_move_claim calls elevated
ctx = mock.MagicMock()
with test.nested(
mock.patch.object(self.rt, '_update'),
mock.patch.object(self.rt.pci_tracker, 'free_device'),
mock.patch.object(self.rt, '_get_usage_dict'),
mock.patch.object(self.rt, '_update_usage')
mock.patch.object(self.rt, '_update_usage'),
mock.patch.object(migration, 'save'),
mock.patch.object(instance, 'save'),
) as (
update_mock, mock_pci_free_device, mock_get_usage,
mock_update_usage,
mock_update_usage, mock_migrate_save, mock_instance_save,
):
self.rt.drop_move_claim(ctx, instance, _NODENAME)
self.rt.drop_move_claim_at_dest(ctx, instance, migration)
mock_pci_free_device.assert_called_once_with(
pci_dev, mock.ANY)
mock_get_usage.assert_called_once_with(
instance.new_flavor, instance, numa_topology=None)
mock_update_usage.assert_called_once_with(
mock_get_usage.return_value, _NODENAME, sign=-1)
mock_migrate_save.assert_called_once()
mock_instance_save.assert_called_once()
@mock.patch('nova.compute.resource_tracker.ResourceTracker.'
'_sync_compute_service_disabled_trait', new=mock.Mock())