Move confirm resize under semaphore

The 'ResourceTracker.update_available_resource' periodic task builds
usage information for the current host by inspecting instances and
in-progress migrations, combining the two. Specifically, it finds all
instances that are not in the 'DELETED' or 'SHELVED_OFFLOADED' state,
calculates the usage from these, then finds all in-progress migrations
for the host that don't have an associated instance (to prevent double
accounting) and includes the usage for these.

In addition to the periodic task, the 'ResourceTracker' class has a
number of helper functions to make or drop claims for the inventory
generated by the 'update_available_resource' periodic task as part of
the various instance operations. These helpers naturally assume that
when making a claim for a particular instance or migration, there
shouldn't already be resources allocated for same. Conversely, when
dropping claims, the resources should currently be allocated. However,
the check for *active* instances and *in-progress* migrations in the
periodic task means we have to be careful in how we make changes to a
given instance or migration record. Running the periodic task between
such an operation and an attempt to make or drop a claim can result in
TOCTOU-like races.

This generally isn't an issue: we use the 'COMPUTE_RESOURCE_SEMAPHORE'
semaphore to prevent the periodic task running while we're claiming
resources in helpers like 'ResourceTracker.instance_claim' and we make
our changes to the instances and migrations within this context. There
is one exception though: the 'drop_move_claim' helper. This function is
used when dropping a claim for either a cold migration, a resize or a
live migration, and will drop usage from either the source host (based
on the "old" flavor) for a resize confirm or the destination host (based
on the "new" flavor) for a resize revert or live migration rollback.
Unfortunately, while the function itself is wrapped in the semaphore, no
changes to the state or the instance or migration in question are
protected by it.

Consider the confirm resize case, which we're addressing here. If we
mark the migration as 'confirmed' before running 'drop_move_claim', then
the periodic task running between these steps will not account for the
usage on the source since the migration is allegedly 'confirmed'. The
call to 'drop_move_claim' will then result in the tracker dropping usage
that we're no longer accounting for. This "set migration status before
dropping usage" is the current behaviour for both same-cell and
cross-cell resize, via the 'ComputeManager.confirm_resize' and
'ComputeManager.confirm_snapshot_based_resize_at_source' functions,
respectively. We could reverse those calls and run 'drop_move_claim'
before marking the migration as 'confirmed', but while our usage will be
momentarily correct, the periodic task running between these steps will
re-add the usage we just dropped since the migration isn't yet
'confirmed'. The correct solution is to close this gap between setting
the migration status and dropping the move claim to zero. We do this by
putting both operations behind the 'COMPUTE_RESOURCE_SEMAPHORE', just
like the claim operations.

Change-Id: I26b050c402f5721fc490126e9becb643af9279b4
Signed-off-by: Stephen Finucane <stephenfin@redhat.com>
Partial-Bug: #1879878
This commit is contained in:
Stephen Finucane 2020-08-21 16:54:16 +01:00
parent 779fd5ea3b
commit a57800d382
7 changed files with 56 additions and 63 deletions

View File

@ -4359,6 +4359,8 @@ class ComputeManager(manager.Manager):
# NOTE(tr3buchet): tear down networks on source host
self.network_api.setup_networks_on_host(context, instance,
migration.source_compute, teardown=True)
# TODO(stephenfin): These next three calls should be bundled
network_info = self.network_api.get_instance_nw_info(context,
instance)
@ -4372,17 +4374,8 @@ class ComputeManager(manager.Manager):
self.driver.confirm_migration(context, migration, instance,
network_info)
migration.status = 'confirmed'
migration.save()
# NOTE(mriedem): drop_move_claim relies on
# instance.migration_context so make sure to not call
# instance.drop_migration_context() until after drop_move_claim
# is called.
self.rt.drop_move_claim(
context, instance, migration.source_node, instance.old_flavor,
prefix='old_')
instance.drop_migration_context()
# Free up the old_flavor usage from the resource tracker for this host.
self.rt.drop_move_claim_at_source(context, instance, migration)
# NOTE(mriedem): The old_vm_state could be STOPPED but the user
# might have manually powered up the instance to confirm the
@ -4540,13 +4533,7 @@ class ComputeManager(manager.Manager):
self._delete_volume_attachments(ctxt, instance.get_bdms())
# Free up the old_flavor usage from the resource tracker for this host.
self.rt.drop_move_claim(
ctxt, instance, migration.source_node, instance.old_flavor,
prefix='old_')
instance.drop_migration_context()
migration.status = 'confirmed'
migration.save()
self.rt.drop_move_claim_at_source(ctxt, instance, migration)
def _confirm_snapshot_based_resize_delete_port_bindings(
self, ctxt, instance):

View File

@ -551,9 +551,31 @@ class ResourceTracker(object):
dev_pools_obj = self.pci_tracker.stats.to_device_pools_obj()
self.compute_nodes[nodename].pci_device_pools = dev_pools_obj
@utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True)
def drop_move_claim_at_source(self, context, instance, migration):
"""Drop a move claim after confirming a resize or cold migration."""
migration.status = 'confirmed'
migration.save()
self._drop_move_claim(
context, instance, migration.source_node, instance.old_flavor,
prefix='old_')
# NOTE(stephenfin): Unsetting this is unnecessary for cross-cell
# resize, since the source and dest instance objects are different and
# the source instance will be deleted soon. It's easier to just do it
# though.
instance.drop_migration_context()
@utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True)
def drop_move_claim(self, context, instance, nodename,
instance_type=None, prefix='new_'):
self._drop_move_claim(
context, instance, nodename, instance_type, prefix='new_')
def _drop_move_claim(
self, context, instance, nodename, instance_type=None, prefix='new_',
):
"""Remove usage for an incoming/outgoing migration.
:param context: Security context.

View File

@ -981,7 +981,7 @@ class ConfirmResizeTask(base.TaskBase):
"""
LOG.debug('Updating migration and instance status in target cell DB.',
instance=self.instance)
# Complete the migration confirmation.
# Update the target cell migration.
self.migration.status = 'confirmed'
self.migration.save()
# Update the target cell instance.

View File

@ -61,7 +61,7 @@ class TestColdMigrationUsage(integrated_helpers.ProviderUsageBaseTestCase):
self.assertUsage(src_host, 1)
self.assertUsage(dst_host, 0)
orig_drop_claim = rt.ResourceTracker.drop_move_claim
orig_drop_claim = rt.ResourceTracker.drop_move_claim_at_source
def fake_drop_move_claim(*args, **kwargs):
# run periodics after marking the migration confirmed, simulating a
@ -74,15 +74,14 @@ class TestColdMigrationUsage(integrated_helpers.ProviderUsageBaseTestCase):
if drop_race:
self._run_periodics()
# FIXME(stephenfin): the periodic should not have dropped the
# records for the src yet
self.assertUsage(src_host, 0)
self.assertUsage(dst_host, 1)
self.assertUsage(src_host, 1)
self.assertUsage(dst_host, 1)
return orig_drop_claim(*args, **kwargs)
self.stub_out(
'nova.compute.resource_tracker.ResourceTracker.drop_move_claim',
'nova.compute.resource_tracker.ResourceTracker.'
'drop_move_claim_at_source',
fake_drop_move_claim,
)
@ -98,10 +97,7 @@ class TestColdMigrationUsage(integrated_helpers.ProviderUsageBaseTestCase):
# migration is now confirmed so we should once again only have usage on
# one host
# FIXME(stephenfin): Our usage here should be 0 and 1 for source and
# dest respectively when confirming, but that won't happen until we run
# the periodic and rebuild our inventory from scratch
self.assertUsage(src_host, -1 if drop_race else 0)
self.assertUsage(src_host, 0)
self.assertUsage(dst_host, 1)
# running periodics shouldn't change things

View File

@ -8141,14 +8141,10 @@ class ComputeTestCase(BaseTestCase,
instance.new_flavor = new_type
instance.migration_context = objects.MigrationContext()
def fake_drop_move_claim(*args, **kwargs):
pass
def fake_setup_networks_on_host(self, *args, **kwargs):
pass
self._mock_rt(
drop_move_claim=mock.Mock(side_effect=fake_drop_move_claim))
self._mock_rt()
with test.nested(
mock.patch.object(self.compute.network_api,

View File

@ -8777,14 +8777,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
mock_mig_save, mock_mig_get, mock_inst_get,
mock_delete_scheduler_info):
def fake_drop_move_claim(*args, **kwargs):
# RT.drop_move_claim must be called before
# instance.drop_migration_context.
mock_drop.assert_not_called()
mock_rt = self._mock_rt()
# Enforce order of drop_move_claim/drop_migration_context calls.
mock_rt.drop_move_claim.side_effect = fake_drop_move_claim
self._mock_rt()
self.instance.migration_context = objects.MigrationContext(
new_pci_devices=None,
old_pci_devices=None)
@ -11569,22 +11562,22 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
'_confirm_snapshot_based_resize_delete_port_bindings')
@mock.patch('nova.compute.manager.ComputeManager.'
'_delete_volume_attachments')
@mock.patch('nova.objects.Instance.drop_migration_context')
def test_confirm_snapshot_based_resize_at_source(
self, mock_drop_mig_ctx, mock_delete_vols, mock_delete_bindings,
self, mock_delete_vols, mock_delete_bindings,
mock_delete_allocs, mock_get_bdms):
"""Happy path test for confirm_snapshot_based_resize_at_source."""
self.instance.old_flavor = objects.Flavor()
with test.nested(
mock.patch.object(self.compute, 'network_api'),
mock.patch.object(self.compute.driver, 'cleanup'),
mock.patch.object(self.compute.rt, 'drop_move_claim')
mock.patch.object(self.compute, 'network_api'),
mock.patch.object(self.compute.driver, 'cleanup'),
mock.patch.object(self.compute.rt, 'drop_move_claim_at_source')
) as (
mock_network_api, mock_cleanup, mock_drop_claim
mock_network_api, mock_cleanup, mock_drop_claim,
):
# Run the code.
self.compute.confirm_snapshot_based_resize_at_source(
self.context, self.instance, self.migration)
# Assert the mocks.
mock_network_api.get_instance_nw_info.assert_called_once_with(
self.context, self.instance)
@ -11600,12 +11593,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
self.context, mock_get_bdms.return_value)
# Move claim and migration context were dropped.
mock_drop_claim.assert_called_once_with(
self.context, self.instance, self.migration.source_node,
self.instance.old_flavor, prefix='old_')
mock_drop_mig_ctx.assert_called_once_with()
# The migration was updated.
self.assertEqual('confirmed', self.migration.status)
self.migration.save.assert_called_once_with()
self.context, self.instance, self.migration)
mock_delete_allocs.assert_called_once_with(
self.context, self.instance, self.migration)

View File

@ -2541,6 +2541,7 @@ class TestResize(BaseTestCase):
mig_context_obj.new_resources = objects.ResourceList(
objects=[self.resource_1, self.resource_2])
instance.migration_context = mig_context_obj
instance.system_metadata = {}
migration = objects.Migration(
id=3,
@ -2582,15 +2583,18 @@ class TestResize(BaseTestCase):
len(self.rt.assigned_resources[cn.uuid][rc]))
# Confirm or revert resize
if revert:
flavor = new_flavor
prefix = 'new_'
else:
flavor = old_flavor
prefix = 'old_'
self.rt.drop_move_claim(ctx, instance, _NODENAME, flavor,
prefix=prefix)
with test.nested(
mock.patch('nova.objects.Migration.save'),
mock.patch('nova.objects.Instance.drop_migration_context'),
):
if revert:
flavor = new_flavor
prefix = 'new_'
self.rt.drop_move_claim(
ctx, instance, _NODENAME, flavor, prefix=prefix)
else: # confirm
flavor = old_flavor
self.rt.drop_move_claim_at_source(ctx, instance, migration)
expected = compute_update_usage(expected, flavor, sign=-1)
self.assertTrue(obj_base.obj_equal_prims(