From dc9c7a5ebf11253f86127238d33dff7401465155 Mon Sep 17 00:00:00 2001 From: Stephen Finucane Date: Fri, 21 Aug 2020 17:43:36 +0100 Subject: [PATCH] Move revert resize under semaphore As discussed in change I26b050c402f5721fc490126e9becb643af9279b4, the resource tracker's periodic task is reliant on the status of migrations to determine whether to include usage from these migrations in the total, and races between setting the migration status and decrementing resource usage via 'drop_move_claim' can result in incorrect usage. That change tackled the confirm resize operation. This one changes the revert resize operation, and is a little trickier due to kinks in how both the same-cell and cross-cell resize revert operations work. For same-cell resize revert, the 'ComputeManager.revert_resize' function, running on the destination host, sets the migration status to 'reverted' before dropping the move claim. This exposes the same race that we previously saw with the confirm resize operation. It then calls back to 'ComputeManager.finish_revert_resize' on the source host to boot up the instance itself. This is kind of weird, because, even ignoring the race, we're marking the migration as 'reverted' before we've done any of the necessary work on the source host. The cross-cell resize revert splits dropping of the move claim and setting of the migration status between the source and destination host tasks. Specifically, we do cleanup on the destination and drop the move claim first, via 'ComputeManager.revert_snapshot_based_resize_at_dest' before resuming the instance and setting the migration status on the source via 'ComputeManager.finish_revert_snapshot_based_resize_at_source'. This would appear to avoid the weird quirk of same-cell migration, however, in typical weird cross-cell fashion, these are actually different instances and different migration records. The solution is once again to move the setting of the migration status and the dropping of the claim under 'COMPUTE_RESOURCE_SEMAPHORE'. This introduces the weird setting of migration status before completion to the cross-cell resize case and perpetuates it in the same-cell case, but this seems like a suitable compromise to avoid attempts to do things like unplugging already unplugged PCI devices or unpinning already unpinned CPUs. From an end-user perspective, instance state changes are what really matter and once a revert is completed on the destination host and the instance has been marked as having returned to the source host, hard reboots can help us resolve any remaining issues. Change-Id: I29d6f4a78c0206385a550967ce244794e71cef6d Signed-off-by: Stephen Finucane Closes-Bug: #1879878 --- nova/compute/manager.py | 22 +++----------- nova/compute/resource_tracker.py | 18 ++++++++++++ nova/tests/functional/integrated_helpers.py | 17 +++++++---- .../regressions/test_bug_1879878.py | 18 +++++------- nova/tests/unit/compute/test_compute.py | 1 + nova/tests/unit/compute/test_compute_mgr.py | 26 ++++++----------- .../unit/compute/test_resource_tracker.py | 29 ++++++++++++------- 7 files changed, 69 insertions(+), 62 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index b6d8fe0ee437..7db5662f4e9a 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -4673,10 +4673,7 @@ class ComputeManager(manager.Manager): self._delete_volume_attachments(ctxt, bdms) # Free up the new_flavor usage from the resource tracker for this host. - instance.revert_migration_context() - instance.save(expected_task_state=task_states.RESIZE_REVERTING) - self.rt.drop_move_claim(ctxt, instance, instance.node, - instance_type=instance.new_flavor) + self.rt.drop_move_claim_at_dest(ctxt, instance, migration) def _revert_instance_flavor_host_node(self, instance, migration): """Revert host, node and flavor fields after a resize-revert.""" @@ -4873,20 +4870,9 @@ class ComputeManager(manager.Manager): self._terminate_volume_connections(context, instance, bdms) - migration.status = 'reverted' - migration.save() - - # NOTE(ndipanov): We need to do this here because dropping the - # claim means we lose the migration_context data. We really should - # fix this by moving the drop_move_claim call to the - # finish_revert_resize method as this is racy (revert is dropped, - # but instance resources will be tracked with the new flavor until - # it gets rolled back in finish_revert_resize, which is - # potentially wrong for a period of time). - instance.revert_migration_context() - instance.save() - - self.rt.drop_move_claim(context, instance, instance.node) + # Free up the new_flavor usage from the resource tracker for this + # host. + self.rt.drop_move_claim_at_dest(context, instance, migration) # RPC cast back to the source host to finish the revert there. self.compute_rpcapi.finish_revert_resize(context, instance, diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index 1037163f0f42..3ea11fc97692 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -567,6 +567,24 @@ class ResourceTracker(object): # though. instance.drop_migration_context() + @utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True) + def drop_move_claim_at_dest(self, context, instance, migration): + """Drop a move claim after reverting a resize or cold migration.""" + + # NOTE(stephenfin): This runs on the destination, before we return to + # the source and resume the instance there. As such, the migration + # isn't really really reverted yet, but this status is what we use to + # indicate that we no longer needs to account for usage on this host + migration.status = 'reverted' + migration.save() + + self._drop_move_claim( + context, instance, migration.dest_node, instance.new_flavor, + prefix='new_') + + instance.revert_migration_context() + instance.save(expected_task_state=[task_states.RESIZE_REVERTING]) + @utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True) def drop_move_claim(self, context, instance, nodename, instance_type=None, prefix='new_'): diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py index 828734ca935a..0081a551960c 100644 --- a/nova/tests/functional/integrated_helpers.py +++ b/nova/tests/functional/integrated_helpers.py @@ -200,15 +200,22 @@ class InstanceHelperMixin: api = getattr(self, 'admin_api', self.api) statuses = [status.lower() for status in expected_statuses] + actual_status = None + for attempt in range(10): migrations = api.api_get('/os-migrations').body['migrations'] for migration in migrations: - if (migration['instance_uuid'] == server['id'] and - migration['status'].lower() in statuses): - return migration + if migration['instance_uuid'] == server['id']: + actual_status = migration['status'] + if migration['status'].lower() in statuses: + return migration time.sleep(0.5) - self.fail('Timed out waiting for migration with status "%s" for ' - 'instance: %s' % (expected_statuses, server['id'])) + + self.fail( + 'Timed out waiting for migration with status for instance %s ' + '(expected "%s", got "%s")' % ( + server['id'], expected_statuses, actual_status, + )) def _wait_for_log(self, log_line): for i in range(10): diff --git a/nova/tests/functional/regressions/test_bug_1879878.py b/nova/tests/functional/regressions/test_bug_1879878.py index 5f5f12b90d0a..f9d106fff2b1 100644 --- a/nova/tests/functional/regressions/test_bug_1879878.py +++ b/nova/tests/functional/regressions/test_bug_1879878.py @@ -114,7 +114,7 @@ class TestColdMigrationUsage(integrated_helpers.ProviderUsageBaseTestCase): self.assertUsage(src_host, 1) self.assertUsage(dst_host, 0) - orig_drop_claim = rt.ResourceTracker.drop_move_claim + orig_drop_claim = rt.ResourceTracker.drop_move_claim_at_dest def fake_drop_move_claim(*args, **kwargs): # run periodics after marking the migration reverted, simulating a @@ -127,15 +127,14 @@ class TestColdMigrationUsage(integrated_helpers.ProviderUsageBaseTestCase): if drop_race: self._run_periodics() - # FIXME(stephenfin): the periodic should not have dropped the - # records for the src - self.assertUsage(src_host, 0) - self.assertUsage(dst_host, 1) + self.assertUsage(src_host, 1) + self.assertUsage(dst_host, 1) return orig_drop_claim(*args, **kwargs) self.stub_out( - 'nova.compute.resource_tracker.ResourceTracker.drop_move_claim', + 'nova.compute.resource_tracker.ResourceTracker.' + 'drop_move_claim_at_dest', fake_drop_move_claim, ) @@ -151,11 +150,8 @@ class TestColdMigrationUsage(integrated_helpers.ProviderUsageBaseTestCase): # migration is now reverted so we should once again only have usage on # one host - # FIXME(stephenfin): Our usage here should always be 1 and 0 for source - # and dest respectively when reverting, but that won't happen until we - # run the periodic and rebuild our inventory from scratch - self.assertUsage(src_host, 0 if drop_race else 1) - self.assertUsage(dst_host, 1 if drop_race else 0) + self.assertUsage(src_host, 1) + self.assertUsage(dst_host, 0) # running periodics shouldn't change things self._run_periodics() diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 69a9e0b97cb2..9adeced30fa3 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -5771,6 +5771,7 @@ class ComputeTestCase(BaseTestCase, migration.status = 'finished' migration.migration_type = 'migration' migration.source_node = NODENAME + migration.dest_node = NODENAME migration.create() migration_context = objects.MigrationContext() diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 7eba83177ed7..8089c919429f 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -8583,7 +8583,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, request_spec = objects.RequestSpec() @mock.patch('nova.compute.resource_tracker.ResourceTracker.' - 'drop_move_claim') + 'drop_move_claim_at_dest') @mock.patch('nova.compute.rpcapi.ComputeAPI.finish_revert_resize') @mock.patch.object(self.instance, 'revert_migration_context') @mock.patch.object(self.compute.network_api, 'get_instance_nw_info') @@ -8621,9 +8621,9 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, migration=self.migration, instance=self.instance, request_spec=request_spec) - mock_drop_move_claim.assert_called_once_with(self.context, - self.instance, self.instance.node) - self.assertIsNotNone(self.instance.migration_context) + + mock_drop_move_claim.assert_called_once_with( + self.context, self.instance, self.migration) # Three fake BDMs: # 1. volume BDM with an attachment_id which will be updated/completed @@ -11754,11 +11754,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, # Assert _error_out_instance_on_exception wasn't tripped somehow. self.assertNotEqual(vm_states.ERROR, self.instance.vm_state) - @mock.patch('nova.objects.Instance.save') - @mock.patch('nova.objects.Instance.revert_migration_context') @mock.patch('nova.objects.BlockDeviceMappingList.get_by_instance_uuid') - def test_revert_snapshot_based_resize_at_dest( - self, mock_get_bdms, mock_revert_mig_ctxt, mock_inst_save): + def test_revert_snapshot_based_resize_at_dest(self, mock_get_bdms): """Happy path test for _revert_snapshot_based_resize_at_dest""" # Setup more mocks. def stub_migrate_instance_start(ctxt, instance, migration): @@ -11771,7 +11768,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, mock.patch.object(self.compute, '_get_instance_block_device_info'), mock.patch.object(self.compute.driver, 'destroy'), mock.patch.object(self.compute, '_delete_volume_attachments'), - mock.patch.object(self.compute.rt, 'drop_move_claim') + mock.patch.object(self.compute.rt, 'drop_move_claim_at_dest') ) as ( mock_network_api, mock_get_bdi, mock_destroy, mock_delete_attachments, mock_drop_claim @@ -11786,6 +11783,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, # Run the code. self.compute._revert_snapshot_based_resize_at_dest( self.context, self.instance, self.migration) + # Assert the calls. mock_network_api.get_instance_nw_info.assert_called_once_with( self.context, self.instance) @@ -11806,12 +11804,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, self.stdlog.logger.output) mock_delete_attachments.assert_called_once_with( self.context, mock_get_bdms.return_value) - mock_revert_mig_ctxt.assert_called_once_with() - mock_inst_save.assert_called_once_with( - expected_task_state=task_states.RESIZE_REVERTING) mock_drop_claim.assert_called_once_with( - self.context, self.instance, self.instance.node, - instance_type=self.instance.new_flavor) + self.context, self.instance, self.migration) @mock.patch('nova.compute.manager.ComputeManager.' '_finish_revert_snapshot_based_resize_at_source') @@ -11910,9 +11904,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, # Run the code. self.compute.finish_revert_snapshot_based_resize_at_source( self.context, self.instance, self.migration) - # Assert the migration status was updated. - self.migration.save.assert_called_once_with() - self.assertEqual('reverted', self.migration.status) + # Assert the instance host/node and flavor info was updated. self.assertEqual(self.migration.source_compute, self.instance.host) self.assertEqual(self.migration.source_node, self.instance.node) diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py index 3676d4dd4246..c60223c2c60f 100644 --- a/nova/tests/unit/compute/test_resource_tracker.py +++ b/nova/tests/unit/compute/test_resource_tracker.py @@ -2586,12 +2586,11 @@ class TestResize(BaseTestCase): with test.nested( mock.patch('nova.objects.Migration.save'), mock.patch('nova.objects.Instance.drop_migration_context'), + mock.patch('nova.objects.Instance.save'), ): if revert: flavor = new_flavor - prefix = 'new_' - self.rt.drop_move_claim( - ctx, instance, _NODENAME, flavor, prefix=prefix) + self.rt.drop_move_claim_at_dest(ctx, instance, migration) else: # confirm flavor = old_flavor self.rt.drop_move_claim_at_source(ctx, instance, migration) @@ -2758,32 +2757,40 @@ class TestResize(BaseTestCase): instance.migration_context.new_pci_devices = objects.PciDeviceList( objects=pci_devs) - # When reverting a resize and dropping the move claim, the - # destination compute calls drop_move_claim to drop the new_flavor + # When reverting a resize and dropping the move claim, the destination + # compute calls drop_move_claim_at_dest to drop the new_flavor # usage and the instance should be in tracked_migrations from when # the resize_claim was made on the dest during prep_resize. - self.rt.tracked_migrations = { - instance.uuid: objects.Migration(migration_type='resize')} + migration = objects.Migration( + dest_node=cn.hypervisor_hostname, + migration_type='resize', + ) + self.rt.tracked_migrations = {instance.uuid: migration} - # not using mock.sentinel.ctx because drop_move_claim calls elevated + # not using mock.sentinel.ctx because _drop_move_claim calls elevated ctx = mock.MagicMock() with test.nested( mock.patch.object(self.rt, '_update'), mock.patch.object(self.rt.pci_tracker, 'free_device'), mock.patch.object(self.rt, '_get_usage_dict'), - mock.patch.object(self.rt, '_update_usage') + mock.patch.object(self.rt, '_update_usage'), + mock.patch.object(migration, 'save'), + mock.patch.object(instance, 'save'), ) as ( update_mock, mock_pci_free_device, mock_get_usage, - mock_update_usage, + mock_update_usage, mock_migrate_save, mock_instance_save, ): - self.rt.drop_move_claim(ctx, instance, _NODENAME) + self.rt.drop_move_claim_at_dest(ctx, instance, migration) + mock_pci_free_device.assert_called_once_with( pci_dev, mock.ANY) mock_get_usage.assert_called_once_with( instance.new_flavor, instance, numa_topology=None) mock_update_usage.assert_called_once_with( mock_get_usage.return_value, _NODENAME, sign=-1) + mock_migrate_save.assert_called_once() + mock_instance_save.assert_called_once() @mock.patch('nova.compute.resource_tracker.ResourceTracker.' '_sync_compute_service_disabled_trait', new=mock.Mock())