Don't unset Instance.old_flavor, new_flavor until necessary
Since change Ia6d8a7909081b0b856bd7e290e234af7e42a2b38, the resource tracker's 'drop_move_claim' method has been capable of freeing up resource usage. However, this relies on accurate resource reporting. It transpires that there's a race whereby the resource tracker's 'update_available_resource' periodic task can end up not accounting for usage from migrations that are in the process of being completed. The root cause is the resource tracker's reliance on the stashed flavor in a given migration record [1]. Previously, this information was deleted by the compute manager at the start of the confirm migration operation [2]. The compute manager would then call the virt driver [3], which could take a not insignificant amount of time to return, before finally dropping the move claim. If the periodic task ran between the clearing of the stashed flavor and the return of the virt driver, it would find a migration record with no stashed flavor and would therefore ignore this record for accounting purposes [4], resulting in an incorrect record for the compute node, and an exception when the 'drop_move_claim' attempts to free up the resources that aren't being tracked. The solution to this issue is pretty simple. Instead of unsetting the old flavor record from the migration at the start of the various move operations, do it afterwards. Conflicts: nova/compute/manager.py NOTE(stephenfin): Conflicts are due to a number of missing cross-cell resize changes. [1] https://github.com/openstack/nova/blob/6557d67/nova/compute/resource_tracker.py#L1288 [2] https://github.com/openstack/nova/blob/6557d67/nova/compute/manager.py#L4310-L4315 [3] https://github.com/openstack/nova/blob/6557d67/nova/compute/manager.py#L4330-L4331 [4] https://github.com/openstack/nova/blob/6557d67/nova/compute/resource_tracker.py#L1300 Change-Id: I4760b01b695c94fa371b72216d398388cf981d28 Signed-off-by: Stephen Finucane <stephenfin@redhat.com> Partial-Bug: #1879878 Related-Bug: #1834349 Related-Bug: #1818914 (cherry picked from commit44376d2e21
) (cherry picked from commitce95af2caf
)
This commit is contained in:
parent
6d8d2cb24a
commit
75f9b288f8
|
@ -4246,14 +4246,17 @@ class ComputeManager(manager.Manager):
|
|||
instance=instance)
|
||||
finally:
|
||||
# Whether an error occurred or not, at this point the
|
||||
# instance is on the dest host so to avoid leaking
|
||||
# allocations in placement, delete them here.
|
||||
# instance is on the dest host. Avoid leaking allocations
|
||||
# in placement by deleting them here...
|
||||
self._delete_allocation_after_move(
|
||||
context, instance, migration)
|
||||
# Also as the instance is not any more on this host, update
|
||||
# the scheduler about the move
|
||||
# ...inform the scheduler about the move...
|
||||
self._delete_scheduler_instance_info(
|
||||
context, instance.uuid)
|
||||
# ...and unset the cached flavor information (this is done
|
||||
# last since the resource tracker relies on it for its
|
||||
# periodic tasks)
|
||||
self._delete_stashed_flavor_info(instance)
|
||||
|
||||
do_confirm_resize(context, instance, migration.id)
|
||||
|
||||
|
@ -4292,13 +4295,6 @@ class ComputeManager(manager.Manager):
|
|||
self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
|
||||
phase=fields.NotificationPhase.START)
|
||||
|
||||
# NOTE(danms): delete stashed migration information
|
||||
old_instance_type = instance.old_flavor
|
||||
instance.old_flavor = None
|
||||
instance.new_flavor = None
|
||||
instance.system_metadata.pop('old_vm_state', None)
|
||||
instance.save()
|
||||
|
||||
# NOTE(tr3buchet): tear down networks on source host
|
||||
self.network_api.setup_networks_on_host(context, instance,
|
||||
migration.source_compute, teardown=True)
|
||||
|
@ -4323,8 +4319,9 @@ class ComputeManager(manager.Manager):
|
|||
# instance.migration_context so make sure to not call
|
||||
# instance.drop_migration_context() until after drop_move_claim
|
||||
# is called.
|
||||
self.rt.drop_move_claim(context, instance, migration.source_node,
|
||||
old_instance_type, prefix='old_')
|
||||
self.rt.drop_move_claim(
|
||||
context, instance, migration.source_node, instance.old_flavor,
|
||||
prefix='old_')
|
||||
instance.drop_migration_context()
|
||||
|
||||
# NOTE(mriedem): The old_vm_state could be STOPPED but the user
|
||||
|
@ -4375,6 +4372,13 @@ class ComputeManager(manager.Manager):
|
|||
'migration_uuid': migration.uuid})
|
||||
raise
|
||||
|
||||
def _delete_stashed_flavor_info(self, instance):
|
||||
"""Remove information about the flavor change after a resize."""
|
||||
instance.old_flavor = None
|
||||
instance.new_flavor = None
|
||||
instance.system_metadata.pop('old_vm_state', None)
|
||||
instance.save()
|
||||
|
||||
@wrap_exception()
|
||||
@reverts_task_state
|
||||
@wrap_instance_event(prefix='compute')
|
||||
|
@ -4492,6 +4496,16 @@ class ComputeManager(manager.Manager):
|
|||
revert the resized attributes in the database.
|
||||
|
||||
"""
|
||||
try:
|
||||
self._finish_revert_resize(
|
||||
context, instance, migration, request_spec)
|
||||
finally:
|
||||
self._delete_stashed_flavor_info(instance)
|
||||
|
||||
def _finish_revert_resize(
|
||||
self, context, instance, migration, request_spec=None,
|
||||
):
|
||||
"""Inner version of finish_revert_resize."""
|
||||
with self._error_out_instance_on_exception(context, instance):
|
||||
bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
|
||||
context, instance.uuid)
|
||||
|
@ -4501,18 +4515,16 @@ class ComputeManager(manager.Manager):
|
|||
self.host, action=fields.NotificationAction.RESIZE_REVERT,
|
||||
phase=fields.NotificationPhase.START, bdms=bdms)
|
||||
|
||||
# NOTE(mriedem): delete stashed old_vm_state information; we
|
||||
# default to ACTIVE for backwards compatibility if old_vm_state
|
||||
# is not set
|
||||
old_vm_state = instance.system_metadata.pop('old_vm_state',
|
||||
vm_states.ACTIVE)
|
||||
# Get stashed old_vm_state information to determine if guest should
|
||||
# be powered on after spawn; we default to ACTIVE for backwards
|
||||
# compatibility if old_vm_state is not set
|
||||
old_vm_state = instance.system_metadata.get(
|
||||
'old_vm_state', vm_states.ACTIVE)
|
||||
|
||||
self._set_instance_info(instance, instance.old_flavor)
|
||||
instance.old_flavor = None
|
||||
instance.new_flavor = None
|
||||
instance.host = migration.source_compute
|
||||
instance.node = migration.source_node
|
||||
instance.save()
|
||||
instance.save(expected_task_state=[task_states.RESIZE_REVERTING])
|
||||
|
||||
try:
|
||||
source_allocations = self._revert_allocation(
|
||||
|
|
|
@ -679,8 +679,7 @@ class NUMAServersTest(NUMAServersTestBase):
|
|||
self.ctxt, dst_host,
|
||||
).numa_topology,
|
||||
)
|
||||
# FIXME(stephenfin): There should still be two pinned cores here
|
||||
self.assertEqual(0, len(src_numa_topology.cells[0].pinned_cpus))
|
||||
self.assertEqual(2, len(src_numa_topology.cells[0].pinned_cpus))
|
||||
self.assertEqual(2, len(dst_numa_topology.cells[0].pinned_cpus))
|
||||
|
||||
# before continuing with the actualy confirm process
|
||||
|
@ -724,14 +723,10 @@ class NUMAServersTest(NUMAServersTestBase):
|
|||
|
||||
# Now confirm the resize
|
||||
|
||||
# FIXME(stephenfin): This should be successful, but it's failing with a
|
||||
# HTTP 500 due to bug #1879878
|
||||
post = {'confirmResize': None}
|
||||
exc = self.assertRaises(
|
||||
client.OpenStackApiException,
|
||||
self.api.post_server_action, server['id'], post)
|
||||
self.assertEqual(500, exc.response.status_code)
|
||||
self.assertIn('CPUUnpinningInvalid', str(exc))
|
||||
self.api.post_server_action(server['id'], post)
|
||||
|
||||
server = self._wait_for_state_change(server, 'ACTIVE')
|
||||
|
||||
|
||||
class NUMAServerTestWithCountingQuotaFromPlacement(NUMAServersTest):
|
||||
|
|
|
@ -8237,9 +8237,14 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
|
|||
self.migration)
|
||||
mock_delete.assert_called_once_with(self.context, self.instance,
|
||||
self.migration)
|
||||
mock_save.assert_called_with(expected_task_state=
|
||||
[None, task_states.DELETING,
|
||||
task_states.SOFT_DELETING])
|
||||
mock_save.assert_has_calls([
|
||||
mock.call(
|
||||
expected_task_state=[
|
||||
None, task_states.DELETING, task_states.SOFT_DELETING,
|
||||
],
|
||||
),
|
||||
mock.call(),
|
||||
])
|
||||
mock_delete_scheduler_info.assert_called_once_with(
|
||||
self.context, self.instance.uuid)
|
||||
|
||||
|
|
Loading…
Reference in New Issue