From 683ecc060e3bca818b9fb514d297e323bc8cb220 Mon Sep 17 00:00:00 2001 From: Bence Romsics Date: Wed, 2 Aug 2023 16:22:55 +0200 Subject: [PATCH] Do not untrack resources of a server being unshelved This patch concerns the time when a VM is being unshelved and the compute manager set the task_state to spawning, claimed resources of the VM and then called driver.spawn(). So the instance is in vm_state SHELVED_OFFLOADED, task_state spawning. If at this point a new update_available_resource periodic job is started that collects all the instances assigned to the node to calculate resource usage. However the calculation assumed that a VM in SHELVED_OFFLOADED state does not need resource allocation on the node (probably being removed from the node as it is offloaded) and deleted the resource claim. Given all this we ended up with the VM spawned successfully but having lost the resource claim on the node. This patch changes what we do in vm_state SHELVED_OFFLOADED, task_state spawning. We no longer delete the resource claim in this state and keep tracking the resource in stats. Change-Id: I8c9944810c09d501a6d3f60f095d9817b756872d Closes-Bug: #2025480 (cherry picked from commit f1dc4ec39bcfda1bd4b97e233a9da498b6378c4f) (cherry picked from commit 4239d1fec2814c074482b740a2fd38a5d5ce6942) --- nova/compute/manager.py | 6 +++--- nova/compute/resource_tracker.py | 7 +++++-- nova/compute/stats.py | 3 ++- nova/compute/vm_states.py | 13 ++++++++++--- .../functional/regressions/test_bug_2025480.py | 5 ++--- nova/tests/unit/compute/test_stats.py | 16 ++++++++++++++++ 6 files changed, 38 insertions(+), 12 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index c887b3ae47ce..e4df90b981ce 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -6918,9 +6918,9 @@ class ComputeManager(manager.Manager): instance.power_state = current_power_state # NOTE(mriedem): The vm_state has to be set before updating the - # resource tracker, see vm_states.ALLOW_RESOURCE_REMOVAL. The host/node - # values cannot be nulled out until after updating the resource tracker - # though. + # resource tracker, see vm_states.allow_resource_removal(). The + # host/node values cannot be nulled out until after updating the + # resource tracker though. instance.vm_state = vm_states.SHELVED_OFFLOADED instance.task_state = None instance.save(expected_task_state=[task_states.SHELVING, diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index 3f911f37084f..a4e7dbacc646 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -1539,7 +1539,8 @@ class ResourceTracker(object): # NOTE(sfinucan): Both brand new instances as well as instances that # are being unshelved will have is_new_instance == True is_removed_instance = not is_new_instance and (is_removed or - instance['vm_state'] in vm_states.ALLOW_RESOURCE_REMOVAL) + vm_states.allow_resource_removal( + vm_state=instance['vm_state'], task_state=instance.task_state)) if is_new_instance: self.tracked_instances.add(uuid) @@ -1598,7 +1599,9 @@ class ResourceTracker(object): instance_by_uuid = {} for instance in instances: - if instance.vm_state not in vm_states.ALLOW_RESOURCE_REMOVAL: + if not vm_states.allow_resource_removal( + vm_state=instance['vm_state'], + task_state=instance.task_state): self._update_usage_from_instance(context, instance, nodename) instance_by_uuid[instance.uuid] = instance return instance_by_uuid diff --git a/nova/compute/stats.py b/nova/compute/stats.py index cfbee2e6bc17..e9180ec6d6d8 100644 --- a/nova/compute/stats.py +++ b/nova/compute/stats.py @@ -105,7 +105,8 @@ class Stats(dict): (vm_state, task_state, os_type, project_id) = \ self._extract_state_from_instance(instance) - if is_removed or vm_state in vm_states.ALLOW_RESOURCE_REMOVAL: + if is_removed or vm_states.allow_resource_removal( + vm_state=vm_state, task_state=task_state): self._decrement("num_instances") self.states.pop(uuid) else: diff --git a/nova/compute/vm_states.py b/nova/compute/vm_states.py index 1a916ea59a6d..393a3ebbf2bb 100644 --- a/nova/compute/vm_states.py +++ b/nova/compute/vm_states.py @@ -27,6 +27,7 @@ health and progress. See http://wiki.openstack.org/VMState """ +from nova.compute import task_states from nova.objects import fields @@ -74,8 +75,14 @@ ALLOW_HARD_REBOOT = ALLOW_SOFT_REBOOT + [STOPPED, PAUSED, SUSPENDED, ERROR] # states we allow to trigger crash dump ALLOW_TRIGGER_CRASH_DUMP = [ACTIVE, PAUSED, RESCUED, RESIZED, ERROR] -# states we allow resources to be freed in -ALLOW_RESOURCE_REMOVAL = [DELETED, SHELVED_OFFLOADED] - # states we allow for evacuate instance ALLOW_TARGET_STATES = [STOPPED] + + +def allow_resource_removal(vm_state, task_state=None): + """(vm_state, task_state) combinations we allow resources to be freed in""" + + return ( + vm_state == DELETED or + vm_state == SHELVED_OFFLOADED and task_state != task_states.SPAWNING + ) diff --git a/nova/tests/functional/regressions/test_bug_2025480.py b/nova/tests/functional/regressions/test_bug_2025480.py index f6c87109f79c..c707a40a8469 100644 --- a/nova/tests/functional/regressions/test_bug_2025480.py +++ b/nova/tests/functional/regressions/test_bug_2025480.py @@ -82,6 +82,5 @@ class UnshelveUpdateAvailableResourcesPeriodicRace( node = compute_node.ComputeNode.get_by_nodename( context.get_admin_context(), 'compute1') - # This is the bug, the instance should have resources claimed - # self.assertEqual(1, node.vcpus_used) - self.assertEqual(0, node.vcpus_used) + # After the fix, the instance should have resources claimed + self.assertEqual(1, node.vcpus_used) diff --git a/nova/tests/unit/compute/test_stats.py b/nova/tests/unit/compute/test_stats.py index e713794a19a3..b95475f09db9 100644 --- a/nova/tests/unit/compute/test_stats.py +++ b/nova/tests/unit/compute/test_stats.py @@ -208,6 +208,22 @@ class StatsTestCase(test.NoDBTestCase): self.assertEqual(0, self.stats.num_os_type("Linux")) self.assertEqual(0, self.stats["num_vm_" + vm_states.BUILDING]) + def test_update_stats_for_instance_being_unshelved(self): + instance = self._create_instance() + self.stats.update_stats_for_instance(instance) + self.assertEqual(1, self.stats.num_instances_for_project("1234")) + + instance["vm_state"] = vm_states.SHELVED_OFFLOADED + instance["task_state"] = task_states.SPAWNING + self.stats.update_stats_for_instance(instance) + + self.assertEqual(1, self.stats.num_instances) + self.assertEqual(1, self.stats.num_instances_for_project(1234)) + self.assertEqual(1, self.stats["num_os_type_Linux"]) + self.assertEqual(1, self.stats["num_vm_%s" % + vm_states.SHELVED_OFFLOADED]) + self.assertEqual(1, self.stats["num_task_%s" % task_states.SPAWNING]) + def test_io_workload(self): vms = [vm_states.ACTIVE, vm_states.BUILDING, vm_states.PAUSED] tasks = [task_states.RESIZE_MIGRATING, task_states.REBUILDING,