Do not untrack resources of a server being unshelved

This patch concerns the time when a VM is being unshelved and the
compute manager set the task_state to spawning, claimed resources of
the VM and then called driver.spawn(). So the instance is in vm_state
SHELVED_OFFLOADED, task_state spawning.

If at this point a new update_available_resource periodic job is
started that collects all the instances assigned to the node to
calculate resource usage. However the calculation assumed that a
VM in SHELVED_OFFLOADED state does not need resource allocation on
the node (probably being removed from the node as it is offloaded)
and deleted the resource claim.

Given all this we ended up with the VM spawned successfully but having
lost the resource claim on the node.

This patch changes what we do in vm_state SHELVED_OFFLOADED, task_state
spawning. We no longer delete the resource claim in this state and
keep tracking the resource in stats.

Change-Id: I8c9944810c09d501a6d3f60f095d9817b756872d
Closes-Bug: #2025480
This commit is contained in:
Bence Romsics 2023-08-02 16:22:55 +02:00
parent 62300d4885
commit f1dc4ec39b
6 changed files with 38 additions and 12 deletions

View File

@ -6948,9 +6948,9 @@ class ComputeManager(manager.Manager):
instance.power_state = current_power_state instance.power_state = current_power_state
# NOTE(mriedem): The vm_state has to be set before updating the # NOTE(mriedem): The vm_state has to be set before updating the
# resource tracker, see vm_states.ALLOW_RESOURCE_REMOVAL. The host/node # resource tracker, see vm_states.allow_resource_removal(). The
# values cannot be nulled out until after updating the resource tracker # host/node values cannot be nulled out until after updating the
# though. # resource tracker though.
instance.vm_state = vm_states.SHELVED_OFFLOADED instance.vm_state = vm_states.SHELVED_OFFLOADED
instance.task_state = None instance.task_state = None
instance.save(expected_task_state=[task_states.SHELVING, instance.save(expected_task_state=[task_states.SHELVING,

View File

@ -1611,7 +1611,8 @@ class ResourceTracker(object):
# NOTE(sfinucan): Both brand new instances as well as instances that # NOTE(sfinucan): Both brand new instances as well as instances that
# are being unshelved will have is_new_instance == True # are being unshelved will have is_new_instance == True
is_removed_instance = not is_new_instance and (is_removed or is_removed_instance = not is_new_instance and (is_removed or
instance['vm_state'] in vm_states.ALLOW_RESOURCE_REMOVAL) vm_states.allow_resource_removal(
vm_state=instance['vm_state'], task_state=instance.task_state))
if is_new_instance: if is_new_instance:
self.tracked_instances.add(uuid) self.tracked_instances.add(uuid)
@ -1670,7 +1671,9 @@ class ResourceTracker(object):
instance_by_uuid = {} instance_by_uuid = {}
for instance in instances: for instance in instances:
if instance.vm_state not in vm_states.ALLOW_RESOURCE_REMOVAL: if not vm_states.allow_resource_removal(
vm_state=instance['vm_state'],
task_state=instance.task_state):
self._update_usage_from_instance(context, instance, nodename) self._update_usage_from_instance(context, instance, nodename)
instance_by_uuid[instance.uuid] = instance instance_by_uuid[instance.uuid] = instance
return instance_by_uuid return instance_by_uuid

View File

@ -105,7 +105,8 @@ class Stats(dict):
(vm_state, task_state, os_type, project_id) = \ (vm_state, task_state, os_type, project_id) = \
self._extract_state_from_instance(instance) self._extract_state_from_instance(instance)
if is_removed or vm_state in vm_states.ALLOW_RESOURCE_REMOVAL: if is_removed or vm_states.allow_resource_removal(
vm_state=vm_state, task_state=task_state):
self._decrement("num_instances") self._decrement("num_instances")
self.states.pop(uuid) self.states.pop(uuid)
else: else:

View File

@ -27,6 +27,7 @@ health and progress.
See http://wiki.openstack.org/VMState See http://wiki.openstack.org/VMState
""" """
from nova.compute import task_states
from nova.objects import fields from nova.objects import fields
@ -74,8 +75,14 @@ ALLOW_HARD_REBOOT = ALLOW_SOFT_REBOOT + [STOPPED, PAUSED, SUSPENDED, ERROR]
# states we allow to trigger crash dump # states we allow to trigger crash dump
ALLOW_TRIGGER_CRASH_DUMP = [ACTIVE, PAUSED, RESCUED, RESIZED, ERROR] ALLOW_TRIGGER_CRASH_DUMP = [ACTIVE, PAUSED, RESCUED, RESIZED, ERROR]
# states we allow resources to be freed in
ALLOW_RESOURCE_REMOVAL = [DELETED, SHELVED_OFFLOADED]
# states we allow for evacuate instance # states we allow for evacuate instance
ALLOW_TARGET_STATES = [STOPPED] ALLOW_TARGET_STATES = [STOPPED]
def allow_resource_removal(vm_state, task_state=None):
"""(vm_state, task_state) combinations we allow resources to be freed in"""
return (
vm_state == DELETED or
vm_state == SHELVED_OFFLOADED and task_state != task_states.SPAWNING
)

View File

@ -82,6 +82,5 @@ class UnshelveUpdateAvailableResourcesPeriodicRace(
node = compute_node.ComputeNode.get_by_nodename( node = compute_node.ComputeNode.get_by_nodename(
context.get_admin_context(), 'compute1') context.get_admin_context(), 'compute1')
# This is the bug, the instance should have resources claimed # After the fix, the instance should have resources claimed
# self.assertEqual(1, node.vcpus_used) self.assertEqual(1, node.vcpus_used)
self.assertEqual(0, node.vcpus_used)

View File

@ -208,6 +208,22 @@ class StatsTestCase(test.NoDBTestCase):
self.assertEqual(0, self.stats.num_os_type("Linux")) self.assertEqual(0, self.stats.num_os_type("Linux"))
self.assertEqual(0, self.stats["num_vm_" + vm_states.BUILDING]) self.assertEqual(0, self.stats["num_vm_" + vm_states.BUILDING])
def test_update_stats_for_instance_being_unshelved(self):
instance = self._create_instance()
self.stats.update_stats_for_instance(instance)
self.assertEqual(1, self.stats.num_instances_for_project("1234"))
instance["vm_state"] = vm_states.SHELVED_OFFLOADED
instance["task_state"] = task_states.SPAWNING
self.stats.update_stats_for_instance(instance)
self.assertEqual(1, self.stats.num_instances)
self.assertEqual(1, self.stats.num_instances_for_project(1234))
self.assertEqual(1, self.stats["num_os_type_Linux"])
self.assertEqual(1, self.stats["num_vm_%s" %
vm_states.SHELVED_OFFLOADED])
self.assertEqual(1, self.stats["num_task_%s" % task_states.SPAWNING])
def test_io_workload(self): def test_io_workload(self):
vms = [vm_states.ACTIVE, vm_states.BUILDING, vm_states.PAUSED] vms = [vm_states.ACTIVE, vm_states.BUILDING, vm_states.PAUSED]
tasks = [task_states.RESIZE_MIGRATING, task_states.REBUILDING, tasks = [task_states.RESIZE_MIGRATING, task_states.REBUILDING,