placement: refactor healing of allocations in RT

Updates the RT method for "healing" instance allocations so that we
only delete allocations (and delete them entirely for an instance) in
the event that the instance has been local-deleted. Adds log statements
to the method to give us a good idea of what the state of various
things are during test runs.

Note that this patch only partially fixes LP #1707071 to remove some of
the back-and-forth allocation trampling that was going on for move
operations. The next patch adds the final piece of the puzzle which is
to have the confirm/revert resize step on the source or destination
compute host *correct* a doubled-up allocation. What this patch does is
prevent the healing function from deleting allocations unless the
instance is local-deleted.

Partial-bug: #1707071

Change-Id: Ia0bb32fd92ae723c505d0fce5691e0fe0540f10d
This commit is contained in:
Jay Pipes 2017-08-07 16:36:49 -04:00
parent 2651b27b68
commit 09e169bd7a
3 changed files with 124 additions and 95 deletions

View File

@ -1041,29 +1041,76 @@ class ResourceTracker(object):
if instance.vm_state not in vm_states.ALLOW_RESOURCE_REMOVAL:
self._update_usage_from_instance(context, instance, nodename)
# Remove allocations for instances that have been removed.
self._remove_deleted_instances_allocations(context, cn)
def _remove_deleted_instances_allocations(self, context, cn):
tracked_keys = set(self.tracked_instances.keys())
# NOTE(jaypipes): All of this code sucks. It's basically dealing with
# all the corner cases in move, local delete, unshelve and rebuild
# operations for when allocations should be deleted when things didn't
# happen according to the normal flow of events where the scheduler
# always creates allocations for an instance
known_instances = set(self.tracked_instances.keys())
allocations = self.reportclient.get_allocations_for_resource_provider(
cn.uuid) or {}
allocations_to_delete = set(allocations.keys()) - tracked_keys
for instance_uuid in allocations_to_delete:
# Allocations related to instances being scheduled should not be
# deleted if we already wrote the allocation previously.
for instance_uuid, alloc in allocations.items():
if instance_uuid in known_instances:
LOG.debug("Instance %s actively managed on this compute host "
"and has allocations in placement: %s.",
instance_uuid, alloc)
continue
try:
instance = objects.Instance.get_by_uuid(context, instance_uuid,
expected_attrs=[])
if not instance.host:
continue
except exception.InstanceNotFound:
# The instance is gone, so we definitely want to
# remove allocations associated with it.
pass
LOG.debug('Deleting stale allocation for instance %s',
instance_uuid)
self.reportclient.delete_allocation_for_instance(instance_uuid)
# The instance is gone, so we definitely want to remove
# allocations associated with it.
# NOTE(jaypipes): This will not be true if/when we support
# cross-cell migrations...
LOG.debug("Instance %s has been deleted (perhaps locally). "
"Deleting allocations that remained for this "
"instance against this compute host: %s.",
instance_uuid, alloc)
self.reportclient.delete_allocation_for_instance(instance_uuid)
continue
if not instance.host:
# Allocations related to instances being scheduled should not
# be deleted if we already wrote the allocation previously.
LOG.debug("Instance %s has been scheduled to this compute "
"host, the scheduler has made an allocation "
"against this compute node but the instance has "
"yet to start. Skipping heal of allocation: %s.",
instance_uuid, alloc)
continue
if (instance.host == cn.host and
instance.node == cn.hypervisor_hostname):
# The instance is supposed to be on this compute host but is
# not in the list of actively managed instances.
LOG.warning("Instance %s is not being actively managed by "
"this compute host but has allocations "
"referencing this compute host: %s. Skipping "
"heal of allocation because we do not know "
"what to do.", instance_uuid, alloc)
continue
if instance.host != cn.host:
# The instance has been moved to another host either via a
# migration, evacuation or unshelve in between the time when we
# ran InstanceList.get_by_host_and_node(), added those
# instances to RT.tracked_instances and the above
# Instance.get_by_uuid() call. We SHOULD attempt to remove any
# allocations that reference this compute host if the VM is in
# a stable terminal state (i.e. it isn't in a state of waiting
# for resize to confirm/revert), however if the destination
# host is an Ocata compute host, it will delete the allocation
# that contains this source compute host information anyway and
# recreate an allocation that only refers to itself. So we
# don't need to do anything in that case. Just log the
# situation here for debugging information but don't attempt to
# delete or change the allocation.
LOG.debug("Instance %s has been moved to another host %s(%s). "
"There are allocations remaining against the source "
"host that might need to be removed: %s.",
instance_uuid, instance.host, instance.node, alloc)
continue
def _find_orphaned_instances(self):
"""Given the set of instances and migrations already account for

View File

@ -1305,52 +1305,22 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
# the original host expected to have the old resource usage
source_usages = self._get_provider_usages(source_rp_uuid)
# NOTE(danms): This is bug 1707071, where we've lost the
# doubled allocation
self.assertEqual(0, source_usages['VCPU'])
self.assertEqual(0, source_usages['MEMORY_MB'])
self.assertEqual(0, source_usages['DISK_GB'])
# NOTE(danms): After we fix the 'healing' of the doubled allocation
# by the resource tracker, the following will be expected
# self.assertFlavorMatchesAllocation(old_flavor, source_usages)
self.assertFlavorMatchesAllocation(old_flavor, source_usages)
# the dest host expected to have resource allocation based on
# the new flavor the server is resized to
dest_usages = self._get_provider_usages(dest_rp_uuid)
# NOTE(danms): This is bug 1707071 where we've lost the entire
# allocation because one of the computes deleted it
self.assertEqual(0, dest_usages['VCPU'])
self.assertEqual(0, dest_usages['MEMORY_MB'])
self.assertEqual(0, dest_usages['DISK_GB'])
# NOTE(danms): After we fix the 'healing' of the double allocation
# by the resource tracker, the following will be expected
# self.assertFlavorMatchesAllocation(new_flavor, dest_usages)
self.assertFlavorMatchesAllocation(new_flavor, dest_usages)
# and our server should have allocation on both providers accordingly
allocations = self._get_allocations_by_server_uuid(server['id'])
# NOTE(danms): This is bug 1707071, where we've lost the
# doubled allocation
self.assertEqual(0, len(allocations))
self.assertNotIn(source_rp_uuid, allocations)
self.assertEqual(2, len(allocations))
source_allocation = allocations[source_rp_uuid]['resources']
self.assertFlavorMatchesAllocation(old_flavor, source_allocation)
# NOTE(danms): After we fix the 'healing' of the doubled allocation
# by the resource tracker, the following will be expected
# self.assertEqual(2, len(allocations))
# source_allocation = allocations[source_rp_uuid]['resources']
# self.assertFlavorMatchesAllocation(old_flavor, source_allocation)
# NOTE(danms): This is bug 1707071, where we've lost the
# doubled allocation
self.assertNotIn(dest_rp_uuid, allocations)
# NOTE(danms): After we fix the 'healing' of the doubled allocation
# by the resource tracker, the following will be expected
# self.assertEqual(2, len(allocations))
# dest_allocation = allocations[dest_rp_uuid]['resources']
# self.assertFlavorMatchesAllocation(new_flavor, dest_allocation)
self.assertEqual(2, len(allocations))
dest_allocation = allocations[dest_rp_uuid]['resources']
self.assertFlavorMatchesAllocation(new_flavor, dest_allocation)
def _delete_and_check_allocations(self, server, source_rp_uuid,
dest_rp_uuid):
@ -1395,19 +1365,24 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
# the original host expected to have the old resource allocation
source_usages = self._get_provider_usages(source_rp_uuid)
allocations = self._get_allocations_by_server_uuid(server['id'])
dest_usages = self._get_provider_usages(dest_rp_uuid)
self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
# and the target host should not have usage
self.assertEqual({'VCPU': 0,
'MEMORY_MB': 0,
'DISK_GB': 0}, dest_usages,
'Target host %s still has usage after the resize has '
'been reverted' % dest_hostname)
# NOTE(jaypipes): This should be uncommented when bug 1707071 is fixed.
# Currently, we're not cleaning up the destination allocation on
# confirm and the source allocation on resize
# dest_usages = self._get_provider_usages(dest_rp_uuid)
# self.assertEqual({'VCPU': 0,
# 'MEMORY_MB': 0,
# 'DISK_GB': 0}, dest_usages,
# 'Target host %s still has usage after the resize has '
# 'been reverted' % dest_hostname)
# NOTE(jaypipes): This should be uncommented when bug 1707071 is fixed.
# Currently, we're not cleaning up the destination allocation on
# confirm and the source allocation on resize
# Check that the server only allocates resource from the original host
self.assertEqual(1, len(allocations))
# self.assertEqual(1, len(allocations))
source_allocation = allocations[source_rp_uuid]['resources']
self.assertFlavorMatchesAllocation(self.flavor1, source_allocation)
@ -1434,28 +1409,33 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
# Fetch allocations post-confirm
allocations = self._get_allocations_by_server_uuid(server['id'])
# NOTE(danms): This is bug 1707071, where we've lost the doubled
# allocation
self.assertEqual(0, len(allocations))
self.assertEqual(2, len(allocations))
self._run_periodics()
source_usages = self._get_provider_usages(source_rp_uuid)
dest_usages = self._get_provider_usages(dest_rp_uuid)
self.assertEqual({'VCPU': 0,
'MEMORY_MB': 0,
'DISK_GB': 0}, source_usages,
'The source host %s still has usages after the '
'resize has been confirmed' % source_hostname)
# NOTE(jaypipes): This should be uncommented when bug 1707071 is fixed.
# Currently, we're not cleaning up the destination allocation on
# confirm and the source allocation on resize
# source_usages = self._get_provider_usages(source_rp_uuid)
# self.assertEqual({'VCPU': 0,
# 'MEMORY_MB': 0,
# 'DISK_GB': 0}, source_usages,
# 'The source host %s still has usages after the '
# 'resize has been confirmed' % source_hostname)
# and the target host allocation should be according to the new flavor
self.assertFlavorMatchesAllocation(self.flavor2, dest_usages)
allocations = self._get_allocations_by_server_uuid(server['id'])
# NOTE(jaypipes): This should be uncommented when bug 1707071 is fixed.
# Currently, we're not cleaning up the destination allocation on
# confirm and the source allocation on resize
# and the server allocates only from the target host
self.assertEqual(1, len(allocations))
# self.assertEqual(1, len(allocations))
dest_allocation = allocations[dest_rp_uuid]['resources']
self.assertFlavorMatchesAllocation(self.flavor2, dest_allocation)

View File

@ -2394,26 +2394,32 @@ class TestUpdateUsageFromInstance(BaseTestCase):
mock_update_usage.assert_called_once_with(
self.rt._get_usage_dict(self.instance), _NODENAME, sign=-1)
@mock.patch('nova.compute.resource_tracker.LOG')
@mock.patch('nova.objects.Instance.get_by_uuid')
def test_remove_deleted_instances_allocations(self, mock_inst_get,
mock_log):
def test_remove_deleted_instances_allocations_deleted_instance(self,
mock_inst_get):
rc = self.rt.reportclient
self.rt.tracked_instances = {}
allocs = {uuids.deleted: "fake_deleted_instance"}
rc.get_allocations_for_resource_provider = mock.MagicMock(
return_value=allocs)
rc.delete_allocation_for_instance = mock.MagicMock()
mock_inst_get.side_effect = exc.InstanceNotFound(
instance_id=uuids.deleted)
cn = self.rt.compute_nodes[_NODENAME]
ctx = mock.sentinel.ctx
# Call the method.
self.rt._remove_deleted_instances_allocations(ctx, cn)
# Only one call should be made to delete allocations, and that should
# be for the first instance created above
rc.delete_allocation_for_instance.assert_called_once_with(
uuids.deleted)
@mock.patch('nova.objects.Instance.get_by_uuid')
def test_remove_deleted_instances_allocations_scheduled_instance(self,
mock_inst_get):
rc = self.rt.reportclient
self.rt.tracked_instances = {}
# Create 3 instances
instance_uuids = (uuids.inst0, uuids.inst1, uuids.inst2)
for i in range(3):
instance = _INSTANCE_FIXTURES[0].obj_clone()
instance.uuid = instance_uuids[i]
if i > 0:
# Only add the last two to tracked_instances, to simulate one
# deleted instance
inst_dict = obj_base.obj_to_primitive(instance)
self.rt.tracked_instances[instance.uuid] = inst_dict
# Mock out the allocation call
allocs = {uuids.scheduled: "fake_scheduled_instance"}
for i in range(3):
allocs[instance_uuids[i]] = "fake_instance_%s" % i
rc.get_allocations_for_resource_provider = mock.MagicMock(
return_value=allocs)
rc.delete_allocation_for_instance = mock.MagicMock()
@ -2421,8 +2427,7 @@ class TestUpdateUsageFromInstance(BaseTestCase):
def get_by_uuid(ctx, inst_uuid, expected_attrs=None):
ret = _INSTANCE_FIXTURES[0].obj_clone()
ret.uuid = inst_uuid
if inst_uuid == uuids.scheduled:
ret.host = None
ret.host = None # This indicates the instance is being scheduled
return ret
mock_inst_get.side_effect = get_by_uuid
@ -2430,15 +2435,12 @@ class TestUpdateUsageFromInstance(BaseTestCase):
ctx = mock.sentinel.ctx
# Call the method.
self.rt._remove_deleted_instances_allocations(ctx, cn)
# Only one call should be made to delete allocations, and that should
# be for the first instance created above
rc.delete_allocation_for_instance.assert_called_once_with(uuids.inst0)
mock_log.debug.assert_called_once_with(
'Deleting stale allocation for instance %s', uuids.inst0)
# Scheduled instances should not have their allocations removed
rc.delete_allocation_for_instance.assert_not_called()
@mock.patch('nova.objects.Instance.get_by_uuid')
def test_remove_deleted_instances_allocations_no_instance(self,
mock_inst_get):
mock_inst_get):
# If for some reason an instance is no longer available, but
# there are allocations for it, we want to be sure those
# allocations are removed, not that an InstanceNotFound