Don't delete allocation if instance being scheduled
There could be a potential race if something other than the compute service create allocations (say, the scheduler) because then the self-healing process that runs periodically on compute would delete that allocation given it wouldn't find the related instance for that host. Removing an unnecessary method in the report client code, and rather do the loop in the RT code for directly calling to delete allocations. Co-Authored-By: Ed Leafe <ed@leafe.com> Blueprint: placement-claims Change-Id: I80ba844a6e0fcea89f80aa253d57ac73092773ae
This commit is contained in:
parent
d966d33227
commit
439f67e7bc
@ -992,11 +992,28 @@ class ResourceTracker(object):
|
||||
if instance.vm_state not in vm_states.ALLOW_RESOURCE_REMOVAL:
|
||||
self._update_usage_from_instance(context, instance, nodename)
|
||||
|
||||
self.reportclient.remove_deleted_instances(cn,
|
||||
self.tracked_instances.values())
|
||||
# Remove allocations for instances that have been removed.
|
||||
self._remove_deleted_instances_allocations(context, cn)
|
||||
|
||||
cn.free_ram_mb = max(0, cn.free_ram_mb)
|
||||
cn.free_disk_gb = max(0, cn.free_disk_gb)
|
||||
|
||||
def _remove_deleted_instances_allocations(self, context, cn):
|
||||
tracked_keys = set(self.tracked_instances.keys())
|
||||
allocations = self.reportclient.get_allocations_for_resource_provider(
|
||||
cn.uuid) or {}
|
||||
allocations_to_delete = set(allocations.keys()) - tracked_keys
|
||||
for instance_uuid in allocations_to_delete:
|
||||
# Allocations related to instances being scheduled should not be
|
||||
# deleted if we already wrote the allocation previously.
|
||||
instance = objects.Instance.get_by_uuid(context, instance_uuid,
|
||||
expected_attrs=[])
|
||||
if not instance.host:
|
||||
continue
|
||||
LOG.warning('Deleting stale allocation for instance %s',
|
||||
instance_uuid)
|
||||
self.reportclient.delete_allocation_for_instance(instance_uuid)
|
||||
|
||||
def _find_orphaned_instances(self):
|
||||
"""Given the set of instances and migrations already account for
|
||||
by resource tracker, sanity check the hypervisor to determine
|
||||
|
@ -884,7 +884,7 @@ class SchedulerReportClient(object):
|
||||
return r.status_code == 204
|
||||
|
||||
@safe_connect
|
||||
def _delete_allocation_for_instance(self, uuid):
|
||||
def delete_allocation_for_instance(self, uuid):
|
||||
url = '/allocations/%s' % uuid
|
||||
r = self.delete(url)
|
||||
if r:
|
||||
@ -905,10 +905,10 @@ class SchedulerReportClient(object):
|
||||
if sign > 0:
|
||||
self._allocate_for_instance(compute_node.uuid, instance)
|
||||
else:
|
||||
self._delete_allocation_for_instance(instance.uuid)
|
||||
self.delete_allocation_for_instance(instance.uuid)
|
||||
|
||||
@safe_connect
|
||||
def _get_allocations(self, rp_uuid):
|
||||
def get_allocations_for_resource_provider(self, rp_uuid):
|
||||
url = '/resource_providers/%s/allocations' % rp_uuid
|
||||
resp = self.get(url)
|
||||
if not resp:
|
||||
@ -916,20 +916,6 @@ class SchedulerReportClient(object):
|
||||
else:
|
||||
return resp.json()['allocations']
|
||||
|
||||
def remove_deleted_instances(self, compute_node, instance_uuids):
|
||||
allocations = self._get_allocations(compute_node.uuid)
|
||||
if allocations is None:
|
||||
allocations = {}
|
||||
|
||||
instance_dict = {instance['uuid']: instance
|
||||
for instance in instance_uuids}
|
||||
removed_instances = set(allocations.keys()) - set(instance_dict.keys())
|
||||
|
||||
for uuid in removed_instances:
|
||||
LOG.warning(_LW('Deleting stale allocation for instance %s'),
|
||||
uuid)
|
||||
self._delete_allocation_for_instance(uuid)
|
||||
|
||||
@safe_connect
|
||||
def delete_resource_provider(self, context, compute_node, cascade=False):
|
||||
"""Deletes the ResourceProvider record for the compute_node.
|
||||
@ -951,7 +937,7 @@ class SchedulerReportClient(object):
|
||||
instances = objects.InstanceList.get_by_host_and_node(context,
|
||||
host, nodename)
|
||||
for instance in instances:
|
||||
self._delete_allocation_for_instance(instance.uuid)
|
||||
self.delete_allocation_for_instance(instance.uuid)
|
||||
url = "/resource_providers/%s" % rp_uuid
|
||||
resp = self.delete(url)
|
||||
if resp:
|
||||
|
@ -2266,6 +2266,44 @@ class TestUpdateUsageFromInstance(BaseTestCase):
|
||||
mock_update_usage.assert_called_once_with(
|
||||
self.rt._get_usage_dict(self.instance), _NODENAME, sign=-1)
|
||||
|
||||
@mock.patch('nova.objects.Instance.get_by_uuid')
|
||||
def test_remove_deleted_instances_allocations(self, mock_inst_get):
|
||||
rc = self.rt.reportclient
|
||||
self.rt.tracked_instances = {}
|
||||
# Create 3 instances
|
||||
instance_uuids = (uuids.inst0, uuids.inst1, uuids.inst2)
|
||||
for i in range(3):
|
||||
instance = _INSTANCE_FIXTURES[0].obj_clone()
|
||||
instance.uuid = instance_uuids[i]
|
||||
if i > 0:
|
||||
# Only add the last two to tracked_instances, to simulate one
|
||||
# deleted instance
|
||||
inst_dict = obj_base.obj_to_primitive(instance)
|
||||
self.rt.tracked_instances[instance.uuid] = inst_dict
|
||||
# Mock out the allocation call
|
||||
allocs = {uuids.scheduled: "fake_scheduled_instance"}
|
||||
for i in range(3):
|
||||
allocs[instance_uuids[i]] = "fake_instance_%s" % i
|
||||
rc.get_allocations_for_resource_provider = mock.MagicMock(
|
||||
return_value=allocs)
|
||||
rc.delete_allocation_for_instance = mock.MagicMock()
|
||||
|
||||
def get_by_uuid(ctx, inst_uuid, expected_attrs=None):
|
||||
ret = _INSTANCE_FIXTURES[0].obj_clone()
|
||||
ret.uuid = inst_uuid
|
||||
if inst_uuid == uuids.scheduled:
|
||||
ret.host = None
|
||||
return ret
|
||||
|
||||
mock_inst_get.side_effect = get_by_uuid
|
||||
cn = self.rt.compute_nodes[_NODENAME]
|
||||
ctx = mock.sentinel.ctx
|
||||
# Call the method.
|
||||
self.rt._remove_deleted_instances_allocations(ctx, cn)
|
||||
# Only one call should be made to delete allocations, and that should
|
||||
# be for the first instance created above
|
||||
rc.delete_allocation_for_instance.assert_called_once_with(uuids.inst0)
|
||||
|
||||
|
||||
class TestInstanceInResizeState(test.NoDBTestCase):
|
||||
def test_active_suspending(self):
|
||||
|
@ -1482,41 +1482,6 @@ class TestAllocations(SchedulerReportClientTestCase):
|
||||
self.client.update_instance_allocation(cn, inst, -1)
|
||||
self.assertTrue(mock_warn.called)
|
||||
|
||||
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
|
||||
'delete')
|
||||
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
|
||||
'get')
|
||||
@mock.patch('nova.scheduler.client.report.'
|
||||
'_instance_to_allocations_dict')
|
||||
def test_remove_deleted_instances(self, mock_a, mock_get,
|
||||
mock_delete):
|
||||
cn = objects.ComputeNode(uuid=uuids.cn)
|
||||
inst1 = objects.Instance(uuid=uuids.inst1)
|
||||
inst2 = objects.Instance(uuid=uuids.inst2)
|
||||
fake_allocations = {
|
||||
'MEMORY_MB': 1024,
|
||||
'VCPU': 2,
|
||||
'DISK_GB': 101,
|
||||
}
|
||||
mock_get.return_value.json.return_value = {'allocations': {
|
||||
inst1.uuid: fake_allocations,
|
||||
inst2.uuid: fake_allocations,
|
||||
}
|
||||
}
|
||||
|
||||
# One instance still on the node, dict form as the
|
||||
# RT tracks it
|
||||
inst3 = {'uuid': 'foo'}
|
||||
|
||||
mock_delete.return_value = True
|
||||
self.client.remove_deleted_instances(cn, [inst3])
|
||||
mock_get.assert_called_once_with(
|
||||
'/resource_providers/%s/allocations' % cn.uuid)
|
||||
expected_calls = [
|
||||
mock.call('/allocations/%s' % inst1.uuid),
|
||||
mock.call('/allocations/%s' % inst2.uuid)]
|
||||
mock_delete.assert_has_calls(expected_calls, any_order=True)
|
||||
|
||||
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
|
||||
'delete')
|
||||
@mock.patch('nova.scheduler.client.report.LOG')
|
||||
@ -1532,7 +1497,7 @@ class TestAllocations(SchedulerReportClientTestCase):
|
||||
# py3 uses __bool__
|
||||
mock_response.__bool__.return_value = False
|
||||
mock_delete.return_value = mock_response
|
||||
self.client._delete_allocation_for_instance(uuids.rp_uuid)
|
||||
self.client.delete_allocation_for_instance(uuids.rp_uuid)
|
||||
# make sure we didn't screw up the logic or the mock
|
||||
mock_log.info.assert_not_called()
|
||||
# make sure warning wasn't called for the 404
|
||||
@ -1541,7 +1506,7 @@ class TestAllocations(SchedulerReportClientTestCase):
|
||||
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
|
||||
"delete")
|
||||
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
|
||||
"_delete_allocation_for_instance")
|
||||
"delete_allocation_for_instance")
|
||||
@mock.patch("nova.objects.InstanceList.get_by_host_and_node")
|
||||
def test_delete_resource_provider_cascade(self, mock_by_host,
|
||||
mock_del_alloc, mock_delete):
|
||||
@ -1563,7 +1528,7 @@ class TestAllocations(SchedulerReportClientTestCase):
|
||||
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
|
||||
"delete")
|
||||
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
|
||||
"_delete_allocation_for_instance")
|
||||
"delete_allocation_for_instance")
|
||||
@mock.patch("nova.objects.InstanceList.get_by_host_and_node")
|
||||
def test_delete_resource_provider_no_cascade(self, mock_by_host,
|
||||
mock_del_alloc, mock_delete):
|
||||
|
Loading…
Reference in New Issue
Block a user