Delete a compute node's resource provider when node is deleted

Currently when a compute node is deleted, its record in the cell DB is
deleted, but its representation as a resource provider in the placement
service remains, along with any inventory and allocations. This could
cause the placement engine to return that provider record, even though
the compute node no longer exists. And since the periodic "healing" by
the resource tracker only updates compute node resources for records in
the compute_nodes table, these old records are never removed.

This patch adds a call to delete the resource provider when the compute
node is deleted. It also adds a method to the scheduler report client
to make these calls to the placement API.

Partial-Bug: #1661258
Closes-Bug: #1661014

Change-Id: I6098d186d05ff8b9a568e23f860295a7bc2e6447
This commit is contained in:
EdLeafe 2017-02-02 18:48:35 +00:00 committed by Chris Dent
parent 6f3338612d
commit 965fffc09d
5 changed files with 136 additions and 1 deletions

View File

@ -6583,6 +6583,11 @@ class ComputeManager(manager.Manager):
{'id': cn.id, 'hh': cn.hypervisor_hostname,
'nodes': nodenames})
cn.destroy()
# Delete the corresponding resource provider in placement,
# along with any associated allocations and inventory.
# TODO(cdent): Move use of reportclient into resource tracker.
self.scheduler_client.reportclient.delete_resource_provider(
context, cn, cascade=True)
def _get_compute_nodes_in_db(self, context, use_slave=False):
try:

View File

@ -749,3 +749,40 @@ class SchedulerReportClient(object):
LOG.warning(_LW('Deleting stale allocation for instance %s'),
uuid)
self._delete_allocation_for_instance(uuid)
@safe_connect
def delete_resource_provider(self, context, compute_node, cascade=False):
"""Deletes the ResourceProvider record for the compute_node.
:param context: The security context
:param compute_node: The nova.objects.ComputeNode object that is the
resource provider being deleted.
:param cascade: Boolean value that, when True, will first delete any
associated Allocation and Inventory records for the
compute node
"""
nodename = compute_node.hypervisor_hostname
host = compute_node.host
rp_uuid = compute_node.uuid
if cascade:
# Delete any allocations for this resource provider.
# Since allocations are by consumer, we get the consumers on this
# host, which are its instances.
instances = objects.InstanceList.get_by_host_and_node(context,
host, nodename)
for instance in instances:
self._delete_allocation_for_instance(instance.uuid)
url = "/resource_providers/%s" % rp_uuid
resp = self.delete(url)
if resp:
LOG.info(_LI("Deleted resource provider %s"), rp_uuid)
else:
# Check for 404 since we don't need to log a warning if we tried to
# delete something which doesn"t actually exist.
if resp.status_code != 404:
LOG.warning(
_LW("Unable to delete resource provider "
"%(uuid)s: (%(code)i %(text)s)"),
{"uuid": rp_uuid,
"code": resp.status_code,
"text": resp.text})

View File

@ -210,12 +210,14 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
self.assertTrue(log_mock.info.called)
self.assertIsNone(self.compute._resource_tracker)
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'delete_resource_provider')
@mock.patch.object(manager.ComputeManager,
'update_available_resource_for_node')
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
update_mock):
update_mock, del_rp_mock):
db_nodes = [self._make_compute_node('node%s' % i, i)
for i in range(1, 5)]
avail_nodes = set(['node2', 'node3', 'node4', 'node5'])
@ -233,6 +235,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
for db_node in db_nodes:
if db_node.hypervisor_hostname == 'node1':
db_node.destroy.assert_called_once_with()
del_rp_mock.assert_called_once_with(self.context, db_node,
cascade=True)
else:
self.assertFalse(db_node.destroy.called)

View File

@ -1301,3 +1301,83 @@ class TestAllocations(SchedulerReportClientTestCase):
mock_log.info.assert_not_called()
# make sure warning wasn't called for the 404
mock_log.warning.assert_not_called()
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
"delete")
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
"_delete_allocation_for_instance")
@mock.patch("nova.objects.InstanceList.get_by_host_and_node")
def test_delete_resource_provider_cascade(self, mock_by_host,
mock_del_alloc, mock_delete):
cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
hypervisor_hostname="fake_hostname", )
inst1 = objects.Instance(uuid=uuids.inst1)
inst2 = objects.Instance(uuid=uuids.inst2)
mock_by_host.return_value = objects.InstanceList(
objects=[inst1, inst2])
resp_mock = mock.Mock(status_code=204)
mock_delete.return_value = resp_mock
self.client.delete_resource_provider(self.context, cn, cascade=True)
self.assertEqual(2, mock_del_alloc.call_count)
exp_url = "/resource_providers/%s" % uuids.cn
mock_delete.assert_called_once_with(exp_url)
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
"delete")
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
"_delete_allocation_for_instance")
@mock.patch("nova.objects.InstanceList.get_by_host_and_node")
def test_delete_resource_provider_no_cascade(self, mock_by_host,
mock_del_alloc, mock_delete):
cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
hypervisor_hostname="fake_hostname", )
inst1 = objects.Instance(uuid=uuids.inst1)
inst2 = objects.Instance(uuid=uuids.inst2)
mock_by_host.return_value = objects.InstanceList(
objects=[inst1, inst2])
resp_mock = mock.Mock(status_code=204)
mock_delete.return_value = resp_mock
self.client.delete_resource_provider(self.context, cn)
mock_del_alloc.assert_not_called()
exp_url = "/resource_providers/%s" % uuids.cn
mock_delete.assert_called_once_with(exp_url)
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
"delete")
@mock.patch('nova.scheduler.client.report.LOG')
def test_delete_resource_provider_log_calls(self, mock_log, mock_delete):
# First, check a successful call
cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
hypervisor_hostname="fake_hostname", )
resp_mock = mock.MagicMock(status_code=204)
try:
resp_mock.__nonzero__.return_value = True
except AttributeError:
# py3 uses __bool__
resp_mock.__bool__.return_value = True
mock_delete.return_value = resp_mock
self.client.delete_resource_provider(self.context, cn)
# With a 204, only the info should be called
self.assertEqual(1, mock_log.info.call_count)
self.assertEqual(0, mock_log.warning.call_count)
# Now check a 404 response
mock_log.reset_mock()
resp_mock.status_code = 404
try:
resp_mock.__nonzero__.return_value = False
except AttributeError:
# py3 uses __bool__
resp_mock.__bool__.return_value = False
self.client.delete_resource_provider(self.context, cn)
# With a 404, neither log message should be called
self.assertEqual(0, mock_log.info.call_count)
self.assertEqual(0, mock_log.warning.call_count)
# Finally, check a 409 response
mock_log.reset_mock()
resp_mock.status_code = 409
self.client.delete_resource_provider(self.context, cn)
# With a 409, only the warning should be called
self.assertEqual(0, mock_log.info.call_count)
self.assertEqual(1, mock_log.warning.call_count)

View File

@ -0,0 +1,9 @@
---
issues:
- |
Ironic nodes that were deleted from ironic's database during Newton
may result in orphaned resource providers causing incorrect scheduling
decisions, leading to a reschedule. If this happens, the orphaned
resource providers will need to be identified and removed.
See also https://bugs.launchpad.net/nova/+bug/1661258