From 965fffc09d6fffba7918117e170d5799c69fe99b Mon Sep 17 00:00:00 2001 From: EdLeafe Date: Thu, 2 Feb 2017 18:48:35 +0000 Subject: [PATCH] Delete a compute node's resource provider when node is deleted Currently when a compute node is deleted, its record in the cell DB is deleted, but its representation as a resource provider in the placement service remains, along with any inventory and allocations. This could cause the placement engine to return that provider record, even though the compute node no longer exists. And since the periodic "healing" by the resource tracker only updates compute node resources for records in the compute_nodes table, these old records are never removed. This patch adds a call to delete the resource provider when the compute node is deleted. It also adds a method to the scheduler report client to make these calls to the placement API. Partial-Bug: #1661258 Closes-Bug: #1661014 Change-Id: I6098d186d05ff8b9a568e23f860295a7bc2e6447 --- nova/compute/manager.py | 5 ++ nova/scheduler/client/report.py | 37 +++++++++ nova/tests/unit/compute/test_compute_mgr.py | 6 +- .../unit/scheduler/client/test_report.py | 80 +++++++++++++++++++ .../notes/bug-1661258-ee202843157f6a27.yaml | 9 +++ 5 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 releasenotes/notes/bug-1661258-ee202843157f6a27.yaml diff --git a/nova/compute/manager.py b/nova/compute/manager.py index ded735e9b0a3..3ca534bd2752 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -6583,6 +6583,11 @@ class ComputeManager(manager.Manager): {'id': cn.id, 'hh': cn.hypervisor_hostname, 'nodes': nodenames}) cn.destroy() + # Delete the corresponding resource provider in placement, + # along with any associated allocations and inventory. + # TODO(cdent): Move use of reportclient into resource tracker. + self.scheduler_client.reportclient.delete_resource_provider( + context, cn, cascade=True) def _get_compute_nodes_in_db(self, context, use_slave=False): try: diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py index e3f11e902551..320cf4d5c847 100644 --- a/nova/scheduler/client/report.py +++ b/nova/scheduler/client/report.py @@ -749,3 +749,40 @@ class SchedulerReportClient(object): LOG.warning(_LW('Deleting stale allocation for instance %s'), uuid) self._delete_allocation_for_instance(uuid) + + @safe_connect + def delete_resource_provider(self, context, compute_node, cascade=False): + """Deletes the ResourceProvider record for the compute_node. + + :param context: The security context + :param compute_node: The nova.objects.ComputeNode object that is the + resource provider being deleted. + :param cascade: Boolean value that, when True, will first delete any + associated Allocation and Inventory records for the + compute node + """ + nodename = compute_node.hypervisor_hostname + host = compute_node.host + rp_uuid = compute_node.uuid + if cascade: + # Delete any allocations for this resource provider. + # Since allocations are by consumer, we get the consumers on this + # host, which are its instances. + instances = objects.InstanceList.get_by_host_and_node(context, + host, nodename) + for instance in instances: + self._delete_allocation_for_instance(instance.uuid) + url = "/resource_providers/%s" % rp_uuid + resp = self.delete(url) + if resp: + LOG.info(_LI("Deleted resource provider %s"), rp_uuid) + else: + # Check for 404 since we don't need to log a warning if we tried to + # delete something which doesn"t actually exist. + if resp.status_code != 404: + LOG.warning( + _LW("Unable to delete resource provider " + "%(uuid)s: (%(code)i %(text)s)"), + {"uuid": rp_uuid, + "code": resp.status_code, + "text": resp.text}) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 4f3b5787e25a..b7f016ce6d31 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -210,12 +210,14 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): self.assertTrue(log_mock.info.called) self.assertIsNone(self.compute._resource_tracker) + @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'delete_resource_provider') @mock.patch.object(manager.ComputeManager, 'update_available_resource_for_node') @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') @mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db') def test_update_available_resource(self, get_db_nodes, get_avail_nodes, - update_mock): + update_mock, del_rp_mock): db_nodes = [self._make_compute_node('node%s' % i, i) for i in range(1, 5)] avail_nodes = set(['node2', 'node3', 'node4', 'node5']) @@ -233,6 +235,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): for db_node in db_nodes: if db_node.hypervisor_hostname == 'node1': db_node.destroy.assert_called_once_with() + del_rp_mock.assert_called_once_with(self.context, db_node, + cascade=True) else: self.assertFalse(db_node.destroy.called) diff --git a/nova/tests/unit/scheduler/client/test_report.py b/nova/tests/unit/scheduler/client/test_report.py index 8195c78c4794..15a79ee45049 100644 --- a/nova/tests/unit/scheduler/client/test_report.py +++ b/nova/tests/unit/scheduler/client/test_report.py @@ -1301,3 +1301,83 @@ class TestAllocations(SchedulerReportClientTestCase): mock_log.info.assert_not_called() # make sure warning wasn't called for the 404 mock_log.warning.assert_not_called() + + @mock.patch("nova.scheduler.client.report.SchedulerReportClient." + "delete") + @mock.patch("nova.scheduler.client.report.SchedulerReportClient." + "_delete_allocation_for_instance") + @mock.patch("nova.objects.InstanceList.get_by_host_and_node") + def test_delete_resource_provider_cascade(self, mock_by_host, + mock_del_alloc, mock_delete): + cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host", + hypervisor_hostname="fake_hostname", ) + inst1 = objects.Instance(uuid=uuids.inst1) + inst2 = objects.Instance(uuid=uuids.inst2) + mock_by_host.return_value = objects.InstanceList( + objects=[inst1, inst2]) + resp_mock = mock.Mock(status_code=204) + mock_delete.return_value = resp_mock + self.client.delete_resource_provider(self.context, cn, cascade=True) + self.assertEqual(2, mock_del_alloc.call_count) + exp_url = "/resource_providers/%s" % uuids.cn + mock_delete.assert_called_once_with(exp_url) + + @mock.patch("nova.scheduler.client.report.SchedulerReportClient." + "delete") + @mock.patch("nova.scheduler.client.report.SchedulerReportClient." + "_delete_allocation_for_instance") + @mock.patch("nova.objects.InstanceList.get_by_host_and_node") + def test_delete_resource_provider_no_cascade(self, mock_by_host, + mock_del_alloc, mock_delete): + cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host", + hypervisor_hostname="fake_hostname", ) + inst1 = objects.Instance(uuid=uuids.inst1) + inst2 = objects.Instance(uuid=uuids.inst2) + mock_by_host.return_value = objects.InstanceList( + objects=[inst1, inst2]) + resp_mock = mock.Mock(status_code=204) + mock_delete.return_value = resp_mock + self.client.delete_resource_provider(self.context, cn) + mock_del_alloc.assert_not_called() + exp_url = "/resource_providers/%s" % uuids.cn + mock_delete.assert_called_once_with(exp_url) + + @mock.patch("nova.scheduler.client.report.SchedulerReportClient." + "delete") + @mock.patch('nova.scheduler.client.report.LOG') + def test_delete_resource_provider_log_calls(self, mock_log, mock_delete): + # First, check a successful call + cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host", + hypervisor_hostname="fake_hostname", ) + resp_mock = mock.MagicMock(status_code=204) + try: + resp_mock.__nonzero__.return_value = True + except AttributeError: + # py3 uses __bool__ + resp_mock.__bool__.return_value = True + mock_delete.return_value = resp_mock + self.client.delete_resource_provider(self.context, cn) + # With a 204, only the info should be called + self.assertEqual(1, mock_log.info.call_count) + self.assertEqual(0, mock_log.warning.call_count) + + # Now check a 404 response + mock_log.reset_mock() + resp_mock.status_code = 404 + try: + resp_mock.__nonzero__.return_value = False + except AttributeError: + # py3 uses __bool__ + resp_mock.__bool__.return_value = False + self.client.delete_resource_provider(self.context, cn) + # With a 404, neither log message should be called + self.assertEqual(0, mock_log.info.call_count) + self.assertEqual(0, mock_log.warning.call_count) + + # Finally, check a 409 response + mock_log.reset_mock() + resp_mock.status_code = 409 + self.client.delete_resource_provider(self.context, cn) + # With a 409, only the warning should be called + self.assertEqual(0, mock_log.info.call_count) + self.assertEqual(1, mock_log.warning.call_count) diff --git a/releasenotes/notes/bug-1661258-ee202843157f6a27.yaml b/releasenotes/notes/bug-1661258-ee202843157f6a27.yaml new file mode 100644 index 000000000000..ad8591e3328b --- /dev/null +++ b/releasenotes/notes/bug-1661258-ee202843157f6a27.yaml @@ -0,0 +1,9 @@ +--- +issues: + - | + Ironic nodes that were deleted from ironic's database during Newton + may result in orphaned resource providers causing incorrect scheduling + decisions, leading to a reschedule. If this happens, the orphaned + resource providers will need to be identified and removed. + + See also https://bugs.launchpad.net/nova/+bug/1661258