diff --git a/nova/compute/manager.py b/nova/compute/manager.py index ded735e9b0a3..3ca534bd2752 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -6583,6 +6583,11 @@ class ComputeManager(manager.Manager): {'id': cn.id, 'hh': cn.hypervisor_hostname, 'nodes': nodenames}) cn.destroy() + # Delete the corresponding resource provider in placement, + # along with any associated allocations and inventory. + # TODO(cdent): Move use of reportclient into resource tracker. + self.scheduler_client.reportclient.delete_resource_provider( + context, cn, cascade=True) def _get_compute_nodes_in_db(self, context, use_slave=False): try: diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py index e3f11e902551..320cf4d5c847 100644 --- a/nova/scheduler/client/report.py +++ b/nova/scheduler/client/report.py @@ -749,3 +749,40 @@ class SchedulerReportClient(object): LOG.warning(_LW('Deleting stale allocation for instance %s'), uuid) self._delete_allocation_for_instance(uuid) + + @safe_connect + def delete_resource_provider(self, context, compute_node, cascade=False): + """Deletes the ResourceProvider record for the compute_node. + + :param context: The security context + :param compute_node: The nova.objects.ComputeNode object that is the + resource provider being deleted. + :param cascade: Boolean value that, when True, will first delete any + associated Allocation and Inventory records for the + compute node + """ + nodename = compute_node.hypervisor_hostname + host = compute_node.host + rp_uuid = compute_node.uuid + if cascade: + # Delete any allocations for this resource provider. + # Since allocations are by consumer, we get the consumers on this + # host, which are its instances. + instances = objects.InstanceList.get_by_host_and_node(context, + host, nodename) + for instance in instances: + self._delete_allocation_for_instance(instance.uuid) + url = "/resource_providers/%s" % rp_uuid + resp = self.delete(url) + if resp: + LOG.info(_LI("Deleted resource provider %s"), rp_uuid) + else: + # Check for 404 since we don't need to log a warning if we tried to + # delete something which doesn"t actually exist. + if resp.status_code != 404: + LOG.warning( + _LW("Unable to delete resource provider " + "%(uuid)s: (%(code)i %(text)s)"), + {"uuid": rp_uuid, + "code": resp.status_code, + "text": resp.text}) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 4f3b5787e25a..b7f016ce6d31 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -210,12 +210,14 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): self.assertTrue(log_mock.info.called) self.assertIsNone(self.compute._resource_tracker) + @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'delete_resource_provider') @mock.patch.object(manager.ComputeManager, 'update_available_resource_for_node') @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') @mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db') def test_update_available_resource(self, get_db_nodes, get_avail_nodes, - update_mock): + update_mock, del_rp_mock): db_nodes = [self._make_compute_node('node%s' % i, i) for i in range(1, 5)] avail_nodes = set(['node2', 'node3', 'node4', 'node5']) @@ -233,6 +235,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): for db_node in db_nodes: if db_node.hypervisor_hostname == 'node1': db_node.destroy.assert_called_once_with() + del_rp_mock.assert_called_once_with(self.context, db_node, + cascade=True) else: self.assertFalse(db_node.destroy.called) diff --git a/nova/tests/unit/scheduler/client/test_report.py b/nova/tests/unit/scheduler/client/test_report.py index 8195c78c4794..15a79ee45049 100644 --- a/nova/tests/unit/scheduler/client/test_report.py +++ b/nova/tests/unit/scheduler/client/test_report.py @@ -1301,3 +1301,83 @@ class TestAllocations(SchedulerReportClientTestCase): mock_log.info.assert_not_called() # make sure warning wasn't called for the 404 mock_log.warning.assert_not_called() + + @mock.patch("nova.scheduler.client.report.SchedulerReportClient." + "delete") + @mock.patch("nova.scheduler.client.report.SchedulerReportClient." + "_delete_allocation_for_instance") + @mock.patch("nova.objects.InstanceList.get_by_host_and_node") + def test_delete_resource_provider_cascade(self, mock_by_host, + mock_del_alloc, mock_delete): + cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host", + hypervisor_hostname="fake_hostname", ) + inst1 = objects.Instance(uuid=uuids.inst1) + inst2 = objects.Instance(uuid=uuids.inst2) + mock_by_host.return_value = objects.InstanceList( + objects=[inst1, inst2]) + resp_mock = mock.Mock(status_code=204) + mock_delete.return_value = resp_mock + self.client.delete_resource_provider(self.context, cn, cascade=True) + self.assertEqual(2, mock_del_alloc.call_count) + exp_url = "/resource_providers/%s" % uuids.cn + mock_delete.assert_called_once_with(exp_url) + + @mock.patch("nova.scheduler.client.report.SchedulerReportClient." + "delete") + @mock.patch("nova.scheduler.client.report.SchedulerReportClient." + "_delete_allocation_for_instance") + @mock.patch("nova.objects.InstanceList.get_by_host_and_node") + def test_delete_resource_provider_no_cascade(self, mock_by_host, + mock_del_alloc, mock_delete): + cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host", + hypervisor_hostname="fake_hostname", ) + inst1 = objects.Instance(uuid=uuids.inst1) + inst2 = objects.Instance(uuid=uuids.inst2) + mock_by_host.return_value = objects.InstanceList( + objects=[inst1, inst2]) + resp_mock = mock.Mock(status_code=204) + mock_delete.return_value = resp_mock + self.client.delete_resource_provider(self.context, cn) + mock_del_alloc.assert_not_called() + exp_url = "/resource_providers/%s" % uuids.cn + mock_delete.assert_called_once_with(exp_url) + + @mock.patch("nova.scheduler.client.report.SchedulerReportClient." + "delete") + @mock.patch('nova.scheduler.client.report.LOG') + def test_delete_resource_provider_log_calls(self, mock_log, mock_delete): + # First, check a successful call + cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host", + hypervisor_hostname="fake_hostname", ) + resp_mock = mock.MagicMock(status_code=204) + try: + resp_mock.__nonzero__.return_value = True + except AttributeError: + # py3 uses __bool__ + resp_mock.__bool__.return_value = True + mock_delete.return_value = resp_mock + self.client.delete_resource_provider(self.context, cn) + # With a 204, only the info should be called + self.assertEqual(1, mock_log.info.call_count) + self.assertEqual(0, mock_log.warning.call_count) + + # Now check a 404 response + mock_log.reset_mock() + resp_mock.status_code = 404 + try: + resp_mock.__nonzero__.return_value = False + except AttributeError: + # py3 uses __bool__ + resp_mock.__bool__.return_value = False + self.client.delete_resource_provider(self.context, cn) + # With a 404, neither log message should be called + self.assertEqual(0, mock_log.info.call_count) + self.assertEqual(0, mock_log.warning.call_count) + + # Finally, check a 409 response + mock_log.reset_mock() + resp_mock.status_code = 409 + self.client.delete_resource_provider(self.context, cn) + # With a 409, only the warning should be called + self.assertEqual(0, mock_log.info.call_count) + self.assertEqual(1, mock_log.warning.call_count) diff --git a/releasenotes/notes/bug-1661258-ee202843157f6a27.yaml b/releasenotes/notes/bug-1661258-ee202843157f6a27.yaml new file mode 100644 index 000000000000..ad8591e3328b --- /dev/null +++ b/releasenotes/notes/bug-1661258-ee202843157f6a27.yaml @@ -0,0 +1,9 @@ +--- +issues: + - | + Ironic nodes that were deleted from ironic's database during Newton + may result in orphaned resource providers causing incorrect scheduling + decisions, leading to a reschedule. If this happens, the orphaned + resource providers will need to be identified and removed. + + See also https://bugs.launchpad.net/nova/+bug/1661258