From 947bb5f641095e726c80c87d8430c6e99a58ddf0 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Wed, 28 Jun 2023 10:45:30 +0100 Subject: [PATCH] Make compute node rebalance safer Many bugs around nova-compute rebalancing are focused around problems when the compute node and placement resources are deleted, and sometimes they never get re-created. To limit this class of bugs, we add a check to ensure a compute node is only ever deleted when it is known to have been deleted in Ironic. There is a risk this might leave orphaned compute nodes and resource providers that need manual clean up because users do not want to delete the node in Ironic, but are removing it from nova management. But on balance, it seems safer to leave these cases up to the operator to resolve manually, and collect feedback on how to better help those users. blueprint ironic-shards Change-Id: I2bc77cbb77c2dd5584368563dc4250d71913906b --- nova/compute/manager.py | 13 +++++++ .../regressions/test_bug_1839560.py | 7 +++- .../regressions/test_bug_1853009.py | 6 ++- nova/tests/unit/compute/test_compute_mgr.py | 38 ++++++++++++++++++- nova/virt/driver.py | 20 ++++++++++ nova/virt/ironic/driver.py | 8 ++++ 6 files changed, 88 insertions(+), 4 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 984caeefea4d..87c8b6ac0a91 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -10644,6 +10644,19 @@ class ComputeManager(manager.Manager): # Delete orphan compute node not reported by driver but still in db for cn in compute_nodes_in_db: if cn.hypervisor_hostname not in nodenames: + # if the node could be migrated, we don't delete + # the compute node database records + if not self.driver.is_node_deleted(cn.hypervisor_hostname): + LOG.warning( + "Found orphan compute node %(id)s " + "hypervisor host is %(hh)s, " + "nodes are %(nodes)s. " + "We are not deleting this as the driver " + "says this node has not been deleted.", + {'id': cn.id, 'hh': cn.hypervisor_hostname, + 'nodes': nodenames}) + continue + LOG.info("Deleting orphan compute node %(id)s " "hypervisor host is %(hh)s, " "nodes are %(nodes)s", diff --git a/nova/tests/functional/regressions/test_bug_1839560.py b/nova/tests/functional/regressions/test_bug_1839560.py index 54ae3af191f1..005f6537f3b9 100644 --- a/nova/tests/functional/regressions/test_bug_1839560.py +++ b/nova/tests/functional/regressions/test_bug_1839560.py @@ -11,6 +11,7 @@ # under the License. from oslo_log import log as logging +from unittest import mock from nova import context from nova.db.main import api as db_api @@ -20,6 +21,7 @@ from nova.tests import fixtures as nova_fixtures from nova.tests.functional import fixtures as func_fixtures from nova.tests.functional import integrated_helpers from nova import utils +from nova.virt import fake as fake_driver LOG = logging.getLogger(__name__) @@ -56,7 +58,10 @@ class PeriodicNodeRecreateTestCase(test.TestCase, # for each node. self.flags(compute_driver='fake.PredictableNodeUUIDDriver') - def test_update_available_resource_node_recreate(self): + @mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted') + def test_update_available_resource_node_recreate(self, mock_delete): + # make the fake driver allow nodes to be delete, like ironic driver + mock_delete.return_value = True # First we create a compute service to manage a couple of fake nodes. compute = self.start_service('compute', 'node1') # When start_service runs, it will create the node1 ComputeNode. diff --git a/nova/tests/functional/regressions/test_bug_1853009.py b/nova/tests/functional/regressions/test_bug_1853009.py index 5266e6166b7e..c3a220ae5872 100644 --- a/nova/tests/functional/regressions/test_bug_1853009.py +++ b/nova/tests/functional/regressions/test_bug_1853009.py @@ -15,6 +15,7 @@ from unittest import mock from nova import context from nova import objects from nova.tests.functional import integrated_helpers +from nova.virt import fake as fake_driver class NodeRebalanceDeletedComputeNodeRaceTestCase( @@ -54,7 +55,10 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase( self.nodename = 'fake-node' self.ctxt = context.get_admin_context() - def test_node_rebalance_deleted_compute_node_race(self): + @mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted') + def test_node_rebalance_deleted_compute_node_race(self, mock_delete): + # make the fake driver allow nodes to be delete, like ironic driver + mock_delete.return_value = True # Simulate a service running and then stopping. host_a runs, creates # fake-node, then is stopped. The fake-node compute node is destroyed. # This leaves a soft-deleted node in the DB. diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 332e92bbf72a..249d1a5302d5 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -396,13 +396,14 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, {'node': mock.sentinel.node} ) + @mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted') @mock.patch.object(manager, 'LOG') @mock.patch.object(manager.ComputeManager, '_update_available_resource_for_node') @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') @mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db') def test_update_available_resource(self, get_db_nodes, get_avail_nodes, - update_mock, mock_log): + update_mock, mock_log, mock_deleted): mock_rt = self._mock_rt() rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject( self.compute, 'reportclient')).mock @@ -415,6 +416,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, get_db_nodes.return_value = db_nodes get_avail_nodes.return_value = avail_nodes + mock_deleted.return_value = True self.compute.update_available_resource(self.context, startup=True) get_db_nodes.assert_called_once_with(self.context, avail_nodes, use_slave=True, startup=True) @@ -460,12 +462,13 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, update_mock.assert_not_called() del_rp_mock.assert_not_called() + @mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted') @mock.patch.object(manager.ComputeManager, '_update_available_resource_for_node') @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') @mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db') def test_update_available_resource_destroy_rebalance( - self, get_db_nodes, get_avail_nodes, update_mock): + self, get_db_nodes, get_avail_nodes, update_mock, mock_deleted): mock_rt = self._mock_rt() rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject( self.compute, 'reportclient')).mock @@ -476,6 +479,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, db_nodes[0].destroy.side_effect = exception.ObjectActionError( action='destroy', reason='host changed') get_avail_nodes.return_value = set() + mock_deleted.return_value = True self.compute.update_available_resource(self.context) get_db_nodes.assert_called_once_with(self.context, set(), use_slave=True, startup=False) @@ -486,6 +490,36 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, mock_rt.remove_node.assert_called_once_with('node1') rc_mock.invalidate_resource_provider.assert_called_once_with( db_nodes[0].uuid) + mock_deleted.assert_called_once_with('node1') + + @mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted') + @mock.patch.object(manager.ComputeManager, + '_update_available_resource_for_node') + @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') + @mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db') + def test_update_available_resource_no_destroy_rebalance( + self, get_db_nodes, get_avail_nodes, update_mock, mock_deleted): + mock_rt = self._mock_rt() + rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject( + self.compute, 'reportclient')).mock + db_nodes = [self._make_compute_node('node1', 1)] + get_db_nodes.return_value = db_nodes + # Destroy can fail if nodes were rebalanced between getting the node + # list and calling destroy. + db_nodes[0].destroy.side_effect = exception.ObjectActionError( + action='destroy', reason='host changed') + get_avail_nodes.return_value = set() + mock_deleted.return_value = False + self.compute.update_available_resource(self.context) + get_db_nodes.assert_called_once_with(self.context, set(), + use_slave=True, startup=False) + self.assertEqual(0, update_mock.call_count) + + db_nodes[0].destroy.assert_not_called() + self.assertEqual(0, rc_mock.delete_resource_provider.call_count) + mock_rt.remove_node.assert_not_called() + rc_mock.invalidate_resource_provider.assert_not_called() + mock_deleted.assert_called_once_with('node1') @mock.patch('nova.context.get_admin_context') def test_pre_start_hook(self, get_admin_context): diff --git a/nova/virt/driver.py b/nova/virt/driver.py index 3c95e17ce6bb..04b512934f31 100644 --- a/nova/virt/driver.py +++ b/nova/virt/driver.py @@ -1150,6 +1150,26 @@ class ComputeDriver(object): """ raise NotImplementedError() + def is_node_deleted(self, nodename): + """Check this compute node has been deleted. + + This method is called when the compute manager notices that a + node that was previously reported as available is no longer + available. + In this case, we need to know if the node has actually been + deleted, or if it is simply no longer managed by this + nova-compute service. + If True is returned, the database and placement records will + be removed. + + :param nodename: + node which the caller wants to check if its deleted + :returns: True if the node is safe to delete + """ + # For most driver, compute nodes should never get deleted + # during a periodic task, they are only deleted via the API + return False + def pre_live_migration(self, context, instance, block_device_info, network_info, disk_info, migrate_data): """Prepare an instance for live migration diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py index 446b0db60756..ce9e0e29063d 100644 --- a/nova/virt/ironic/driver.py +++ b/nova/virt/ironic/driver.py @@ -697,6 +697,14 @@ class IronicDriver(virt_driver.ComputeDriver): except sdk_exc.ResourceNotFound: return False + def is_node_deleted(self, nodename): + # check if the node is missing in Ironic + try: + self._get_node(nodename) + return False + except sdk_exc.ResourceNotFound: + return True + def _refresh_hash_ring(self, ctxt): # When requesting a shard, we assume each compute service is # targeting a separate shard, so hard code peer_list to