Make compute node rebalance safer

Many bugs around nova-compute rebalancing are focused around
problems when the compute node and placement resources are
deleted, and sometimes they never get re-created.

To limit this class of bugs, we add a check to ensure a compute
node is only ever deleted when it is known to have been deleted
in Ironic.

There is a risk this might leave orphaned compute nodes and
resource providers that need manual clean up because users
do not want to delete the node in Ironic, but are removing it
from nova management. But on balance, it seems safer to leave
these cases up to the operator to resolve manually, and collect
feedback on how to better help those users.

blueprint ironic-shards

Change-Id: I2bc77cbb77c2dd5584368563dc4250d71913906b
This commit is contained in:
John Garbutt 2023-06-28 10:45:30 +01:00 committed by Jay Faulkner
parent 08d7be1726
commit 947bb5f641
6 changed files with 88 additions and 4 deletions

View File

@ -10644,6 +10644,19 @@ class ComputeManager(manager.Manager):
# Delete orphan compute node not reported by driver but still in db
for cn in compute_nodes_in_db:
if cn.hypervisor_hostname not in nodenames:
# if the node could be migrated, we don't delete
# the compute node database records
if not self.driver.is_node_deleted(cn.hypervisor_hostname):
LOG.warning(
"Found orphan compute node %(id)s "
"hypervisor host is %(hh)s, "
"nodes are %(nodes)s. "
"We are not deleting this as the driver "
"says this node has not been deleted.",
{'id': cn.id, 'hh': cn.hypervisor_hostname,
'nodes': nodenames})
continue
LOG.info("Deleting orphan compute node %(id)s "
"hypervisor host is %(hh)s, "
"nodes are %(nodes)s",

View File

@ -11,6 +11,7 @@
# under the License.
from oslo_log import log as logging
from unittest import mock
from nova import context
from nova.db.main import api as db_api
@ -20,6 +21,7 @@ from nova.tests import fixtures as nova_fixtures
from nova.tests.functional import fixtures as func_fixtures
from nova.tests.functional import integrated_helpers
from nova import utils
from nova.virt import fake as fake_driver
LOG = logging.getLogger(__name__)
@ -56,7 +58,10 @@ class PeriodicNodeRecreateTestCase(test.TestCase,
# for each node.
self.flags(compute_driver='fake.PredictableNodeUUIDDriver')
def test_update_available_resource_node_recreate(self):
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
def test_update_available_resource_node_recreate(self, mock_delete):
# make the fake driver allow nodes to be delete, like ironic driver
mock_delete.return_value = True
# First we create a compute service to manage a couple of fake nodes.
compute = self.start_service('compute', 'node1')
# When start_service runs, it will create the node1 ComputeNode.

View File

@ -15,6 +15,7 @@ from unittest import mock
from nova import context
from nova import objects
from nova.tests.functional import integrated_helpers
from nova.virt import fake as fake_driver
class NodeRebalanceDeletedComputeNodeRaceTestCase(
@ -54,7 +55,10 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
self.nodename = 'fake-node'
self.ctxt = context.get_admin_context()
def test_node_rebalance_deleted_compute_node_race(self):
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
def test_node_rebalance_deleted_compute_node_race(self, mock_delete):
# make the fake driver allow nodes to be delete, like ironic driver
mock_delete.return_value = True
# Simulate a service running and then stopping. host_a runs, creates
# fake-node, then is stopped. The fake-node compute node is destroyed.
# This leaves a soft-deleted node in the DB.

View File

@ -396,13 +396,14 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
{'node': mock.sentinel.node}
)
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
@mock.patch.object(manager, 'LOG')
@mock.patch.object(manager.ComputeManager,
'_update_available_resource_for_node')
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
update_mock, mock_log):
update_mock, mock_log, mock_deleted):
mock_rt = self._mock_rt()
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
self.compute, 'reportclient')).mock
@ -415,6 +416,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
get_db_nodes.return_value = db_nodes
get_avail_nodes.return_value = avail_nodes
mock_deleted.return_value = True
self.compute.update_available_resource(self.context, startup=True)
get_db_nodes.assert_called_once_with(self.context, avail_nodes,
use_slave=True, startup=True)
@ -460,12 +462,13 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
update_mock.assert_not_called()
del_rp_mock.assert_not_called()
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
@mock.patch.object(manager.ComputeManager,
'_update_available_resource_for_node')
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
def test_update_available_resource_destroy_rebalance(
self, get_db_nodes, get_avail_nodes, update_mock):
self, get_db_nodes, get_avail_nodes, update_mock, mock_deleted):
mock_rt = self._mock_rt()
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
self.compute, 'reportclient')).mock
@ -476,6 +479,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
db_nodes[0].destroy.side_effect = exception.ObjectActionError(
action='destroy', reason='host changed')
get_avail_nodes.return_value = set()
mock_deleted.return_value = True
self.compute.update_available_resource(self.context)
get_db_nodes.assert_called_once_with(self.context, set(),
use_slave=True, startup=False)
@ -486,6 +490,36 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
mock_rt.remove_node.assert_called_once_with('node1')
rc_mock.invalidate_resource_provider.assert_called_once_with(
db_nodes[0].uuid)
mock_deleted.assert_called_once_with('node1')
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
@mock.patch.object(manager.ComputeManager,
'_update_available_resource_for_node')
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
def test_update_available_resource_no_destroy_rebalance(
self, get_db_nodes, get_avail_nodes, update_mock, mock_deleted):
mock_rt = self._mock_rt()
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
self.compute, 'reportclient')).mock
db_nodes = [self._make_compute_node('node1', 1)]
get_db_nodes.return_value = db_nodes
# Destroy can fail if nodes were rebalanced between getting the node
# list and calling destroy.
db_nodes[0].destroy.side_effect = exception.ObjectActionError(
action='destroy', reason='host changed')
get_avail_nodes.return_value = set()
mock_deleted.return_value = False
self.compute.update_available_resource(self.context)
get_db_nodes.assert_called_once_with(self.context, set(),
use_slave=True, startup=False)
self.assertEqual(0, update_mock.call_count)
db_nodes[0].destroy.assert_not_called()
self.assertEqual(0, rc_mock.delete_resource_provider.call_count)
mock_rt.remove_node.assert_not_called()
rc_mock.invalidate_resource_provider.assert_not_called()
mock_deleted.assert_called_once_with('node1')
@mock.patch('nova.context.get_admin_context')
def test_pre_start_hook(self, get_admin_context):

View File

@ -1150,6 +1150,26 @@ class ComputeDriver(object):
"""
raise NotImplementedError()
def is_node_deleted(self, nodename):
"""Check this compute node has been deleted.
This method is called when the compute manager notices that a
node that was previously reported as available is no longer
available.
In this case, we need to know if the node has actually been
deleted, or if it is simply no longer managed by this
nova-compute service.
If True is returned, the database and placement records will
be removed.
:param nodename:
node which the caller wants to check if its deleted
:returns: True if the node is safe to delete
"""
# For most driver, compute nodes should never get deleted
# during a periodic task, they are only deleted via the API
return False
def pre_live_migration(self, context, instance, block_device_info,
network_info, disk_info, migrate_data):
"""Prepare an instance for live migration

View File

@ -697,6 +697,14 @@ class IronicDriver(virt_driver.ComputeDriver):
except sdk_exc.ResourceNotFound:
return False
def is_node_deleted(self, nodename):
# check if the node is missing in Ironic
try:
self._get_node(nodename)
return False
except sdk_exc.ResourceNotFound:
return True
def _refresh_hash_ring(self, ctxt):
# When requesting a shard, we assume each compute service is
# targeting a separate shard, so hard code peer_list to