Make compute node rebalance safer
Many bugs around nova-compute rebalancing are focused around problems when the compute node and placement resources are deleted, and sometimes they never get re-created. To limit this class of bugs, we add a check to ensure a compute node is only ever deleted when it is known to have been deleted in Ironic. There is a risk this might leave orphaned compute nodes and resource providers that need manual clean up because users do not want to delete the node in Ironic, but are removing it from nova management. But on balance, it seems safer to leave these cases up to the operator to resolve manually, and collect feedback on how to better help those users. blueprint ironic-shards Change-Id: I2bc77cbb77c2dd5584368563dc4250d71913906b
This commit is contained in:
parent
08d7be1726
commit
947bb5f641
@ -10644,6 +10644,19 @@ class ComputeManager(manager.Manager):
|
||||
# Delete orphan compute node not reported by driver but still in db
|
||||
for cn in compute_nodes_in_db:
|
||||
if cn.hypervisor_hostname not in nodenames:
|
||||
# if the node could be migrated, we don't delete
|
||||
# the compute node database records
|
||||
if not self.driver.is_node_deleted(cn.hypervisor_hostname):
|
||||
LOG.warning(
|
||||
"Found orphan compute node %(id)s "
|
||||
"hypervisor host is %(hh)s, "
|
||||
"nodes are %(nodes)s. "
|
||||
"We are not deleting this as the driver "
|
||||
"says this node has not been deleted.",
|
||||
{'id': cn.id, 'hh': cn.hypervisor_hostname,
|
||||
'nodes': nodenames})
|
||||
continue
|
||||
|
||||
LOG.info("Deleting orphan compute node %(id)s "
|
||||
"hypervisor host is %(hh)s, "
|
||||
"nodes are %(nodes)s",
|
||||
|
@ -11,6 +11,7 @@
|
||||
# under the License.
|
||||
|
||||
from oslo_log import log as logging
|
||||
from unittest import mock
|
||||
|
||||
from nova import context
|
||||
from nova.db.main import api as db_api
|
||||
@ -20,6 +21,7 @@ from nova.tests import fixtures as nova_fixtures
|
||||
from nova.tests.functional import fixtures as func_fixtures
|
||||
from nova.tests.functional import integrated_helpers
|
||||
from nova import utils
|
||||
from nova.virt import fake as fake_driver
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
@ -56,7 +58,10 @@ class PeriodicNodeRecreateTestCase(test.TestCase,
|
||||
# for each node.
|
||||
self.flags(compute_driver='fake.PredictableNodeUUIDDriver')
|
||||
|
||||
def test_update_available_resource_node_recreate(self):
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
|
||||
def test_update_available_resource_node_recreate(self, mock_delete):
|
||||
# make the fake driver allow nodes to be delete, like ironic driver
|
||||
mock_delete.return_value = True
|
||||
# First we create a compute service to manage a couple of fake nodes.
|
||||
compute = self.start_service('compute', 'node1')
|
||||
# When start_service runs, it will create the node1 ComputeNode.
|
||||
|
@ -15,6 +15,7 @@ from unittest import mock
|
||||
from nova import context
|
||||
from nova import objects
|
||||
from nova.tests.functional import integrated_helpers
|
||||
from nova.virt import fake as fake_driver
|
||||
|
||||
|
||||
class NodeRebalanceDeletedComputeNodeRaceTestCase(
|
||||
@ -54,7 +55,10 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
|
||||
self.nodename = 'fake-node'
|
||||
self.ctxt = context.get_admin_context()
|
||||
|
||||
def test_node_rebalance_deleted_compute_node_race(self):
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
|
||||
def test_node_rebalance_deleted_compute_node_race(self, mock_delete):
|
||||
# make the fake driver allow nodes to be delete, like ironic driver
|
||||
mock_delete.return_value = True
|
||||
# Simulate a service running and then stopping. host_a runs, creates
|
||||
# fake-node, then is stopped. The fake-node compute node is destroyed.
|
||||
# This leaves a soft-deleted node in the DB.
|
||||
|
@ -396,13 +396,14 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||
{'node': mock.sentinel.node}
|
||||
)
|
||||
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
|
||||
@mock.patch.object(manager, 'LOG')
|
||||
@mock.patch.object(manager.ComputeManager,
|
||||
'_update_available_resource_for_node')
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
|
||||
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
|
||||
def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
|
||||
update_mock, mock_log):
|
||||
update_mock, mock_log, mock_deleted):
|
||||
mock_rt = self._mock_rt()
|
||||
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
|
||||
self.compute, 'reportclient')).mock
|
||||
@ -415,6 +416,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||
|
||||
get_db_nodes.return_value = db_nodes
|
||||
get_avail_nodes.return_value = avail_nodes
|
||||
mock_deleted.return_value = True
|
||||
self.compute.update_available_resource(self.context, startup=True)
|
||||
get_db_nodes.assert_called_once_with(self.context, avail_nodes,
|
||||
use_slave=True, startup=True)
|
||||
@ -460,12 +462,13 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||
update_mock.assert_not_called()
|
||||
del_rp_mock.assert_not_called()
|
||||
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
|
||||
@mock.patch.object(manager.ComputeManager,
|
||||
'_update_available_resource_for_node')
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
|
||||
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
|
||||
def test_update_available_resource_destroy_rebalance(
|
||||
self, get_db_nodes, get_avail_nodes, update_mock):
|
||||
self, get_db_nodes, get_avail_nodes, update_mock, mock_deleted):
|
||||
mock_rt = self._mock_rt()
|
||||
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
|
||||
self.compute, 'reportclient')).mock
|
||||
@ -476,6 +479,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||
db_nodes[0].destroy.side_effect = exception.ObjectActionError(
|
||||
action='destroy', reason='host changed')
|
||||
get_avail_nodes.return_value = set()
|
||||
mock_deleted.return_value = True
|
||||
self.compute.update_available_resource(self.context)
|
||||
get_db_nodes.assert_called_once_with(self.context, set(),
|
||||
use_slave=True, startup=False)
|
||||
@ -486,6 +490,36 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||
mock_rt.remove_node.assert_called_once_with('node1')
|
||||
rc_mock.invalidate_resource_provider.assert_called_once_with(
|
||||
db_nodes[0].uuid)
|
||||
mock_deleted.assert_called_once_with('node1')
|
||||
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
|
||||
@mock.patch.object(manager.ComputeManager,
|
||||
'_update_available_resource_for_node')
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
|
||||
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
|
||||
def test_update_available_resource_no_destroy_rebalance(
|
||||
self, get_db_nodes, get_avail_nodes, update_mock, mock_deleted):
|
||||
mock_rt = self._mock_rt()
|
||||
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
|
||||
self.compute, 'reportclient')).mock
|
||||
db_nodes = [self._make_compute_node('node1', 1)]
|
||||
get_db_nodes.return_value = db_nodes
|
||||
# Destroy can fail if nodes were rebalanced between getting the node
|
||||
# list and calling destroy.
|
||||
db_nodes[0].destroy.side_effect = exception.ObjectActionError(
|
||||
action='destroy', reason='host changed')
|
||||
get_avail_nodes.return_value = set()
|
||||
mock_deleted.return_value = False
|
||||
self.compute.update_available_resource(self.context)
|
||||
get_db_nodes.assert_called_once_with(self.context, set(),
|
||||
use_slave=True, startup=False)
|
||||
self.assertEqual(0, update_mock.call_count)
|
||||
|
||||
db_nodes[0].destroy.assert_not_called()
|
||||
self.assertEqual(0, rc_mock.delete_resource_provider.call_count)
|
||||
mock_rt.remove_node.assert_not_called()
|
||||
rc_mock.invalidate_resource_provider.assert_not_called()
|
||||
mock_deleted.assert_called_once_with('node1')
|
||||
|
||||
@mock.patch('nova.context.get_admin_context')
|
||||
def test_pre_start_hook(self, get_admin_context):
|
||||
|
@ -1150,6 +1150,26 @@ class ComputeDriver(object):
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def is_node_deleted(self, nodename):
|
||||
"""Check this compute node has been deleted.
|
||||
|
||||
This method is called when the compute manager notices that a
|
||||
node that was previously reported as available is no longer
|
||||
available.
|
||||
In this case, we need to know if the node has actually been
|
||||
deleted, or if it is simply no longer managed by this
|
||||
nova-compute service.
|
||||
If True is returned, the database and placement records will
|
||||
be removed.
|
||||
|
||||
:param nodename:
|
||||
node which the caller wants to check if its deleted
|
||||
:returns: True if the node is safe to delete
|
||||
"""
|
||||
# For most driver, compute nodes should never get deleted
|
||||
# during a periodic task, they are only deleted via the API
|
||||
return False
|
||||
|
||||
def pre_live_migration(self, context, instance, block_device_info,
|
||||
network_info, disk_info, migrate_data):
|
||||
"""Prepare an instance for live migration
|
||||
|
@ -697,6 +697,14 @@ class IronicDriver(virt_driver.ComputeDriver):
|
||||
except sdk_exc.ResourceNotFound:
|
||||
return False
|
||||
|
||||
def is_node_deleted(self, nodename):
|
||||
# check if the node is missing in Ironic
|
||||
try:
|
||||
self._get_node(nodename)
|
||||
return False
|
||||
except sdk_exc.ResourceNotFound:
|
||||
return True
|
||||
|
||||
def _refresh_hash_ring(self, ctxt):
|
||||
# When requesting a shard, we assume each compute service is
|
||||
# targeting a separate shard, so hard code peer_list to
|
||||
|
Loading…
Reference in New Issue
Block a user