Merge "Make compute node rebalance safer"
This commit is contained in:
commit
d29a9b64ee
@ -10644,6 +10644,19 @@ class ComputeManager(manager.Manager):
|
||||
# Delete orphan compute node not reported by driver but still in db
|
||||
for cn in compute_nodes_in_db:
|
||||
if cn.hypervisor_hostname not in nodenames:
|
||||
# if the node could be migrated, we don't delete
|
||||
# the compute node database records
|
||||
if not self.driver.is_node_deleted(cn.hypervisor_hostname):
|
||||
LOG.warning(
|
||||
"Found orphan compute node %(id)s "
|
||||
"hypervisor host is %(hh)s, "
|
||||
"nodes are %(nodes)s. "
|
||||
"We are not deleting this as the driver "
|
||||
"says this node has not been deleted.",
|
||||
{'id': cn.id, 'hh': cn.hypervisor_hostname,
|
||||
'nodes': nodenames})
|
||||
continue
|
||||
|
||||
LOG.info("Deleting orphan compute node %(id)s "
|
||||
"hypervisor host is %(hh)s, "
|
||||
"nodes are %(nodes)s",
|
||||
|
@ -11,6 +11,7 @@
|
||||
# under the License.
|
||||
|
||||
from oslo_log import log as logging
|
||||
from unittest import mock
|
||||
|
||||
from nova import context
|
||||
from nova.db.main import api as db_api
|
||||
@ -20,6 +21,7 @@ from nova.tests import fixtures as nova_fixtures
|
||||
from nova.tests.functional import fixtures as func_fixtures
|
||||
from nova.tests.functional import integrated_helpers
|
||||
from nova import utils
|
||||
from nova.virt import fake as fake_driver
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
@ -56,7 +58,10 @@ class PeriodicNodeRecreateTestCase(test.TestCase,
|
||||
# for each node.
|
||||
self.flags(compute_driver='fake.PredictableNodeUUIDDriver')
|
||||
|
||||
def test_update_available_resource_node_recreate(self):
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
|
||||
def test_update_available_resource_node_recreate(self, mock_delete):
|
||||
# make the fake driver allow nodes to be delete, like ironic driver
|
||||
mock_delete.return_value = True
|
||||
# First we create a compute service to manage a couple of fake nodes.
|
||||
compute = self.start_service('compute', 'node1')
|
||||
# When start_service runs, it will create the node1 ComputeNode.
|
||||
|
@ -15,6 +15,7 @@ from unittest import mock
|
||||
from nova import context
|
||||
from nova import objects
|
||||
from nova.tests.functional import integrated_helpers
|
||||
from nova.virt import fake as fake_driver
|
||||
|
||||
|
||||
class NodeRebalanceDeletedComputeNodeRaceTestCase(
|
||||
@ -54,7 +55,10 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
|
||||
self.nodename = 'fake-node'
|
||||
self.ctxt = context.get_admin_context()
|
||||
|
||||
def test_node_rebalance_deleted_compute_node_race(self):
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
|
||||
def test_node_rebalance_deleted_compute_node_race(self, mock_delete):
|
||||
# make the fake driver allow nodes to be delete, like ironic driver
|
||||
mock_delete.return_value = True
|
||||
# Simulate a service running and then stopping. host_a runs, creates
|
||||
# fake-node, then is stopped. The fake-node compute node is destroyed.
|
||||
# This leaves a soft-deleted node in the DB.
|
||||
|
@ -396,13 +396,14 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||
{'node': mock.sentinel.node}
|
||||
)
|
||||
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
|
||||
@mock.patch.object(manager, 'LOG')
|
||||
@mock.patch.object(manager.ComputeManager,
|
||||
'_update_available_resource_for_node')
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
|
||||
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
|
||||
def test_update_available_resource(self, get_db_nodes, get_avail_nodes,
|
||||
update_mock, mock_log):
|
||||
update_mock, mock_log, mock_deleted):
|
||||
mock_rt = self._mock_rt()
|
||||
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
|
||||
self.compute, 'reportclient')).mock
|
||||
@ -415,6 +416,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||
|
||||
get_db_nodes.return_value = db_nodes
|
||||
get_avail_nodes.return_value = avail_nodes
|
||||
mock_deleted.return_value = True
|
||||
self.compute.update_available_resource(self.context, startup=True)
|
||||
get_db_nodes.assert_called_once_with(self.context, avail_nodes,
|
||||
use_slave=True, startup=True)
|
||||
@ -460,12 +462,13 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||
update_mock.assert_not_called()
|
||||
del_rp_mock.assert_not_called()
|
||||
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
|
||||
@mock.patch.object(manager.ComputeManager,
|
||||
'_update_available_resource_for_node')
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
|
||||
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
|
||||
def test_update_available_resource_destroy_rebalance(
|
||||
self, get_db_nodes, get_avail_nodes, update_mock):
|
||||
self, get_db_nodes, get_avail_nodes, update_mock, mock_deleted):
|
||||
mock_rt = self._mock_rt()
|
||||
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
|
||||
self.compute, 'reportclient')).mock
|
||||
@ -476,6 +479,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||
db_nodes[0].destroy.side_effect = exception.ObjectActionError(
|
||||
action='destroy', reason='host changed')
|
||||
get_avail_nodes.return_value = set()
|
||||
mock_deleted.return_value = True
|
||||
self.compute.update_available_resource(self.context)
|
||||
get_db_nodes.assert_called_once_with(self.context, set(),
|
||||
use_slave=True, startup=False)
|
||||
@ -486,6 +490,36 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||
mock_rt.remove_node.assert_called_once_with('node1')
|
||||
rc_mock.invalidate_resource_provider.assert_called_once_with(
|
||||
db_nodes[0].uuid)
|
||||
mock_deleted.assert_called_once_with('node1')
|
||||
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted')
|
||||
@mock.patch.object(manager.ComputeManager,
|
||||
'_update_available_resource_for_node')
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
|
||||
@mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db')
|
||||
def test_update_available_resource_no_destroy_rebalance(
|
||||
self, get_db_nodes, get_avail_nodes, update_mock, mock_deleted):
|
||||
mock_rt = self._mock_rt()
|
||||
rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject(
|
||||
self.compute, 'reportclient')).mock
|
||||
db_nodes = [self._make_compute_node('node1', 1)]
|
||||
get_db_nodes.return_value = db_nodes
|
||||
# Destroy can fail if nodes were rebalanced between getting the node
|
||||
# list and calling destroy.
|
||||
db_nodes[0].destroy.side_effect = exception.ObjectActionError(
|
||||
action='destroy', reason='host changed')
|
||||
get_avail_nodes.return_value = set()
|
||||
mock_deleted.return_value = False
|
||||
self.compute.update_available_resource(self.context)
|
||||
get_db_nodes.assert_called_once_with(self.context, set(),
|
||||
use_slave=True, startup=False)
|
||||
self.assertEqual(0, update_mock.call_count)
|
||||
|
||||
db_nodes[0].destroy.assert_not_called()
|
||||
self.assertEqual(0, rc_mock.delete_resource_provider.call_count)
|
||||
mock_rt.remove_node.assert_not_called()
|
||||
rc_mock.invalidate_resource_provider.assert_not_called()
|
||||
mock_deleted.assert_called_once_with('node1')
|
||||
|
||||
@mock.patch('nova.context.get_admin_context')
|
||||
def test_pre_start_hook(self, get_admin_context):
|
||||
|
@ -1156,6 +1156,26 @@ class ComputeDriver(object):
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def is_node_deleted(self, nodename):
|
||||
"""Check this compute node has been deleted.
|
||||
|
||||
This method is called when the compute manager notices that a
|
||||
node that was previously reported as available is no longer
|
||||
available.
|
||||
In this case, we need to know if the node has actually been
|
||||
deleted, or if it is simply no longer managed by this
|
||||
nova-compute service.
|
||||
If True is returned, the database and placement records will
|
||||
be removed.
|
||||
|
||||
:param nodename:
|
||||
node which the caller wants to check if its deleted
|
||||
:returns: True if the node is safe to delete
|
||||
"""
|
||||
# For most driver, compute nodes should never get deleted
|
||||
# during a periodic task, they are only deleted via the API
|
||||
return False
|
||||
|
||||
def pre_live_migration(self, context, instance, block_device_info,
|
||||
network_info, disk_info, migrate_data):
|
||||
"""Prepare an instance for live migration
|
||||
|
@ -697,6 +697,14 @@ class IronicDriver(virt_driver.ComputeDriver):
|
||||
except sdk_exc.ResourceNotFound:
|
||||
return False
|
||||
|
||||
def is_node_deleted(self, nodename):
|
||||
# check if the node is missing in Ironic
|
||||
try:
|
||||
self._get_node(nodename)
|
||||
return False
|
||||
except sdk_exc.ResourceNotFound:
|
||||
return True
|
||||
|
||||
def _refresh_hash_ring(self, ctxt):
|
||||
# When requesting a shard, we assume each compute service is
|
||||
# targeting a separate shard, so hard code peer_list to
|
||||
|
Loading…
Reference in New Issue
Block a user