Merge "Detect host renames and abort startup"

This commit is contained in:
Zuul 2023-02-01 07:12:48 +00:00 committed by Gerrit Code Review
commit e9d716f555
3 changed files with 101 additions and 12 deletions
nova
compute
tests/unit/compute
releasenotes/notes

@ -1536,6 +1536,22 @@ class ComputeManager(manager.Manager):
nova.virt.node.write_local_node_uuid(db_nodes[0].uuid)
def _check_for_host_rename(self, nodes_by_uuid):
if 'ironic' in CONF.compute_driver.lower():
# Ironic (currently) rebalances nodes at various times, and as
# such, nodes being discovered as assigned to this host with a
# different hostname is not surprising. Skip this check for
# ironic.
return
for node in nodes_by_uuid.values():
if node.host != self.host:
raise exception.InvalidConfiguration(
'My node %s has host %r but my host is %r; '
'Possible rename detected, refusing to start!' % (
node.uuid, node.host, self.host))
LOG.debug('Verified node %s matches my host %s',
node.uuid, self.host)
def init_host(self, service_ref):
"""Initialization for a standalone compute service."""
@ -1574,15 +1590,6 @@ class ComputeManager(manager.Manager):
raise exception.InvalidConfiguration(msg)
self.driver.init_host(host=self.host)
context = nova.context.get_admin_context()
instances = objects.InstanceList.get_by_host(
context, self.host,
expected_attrs=['info_cache', 'metadata', 'numa_topology'])
self.init_virt_events()
self._validate_pinning_configuration(instances)
self._validate_vtpm_configuration(instances)
# NOTE(gibi): At this point the compute_nodes of the resource tracker
# has not been populated yet so we cannot rely on the resource tracker
@ -1593,8 +1600,23 @@ class ComputeManager(manager.Manager):
# _destroy_evacuated_instances and
# _error_out_instances_whose_build_was_interrupted out in the
# background on startup
context = nova.context.get_admin_context()
nodes_by_uuid = self._get_nodes(context)
# NOTE(danms): Check for a possible host rename and abort
# startup before we start mucking with instances we think are
# ours.
self._check_for_host_rename(nodes_by_uuid)
instances = objects.InstanceList.get_by_host(
context, self.host,
expected_attrs=['info_cache', 'metadata', 'numa_topology'])
self.init_virt_events()
self._validate_pinning_configuration(instances)
self._validate_vtpm_configuration(instances)
try:
# checking that instance was not already evacuated to other host
evacuated_instances = self._destroy_evacuated_instances(

@ -934,7 +934,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
inst_list = _make_instance_list(startup_instances)
mock_host_get.return_value = inst_list
our_node = objects.ComputeNode(
host='fake-host', uuid=uuids.our_node_uuid,
host=self.compute.host, uuid=uuids.our_node_uuid,
hypervisor_hostname='fake-node')
mock_get_nodes.return_value = {uuids.our_node_uuid: our_node}
@ -983,7 +983,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
"""
mock_get_nodes.return_value = {
uuids.cn_uuid1: objects.ComputeNode(
uuid=uuids.cn_uuid1, hypervisor_hostname='node1')}
uuid=uuids.cn_uuid1, hypervisor_hostname='node1',
host=self.compute.host)}
self.compute.init_host(None)
mock_error_interrupted.assert_called_once_with(
@ -1148,7 +1149,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
uuids.evac_instance: evacuating_instance
}
our_node = objects.ComputeNode(
host='fake-host', uuid=uuids.our_node_uuid,
host=self.compute.host, uuid=uuids.our_node_uuid,
hypervisor_hostname='fake-node')
mock_get_nodes.return_value = {uuids.our_node_uuid: our_node}
@ -1227,6 +1228,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
@mock.patch.object(objects.InstanceList, 'get_by_host',
new=mock.Mock())
@mock.patch('nova.objects.ComputeNodeList.get_all_by_uuids',
new=mock.Mock(return_value=[mock.MagicMock()]))
@mock.patch('nova.compute.manager.ComputeManager.'
'_validate_pinning_configuration')
def test_init_host_pinning_configuration_validation_failure(self,
@ -1244,6 +1247,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
@mock.patch('nova.compute.manager.ComputeManager.'
'_validate_pinning_configuration',
new=mock.Mock())
@mock.patch('nova.objects.ComputeNodeList.get_all_by_uuids',
new=mock.Mock(return_value=[mock.MagicMock()]))
@mock.patch('nova.compute.manager.ComputeManager.'
'_validate_vtpm_configuration')
def test_init_host_vtpm_configuration_validation_failure(self,
@ -6434,6 +6439,49 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
mock.sentinel.service_ref)
m.assert_called_once_with(mock.sentinel.service_ref)
def test_check_for_host_rename_ironic(self):
self.flags(compute_driver='ironic')
# Passing None here makes sure we take the early exit because of our
# virt driver
nodes = {uuids.node1: mock.MagicMock(uuid=uuids.node1,
host='not-this-host')}
self.compute._check_for_host_rename(nodes)
def test_check_for_host_rename_renamed_only(self):
nodes = {uuids.node1: mock.MagicMock(uuid=uuids.node1,
host='not-this-host')}
self.assertRaises(exception.InvalidConfiguration,
self.compute._check_for_host_rename, nodes)
def test_check_for_host_rename_renamed_one(self):
nodes = {uuids.node1: mock.MagicMock(uuid=uuids.node1,
host=self.compute.host),
uuids.node2: mock.MagicMock(uuid=uuids.node2,
host='not-this-host')}
self.assertRaises(exception.InvalidConfiguration,
self.compute._check_for_host_rename, nodes)
def test_check_for_host_rename_not_renamed(self):
nodes = {uuids.node1: mock.MagicMock(uuid=uuids.node1,
host=self.compute.host)}
with mock.patch.object(manager.LOG, 'debug') as mock_debug:
self.compute._check_for_host_rename(nodes)
mock_debug.assert_called_once_with(
'Verified node %s matches my host %s',
uuids.node1, self.compute.host)
@mock.patch('nova.compute.manager.ComputeManager._get_nodes')
def test_check_for_host_rename_called_by_init_host(self, mock_nodes):
# Since testing init_host() requires a billion mocks, this
# tests that we do call it when expected, but make it raise
# to avoid running the rest of init_host().
with mock.patch.object(self.compute,
'_check_for_host_rename') as m:
m.side_effect = test.TestingException
self.assertRaises(test.TestingException,
self.compute.init_host, None)
m.assert_called_once_with(mock_nodes.return_value)
class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
def setUp(self):

@ -0,0 +1,19 @@
---
features:
- |
The compute manager now uses a local file to provide node uuid persistence
to guard against problems with renamed services, among other things.
Deployers wishing to ensure that *new* compute services get a predicatble
uuid before initial startup may provision that file and nova will use it,
otherwise nova will generate and write one to a `compute_id` file in
`CONF.state_path` the first time it starts up. Accidental renames of a
compute node's hostname will be detected and the manager will exit to avoid
database corruption. Note that none of this applies to Ironic computes, as
they manage nodes and uuids differently.
upgrade:
- |
Existing compute nodes will, upon upgrade, perist the uuid of the compute
node assigned to their hostname at first startup. Since this must match
what is currently in the database, it is important to let nova provision
this file from its database. Nova will only persist to a `compute_id` file
in the `CONF.state_path` directory, which should already be writable.