From abbac59e33e1268bc92c5a4a070d9a2721498dc5 Mon Sep 17 00:00:00 2001 From: Dan Smith Date: Mon, 2 Oct 2023 10:48:28 -0700 Subject: [PATCH] Sanity check that new hosts have no instances If we are starting up and we think we are a new host (i.e. no pre- existing service record was found for us), we expect to have no instances on our hypervisor. If that is not the case, it is likely that we got pointed at a new fresh database or the wrong database (i.e. a different cell from our own). In that case, we should abort startup to avoid creating new service, compute node, and resource provider records. This is a sort of follow-on additional improvement related to work done in blueprint stable-compute-uuid. Change-Id: Id817c51c90485119270f3b7f3c52858f42100637 --- nova/compute/manager.py | 16 ++++++++++++++++ nova/tests/functional/wsgi/test_services.py | 19 ++++++++++++------- nova/tests/unit/compute/test_compute_mgr.py | 17 ++++++++++++++++- 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index cdc93af0b17c..59fa18ee53fb 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -1570,6 +1570,18 @@ class ComputeManager(manager.Manager): LOG.debug('Verified node %s matches my host %s', node.uuid, self.host) + def _sanity_check_new_host(self): + instances_on_hv = self.driver.list_instance_uuids() + if len(instances_on_hv) > 0: + # This means we have instances on our hypervisor, but we think + # we are a new host (i.e. we created a new service record). That + # likely means we're pointed at an empty database or the wrong + # cell. + raise exception.InvalidConfiguration( + 'My hypervisor has existing instances, but I appear to be ' + 'a new service in this database. Possible database ' + 'configuration error, refusing to start!') + def init_host(self, service_ref): """Initialization for a standalone compute service.""" @@ -1578,6 +1590,10 @@ class ComputeManager(manager.Manager): # to record a locally-persistent node identity because # we have upgraded from a previous version. self._ensure_existing_node_identity(service_ref) + else: + # If we are a new service (in the database), make sure we have no + # instances on our hypervisor as we would expect. + self._sanity_check_new_host() if CONF.pci.device_spec: # Simply loading the PCI passthrough spec will do a bunch of diff --git a/nova/tests/functional/wsgi/test_services.py b/nova/tests/functional/wsgi/test_services.py index 8da8ea96aed1..4ec08905d935 100644 --- a/nova/tests/functional/wsgi/test_services.py +++ b/nova/tests/functional/wsgi/test_services.py @@ -120,7 +120,8 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase): """Tests a scenario where a server is created on a host, the host goes down, the server is evacuated to another host, and then the source host compute service is deleted. After that the deleted - compute service is restarted and starts successfully. + compute service is restarted and refuses to run because it finds its + service record deleted even though it has instances. """ # Create our source host that we will evacuate *from* later. host1 = self._start_compute('host1') @@ -154,12 +155,16 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase): # Then the resource provider is also deleted. resp = self.placement.get('/resource_providers/%s' % rp_uuid) self.assertEqual(404, resp.status) - # Try to restart the host1 compute service to create a new service - # and a new resource provider. - self.restart_compute_service(host1) - # Make sure the compute service record for host1 is recreated. - service = self.admin_api.get_services( - binary='nova-compute', host='host1')[0] + # Try to restart the host1 compute service and make sure it recognizes + # that its service record has been deleted even though it still has + # instances running. + self.assertRaises(exception.InvalidConfiguration, + self.restart_compute_service, host1) + # Make sure the compute service record for host1 is not recreated + # since we aborted startup. + services = self.admin_api.get_services( + binary='nova-compute', host='host1') + self.assertEqual([], services) def test_migrate_confirm_after_deleted_source_compute(self): """Tests a scenario where a server is cold migrated and while in diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 335f5d5afcff..0ce2ac6840b0 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -916,6 +916,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, return instance_obj._make_instance_list( self.context, objects.InstanceList(), db_list, None) + @mock.patch.object(manager.ComputeManager, + '_sanity_check_new_host') @mock.patch.object(manager.ComputeManager, '_ensure_existing_node_identity') @mock.patch.object(manager.ComputeManager, '_get_nodes') @@ -937,7 +939,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, mock_destroy, mock_admin_ctxt, mock_host_get, mock_init_host, mock_error_interrupted, mock_get_nodes, - mock_existing_node): + mock_existing_node, mock_check_new): mock_admin_ctxt.return_value = self.context inst_list = _make_instance_list(startup_instances) mock_host_get.return_value = inst_list @@ -948,6 +950,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, self.compute.init_host(None) + mock_check_new.assert_called_once_with() mock_existing_node.assert_not_called() mock_validate_pinning.assert_called_once_with(inst_list) mock_validate_vtpm.assert_called_once_with(inst_list) @@ -1001,6 +1004,18 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, mock_get_nodes.assert_called_once_with( test.MatchType(nova.context.RequestContext)) + def test_init_host_new_with_instances(self): + """Tests the case where we start up without an existing service_ref, + indicating that we are a new service, but our hypervisor reports + existing instances. This indicates we were moved to another cell, + our database got wiped, etc. + """ + with mock.patch.object(self.compute.driver, + 'list_instance_uuids') as mock_insts: + mock_insts.return_value = ['foo'] + self.assertRaises(exception.InvalidConfiguration, + self.compute.init_host, None) + @mock.patch('nova.objects.InstanceList') @mock.patch('nova.objects.MigrationList.get_by_filters') @mock.patch('nova.objects.ComputeNodeList.get_all_by_uuids')