diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py index 0caa276e2825..4b83a5788356 100644 --- a/nova/scheduler/client/report.py +++ b/nova/scheduler/client/report.py @@ -36,6 +36,7 @@ from nova import context as nova_context from nova import exception from nova.i18n import _ from nova import objects +from nova.objects import fields from nova import utils @@ -2248,6 +2249,22 @@ class SchedulerReportClient(object): return {consumer: self.get_allocs_for_consumer(context, consumer) for consumer in consumers} + def _remove_allocations_for_evacuated_instances(self, context, + compute_node): + filters = { + 'source_compute': compute_node.host, + 'status': ['done'], + 'migration_type': fields.MigrationType.EVACUATION, + } + evacuations = objects.MigrationList.get_by_filters(context, filters) + + for evacuation in evacuations: + if not self.remove_provider_tree_from_instance_allocation( + context, evacuation.instance_uuid, compute_node.uuid): + LOG.error("Failed to clean allocation of evacuated " + "instance on the source node %s", + compute_node.uuid, instance=evacuation.instance) + def delete_resource_provider(self, context, compute_node, cascade=False): """Deletes the ResourceProvider record for the compute_node. @@ -2266,17 +2283,20 @@ class SchedulerReportClient(object): # Delete any allocations for this resource provider. # Since allocations are by consumer, we get the consumers on this # host, which are its instances. - # NOTE(mriedem): This assumes the only allocations on this node - # are instances, but there could be migration consumers if the - # node is deleted during a migration or allocations from an - # evacuated host (bug 1829479). Obviously an admin shouldn't - # do that but...you know. I guess the provider deletion should fail - # in that case which is what we'd want to happen. instance_uuids = objects.InstanceList.get_uuids_by_host_and_node( context, host, nodename) for instance_uuid in instance_uuids: self.delete_allocation_for_instance( context, instance_uuid, force=True) + + # When an instance is evacuated, its allocation remains in + # the source compute node until the node recovers again. + # If the broken compute never recovered but instead it is + # decommissioned, then we should delete the allocations of + # successfully evacuated instances during service delete. + self._remove_allocations_for_evacuated_instances(context, + compute_node) + # Ensure to delete resource provider in tree by top-down # traversable order. rps_to_refresh = self.get_providers_in_tree(context, rp_uuid) diff --git a/nova/tests/functional/test_nova_manage.py b/nova/tests/functional/test_nova_manage.py index f04a1d6d9939..39681183b070 100644 --- a/nova/tests/functional/test_nova_manage.py +++ b/nova/tests/functional/test_nova_manage.py @@ -1849,71 +1849,6 @@ class TestNovaManagePlacementAudit( self.assertIn('Processed 1 allocation.', output) self.assertEqual(4, ret) - # TODO(sbauza): Remove this test once bug #1829479 is fixed - def test_audit_orphaned_allocations_from_deleted_compute_evacuate(self): - """Evacuate a server and the delete the source node so that it will - leave a source allocation that the audit command will find. - """ - - source_hostname = self.compute1.host - dest_hostname = self.compute2.host - - source_rp_uuid = self._get_provider_uuid_by_host(source_hostname) - dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname) - - server = self._boot_and_check_allocations(self.flavor, source_hostname) - - # Stop the service and fake it down - self.compute1.stop() - source_service_id = self.admin_api.get_services( - host=source_hostname, binary='nova-compute')[0]['id'] - self.admin_api.put_service(source_service_id, {'forced_down': 'true'}) - - # evacuate the instance to the target - self._evacuate_server( - server, {'host': dest_hostname}, expected_host=dest_hostname) - - # Now the instance is gone, we can delete the compute service - self.admin_api.api_delete('/os-services/%s' % source_service_id) - - # Since the compute is deleted, we should have in theory a single - # allocation against the destination resource provider, but evacuated - # instances are not having their allocations deleted. See bug #1829479. - # We have two allocations for the same consumer, source and destination - self._check_allocation_during_evacuate( - self.flavor, server['id'], source_rp_uuid, dest_rp_uuid) - - # Now, run the audit command that will find this orphaned allocation - ret = self.cli.audit(verbose=True) - output = self.output.getvalue() - self.assertIn( - 'Allocations for consumer UUID %(consumer_uuid)s on ' - 'Resource Provider %(rp_uuid)s can be deleted' % - {'consumer_uuid': server['id'], - 'rp_uuid': source_rp_uuid}, - output) - self.assertIn('Processed 1 allocation.', output) - self.assertEqual(3, ret) - - # Now we want to delete the orphaned allocation that is duplicate - ret = self.cli.audit(delete=True, verbose=True) - - # We finally should only have the target allocations - self.assertFlavorMatchesUsage(dest_rp_uuid, self.flavor) - self.assertRequestMatchesUsage({'VCPU': 0, - 'MEMORY_MB': 0, - 'DISK_GB': 0}, source_rp_uuid) - - output = self.output.getvalue() - self.assertIn( - 'Deleted allocations for consumer UUID %(consumer_uuid)s on ' - 'Resource Provider %(rp_uuid)s' % - {'consumer_uuid': server['id'], - 'rp_uuid': source_rp_uuid}, - output) - self.assertIn('Processed 1 allocation.', output) - self.assertEqual(4, ret) - class TestDBArchiveDeletedRows(integrated_helpers._IntegratedTestBase): """Functional tests for the "nova-manage db archive_deleted_rows" CLI.""" diff --git a/nova/tests/functional/wsgi/test_services.py b/nova/tests/functional/wsgi/test_services.py index e5132471306e..8da8ea96aed1 100644 --- a/nova/tests/functional/wsgi/test_services.py +++ b/nova/tests/functional/wsgi/test_services.py @@ -120,8 +120,7 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase): """Tests a scenario where a server is created on a host, the host goes down, the server is evacuated to another host, and then the source host compute service is deleted. After that the deleted - compute service is restarted. Related placement resources are checked - throughout. + compute service is restarted and starts successfully. """ # Create our source host that we will evacuate *from* later. host1 = self._start_compute('host1') @@ -152,23 +151,15 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase): services = self.admin_api.get_services( binary='nova-compute', host='host1') self.assertEqual(0, len(services), services) - # FIXME(mriedem): This is bug 1829479 where the compute service is - # deleted but the resource provider is not because there are still - # allocations against the provider from the evacuated server. + # Then the resource provider is also deleted. resp = self.placement.get('/resource_providers/%s' % rp_uuid) - self.assertEqual(200, resp.status) - self.assertFlavorMatchesUsage(rp_uuid, flavor) - # Try to restart the host1 compute service to create a new resource - # provider. + self.assertEqual(404, resp.status) + # Try to restart the host1 compute service to create a new service + # and a new resource provider. self.restart_compute_service(host1) - # FIXME(mriedem): This is bug 1817833 where restarting the now-deleted - # compute service attempts to create a new resource provider with a - # new uuid but the same name which results in a conflict. The service - # does not die, however, because _update_available_resource_for_node - # catches and logs but does not re-raise the error. - log_output = self.stdlog.logger.output - self.assertIn('Error updating resources for node host1.', log_output) - self.assertIn('Failed to create resource provider host1', log_output) + # Make sure the compute service record for host1 is recreated. + service = self.admin_api.get_services( + binary='nova-compute', host='host1')[0] def test_migrate_confirm_after_deleted_source_compute(self): """Tests a scenario where a server is cold migrated and while in diff --git a/nova/tests/unit/scheduler/client/test_report.py b/nova/tests/unit/scheduler/client/test_report.py index f8e9812b77d9..c13242b984de 100644 --- a/nova/tests/unit/scheduler/client/test_report.py +++ b/nova/tests/unit/scheduler/client/test_report.py @@ -3184,15 +3184,41 @@ class TestAssociations(SchedulerReportClientTestCase): class TestAllocations(SchedulerReportClientTestCase): + @mock.patch("nova.scheduler.client.report.SchedulerReportClient." + "remove_provider_tree_from_instance_allocation") + @mock.patch('nova.objects.MigrationList.get_by_filters') + def test_remove_allocations_for_evacuated_instances(self, + mock_get_migrations, mock_rm_pr_tree): + cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host", + hypervisor_hostname="fake_hostname", ) + migrations = [mock.Mock(instance_uuid=uuids.inst1, status='done'), + mock.Mock(instance_uuid=uuids.inst2, status='done')] + mock_get_migrations.return_value = migrations + mock_rm_pr_tree.return_value = True + self.client._remove_allocations_for_evacuated_instances(self.context, + cn) + mock_get_migrations.assert_called_once_with( + self.context, + {'source_compute': cn.host, 'status': ['done'], + 'migration_type': 'evacuation'}) + mock_rm_pr_tree.assert_has_calls( + [mock.call(self.context, uuids.inst1, cn.uuid), + mock.call(self.context, uuids.inst2, cn.uuid)]) + # status of migrations should be kept + for migration in migrations: + self.assertEqual('done', migration.status) + @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' 'get_providers_in_tree') @mock.patch("nova.scheduler.client.report.SchedulerReportClient." "delete") + @mock.patch("nova.scheduler.client.report.SchedulerReportClient." + "_remove_allocations_for_evacuated_instances") @mock.patch("nova.scheduler.client.report.SchedulerReportClient." "delete_allocation_for_instance") @mock.patch("nova.objects.InstanceList.get_uuids_by_host_and_node") def test_delete_resource_provider_cascade(self, mock_by_host, - mock_del_alloc, mock_delete, mock_get_rpt): + mock_del_alloc, mock_evacuate, mock_delete, mock_get_rpt): cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host", hypervisor_hostname="fake_hostname", ) mock_by_host.return_value = [uuids.inst1, uuids.inst2] @@ -3208,6 +3234,7 @@ class TestAllocations(SchedulerReportClientTestCase): mock_by_host.assert_called_once_with( self.context, cn.host, cn.hypervisor_hostname) self.assertEqual(2, mock_del_alloc.call_count) + mock_evacuate.assert_called_once_with(self.context, cn) exp_url = "/resource_providers/%s" % uuids.cn mock_delete.assert_called_once_with( exp_url, global_request_id=self.context.global_id) @@ -3217,11 +3244,13 @@ class TestAllocations(SchedulerReportClientTestCase): 'get_providers_in_tree') @mock.patch("nova.scheduler.client.report.SchedulerReportClient." "delete") + @mock.patch("nova.scheduler.client.report.SchedulerReportClient." + "_remove_allocations_for_evacuated_instances") @mock.patch("nova.scheduler.client.report.SchedulerReportClient." "delete_allocation_for_instance") @mock.patch("nova.objects.InstanceList.get_uuids_by_host_and_node") def test_delete_resource_provider_no_cascade(self, mock_by_host, - mock_del_alloc, mock_delete, mock_get_rpt): + mock_del_alloc, mock_evacuate, mock_delete, mock_get_rpt): self.client._association_refresh_time[uuids.cn] = mock.Mock() cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host", hypervisor_hostname="fake_hostname", ) @@ -3236,6 +3265,7 @@ class TestAllocations(SchedulerReportClientTestCase): mock_delete.return_value = resp_mock self.client.delete_resource_provider(self.context, cn) mock_del_alloc.assert_not_called() + mock_evacuate.assert_not_called() exp_url = "/resource_providers/%s" % uuids.cn mock_delete.assert_called_once_with( exp_url, global_request_id=self.context.global_id) diff --git a/releasenotes/notes/bug-1829479-cd2db21526965e6d.yaml b/releasenotes/notes/bug-1829479-cd2db21526965e6d.yaml new file mode 100644 index 000000000000..5e7a4c51d25e --- /dev/null +++ b/releasenotes/notes/bug-1829479-cd2db21526965e6d.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + `Bug #1829479 `_: Now + deleting a nova-compute service removes allocations of successfully + evacuated instances. This allows the associated resource provider to be + deleted automatically even if the nova-compute service cannot recover after + all instances on the node have been successfully evacuated.