Merge "Clean up allocations left by evacuation when deleting service" into stable/wallaby

2024-02-12 21:20:32 +00:00 · 2024-02-12 21:20:32 +00:00 · 05848fbbb2
commit 05848fbbb2
parent 860d79fb88 783598eec1
5 changed files with 74 additions and 90 deletions
--- a/nova/scheduler/client/report.py
+++ b/nova/scheduler/client/report.py
@ -36,6 +36,7 @@ from nova import context as nova_context
 from nova import exception
 from nova.i18n import _
 from nova import objects
+from nova.objects import fields
 from nova import utils


@ -2218,6 +2219,22 @@ class SchedulerReportClient(object):
        return {consumer: self.get_allocs_for_consumer(context, consumer)
                for consumer in consumers}

+    def _remove_allocations_for_evacuated_instances(self, context,
+            compute_node):
+        filters = {
+            'source_compute': compute_node.host,
+            'status': ['done'],
+            'migration_type': fields.MigrationType.EVACUATION,
+        }
+        evacuations = objects.MigrationList.get_by_filters(context, filters)
+
+        for evacuation in evacuations:
+            if not self.remove_provider_tree_from_instance_allocation(
+                    context, evacuation.instance_uuid, compute_node.uuid):
+                LOG.error("Failed to clean allocation of evacuated "
+                          "instance on the source node %s",
+                          compute_node.uuid, instance=evacuation.instance)
+
    def delete_resource_provider(self, context, compute_node, cascade=False):
        """Deletes the ResourceProvider record for the compute_node.

@ -2236,16 +2253,19 @@ class SchedulerReportClient(object):
            # Delete any allocations for this resource provider.
            # Since allocations are by consumer, we get the consumers on this
            # host, which are its instances.
-            # NOTE(mriedem): This assumes the only allocations on this node
-            # are instances, but there could be migration consumers if the
-            # node is deleted during a migration or allocations from an
-            # evacuated host (bug 1829479). Obviously an admin shouldn't
-            # do that but...you know. I guess the provider deletion should fail
-            # in that case which is what we'd want to happen.
            instance_uuids = objects.InstanceList.get_uuids_by_host_and_node(
                context, host, nodename)
            for instance_uuid in instance_uuids:
                self.delete_allocation_for_instance(context, instance_uuid)
+
+            # When an instance is evacuated, its allocation remains in
+            # the source compute node until the node recovers again.
+            # If the broken compute never recovered but instead it is
+            # decommissioned, then we should delete the allocations of
+            # successfully evacuated instances during service delete.
+            self._remove_allocations_for_evacuated_instances(context,
+                                                             compute_node)
+
        # Ensure to delete resource provider in tree by top-down
        # traversable order.
        rps_to_refresh = self.get_providers_in_tree(context, rp_uuid)
--- a/nova/tests/functional/test_nova_manage.py
+++ b/nova/tests/functional/test_nova_manage.py
@ -1573,71 +1573,6 @@ class TestNovaManagePlacementAudit(
        self.assertIn('Processed 1 allocation.', output)
        self.assertEqual(4, ret)

-    # TODO(sbauza): Remove this test once bug #1829479 is fixed
-    def test_audit_orphaned_allocations_from_deleted_compute_evacuate(self):
-        """Evacuate a server and the delete the source node so that it will
-           leave a source allocation that the audit command will find.
-        """
-
-        source_hostname = self.compute1.host
-        dest_hostname = self.compute2.host
-
-        source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
-        dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
-
-        server = self._boot_and_check_allocations(self.flavor, source_hostname)
-
-        # Stop the service and fake it down
-        self.compute1.stop()
-        source_service_id = self.admin_api.get_services(
-            host=source_hostname, binary='nova-compute')[0]['id']
-        self.admin_api.put_service(source_service_id, {'forced_down': 'true'})
-
-        # evacuate the instance to the target
-        self._evacuate_server(
-            server, {'host': dest_hostname}, expected_host=dest_hostname)
-
-        # Now the instance is gone, we can delete the compute service
-        self.admin_api.api_delete('/os-services/%s' % source_service_id)
-
-        # Since the compute is deleted, we should have in theory a single
-        # allocation against the destination resource provider, but evacuated
-        # instances are not having their allocations deleted. See bug #1829479.
-        # We have two allocations for the same consumer, source and destination
-        self._check_allocation_during_evacuate(
-            self.flavor, server['id'], source_rp_uuid, dest_rp_uuid)
-
-        # Now, run the audit command that will find this orphaned allocation
-        ret = self.cli.audit(verbose=True)
-        output = self.output.getvalue()
-        self.assertIn(
-            'Allocations for consumer UUID %(consumer_uuid)s on '
-            'Resource Provider %(rp_uuid)s can be deleted' %
-            {'consumer_uuid': server['id'],
-             'rp_uuid': source_rp_uuid},
-            output)
-        self.assertIn('Processed 1 allocation.', output)
-        self.assertEqual(3, ret)
-
-        # Now we want to delete the orphaned allocation that is duplicate
-        ret = self.cli.audit(delete=True, verbose=True)
-
-        # We finally should only have the target allocations
-        self.assertFlavorMatchesUsage(dest_rp_uuid, self.flavor)
-        self.assertRequestMatchesUsage({'VCPU': 0,
-                                        'MEMORY_MB': 0,
-                                        'DISK_GB': 0}, source_rp_uuid)
-
-        output = self.output.getvalue()
-        self.assertIn(
-            'Deleted allocations for consumer UUID %(consumer_uuid)s on '
-            'Resource Provider %(rp_uuid)s' %
-            {'consumer_uuid': server['id'],
-             'rp_uuid': source_rp_uuid},
-            output)
-        self.assertIn('Processed 1 allocation.', output)
-        self.assertEqual(4, ret)
-

 class TestDBArchiveDeletedRows(integrated_helpers._IntegratedTestBase):
    """Functional tests for the "nova-manage db archive_deleted_rows" CLI."""
--- a/nova/tests/functional/wsgi/test_services.py
+++ b/nova/tests/functional/wsgi/test_services.py
@ -120,8 +120,7 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase):
        """Tests a scenario where a server is created on a host, the host
        goes down, the server is evacuated to another host, and then the
        source host compute service is deleted. After that the deleted
-        compute service is restarted. Related placement resources are checked
-        throughout.
+        compute service is restarted and starts successfully.
        """
        # Create our source host that we will evacuate *from* later.
        host1 = self._start_compute('host1')
@ -152,23 +151,15 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase):
        services = self.admin_api.get_services(
            binary='nova-compute', host='host1')
        self.assertEqual(0, len(services), services)
-        # FIXME(mriedem): This is bug 1829479 where the compute service is
-        # deleted but the resource provider is not because there are still
-        # allocations against the provider from the evacuated server.
+        # Then the resource provider is also deleted.
        resp = self.placement.get('/resource_providers/%s' % rp_uuid)
-        self.assertEqual(200, resp.status)
-        self.assertFlavorMatchesUsage(rp_uuid, flavor)
-        # Try to restart the host1 compute service to create a new resource
-        # provider.
+        self.assertEqual(404, resp.status)
+        # Try to restart the host1 compute service to create a new service
+        # and a new resource provider.
        self.restart_compute_service(host1)
-        # FIXME(mriedem): This is bug 1817833 where restarting the now-deleted
-        # compute service attempts to create a new resource provider with a
-        # new uuid but the same name which results in a conflict. The service
-        # does not die, however, because _update_available_resource_for_node
-        # catches and logs but does not re-raise the error.
-        log_output = self.stdlog.logger.output
-        self.assertIn('Error updating resources for node host1.', log_output)
-        self.assertIn('Failed to create resource provider host1', log_output)
+        # Make sure the compute service record for host1 is recreated.
+        service = self.admin_api.get_services(
+            binary='nova-compute', host='host1')[0]

    def test_migrate_confirm_after_deleted_source_compute(self):
        """Tests a scenario where a server is cold migrated and while in
--- a/nova/tests/unit/scheduler/client/test_report.py
+++ b/nova/tests/unit/scheduler/client/test_report.py
@ -3184,15 +3184,41 @@ class TestAssociations(SchedulerReportClientTestCase):

 class TestAllocations(SchedulerReportClientTestCase):

+    @mock.patch("nova.scheduler.client.report.SchedulerReportClient."
+                "remove_provider_tree_from_instance_allocation")
+    @mock.patch('nova.objects.MigrationList.get_by_filters')
+    def test_remove_allocations_for_evacuated_instances(self,
+            mock_get_migrations, mock_rm_pr_tree):
+        cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
+                hypervisor_hostname="fake_hostname", )
+        migrations = [mock.Mock(instance_uuid=uuids.inst1, status='done'),
+                      mock.Mock(instance_uuid=uuids.inst2, status='done')]
+        mock_get_migrations.return_value = migrations
+        mock_rm_pr_tree.return_value = True
+        self.client._remove_allocations_for_evacuated_instances(self.context,
+                                                                cn)
+        mock_get_migrations.assert_called_once_with(
+            self.context,
+            {'source_compute': cn.host, 'status': ['done'],
+             'migration_type': 'evacuation'})
+        mock_rm_pr_tree.assert_has_calls(
+            [mock.call(self.context, uuids.inst1, cn.uuid),
+             mock.call(self.context, uuids.inst2, cn.uuid)])
+        # status of migrations should be kept
+        for migration in migrations:
+            self.assertEqual('done', migration.status)
+
    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
                'get_providers_in_tree')
    @mock.patch("nova.scheduler.client.report.SchedulerReportClient."
                "delete")
+    @mock.patch("nova.scheduler.client.report.SchedulerReportClient."
+                "_remove_allocations_for_evacuated_instances")
    @mock.patch("nova.scheduler.client.report.SchedulerReportClient."
                "delete_allocation_for_instance")
    @mock.patch("nova.objects.InstanceList.get_uuids_by_host_and_node")
    def test_delete_resource_provider_cascade(self, mock_by_host,
-            mock_del_alloc, mock_delete, mock_get_rpt):
+            mock_del_alloc, mock_evacuate, mock_delete, mock_get_rpt):
        cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
                hypervisor_hostname="fake_hostname", )
        mock_by_host.return_value = [uuids.inst1, uuids.inst2]
@ -3208,6 +3234,7 @@ class TestAllocations(SchedulerReportClientTestCase):
        mock_by_host.assert_called_once_with(
            self.context, cn.host, cn.hypervisor_hostname)
        self.assertEqual(2, mock_del_alloc.call_count)
+        mock_evacuate.assert_called_once_with(self.context, cn)
        exp_url = "/resource_providers/%s" % uuids.cn
        mock_delete.assert_called_once_with(
            exp_url, global_request_id=self.context.global_id)
@ -3217,11 +3244,13 @@ class TestAllocations(SchedulerReportClientTestCase):
                'get_providers_in_tree')
    @mock.patch("nova.scheduler.client.report.SchedulerReportClient."
                "delete")
+    @mock.patch("nova.scheduler.client.report.SchedulerReportClient."
+                "_remove_allocations_for_evacuated_instances")
    @mock.patch("nova.scheduler.client.report.SchedulerReportClient."
                "delete_allocation_for_instance")
    @mock.patch("nova.objects.InstanceList.get_uuids_by_host_and_node")
    def test_delete_resource_provider_no_cascade(self, mock_by_host,
-            mock_del_alloc, mock_delete, mock_get_rpt):
+            mock_del_alloc, mock_evacuate, mock_delete, mock_get_rpt):
        self.client._association_refresh_time[uuids.cn] = mock.Mock()
        cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
                hypervisor_hostname="fake_hostname", )
@ -3236,6 +3265,7 @@ class TestAllocations(SchedulerReportClientTestCase):
        mock_delete.return_value = resp_mock
        self.client.delete_resource_provider(self.context, cn)
        mock_del_alloc.assert_not_called()
+        mock_evacuate.assert_not_called()
        exp_url = "/resource_providers/%s" % uuids.cn
        mock_delete.assert_called_once_with(
            exp_url, global_request_id=self.context.global_id)
--- a/releasenotes/notes/bug-1829479-cd2db21526965e6d.yaml
+++ b/releasenotes/notes/bug-1829479-cd2db21526965e6d.yaml
@ -0,0 +1,8 @@
+---
+fixes:
+  - |
+    `Bug #1829479 <https://bugs.launchpad.net/nova/+bug/1829479>`_: Now
+    deleting a nova-compute service removes allocations of successfully
+    evacuated instances. This allows the associated resource provider to be
+    deleted automatically even if the nova-compute service cannot recover after
+    all instances on the node have been successfully evacuated.