From 6f1a1f5e8ead9ceb145e13927cd9ae7fdca9add4 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <balazs.gibizer@ericsson.com>
Date: Tue, 14 Aug 2018 10:37:03 +0200
Subject: [PATCH] Consumer gen support for delete instance allocations

The placement API version 1.28 introduced consumer generation as a way
to make updating allocation safe even if it is done from multiple
places.

This patch changes delete_allocation_for_instance to use PUT
/allocations instead of DELETE /allocations to benefit from the consumer
generation handling.

In this patch the report client will GET the current allocation of the
instance including the consumer generation and then try to PUT an empty
allocation with that generation. If this fails due to a consumer
generation conflict, meaning something modified the allocation of the
instance in between GET and PUT then the report client will raise
AllocationDeleteFailed exception. This will cause that the instance
goes to ERROR state.

This patch only detects a small portion of possible cases when
allocation is modified outside of the delete code path. The rest can
only be detected if nova would cache at least the consumer generation
of the instance.

To be able to put the instance state to ERROR the instance.destroy()
call is moved to the end to of the deletion call path. To keep the
instance.delete.end notification behavior consistent with this move
(e.g. deleted_at field is filled) the notification sending needed to
be moved too.

Blueprint: use-nested-allocation-candidates
Change-Id: I77f34788dd7ab8fdf60d668a4f76452e03cf9888
---
 doc/source/user/placement.rst                 |  2 +
 nova/api/openstack/compute/servers.py         |  3 +-
 nova/compute/manager.py                       | 25 +++---
 nova/exception.py                             |  5 ++
 nova/scheduler/client/report.py               | 64 +++++++++++---
 nova/tests/functional/test_servers.py         | 86 +++++++++++++++++++
 nova/tests/unit/compute/test_compute.py       |  6 ++
 nova/tests/unit/compute/test_compute_mgr.py   | 22 +++--
 .../unit/scheduler/client/test_report.py      | 15 ----
 9 files changed, 184 insertions(+), 44 deletions(-)

diff --git a/doc/source/user/placement.rst b/doc/source/user/placement.rst
index f22421230f90..c20833109dc8 100644
--- a/doc/source/user/placement.rst
+++ b/doc/source/user/placement.rst
@@ -58,12 +58,14 @@ changed or be partially complete at this time.
 * `filter allocation candidates by aggregate membership`_
 * `perform granular allocation candidate requests`_
 * `inventory and allocation data migration`_ (reshaping provider trees)
+* `handle allocation updates in a safe way`_
 
 .. _Nested Resource Providers: http://specs.openstack.org/openstack/nova-specs/specs/queens/approved/nested-resource-providers.html
 .. _Request Traits During Scheduling: https://specs.openstack.org/openstack/nova-specs/specs/queens/approved/request-traits-in-nova.html
 .. _filter allocation candidates by aggregate membership: https://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/alloc-candidates-member-of.html
 .. _perform granular allocation candidate requests: http://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/granular-resource-requests.html
 .. _inventory and allocation data migration: http://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/reshape-provider-tree.html
+.. _handle allocation updates in a safe way: https://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/add-consumer-generation.html
 
 Deployment
 ==========
diff --git a/nova/api/openstack/compute/servers.py b/nova/api/openstack/compute/servers.py
index e46e9ed329d9..1c2a773ecf22 100644
--- a/nova/api/openstack/compute/servers.py
+++ b/nova/api/openstack/compute/servers.py
@@ -865,7 +865,8 @@ class ServersController(wsgi.Controller):
             raise exc.HTTPNotFound(explanation=msg)
         except exception.InstanceUnknownCell as e:
             raise exc.HTTPNotFound(explanation=e.format_message())
-        except exception.InstanceIsLocked as e:
+        except (exception.InstanceIsLocked,
+                exception.AllocationDeleteFailed) as e:
             raise exc.HTTPConflict(explanation=e.format_message())
         except exception.InstanceInvalidState as state_error:
             common.raise_http_conflict_for_instance_invalid_state(state_error,
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 4dfd6bd4a5c4..f6715dd7fbec 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -749,19 +749,18 @@ class ComputeManager(manager.Manager):
         bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
                 context, instance.uuid)
         self._complete_deletion(context,
-                                instance,
-                                bdms)
+                                instance)
+        self._notify_about_instance_usage(context, instance, "delete.end")
+        compute_utils.notify_about_instance_action(context, instance,
+                self.host, action=fields.NotificationAction.DELETE,
+                phase=fields.NotificationPhase.END, bdms=bdms)
 
-    def _complete_deletion(self, context, instance, bdms):
+    def _complete_deletion(self, context, instance):
         self._update_resource_tracker(context, instance)
 
         rt = self._get_resource_tracker()
         rt.reportclient.delete_allocation_for_instance(context, instance.uuid)
 
-        self._notify_about_instance_usage(context, instance, "delete.end")
-        compute_utils.notify_about_instance_action(context, instance,
-                self.host, action=fields.NotificationAction.DELETE,
-                phase=fields.NotificationPhase.END, bdms=bdms)
         self._clean_instance_console_tokens(context, instance)
         self._delete_scheduler_instance_info(context, instance.uuid)
 
@@ -2624,11 +2623,17 @@ class ComputeManager(manager.Manager):
         instance.power_state = power_state.NOSTATE
         instance.terminated_at = timeutils.utcnow()
         instance.save()
+
+        self._complete_deletion(context, instance)
+        # only destroy the instance in the db if the _complete_deletion
+        # doesn't raise and therefore allocation is successfully
+        # deleted in placement
         instance.destroy()
 
-        self._complete_deletion(context,
-                                instance,
-                                bdms)
+        self._notify_about_instance_usage(context, instance, "delete.end")
+        compute_utils.notify_about_instance_action(context, instance,
+                self.host, action=fields.NotificationAction.DELETE,
+                phase=fields.NotificationPhase.END, bdms=bdms)
 
     @wrap_exception()
     @reverts_task_state
diff --git a/nova/exception.py b/nova/exception.py
index 4e5a5f79486c..fb5f7785c4a5 100644
--- a/nova/exception.py
+++ b/nova/exception.py
@@ -2322,6 +2322,11 @@ class AllocationUpdateFailed(NovaException):
                 'Error: %(error)s')
 
 
+class AllocationDeleteFailed(NovaException):
+    msg_fmt = _('Failed to delete allocations for consumer %(consumer_uuid)s. '
+                'Error: %(error)s')
+
+
 class TooManyComputesForHost(NovaException):
     msg_fmt = _('Unexpected number of compute node records '
                 '(%(num_computes)d) found for host %(host)s. There should '
diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py
index 380a1f721148..a6deb80fd839 100644
--- a/nova/scheduler/client/report.py
+++ b/nova/scheduler/client/report.py
@@ -2047,21 +2047,63 @@ class SchedulerReportClient(object):
 
     @safe_connect
     def delete_allocation_for_instance(self, context, uuid):
+        """Delete the instance allocation from placement
+
+        :param context: The security context
+        :param uuid: the instance UUID which will be used as the consumer UUID
+                     towards placement
+        :return: Returns True if the allocation is successfully deleted by this
+                 call. Returns False if the allocation does not exist.
+        :raises AllocationDeleteFailed: If the allocation cannot be read from
+                placement or it is changed by another process while we tried to
+                delete it.
+        """
         url = '/allocations/%s' % uuid
-        r = self.delete(url, global_request_id=context.global_id)
-        if r:
+        # We read the consumer generation then try to put an empty allocation
+        # for that consumer. If between the GET and the PUT the consumer
+        # generation changes then we raise AllocationDeleteFailed.
+        # NOTE(gibi): This only detect a small portion of possible cases when
+        # allocation is modified outside of the delete code path. The rest can
+        # only be detected if nova would cache at least the consumer generation
+        # of the instance.
+        # NOTE(gibi): placement does not return 404 for non-existing consumer
+        # but returns an empty consumer instead. Putting an empty allocation to
+        # that non-existing consumer won't be 404 or other error either.
+        r = self.get(url, global_request_id=context.global_id,
+                     version=CONSUMER_GENERATION_VERSION)
+        if not r:
+            # at the moment there is no way placement returns a failure so we
+            # could even delete this code
+            LOG.warning('Unable to delete allocation for instance '
+                        '%(uuid)s: (%(code)i %(text)s)',
+                        {'uuid': uuid,
+                         'code': r.status_code,
+                         'text': r.text})
+            raise exception.AllocationDeleteFailed(consumer_uuid=uuid,
+                                                   error=r.text)
+        allocations = r.json()
+        if allocations['allocations'] == {}:
+            # the consumer did not exist in the first place
+            LOG.debug('Cannot delete allocation for %s consumer in placement '
+                      'as consumer does not exists', uuid)
+            return False
+
+        # removing all resources from the allocation will auto delete the
+        # consumer in placement
+        allocations['allocations'] = {}
+        r = self.put(url, allocations, global_request_id=context.global_id,
+                     version=CONSUMER_GENERATION_VERSION)
+        if r.status_code == 204:
             LOG.info('Deleted allocation for instance %s', uuid)
             return True
         else:
-            # Check for 404 since we don't need to log a warning if we tried to
-            # delete something which doesn't actually exist.
-            if r.status_code != 404:
-                LOG.warning('Unable to delete allocation for instance '
-                            '%(uuid)s: (%(code)i %(text)s)',
-                            {'uuid': uuid,
-                             'code': r.status_code,
-                             'text': r.text})
-            return False
+            LOG.warning('Unable to delete allocation for instance '
+                        '%(uuid)s: (%(code)i %(text)s)',
+                        {'uuid': uuid,
+                         'code': r.status_code,
+                         'text': r.text})
+            raise exception.AllocationDeleteFailed(consumer_uuid=uuid,
+                                                   error=r.text)
 
     def get_allocations_for_resource_provider(self, context, rp_uuid):
         """Retrieves the allocations for a specific provider.
diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py
index 9962cdec9dd6..3b57deb1515c 100644
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@@ -20,6 +20,7 @@ import zlib
 import mock
 from oslo_log import log as logging
 from oslo_serialization import base64
+from oslo_serialization import jsonutils
 from oslo_utils.fixture import uuidsentinel as uuids
 from oslo_utils import timeutils
 import six
@@ -40,6 +41,7 @@ from nova.tests.functional import integrated_helpers
 from nova.tests.unit.api.openstack import fakes
 from nova.tests.unit import fake_block_device
 from nova.tests.unit import fake_notifier
+from nova.tests.unit import fake_requests
 import nova.tests.unit.image.fake
 from nova.virt import fake
 from nova import volume
@@ -4529,3 +4531,87 @@ class ServerTestV256RescheduleTestCase(ServerTestV256Common):
         found_server = self.api.get_server(server['id'])
         # Check that rescheduling is not occurred.
         self.assertNotEqual(other_host, found_server['OS-EXT-SRV-ATTR:host'])
+
+
+class ConsumerGenerationConflictTest(
+        integrated_helpers.ProviderUsageBaseTestCase):
+
+    # we need the medium driver to be able to allocate resource not just for
+    # a single instance
+    compute_driver = 'fake.MediumFakeDriver'
+
+    def setUp(self):
+        super(ConsumerGenerationConflictTest, self).setUp()
+        flavors = self.api.get_flavors()
+        self.flavor = flavors[0]
+        self.other_flavor = flavors[1]
+        self.compute1 = self._start_compute('compute1')
+        self.compute2 = self._start_compute('compute2')
+
+    def test_server_delete_fails_due_to_conflict(self):
+        source_hostname = self.compute1.host
+
+        server = self._boot_and_check_allocations(self.flavor, source_hostname)
+
+        rsp = fake_requests.FakeResponse(
+            409, jsonutils.dumps({'text': 'consumer generation conflict'}))
+
+        with mock.patch('keystoneauth1.adapter.Adapter.put',
+                        autospec=True) as mock_put:
+            mock_put.return_value = rsp
+
+            self.api.delete_server(server['id'])
+            server = self._wait_for_state_change(self.admin_api, server,
+                                                 'ERROR')
+            self.assertEqual(1, mock_put.call_count)
+
+        # We still have the allocations as deletion failed
+        source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
+        source_usages = self._get_provider_usages(source_rp_uuid)
+        self.assertFlavorMatchesAllocation(self.flavor, source_usages)
+
+        allocations = self._get_allocations_by_server_uuid(server['id'])
+        self.assertEqual(1, len(allocations))
+        source_allocation = allocations[source_rp_uuid]['resources']
+        self.assertFlavorMatchesAllocation(self.flavor, source_allocation)
+
+        # retry the delete to make sure that allocations are removed this time
+        self._delete_and_check_allocations(server)
+
+    def test_server_local_delete_fails_due_to_conflict(self):
+        source_hostname = self.compute1.host
+
+        server = self._boot_and_check_allocations(self.flavor, source_hostname)
+        source_compute_id = self.admin_api.get_services(
+            host=self.compute1.host, binary='nova-compute')[0]['id']
+        self.compute1.stop()
+        self.admin_api.put_service(
+            source_compute_id, {'forced_down': 'true'})
+
+        rsp = fake_requests.FakeResponse(
+            409, jsonutils.dumps({'text': 'consumer generation conflict'}))
+
+        with mock.patch('keystoneauth1.adapter.Adapter.put',
+                        autospec=True) as mock_put:
+            mock_put.return_value = rsp
+
+            ex = self.assertRaises(client.OpenStackApiException,
+                                   self.api.delete_server, server['id'])
+            self.assertEqual(409, ex.response.status_code)
+            self.assertIn('Failed to delete allocations for consumer',
+                          jsonutils.loads(ex.response.content)[
+                              'conflictingRequest']['message'])
+            self.assertEqual(1, mock_put.call_count)
+
+        # We still have the allocations as deletion failed
+        source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
+        source_usages = self._get_provider_usages(source_rp_uuid)
+        self.assertFlavorMatchesAllocation(self.flavor, source_usages)
+
+        allocations = self._get_allocations_by_server_uuid(server['id'])
+        self.assertEqual(1, len(allocations))
+        source_allocation = allocations[source_rp_uuid]['resources']
+        self.assertFlavorMatchesAllocation(self.flavor, source_allocation)
+
+        # retry the delete to make sure that allocations are removed this time
+        self._delete_and_check_allocations(server)
diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py
index 8484973f57be..142cb3b9083e 100644
--- a/nova/tests/unit/compute/test_compute.py
+++ b/nova/tests/unit/compute/test_compute.py
@@ -7646,6 +7646,12 @@ class ComputeTestCase(BaseTestCase,
         self.stub_out('nova.objects.quotas.Quotas.reserve',
                       lambda *a, **k: None)
 
+        self.stub_out('nova.compute.utils.notify_about_instance_usage',
+                      lambda *a, **k: None)
+
+        self.stub_out('nova.compute.utils.notify_about_instance_action',
+                      lambda *a, **k: None)
+
         self.compute._complete_partial_deletion(admin_context, instance)
 
         self.assertNotEqual(0, instance.deleted)
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index afa0c33b02f0..e9e54a77e3cc 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -232,14 +232,22 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
         methods_called = [n for n, a, k in call_tracker.mock_calls]
         self.assertEqual(['clear_events_for_instance',
                           '_notify_about_instance_usage',
-                          '_shutdown_instance'],
+                          '_shutdown_instance',
+                          '_notify_about_instance_usage'],
                          methods_called)
-        mock_notify.assert_called_once_with(self.context,
-                                            mock_inst,
-                                            specd_compute.host,
-                                            action='delete',
-                                            phase='start',
-                                            bdms=mock_bdms)
+        mock_notify.assert_has_calls([
+            mock.call(self.context,
+                      mock_inst,
+                      specd_compute.host,
+                      action='delete',
+                      phase='start',
+                      bdms=mock_bdms),
+            mock.call(self.context,
+                      mock_inst,
+                      specd_compute.host,
+                      action='delete',
+                      phase='end',
+                      bdms=mock_bdms)])
 
     def _make_compute_node(self, hyp_hostname, cn_id):
             cn = mock.Mock(spec_set=['hypervisor_hostname', 'id',
diff --git a/nova/tests/unit/scheduler/client/test_report.py b/nova/tests/unit/scheduler/client/test_report.py
index e0b389064776..b37b4ad0b76b 100644
--- a/nova/tests/unit/scheduler/client/test_report.py
+++ b/nova/tests/unit/scheduler/client/test_report.py
@@ -3222,21 +3222,6 @@ class TestAllocations(SchedulerReportClientTestCase):
         }
         self.assertEqual(expected, result)
 
-    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
-                'delete')
-    @mock.patch('nova.scheduler.client.report.LOG')
-    def test_delete_allocation_for_instance_ignore_404(self, mock_log,
-                                                       mock_delete):
-        """Tests that we don't log a warning on a 404 response when trying to
-        delete an allocation record.
-        """
-        mock_delete.return_value = fake_requests.FakeResponse(404)
-        self.client.delete_allocation_for_instance(self.context, uuids.rp_uuid)
-        # make sure we didn't screw up the logic or the mock
-        mock_log.info.assert_not_called()
-        # make sure warning wasn't called for the 404
-        mock_log.warning.assert_not_called()
-
     @mock.patch("nova.scheduler.client.report.SchedulerReportClient."
                 "delete")
     @mock.patch("nova.scheduler.client.report.SchedulerReportClient."