From 6f1a1f5e8ead9ceb145e13927cd9ae7fdca9add4 Mon Sep 17 00:00:00 2001 From: Balazs Gibizer Date: Tue, 14 Aug 2018 10:37:03 +0200 Subject: [PATCH] Consumer gen support for delete instance allocations The placement API version 1.28 introduced consumer generation as a way to make updating allocation safe even if it is done from multiple places. This patch changes delete_allocation_for_instance to use PUT /allocations instead of DELETE /allocations to benefit from the consumer generation handling. In this patch the report client will GET the current allocation of the instance including the consumer generation and then try to PUT an empty allocation with that generation. If this fails due to a consumer generation conflict, meaning something modified the allocation of the instance in between GET and PUT then the report client will raise AllocationDeleteFailed exception. This will cause that the instance goes to ERROR state. This patch only detects a small portion of possible cases when allocation is modified outside of the delete code path. The rest can only be detected if nova would cache at least the consumer generation of the instance. To be able to put the instance state to ERROR the instance.destroy() call is moved to the end to of the deletion call path. To keep the instance.delete.end notification behavior consistent with this move (e.g. deleted_at field is filled) the notification sending needed to be moved too. Blueprint: use-nested-allocation-candidates Change-Id: I77f34788dd7ab8fdf60d668a4f76452e03cf9888 --- doc/source/user/placement.rst | 2 + nova/api/openstack/compute/servers.py | 3 +- nova/compute/manager.py | 25 +++--- nova/exception.py | 5 ++ nova/scheduler/client/report.py | 64 +++++++++++--- nova/tests/functional/test_servers.py | 86 +++++++++++++++++++ nova/tests/unit/compute/test_compute.py | 6 ++ nova/tests/unit/compute/test_compute_mgr.py | 22 +++-- .../unit/scheduler/client/test_report.py | 15 ---- 9 files changed, 184 insertions(+), 44 deletions(-) diff --git a/doc/source/user/placement.rst b/doc/source/user/placement.rst index f22421230f90..c20833109dc8 100644 --- a/doc/source/user/placement.rst +++ b/doc/source/user/placement.rst @@ -58,12 +58,14 @@ changed or be partially complete at this time. * `filter allocation candidates by aggregate membership`_ * `perform granular allocation candidate requests`_ * `inventory and allocation data migration`_ (reshaping provider trees) +* `handle allocation updates in a safe way`_ .. _Nested Resource Providers: http://specs.openstack.org/openstack/nova-specs/specs/queens/approved/nested-resource-providers.html .. _Request Traits During Scheduling: https://specs.openstack.org/openstack/nova-specs/specs/queens/approved/request-traits-in-nova.html .. _filter allocation candidates by aggregate membership: https://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/alloc-candidates-member-of.html .. _perform granular allocation candidate requests: http://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/granular-resource-requests.html .. _inventory and allocation data migration: http://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/reshape-provider-tree.html +.. _handle allocation updates in a safe way: https://specs.openstack.org/openstack/nova-specs/specs/rocky/approved/add-consumer-generation.html Deployment ========== diff --git a/nova/api/openstack/compute/servers.py b/nova/api/openstack/compute/servers.py index e46e9ed329d9..1c2a773ecf22 100644 --- a/nova/api/openstack/compute/servers.py +++ b/nova/api/openstack/compute/servers.py @@ -865,7 +865,8 @@ class ServersController(wsgi.Controller): raise exc.HTTPNotFound(explanation=msg) except exception.InstanceUnknownCell as e: raise exc.HTTPNotFound(explanation=e.format_message()) - except exception.InstanceIsLocked as e: + except (exception.InstanceIsLocked, + exception.AllocationDeleteFailed) as e: raise exc.HTTPConflict(explanation=e.format_message()) except exception.InstanceInvalidState as state_error: common.raise_http_conflict_for_instance_invalid_state(state_error, diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 4dfd6bd4a5c4..f6715dd7fbec 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -749,19 +749,18 @@ class ComputeManager(manager.Manager): bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( context, instance.uuid) self._complete_deletion(context, - instance, - bdms) + instance) + self._notify_about_instance_usage(context, instance, "delete.end") + compute_utils.notify_about_instance_action(context, instance, + self.host, action=fields.NotificationAction.DELETE, + phase=fields.NotificationPhase.END, bdms=bdms) - def _complete_deletion(self, context, instance, bdms): + def _complete_deletion(self, context, instance): self._update_resource_tracker(context, instance) rt = self._get_resource_tracker() rt.reportclient.delete_allocation_for_instance(context, instance.uuid) - self._notify_about_instance_usage(context, instance, "delete.end") - compute_utils.notify_about_instance_action(context, instance, - self.host, action=fields.NotificationAction.DELETE, - phase=fields.NotificationPhase.END, bdms=bdms) self._clean_instance_console_tokens(context, instance) self._delete_scheduler_instance_info(context, instance.uuid) @@ -2624,11 +2623,17 @@ class ComputeManager(manager.Manager): instance.power_state = power_state.NOSTATE instance.terminated_at = timeutils.utcnow() instance.save() + + self._complete_deletion(context, instance) + # only destroy the instance in the db if the _complete_deletion + # doesn't raise and therefore allocation is successfully + # deleted in placement instance.destroy() - self._complete_deletion(context, - instance, - bdms) + self._notify_about_instance_usage(context, instance, "delete.end") + compute_utils.notify_about_instance_action(context, instance, + self.host, action=fields.NotificationAction.DELETE, + phase=fields.NotificationPhase.END, bdms=bdms) @wrap_exception() @reverts_task_state diff --git a/nova/exception.py b/nova/exception.py index 4e5a5f79486c..fb5f7785c4a5 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -2322,6 +2322,11 @@ class AllocationUpdateFailed(NovaException): 'Error: %(error)s') +class AllocationDeleteFailed(NovaException): + msg_fmt = _('Failed to delete allocations for consumer %(consumer_uuid)s. ' + 'Error: %(error)s') + + class TooManyComputesForHost(NovaException): msg_fmt = _('Unexpected number of compute node records ' '(%(num_computes)d) found for host %(host)s. There should ' diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py index 380a1f721148..a6deb80fd839 100644 --- a/nova/scheduler/client/report.py +++ b/nova/scheduler/client/report.py @@ -2047,21 +2047,63 @@ class SchedulerReportClient(object): @safe_connect def delete_allocation_for_instance(self, context, uuid): + """Delete the instance allocation from placement + + :param context: The security context + :param uuid: the instance UUID which will be used as the consumer UUID + towards placement + :return: Returns True if the allocation is successfully deleted by this + call. Returns False if the allocation does not exist. + :raises AllocationDeleteFailed: If the allocation cannot be read from + placement or it is changed by another process while we tried to + delete it. + """ url = '/allocations/%s' % uuid - r = self.delete(url, global_request_id=context.global_id) - if r: + # We read the consumer generation then try to put an empty allocation + # for that consumer. If between the GET and the PUT the consumer + # generation changes then we raise AllocationDeleteFailed. + # NOTE(gibi): This only detect a small portion of possible cases when + # allocation is modified outside of the delete code path. The rest can + # only be detected if nova would cache at least the consumer generation + # of the instance. + # NOTE(gibi): placement does not return 404 for non-existing consumer + # but returns an empty consumer instead. Putting an empty allocation to + # that non-existing consumer won't be 404 or other error either. + r = self.get(url, global_request_id=context.global_id, + version=CONSUMER_GENERATION_VERSION) + if not r: + # at the moment there is no way placement returns a failure so we + # could even delete this code + LOG.warning('Unable to delete allocation for instance ' + '%(uuid)s: (%(code)i %(text)s)', + {'uuid': uuid, + 'code': r.status_code, + 'text': r.text}) + raise exception.AllocationDeleteFailed(consumer_uuid=uuid, + error=r.text) + allocations = r.json() + if allocations['allocations'] == {}: + # the consumer did not exist in the first place + LOG.debug('Cannot delete allocation for %s consumer in placement ' + 'as consumer does not exists', uuid) + return False + + # removing all resources from the allocation will auto delete the + # consumer in placement + allocations['allocations'] = {} + r = self.put(url, allocations, global_request_id=context.global_id, + version=CONSUMER_GENERATION_VERSION) + if r.status_code == 204: LOG.info('Deleted allocation for instance %s', uuid) return True else: - # Check for 404 since we don't need to log a warning if we tried to - # delete something which doesn't actually exist. - if r.status_code != 404: - LOG.warning('Unable to delete allocation for instance ' - '%(uuid)s: (%(code)i %(text)s)', - {'uuid': uuid, - 'code': r.status_code, - 'text': r.text}) - return False + LOG.warning('Unable to delete allocation for instance ' + '%(uuid)s: (%(code)i %(text)s)', + {'uuid': uuid, + 'code': r.status_code, + 'text': r.text}) + raise exception.AllocationDeleteFailed(consumer_uuid=uuid, + error=r.text) def get_allocations_for_resource_provider(self, context, rp_uuid): """Retrieves the allocations for a specific provider. diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index 9962cdec9dd6..3b57deb1515c 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -20,6 +20,7 @@ import zlib import mock from oslo_log import log as logging from oslo_serialization import base64 +from oslo_serialization import jsonutils from oslo_utils.fixture import uuidsentinel as uuids from oslo_utils import timeutils import six @@ -40,6 +41,7 @@ from nova.tests.functional import integrated_helpers from nova.tests.unit.api.openstack import fakes from nova.tests.unit import fake_block_device from nova.tests.unit import fake_notifier +from nova.tests.unit import fake_requests import nova.tests.unit.image.fake from nova.virt import fake from nova import volume @@ -4529,3 +4531,87 @@ class ServerTestV256RescheduleTestCase(ServerTestV256Common): found_server = self.api.get_server(server['id']) # Check that rescheduling is not occurred. self.assertNotEqual(other_host, found_server['OS-EXT-SRV-ATTR:host']) + + +class ConsumerGenerationConflictTest( + integrated_helpers.ProviderUsageBaseTestCase): + + # we need the medium driver to be able to allocate resource not just for + # a single instance + compute_driver = 'fake.MediumFakeDriver' + + def setUp(self): + super(ConsumerGenerationConflictTest, self).setUp() + flavors = self.api.get_flavors() + self.flavor = flavors[0] + self.other_flavor = flavors[1] + self.compute1 = self._start_compute('compute1') + self.compute2 = self._start_compute('compute2') + + def test_server_delete_fails_due_to_conflict(self): + source_hostname = self.compute1.host + + server = self._boot_and_check_allocations(self.flavor, source_hostname) + + rsp = fake_requests.FakeResponse( + 409, jsonutils.dumps({'text': 'consumer generation conflict'})) + + with mock.patch('keystoneauth1.adapter.Adapter.put', + autospec=True) as mock_put: + mock_put.return_value = rsp + + self.api.delete_server(server['id']) + server = self._wait_for_state_change(self.admin_api, server, + 'ERROR') + self.assertEqual(1, mock_put.call_count) + + # We still have the allocations as deletion failed + source_rp_uuid = self._get_provider_uuid_by_host(source_hostname) + source_usages = self._get_provider_usages(source_rp_uuid) + self.assertFlavorMatchesAllocation(self.flavor, source_usages) + + allocations = self._get_allocations_by_server_uuid(server['id']) + self.assertEqual(1, len(allocations)) + source_allocation = allocations[source_rp_uuid]['resources'] + self.assertFlavorMatchesAllocation(self.flavor, source_allocation) + + # retry the delete to make sure that allocations are removed this time + self._delete_and_check_allocations(server) + + def test_server_local_delete_fails_due_to_conflict(self): + source_hostname = self.compute1.host + + server = self._boot_and_check_allocations(self.flavor, source_hostname) + source_compute_id = self.admin_api.get_services( + host=self.compute1.host, binary='nova-compute')[0]['id'] + self.compute1.stop() + self.admin_api.put_service( + source_compute_id, {'forced_down': 'true'}) + + rsp = fake_requests.FakeResponse( + 409, jsonutils.dumps({'text': 'consumer generation conflict'})) + + with mock.patch('keystoneauth1.adapter.Adapter.put', + autospec=True) as mock_put: + mock_put.return_value = rsp + + ex = self.assertRaises(client.OpenStackApiException, + self.api.delete_server, server['id']) + self.assertEqual(409, ex.response.status_code) + self.assertIn('Failed to delete allocations for consumer', + jsonutils.loads(ex.response.content)[ + 'conflictingRequest']['message']) + self.assertEqual(1, mock_put.call_count) + + # We still have the allocations as deletion failed + source_rp_uuid = self._get_provider_uuid_by_host(source_hostname) + source_usages = self._get_provider_usages(source_rp_uuid) + self.assertFlavorMatchesAllocation(self.flavor, source_usages) + + allocations = self._get_allocations_by_server_uuid(server['id']) + self.assertEqual(1, len(allocations)) + source_allocation = allocations[source_rp_uuid]['resources'] + self.assertFlavorMatchesAllocation(self.flavor, source_allocation) + + # retry the delete to make sure that allocations are removed this time + self._delete_and_check_allocations(server) diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 8484973f57be..142cb3b9083e 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -7646,6 +7646,12 @@ class ComputeTestCase(BaseTestCase, self.stub_out('nova.objects.quotas.Quotas.reserve', lambda *a, **k: None) + self.stub_out('nova.compute.utils.notify_about_instance_usage', + lambda *a, **k: None) + + self.stub_out('nova.compute.utils.notify_about_instance_action', + lambda *a, **k: None) + self.compute._complete_partial_deletion(admin_context, instance) self.assertNotEqual(0, instance.deleted) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index afa0c33b02f0..e9e54a77e3cc 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -232,14 +232,22 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): methods_called = [n for n, a, k in call_tracker.mock_calls] self.assertEqual(['clear_events_for_instance', '_notify_about_instance_usage', - '_shutdown_instance'], + '_shutdown_instance', + '_notify_about_instance_usage'], methods_called) - mock_notify.assert_called_once_with(self.context, - mock_inst, - specd_compute.host, - action='delete', - phase='start', - bdms=mock_bdms) + mock_notify.assert_has_calls([ + mock.call(self.context, + mock_inst, + specd_compute.host, + action='delete', + phase='start', + bdms=mock_bdms), + mock.call(self.context, + mock_inst, + specd_compute.host, + action='delete', + phase='end', + bdms=mock_bdms)]) def _make_compute_node(self, hyp_hostname, cn_id): cn = mock.Mock(spec_set=['hypervisor_hostname', 'id', diff --git a/nova/tests/unit/scheduler/client/test_report.py b/nova/tests/unit/scheduler/client/test_report.py index e0b389064776..b37b4ad0b76b 100644 --- a/nova/tests/unit/scheduler/client/test_report.py +++ b/nova/tests/unit/scheduler/client/test_report.py @@ -3222,21 +3222,6 @@ class TestAllocations(SchedulerReportClientTestCase): } self.assertEqual(expected, result) - @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' - 'delete') - @mock.patch('nova.scheduler.client.report.LOG') - def test_delete_allocation_for_instance_ignore_404(self, mock_log, - mock_delete): - """Tests that we don't log a warning on a 404 response when trying to - delete an allocation record. - """ - mock_delete.return_value = fake_requests.FakeResponse(404) - self.client.delete_allocation_for_instance(self.context, uuids.rp_uuid) - # make sure we didn't screw up the logic or the mock - mock_log.info.assert_not_called() - # make sure warning wasn't called for the 404 - mock_log.warning.assert_not_called() - @mock.patch("nova.scheduler.client.report.SchedulerReportClient." "delete") @mock.patch("nova.scheduler.client.report.SchedulerReportClient."