diff --git a/nova/api/openstack/compute/migrate_server.py b/nova/api/openstack/compute/migrate_server.py index 43beee991226..a3a82f0b6999 100644 --- a/nova/api/openstack/compute/migrate_server.py +++ b/nova/api/openstack/compute/migrate_server.py @@ -57,7 +57,8 @@ class MigrateServerController(wsgi.Controller): except (exception.TooManyInstances, exception.QuotaError) as e: raise exc.HTTPForbidden(explanation=e.format_message()) except (exception.InstanceIsLocked, - exception.CannotMigrateWithTargetHost) as e: + exception.CannotMigrateWithTargetHost, + exception.AllocationMoveFailed) as e: raise exc.HTTPConflict(explanation=e.format_message()) except exception.InstanceInvalidState as state_error: common.raise_http_conflict_for_instance_invalid_state(state_error, diff --git a/nova/api/openstack/compute/servers.py b/nova/api/openstack/compute/servers.py index 1c2a773ecf22..7b67d8318951 100644 --- a/nova/api/openstack/compute/servers.py +++ b/nova/api/openstack/compute/servers.py @@ -830,7 +830,8 @@ class ServersController(wsgi.Controller): except exception.QuotaError as error: raise exc.HTTPForbidden( explanation=error.format_message()) - except exception.InstanceIsLocked as e: + except (exception.InstanceIsLocked, + exception.AllocationMoveFailed) as e: raise exc.HTTPConflict(explanation=e.format_message()) except exception.InstanceInvalidState as state_error: common.raise_http_conflict_for_instance_invalid_state(state_error, diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 67508c27cb78..b0a308c97c94 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -3950,6 +3950,8 @@ class ComputeManager(manager.Manager): # We're reverting (or failed) on the source, so we # need to check if our migration holds a claim and if # so, avoid doing the legacy behavior below. + # NOTE(gibi): We are only hitting this in case of reverting + # a resize-on-same-host operation mig_allocs = ( self.reportclient.get_allocations_for_consumer_by_provider( context, cn_uuid, migration.uuid)) @@ -4098,7 +4100,16 @@ class ComputeManager(manager.Manager): instance.node = migration.source_node instance.save() - self._revert_allocation(context, instance, migration) + try: + self._revert_allocation(context, instance, migration) + except exception.AllocationMoveFailed: + LOG.error('Reverting allocation in placement for migration ' + '%(migration_uuid)s failed. The instance ' + '%(instance_uuid)s will be put into ERROR state but ' + 'the allocation held by the migration is leaked.', + {'instance_uuid': instance.uuid, + 'migration_uuid': migration.uuid}) + raise self.network_api.setup_networks_on_host(context, instance, migration.source_compute) @@ -4191,9 +4202,6 @@ class ComputeManager(manager.Manager): # We only have a claim against one provider, it is the source node cn_uuid = list(orig_alloc.keys())[0] - # Get just the resources part of the one allocation we need below - orig_alloc = orig_alloc[cn_uuid].get('resources', {}) - # FIXME(danms): This method is flawed in that it asssumes allocations # against only one provider. So, this may overwite allocations against # a shared provider, if we had one. @@ -4202,9 +4210,8 @@ class ComputeManager(manager.Manager): {'node': cn_uuid, 'mig': migration.uuid}, instance=instance) # TODO(cdent): Should we be doing anything with return values here? - self.reportclient.set_and_clear_allocations( - context, cn_uuid, instance.uuid, orig_alloc, instance.project_id, - instance.user_id, consumer_to_clear=migration.uuid) + self.reportclient.move_allocations(context, migration.uuid, + instance.uuid) return True def _prep_resize(self, context, image, instance, instance_type, diff --git a/nova/conductor/tasks/live_migrate.py b/nova/conductor/tasks/live_migrate.py index b3c96d38c400..cf99aa46b855 100644 --- a/nova/conductor/tasks/live_migrate.py +++ b/nova/conductor/tasks/live_migrate.py @@ -140,8 +140,7 @@ class LiveMigrationTask(base.TaskBase): migrate.revert_allocation_for_migration(self.context, self._source_cn, self.instance, - self.migration, - self._held_allocations) + self.migration) def _check_instance_is_active(self): if self.instance.power_state not in (power_state.RUNNING, diff --git a/nova/conductor/tasks/migrate.py b/nova/conductor/tasks/migrate.py index b2a8e08f2d90..206c23c39aca 100644 --- a/nova/conductor/tasks/migrate.py +++ b/nova/conductor/tasks/migrate.py @@ -57,9 +57,8 @@ def replace_allocation_with_migration(context, instance, migration): # FIXME(danms): This method is flawed in that it asssumes allocations # against only one provider. So, this may overwite allocations against # a shared provider, if we had one. - success = reportclient.set_and_clear_allocations( - context, source_cn.uuid, migration.uuid, orig_alloc, - instance.project_id, instance.user_id, consumer_to_clear=instance.uuid) + success = reportclient.move_allocations(context, instance.uuid, + migration.uuid) if not success: LOG.error('Unable to replace resource claim on source ' 'host %(host)s node %(node)s for instance', @@ -78,8 +77,7 @@ def replace_allocation_with_migration(context, instance, migration): return source_cn, orig_alloc -def revert_allocation_for_migration(context, source_cn, instance, migration, - orig_alloc): +def revert_allocation_for_migration(context, source_cn, instance, migration): """Revert an allocation made for a migration back to the instance.""" schedclient = scheduler_client.SchedulerClient() @@ -88,10 +86,8 @@ def revert_allocation_for_migration(context, source_cn, instance, migration, # FIXME(danms): This method is flawed in that it asssumes allocations # against only one provider. So, this may overwite allocations against # a shared provider, if we had one. - success = reportclient.set_and_clear_allocations( - context, source_cn.uuid, instance.uuid, orig_alloc, - instance.project_id, instance.user_id, - consumer_to_clear=migration.uuid) + success = reportclient.move_allocations(context, migration.uuid, + instance.uuid) if not success: LOG.error('Unable to replace resource claim on source ' 'host %(host)s node %(node)s for instance', @@ -320,5 +316,4 @@ class MigrationTask(base.TaskBase): # now. revert_allocation_for_migration(self.context, self._source_cn, - self.instance, self._migration, - self._held_allocations) + self.instance, self._migration) diff --git a/nova/exception.py b/nova/exception.py index a3b701d205f2..e6e2048c3d39 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -2322,6 +2322,12 @@ class AllocationUpdateFailed(NovaException): 'Error: %(error)s') +class AllocationMoveFailed(NovaException): + msg_fmt = _('Failed to move allocations from consumer %(source_consumer)s ' + 'to consumer %(target_consumer)s. ' + 'Error: %(error)s') + + class AllocationDeleteFailed(NovaException): msg_fmt = _('Failed to delete allocations for consumer %(consumer_uuid)s. ' 'Error: %(error)s') diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py index 859f46325a5e..1a4d31d19bfb 100644 --- a/nova/scheduler/client/report.py +++ b/nova/scheduler/client/report.py @@ -1922,68 +1922,81 @@ class SchedulerReportClient(object): @safe_connect @retries - def set_and_clear_allocations(self, context, rp_uuid, consumer_uuid, - alloc_data, project_id, user_id, - consumer_to_clear=None): - """Create allocation records for the supplied consumer UUID while - simultaneously clearing any allocations identified by the uuid - in consumer_to_clear, for example a migration uuid when moving an - instance to another host. This is for atomically managing so-called - "doubled" migration records. + def move_allocations(self, context, source_consumer_uuid, + target_consumer_uuid): + """Move allocations from one consumer to the other - :note Currently we only allocate against a single resource provider. - Once shared storage and things like NUMA allocations are a - reality, this will change to allocate against multiple providers. + Note that this call moves the current allocation from the source + consumer to the target consumer. If parallel update happens on either + or both consumers during this call then Placement will detect that and + this code will re-read the new state of the consumers and retry the + operation. If you want to move a known piece of allocation from source + to target then this function might not be what you want as it always + moves what source has in Placement. :param context: The security context - :param rp_uuid: The UUID of the resource provider to allocate against. - :param consumer_uuid: The consumer UUID for which allocations are - being set. - :param alloc_data: Dict, keyed by resource class, of amounts to - consume. - :param project_id: The project_id associated with the allocations. - :param user_id: The user_id associated with the allocations. - :param consumer_to_clear: A UUID identifying allocations for a - consumer that should be cleared. - :returns: True if the allocations were created, False otherwise. - :raises: Retry if the operation should be retried due to a concurrent - update. + :param source_consumer_uuid: the UUID of the consumer from which + allocations are moving + :param target_consumer_uuid: the UUID of the target consumer for the + allocations + :returns: True if the move was successful False otherwise. + :raises AllocationMoveFailed: If the source or the target consumer has + been modified while this call tries to + move allocations. """ - # FIXME(cdent): Fair amount of duplicate with put in here, but now - # just working things through. - payload = { - consumer_uuid: { - 'allocations': { - rp_uuid: { - 'resources': alloc_data - } - }, - 'project_id': project_id, - 'user_id': user_id, + source_alloc = self.get_allocs_for_consumer( + context, source_consumer_uuid) + target_alloc = self.get_allocs_for_consumer( + context, target_consumer_uuid) + + if target_alloc and target_alloc['allocations']: + LOG.warning('Overwriting current allocation %(allocation)s on ' + 'consumer %(consumer)s', + {'allocation': target_alloc, + 'consumer': target_consumer_uuid}) + + new_allocs = { + source_consumer_uuid: { + # 'allocations': {} means we are removing the allocation from + # the source consumer + 'allocations': {}, + 'project_id': source_alloc['project_id'], + 'user_id': source_alloc['user_id'], + 'consumer_generation': source_alloc['consumer_generation']}, + target_consumer_uuid: { + 'allocations': source_alloc['allocations'], + # NOTE(gibi): Is there any case when we need to keep the + # project_id and user_id of the target allocation that we are + # about to overwrite? + 'project_id': source_alloc['project_id'], + 'user_id': source_alloc['user_id'], + 'consumer_generation': target_alloc.get('consumer_generation') } } - if consumer_to_clear: - payload[consumer_to_clear] = { - 'allocations': {}, - 'project_id': project_id, - 'user_id': user_id, - } - r = self.post('/allocations', payload, - version=POST_ALLOCATIONS_API_VERSION, + r = self.post('/allocations', new_allocs, + version=CONSUMER_GENERATION_VERSION, global_request_id=context.global_id) if r.status_code != 204: - # NOTE(jaypipes): Yes, it sucks doing string comparison like this - # but we have no error codes, only error messages. - if 'concurrently updated' in r.text: + err = r.json()['errors'][0] + if err['code'] == 'placement.concurrent_update': + # NOTE(jaypipes): Yes, it sucks doing string comparison like + # this but we have no error codes, only error messages. + # TODO(gibi): Use more granular error codes when available + if 'consumer generation conflict' in err['detail']: + raise exception.AllocationMoveFailed( + source_consumer=source_consumer_uuid, + target_consumer=target_consumer_uuid, + error=r.text) + reason = ('another process changed the resource providers ' 'involved in our attempt to post allocations for ' - 'consumer %s' % consumer_uuid) - raise Retry('set_and_clear_allocations', reason) + 'consumer %s' % target_consumer_uuid) + raise Retry('move_allocations', reason) else: LOG.warning( - 'Unable to post allocations for instance ' + 'Unable to post allocations for consumer ' '%(uuid)s (%(code)i %(text)s)', - {'uuid': consumer_uuid, + {'uuid': target_consumer_uuid, 'code': r.status_code, 'text': r.text}) return r.status_code == 204 diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py index 1f2c9758af3d..b7322ecb38fd 100644 --- a/nova/tests/functional/integrated_helpers.py +++ b/nova/tests/functional/integrated_helpers.py @@ -28,6 +28,7 @@ import nova.conf from nova import context from nova.db import api as db import nova.image.glance +from nova import objects from nova import test from nova.tests import fixtures as nova_fixtures from nova.tests.functional.api import client as api_client @@ -643,3 +644,103 @@ class ProviderUsageBaseTestCase(test.TestCase, InstanceHelperMixin): compute.manager.host) compute.manager.update_available_resource(ctx) LOG.info('Finished with periodics') + + def _move_and_check_allocations(self, server, request, old_flavor, + new_flavor, source_rp_uuid, dest_rp_uuid): + self.api.post_server_action(server['id'], request) + self._wait_for_state_change(self.api, server, 'VERIFY_RESIZE') + + def _check_allocation(): + source_usages = self._get_provider_usages(source_rp_uuid) + self.assertFlavorMatchesAllocation(old_flavor, source_usages) + dest_usages = self._get_provider_usages(dest_rp_uuid) + self.assertFlavorMatchesAllocation(new_flavor, dest_usages) + + # The instance should own the new_flavor allocation against the + # destination host created by the scheduler + allocations = self._get_allocations_by_server_uuid(server['id']) + self.assertEqual(1, len(allocations)) + dest_alloc = allocations[dest_rp_uuid]['resources'] + self.assertFlavorMatchesAllocation(new_flavor, dest_alloc) + + # The migration should own the old_flavor allocation against the + # source host created by conductor + migration_uuid = self.get_migration_uuid_for_instance(server['id']) + allocations = self._get_allocations_by_server_uuid(migration_uuid) + source_alloc = allocations[source_rp_uuid]['resources'] + self.assertFlavorMatchesAllocation(old_flavor, source_alloc) + + # OK, so the move operation has run, but we have not yet confirmed or + # reverted the move operation. Before we run periodics, make sure + # that we have allocations/usages on BOTH the source and the + # destination hosts. + _check_allocation() + self._run_periodics() + _check_allocation() + + # Make sure the RequestSpec.flavor matches the new_flavor. + ctxt = context.get_admin_context() + reqspec = objects.RequestSpec.get_by_instance_uuid(ctxt, server['id']) + self.assertEqual(new_flavor['id'], reqspec.flavor.flavorid) + + def _migrate_and_check_allocations(self, server, flavor, source_rp_uuid, + dest_rp_uuid): + request = { + 'migrate': None + } + self._move_and_check_allocations( + server, request=request, old_flavor=flavor, new_flavor=flavor, + source_rp_uuid=source_rp_uuid, dest_rp_uuid=dest_rp_uuid) + + def _resize_to_same_host_and_check_allocations(self, server, old_flavor, + new_flavor, rp_uuid): + # Resize the server to the same host and check usages in VERIFY_RESIZE + # state + self.flags(allow_resize_to_same_host=True) + resize_req = { + 'resize': { + 'flavorRef': new_flavor['id'] + } + } + self.api.post_server_action(server['id'], resize_req) + self._wait_for_state_change(self.api, server, 'VERIFY_RESIZE') + + usages = self._get_provider_usages(rp_uuid) + self.assertFlavorsMatchAllocation(old_flavor, new_flavor, usages) + + # The instance should hold a new_flavor allocation + allocations = self._get_allocations_by_server_uuid(server['id']) + self.assertEqual(1, len(allocations)) + allocation = allocations[rp_uuid]['resources'] + self.assertFlavorMatchesAllocation(new_flavor, allocation) + + # The migration should hold an old_flavor allocation + migration_uuid = self.get_migration_uuid_for_instance(server['id']) + allocations = self._get_allocations_by_server_uuid(migration_uuid) + self.assertEqual(1, len(allocations)) + allocation = allocations[rp_uuid]['resources'] + self.assertFlavorMatchesAllocation(old_flavor, allocation) + + # We've resized to the same host and have doubled allocations for both + # the old and new flavor on the same host. Run the periodic on the + # compute to see if it tramples on what the scheduler did. + self._run_periodics() + + usages = self._get_provider_usages(rp_uuid) + + # In terms of usage, it's still double on the host because the instance + # and the migration each hold an allocation for the new and old + # flavors respectively. + self.assertFlavorsMatchAllocation(old_flavor, new_flavor, usages) + + # The instance should hold a new_flavor allocation + allocations = self._get_allocations_by_server_uuid(server['id']) + self.assertEqual(1, len(allocations)) + allocation = allocations[rp_uuid]['resources'] + self.assertFlavorMatchesAllocation(new_flavor, allocation) + + # The migration should hold an old_flavor allocation + allocations = self._get_allocations_by_server_uuid(migration_uuid) + self.assertEqual(1, len(allocations)) + allocation = allocations[rp_uuid]['resources'] + self.assertFlavorMatchesAllocation(old_flavor, allocation) diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index 3b57deb1515c..fd055bd5d9b9 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -2023,53 +2023,6 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase): self.compute2.manager.update_available_resource(ctx) LOG.info('Finished with periodics') - def _migrate_and_check_allocations(self, server, flavor, source_rp_uuid, - dest_rp_uuid): - request = { - 'migrate': None - } - self._move_and_check_allocations( - server, request=request, old_flavor=flavor, new_flavor=flavor, - source_rp_uuid=source_rp_uuid, dest_rp_uuid=dest_rp_uuid) - - def _move_and_check_allocations(self, server, request, old_flavor, - new_flavor, source_rp_uuid, dest_rp_uuid): - self.api.post_server_action(server['id'], request) - self._wait_for_state_change(self.api, server, 'VERIFY_RESIZE') - - def _check_allocation(): - source_usages = self._get_provider_usages(source_rp_uuid) - self.assertFlavorMatchesAllocation(old_flavor, source_usages) - dest_usages = self._get_provider_usages(dest_rp_uuid) - self.assertFlavorMatchesAllocation(new_flavor, dest_usages) - - # The instance should own the new_flavor allocation against the - # destination host created by the scheduler - allocations = self._get_allocations_by_server_uuid(server['id']) - self.assertEqual(1, len(allocations)) - dest_alloc = allocations[dest_rp_uuid]['resources'] - self.assertFlavorMatchesAllocation(new_flavor, dest_alloc) - - # The migration should own the old_flavor allocation against the - # source host created by conductor - migration_uuid = self.get_migration_uuid_for_instance(server['id']) - allocations = self._get_allocations_by_server_uuid(migration_uuid) - source_alloc = allocations[source_rp_uuid]['resources'] - self.assertFlavorMatchesAllocation(old_flavor, source_alloc) - - # OK, so the move operation has run, but we have not yet confirmed or - # reverted the move operation. Before we run periodics, make sure - # that we have allocations/usages on BOTH the source and the - # destination hosts. - _check_allocation() - self._run_periodics() - _check_allocation() - - # Make sure the RequestSpec.flavor matches the new_flavor. - ctxt = context.get_admin_context() - reqspec = objects.RequestSpec.get_by_instance_uuid(ctxt, server['id']) - self.assertEqual(new_flavor['id'], reqspec.flavor.flavorid) - def test_resize_revert(self): self._test_resize_revert(dest_hostname='host1') @@ -2204,59 +2157,6 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase): self._delete_and_check_allocations(server) - def _resize_to_same_host_and_check_allocations(self, server, old_flavor, - new_flavor, rp_uuid): - # Resize the server to the same host and check usages in VERIFY_RESIZE - # state - self.flags(allow_resize_to_same_host=True) - resize_req = { - 'resize': { - 'flavorRef': new_flavor['id'] - } - } - self.api.post_server_action(server['id'], resize_req) - self._wait_for_state_change(self.api, server, 'VERIFY_RESIZE') - - usages = self._get_provider_usages(rp_uuid) - self.assertFlavorsMatchAllocation(old_flavor, new_flavor, usages) - - # The instance should hold a new_flavor allocation - allocations = self._get_allocations_by_server_uuid(server['id']) - self.assertEqual(1, len(allocations)) - allocation = allocations[rp_uuid]['resources'] - self.assertFlavorMatchesAllocation(new_flavor, allocation) - - # The migration should hold an old_flavor allocation - migration_uuid = self.get_migration_uuid_for_instance(server['id']) - allocations = self._get_allocations_by_server_uuid(migration_uuid) - self.assertEqual(1, len(allocations)) - allocation = allocations[rp_uuid]['resources'] - self.assertFlavorMatchesAllocation(old_flavor, allocation) - - # We've resized to the same host and have doubled allocations for both - # the old and new flavor on the same host. Run the periodic on the - # compute to see if it tramples on what the scheduler did. - self._run_periodics() - - usages = self._get_provider_usages(rp_uuid) - - # In terms of usage, it's still double on the host because the instance - # and the migration each hold an allocation for the new and old - # flavors respectively. - self.assertFlavorsMatchAllocation(old_flavor, new_flavor, usages) - - # The instance should hold a new_flavor allocation - allocations = self._get_allocations_by_server_uuid(server['id']) - self.assertEqual(1, len(allocations)) - allocation = allocations[rp_uuid]['resources'] - self.assertFlavorMatchesAllocation(new_flavor, allocation) - - # The migration should hold an old_flavor allocation - allocations = self._get_allocations_by_server_uuid(migration_uuid) - self.assertEqual(1, len(allocations)) - allocation = allocations[rp_uuid]['resources'] - self.assertFlavorMatchesAllocation(old_flavor, allocation) - def test_resize_revert_same_host(self): # make sure that the test only uses a single host compute2_service_id = self.admin_api.get_services( @@ -4548,6 +4448,157 @@ class ConsumerGenerationConflictTest( self.compute1 = self._start_compute('compute1') self.compute2 = self._start_compute('compute2') + def test_migrate_move_allocation_fails_due_to_conflict(self): + source_hostname = self.compute1.host + source_rp_uuid = self._get_provider_uuid_by_host(source_hostname) + + server = self._boot_and_check_allocations(self.flavor, source_hostname) + + rsp = fake_requests.FakeResponse( + 409, + jsonutils.dumps( + {'errors': [ + {'code': 'placement.concurrent_update', + 'detail': 'consumer generation conflict'}]})) + + with mock.patch('keystoneauth1.adapter.Adapter.post', + autospec=True) as mock_post: + mock_post.return_value = rsp + + request = {'migrate': None} + exception = self.assertRaises(client.OpenStackApiException, + self.api.post_server_action, + server['id'], request) + + self.assertEqual(1, mock_post.call_count) + + self.assertEqual(409, exception.response.status_code) + self.assertIn('Failed to move allocations', exception.response.text) + + migrations = self.api.get_migrations() + self.assertEqual(1, len(migrations)) + self.assertEqual('migration', migrations[0]['migration_type']) + self.assertEqual(server['id'], migrations[0]['instance_uuid']) + self.assertEqual(source_hostname, migrations[0]['source_compute']) + self.assertEqual('error', migrations[0]['status']) + + # The migration is aborted so the instance is ACTIVE on the source + # host instead of being in VERIFY_RESIZE state. + server = self.api.get_server(server['id']) + self.assertEqual('ACTIVE', server['status']) + self.assertEqual(source_hostname, server['OS-EXT-SRV-ATTR:host']) + + source_usages = self._get_provider_usages(source_rp_uuid) + self.assertFlavorMatchesAllocation(self.flavor, source_usages) + + allocations = self._get_allocations_by_server_uuid(server['id']) + self.assertEqual(1, len(allocations)) + alloc = allocations[source_rp_uuid]['resources'] + self.assertFlavorMatchesAllocation(self.flavor, alloc) + + self._delete_and_check_allocations(server) + + def test_revert_migrate_delete_dest_allocation_fails_due_to_conflict(self): + source_hostname = self.compute1.host + dest_hostname = self.compute2.host + source_rp_uuid = self._get_provider_uuid_by_host(source_hostname) + dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname) + + server = self._boot_and_check_allocations(self.flavor, source_hostname) + self._migrate_and_check_allocations( + server, self.flavor, source_rp_uuid, dest_rp_uuid) + + rsp = fake_requests.FakeResponse( + 409, + jsonutils.dumps( + {'errors': [ + {'code': 'placement.concurrent_update', + 'detail': 'consumer generation conflict'}]})) + + with mock.patch('keystoneauth1.adapter.Adapter.post', + autospec=True) as mock_post: + mock_post.return_value = rsp + + post = {'revertResize': None} + self.api.post_server_action(server['id'], post) + server = self._wait_for_state_change(self.api, server, 'ERROR',) + + self.assertEqual(1, mock_post.call_count) + + migrations = self.api.get_migrations() + self.assertEqual(1, len(migrations)) + self.assertEqual('migration', migrations[0]['migration_type']) + self.assertEqual(server['id'], migrations[0]['instance_uuid']) + self.assertEqual(source_hostname, migrations[0]['source_compute']) + self.assertEqual('error', migrations[0]['status']) + + # NOTE(gibi): Nova leaks the allocation held by the migration_uuid even + # after the instance is deleted. At least nova logs a fat ERROR. + self.assertIn('Reverting allocation in placement for migration %s ' + 'failed. The instance %s will be put into ERROR state ' + 'but the allocation held by the migration is leaked.' % + (migrations[0]['uuid'], server['id']), + self.stdlog.logger.output) + self.api.delete_server(server['id']) + self._wait_until_deleted(server) + fake_notifier.wait_for_versioned_notifications('instance.delete.end') + + allocations = self._get_allocations_by_server_uuid( + migrations[0]['uuid']) + self.assertEqual(1, len(allocations)) + + def test_revert_resize_same_host_delete_dest_fails_due_to_conflict(self): + # make sure that the test only uses a single host + compute2_service_id = self.admin_api.get_services( + host=self.compute2.host, binary='nova-compute')[0]['id'] + self.admin_api.put_service(compute2_service_id, {'status': 'disabled'}) + + hostname = self.compute1.manager.host + rp_uuid = self._get_provider_uuid_by_host(hostname) + + server = self._boot_and_check_allocations(self.flavor, hostname) + + self._resize_to_same_host_and_check_allocations( + server, self.flavor, self.other_flavor, rp_uuid) + + rsp = fake_requests.FakeResponse( + 409, + jsonutils.dumps( + {'errors': [ + {'code': 'placement.concurrent_update', + 'detail': 'consumer generation conflict'}]})) + with mock.patch('keystoneauth1.adapter.Adapter.post', + autospec=True) as mock_post: + mock_post.return_value = rsp + + post = {'revertResize': None} + self.api.post_server_action(server['id'], post) + server = self._wait_for_state_change(self.api, server, 'ERROR',) + + self.assertEqual(1, mock_post.call_count) + + migrations = self.api.get_migrations() + self.assertEqual(1, len(migrations)) + self.assertEqual('resize', migrations[0]['migration_type']) + self.assertEqual(server['id'], migrations[0]['instance_uuid']) + self.assertEqual(hostname, migrations[0]['source_compute']) + self.assertEqual('error', migrations[0]['status']) + + # NOTE(gibi): Nova leaks the allocation held by the migration_uuid even + # after the instance is deleted. At least nova logs a fat ERROR. + self.assertIn('Reverting allocation in placement for migration %s ' + 'failed. The instance %s will be put into ERROR state ' + 'but the allocation held by the migration is leaked.' % + (migrations[0]['uuid'], server['id']), + self.stdlog.logger.output) + self.api.delete_server(server['id']) + self._wait_until_deleted(server) + fake_notifier.wait_for_versioned_notifications('instance.delete.end') + + allocations = self._get_allocations_by_server_uuid( + migrations[0]['uuid']) + self.assertEqual(1, len(allocations)) + def test_server_delete_fails_due_to_conflict(self): source_hostname = self.compute1.host diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index b97b6f24d928..91a8c68da521 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -6210,11 +6210,8 @@ class ComputeTestCase(BaseTestCase, instance=instance, migration=migration, migrate_data=migrate_data) mock_setup.assert_called_once_with(c, instance, self.compute.host) - mock_client.set_and_clear_allocations.assert_called_once_with( - c, mock.sentinel.source, instance.uuid, - mock.sentinel.allocs, - instance.project_id, instance.user_id, - consumer_to_clear=migration.uuid) + mock_client.move_allocations.assert_called_once_with( + c, migration.uuid, instance.uuid) do_it() diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index a59adce1162e..42be7f27e28a 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -7165,10 +7165,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase): self.instance, self.migration) self.assertTrue(r) - mock_report.set_and_clear_allocations.assert_called_once_with( - mock.sentinel.ctx, cu, self.instance.uuid, {'DISK_GB': 1}, - self.instance.project_id, self.instance.user_id, - consumer_to_clear=self.migration.uuid) + mock_report.move_allocations.assert_called_once_with( + mock.sentinel.ctx, self.migration.uuid, self.instance.uuid) doit() @@ -7184,7 +7182,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase): self.instance, self.migration) self.assertFalse(r) - self.assertFalse(mock_report.set_and_clear_allocations.called) + self.assertFalse(mock_report.move_allocations.called) doit() @@ -7209,7 +7207,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase): self.instance, self.migration) self.assertTrue(r) - self.assertTrue(mock_report.set_and_clear_allocations.called) + self.assertTrue(mock_report.move_allocations.called) doit() diff --git a/nova/tests/unit/conductor/tasks/test_migrate.py b/nova/tests/unit/conductor/tasks/test_migrate.py index 92ffce9e50be..1f6cf95111be 100644 --- a/nova/tests/unit/conductor/tasks/test_migrate.py +++ b/nova/tests/unit/conductor/tasks/test_migrate.py @@ -227,8 +227,7 @@ class MigrationTaskTestCase(test.NoDBTestCase): task._migration.save.assert_called_once_with() self.assertEqual('error', task._migration.status) mock_ra.assert_called_once_with(task.context, task._source_cn, - task.instance, task._migration, - task._held_allocations) + task.instance, task._migration) class MigrationTaskAllocationUtils(test.NoDBTestCase): diff --git a/nova/tests/unit/scheduler/client/test_report.py b/nova/tests/unit/scheduler/client/test_report.py index afa2365b9d6b..87f8be233a39 100644 --- a/nova/tests/unit/scheduler/client/test_report.py +++ b/nova/tests/unit/scheduler/client/test_report.py @@ -1112,10 +1112,10 @@ class TestPutAllocations(SchedulerReportClientTestCase): self.assertFalse(res) -class TestSetAndClearAllocations(SchedulerReportClientTestCase): +class TestMoveAllocations(SchedulerReportClientTestCase): def setUp(self): - super(TestSetAndClearAllocations, self).setUp() + super(TestMoveAllocations, self).setUp() # We want to reuse the mock throughout the class, but with # different return values. patcher = mock.patch( @@ -1126,86 +1126,153 @@ class TestSetAndClearAllocations(SchedulerReportClientTestCase): self.rp_uuid = mock.sentinel.rp self.consumer_uuid = mock.sentinel.consumer self.data = {"MEMORY_MB": 1024} + patcher = mock.patch( + 'nova.scheduler.client.report.SchedulerReportClient.get') + self.mock_get = patcher.start() + self.addCleanup(patcher.stop) + self.project_id = mock.sentinel.project_id self.user_id = mock.sentinel.user_id + + self.mock_post.return_value.status_code = 204 + self.rp_uuid = mock.sentinel.rp + self.source_consumer_uuid = mock.sentinel.source_consumer + self.target_consumer_uuid = mock.sentinel.target_consumer + self.source_consumer_data = { + "allocations": { + self.rp_uuid: { + "generation": 1, + "resources": { + "MEMORY_MB": 1024 + } + } + }, + "consumer_generation": 2, + "project_id": self.project_id, + "user_id": self.user_id + } + self.source_rsp = mock.Mock() + self.source_rsp.json.return_value = self.source_consumer_data + self.target_consumer_data = { + "allocations": { + self.rp_uuid: { + "generation": 1, + "resources": { + "MEMORY_MB": 2048 + } + } + }, + "consumer_generation": 1, + "project_id": self.project_id, + "user_id": self.user_id + } + self.target_rsp = mock.Mock() + self.target_rsp.json.return_value = self.target_consumer_data + self.mock_get.side_effect = [self.source_rsp, self.target_rsp] self.expected_url = '/allocations' + self.expected_microversion = '1.28' def test_url_microversion(self): - expected_microversion = '1.13' - - resp = self.client.set_and_clear_allocations( - self.context, self.rp_uuid, self.consumer_uuid, self.data, - self.project_id, self.user_id) + resp = self.client.move_allocations( + self.context, self.source_consumer_uuid, self.target_consumer_uuid) self.assertTrue(resp) self.mock_post.assert_called_once_with( self.expected_url, mock.ANY, - version=expected_microversion, + version=self.expected_microversion, global_request_id=self.context.global_id) - def test_payload_no_clear(self): + def test_move_to_empty_target(self): + self.target_consumer_data = {"allocations": {}} + target_rsp = mock.Mock() + target_rsp.json.return_value = self.target_consumer_data + self.mock_get.side_effect = [self.source_rsp, target_rsp] + expected_payload = { - self.consumer_uuid: { - 'user_id': self.user_id, - 'project_id': self.project_id, - 'allocations': { + self.target_consumer_uuid: { + "allocations": { self.rp_uuid: { - 'resources': { - 'MEMORY_MB': 1024 - } + "resources": { + "MEMORY_MB": 1024 + }, + "generation": 1 } - } - } - } - - resp = self.client.set_and_clear_allocations( - self.context, self.rp_uuid, self.consumer_uuid, self.data, - self.project_id, self.user_id) - - self.assertTrue(resp) - args, kwargs = self.mock_post.call_args - payload = args[1] - self.assertEqual(expected_payload, payload) - - def test_payload_with_clear(self): - expected_payload = { - self.consumer_uuid: { - 'user_id': self.user_id, - 'project_id': self.project_id, - 'allocations': { - self.rp_uuid: { - 'resources': { - 'MEMORY_MB': 1024 - } - } - } + }, + "consumer_generation": None, + "project_id": self.project_id, + "user_id": self.user_id, }, - mock.sentinel.migration_uuid: { - 'user_id': self.user_id, - 'project_id': self.project_id, - 'allocations': {} + self.source_consumer_uuid: { + "allocations": {}, + "consumer_generation": 2, + "project_id": self.project_id, + "user_id": self.user_id, } } - resp = self.client.set_and_clear_allocations( - self.context, self.rp_uuid, self.consumer_uuid, self.data, - self.project_id, self.user_id, - consumer_to_clear=mock.sentinel.migration_uuid) + resp = self.client.move_allocations( + self.context, self.source_consumer_uuid, self.target_consumer_uuid) self.assertTrue(resp) - args, kwargs = self.mock_post.call_args - payload = args[1] - self.assertEqual(expected_payload, payload) + self.mock_post.assert_called_once_with( + self.expected_url, expected_payload, + version=self.expected_microversion, + global_request_id=self.context.global_id) + + def test_move_to_non_empty_target(self): + self.mock_get.side_effect = [self.source_rsp, self.target_rsp] + + expected_payload = { + self.target_consumer_uuid: { + "allocations": { + self.rp_uuid: { + "resources": { + "MEMORY_MB": 1024 + }, + "generation": 1 + } + }, + "consumer_generation": 1, + "project_id": self.project_id, + "user_id": self.user_id, + }, + self.source_consumer_uuid: { + "allocations": {}, + "consumer_generation": 2, + "project_id": self.project_id, + "user_id": self.user_id, + } + } + + resp = self.client.move_allocations( + self.context, self.source_consumer_uuid, self.target_consumer_uuid) + + self.assertTrue(resp) + self.mock_post.assert_called_once_with( + self.expected_url, expected_payload, + version=self.expected_microversion, + global_request_id=self.context.global_id) + self.assertIn('Overwriting current allocation', + self.stdlog.logger.output) @mock.patch('time.sleep') - def test_409_concurrent_update(self, mock_sleep): - self.mock_post.return_value.status_code = 409 - self.mock_post.return_value.text = 'concurrently updated' + def test_409_concurrent_provider_update(self, mock_sleep): + # there will be 1 normal call and 3 retries + self.mock_get.side_effect = [self.source_rsp, self.target_rsp, + self.source_rsp, self.target_rsp, + self.source_rsp, self.target_rsp, + self.source_rsp, self.target_rsp] + rsp = fake_requests.FakeResponse( + 409, + jsonutils.dumps( + {'errors': [ + {'code': 'placement.concurrent_update', + 'detail': ''}]})) - resp = self.client.set_and_clear_allocations( - self.context, self.rp_uuid, self.consumer_uuid, self.data, - self.project_id, self.user_id, - consumer_to_clear=mock.sentinel.migration_uuid) + self.mock_post.return_value = rsp + + resp = self.client.move_allocations( + self.context, self.source_consumer_uuid, self.target_consumer_uuid) self.assertFalse(resp) # Post was attempted four times. @@ -1217,10 +1284,8 @@ class TestSetAndClearAllocations(SchedulerReportClientTestCase): self.mock_post.return_value.status_code = 503 self.mock_post.return_value.text = error_message - resp = self.client.set_and_clear_allocations( - self.context, self.rp_uuid, self.consumer_uuid, self.data, - self.project_id, self.user_id, - consumer_to_clear=mock.sentinel.migration_uuid) + resp = self.client.move_allocations( + self.context, self.source_consumer_uuid, self.target_consumer_uuid) self.assertFalse(resp) args, kwargs = mock_log.call_args @@ -1229,6 +1294,17 @@ class TestSetAndClearAllocations(SchedulerReportClientTestCase): self.assertIn('Unable to post allocations', log_message) self.assertEqual(error_message, log_args['text']) + def test_409_concurrent_consumer_update(self): + self.mock_post.return_value = fake_requests.FakeResponse( + status_code=409, + content=jsonutils.dumps( + {'errors': [{'code': 'placement.concurrent_update', + 'detail': 'consumer generation conflict'}]})) + + self.assertRaises(exception.AllocationMoveFailed, + self.client.move_allocations, self.context, + self.source_consumer_uuid, self.target_consumer_uuid) + class TestProviderOperations(SchedulerReportClientTestCase): @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' diff --git a/releasenotes/notes/leaking_migration_allocations_in_placement-bd0a6f2a30e2e3d2.yaml b/releasenotes/notes/leaking_migration_allocations_in_placement-bd0a6f2a30e2e3d2.yaml new file mode 100644 index 000000000000..7c3af1ed0eb0 --- /dev/null +++ b/releasenotes/notes/leaking_migration_allocations_in_placement-bd0a6f2a30e2e3d2.yaml @@ -0,0 +1,10 @@ +--- +issues: + - | + Nova leaks resource allocations in placement during + ``POST /servers/{server_id}/action (revertResize Action)`` if the + allocation held by the migration_uuid is modified in parallel with the + revert operation. Nova will log an ERROR and will put the server into ERROR + state but will not delete the migration allocation. We assume that this can + only happen if somebody outside of nova is actively changing the migration + allocation in placement. Therefore it is not considered as a bug.