diff --git a/doc/source/user/cellsv2-layout.rst b/doc/source/user/cellsv2-layout.rst index 1346ed0b7cc2..224535828e40 100644 --- a/doc/source/user/cellsv2-layout.rst +++ b/doc/source/user/cellsv2-layout.rst @@ -319,7 +319,7 @@ following: #. Instance reschedules during boot #. Instance affinity reporting from the compute nodes to scheduler -#. The late anti-affinity check +#. The late anti-affinity check during server create and evacuate #. Querying host aggregates from the cell #. Attaching a volume and ``[cinder]/cross_az_attach=False`` diff --git a/nova/compute/manager.py b/nova/compute/manager.py index edc4e985cdae..8486534adce3 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -485,7 +485,7 @@ class ComputeVirtAPI(virtapi.VirtAPI): class ComputeManager(manager.Manager): """Manages the running instances from creation to destruction.""" - target = messaging.Target(version='4.21') + target = messaging.Target(version='4.22') # How long to wait in seconds before re-issuing a shutdown # signal to an instance during power off. The overall @@ -2804,7 +2804,7 @@ class ComputeManager(manager.Manager): injected_files, new_pass, orig_sys_metadata, bdms, recreate, on_shared_storage=None, preserve_ephemeral=False, migration=None, - scheduled_node=None, limits=None): + scheduled_node=None, limits=None, request_spec=None): """Destroy and re-make this instance. A 'rebuild' effectively purges all existing data from the system and @@ -2834,6 +2834,8 @@ class ComputeManager(manager.Manager): None :param limits: Overcommit limits set by the scheduler. If a host was specified by the user, this will be None + :param request_spec: a RequestSpec object used to schedule the instance + """ context = context.elevated() @@ -2886,11 +2888,22 @@ class ComputeManager(manager.Manager): claim_ctxt, context, instance, orig_image_ref, image_ref, injected_files, new_pass, orig_sys_metadata, bdms, recreate, on_shared_storage, preserve_ephemeral, - migration) - except exception.ComputeResourcesUnavailable as e: - LOG.debug("Could not rebuild instance on this host, not " - "enough resources available.", instance=instance) - + migration, request_spec) + except (exception.ComputeResourcesUnavailable, + exception.RescheduledException) as e: + if isinstance(e, exception.ComputeResourcesUnavailable): + LOG.debug("Could not rebuild instance on this host, not " + "enough resources available.", instance=instance) + else: + # RescheduledException is raised by the late server group + # policy check during evacuation if a parallel scheduling + # violated the policy. + # We catch the RescheduledException here but we don't have + # the plumbing to do an actual reschedule so we abort the + # operation. + LOG.debug("Could not rebuild instance on this host, " + "late server group check failed.", + instance=instance) # NOTE(ndipanov): We just abort the build for now and leave a # migration record for potential cleanup later self._set_migration_status(migration, 'failed') @@ -2949,10 +2962,18 @@ class ComputeManager(manager.Manager): image_ref, injected_files, new_pass, orig_sys_metadata, bdms, recreate, on_shared_storage, preserve_ephemeral, - migration): + migration, request_spec): orig_vm_state = instance.vm_state if recreate: + if request_spec: + # NOTE(gibi): Do a late check of server group policy as + # parallel scheduling could violate such policy. This will + # cause the evacuate to fail as rebuild does not implement + # reschedule. + hints = self._get_scheduler_hints({}, request_spec) + self._validate_instance_group_policy(context, instance, hints) + if not self.driver.capabilities["supports_recreate"]: raise exception.InstanceRecreateNotSupported diff --git a/nova/compute/rpcapi.py b/nova/compute/rpcapi.py index 7f62e5b341ba..c001921f41ab 100644 --- a/nova/compute/rpcapi.py +++ b/nova/compute/rpcapi.py @@ -340,6 +340,7 @@ class ComputeAPI(object): * 4.20 - Add multiattach argument to reserve_block_device_name(). * 4.21 - prep_resize() now gets a 'host_list' parameter representing potential alternate hosts for retries within a cell. + * 4.22 - Add request_spec to rebuild_instance() ''' VERSION_ALIASES = { @@ -811,7 +812,7 @@ class ComputeAPI(object): image_ref, orig_image_ref, orig_sys_metadata, bdms, recreate=False, on_shared_storage=False, host=None, node=None, preserve_ephemeral=False, migration=None, limits=None, - kwargs=None): + request_spec=None, kwargs=None): # NOTE(edleafe): compute nodes can only use the dict form of limits. if isinstance(limits, objects.SchedulerLimits): limits = limits.to_dict() @@ -820,9 +821,13 @@ class ComputeAPI(object): extra = {'preserve_ephemeral': preserve_ephemeral, 'migration': migration, 'scheduled_node': node, - 'limits': limits} - version = '4.5' + 'limits': limits, + 'request_spec': request_spec} + version = '4.22' client = self.router.client(ctxt) + if not client.can_send_version(version): + version = '4.5' + del extra['request_spec'] if not client.can_send_version(version): version = '4.0' extra.pop('migration') diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py index 6497e0f77e95..2e0fc3e9277f 100644 --- a/nova/conductor/manager.py +++ b/nova/conductor/manager.py @@ -996,7 +996,8 @@ class ComputeTaskManager(base.Base): on_shared_storage=on_shared_storage, preserve_ephemeral=preserve_ephemeral, migration=migration, - host=host, node=node, limits=limits) + host=host, node=node, limits=limits, + request_spec=request_spec) # TODO(avolkov): move method to bdm @staticmethod diff --git a/nova/objects/service.py b/nova/objects/service.py index b3587ed0999e..5d28feec623a 100644 --- a/nova/objects/service.py +++ b/nova/objects/service.py @@ -31,7 +31,7 @@ LOG = logging.getLogger(__name__) # NOTE(danms): This is the global service version counter -SERVICE_VERSION = 28 +SERVICE_VERSION = 29 # NOTE(danms): This is our SERVICE_VERSION history. The idea is that any @@ -127,6 +127,8 @@ SERVICE_VERSION_HISTORY = ( {'compute_rpc': '4.20'}, # Version 28: Adds a 'host_list' parameter to prep_resize() {'compute_rpc': '4.21'}, + # Version 29: Compute RPC version 4.22 + {'compute_rpc': '4.22'}, ) diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py index 8c0bafaf0231..c97557b62068 100644 --- a/nova/tests/functional/integrated_helpers.py +++ b/nova/tests/functional/integrated_helpers.py @@ -316,8 +316,8 @@ class InstanceHelperMixin(object): self.fail('Timed out waiting for %s failure event. Current ' 'instance actions: %s' % (event_name, actions)) - def _wait_for_migration_status(self, server, expected_status): - """Waits for a migration record with the given status to be found + def _wait_for_migration_status(self, server, expected_statuses): + """Waits for a migration record with the given statuses to be found for the given server, else the test fails. The migration record, if found, is returned. """ @@ -325,13 +325,13 @@ class InstanceHelperMixin(object): if api is None: api = self.api + statuses = [status.lower() for status in expected_statuses] for attempt in range(10): migrations = api.api_get('/os-migrations').body['migrations'] for migration in migrations: if (migration['instance_uuid'] == server['id'] and - migration['status'].lower() == - expected_status.lower()): + migration['status'].lower() in statuses): return migration time.sleep(0.5) self.fail('Timed out waiting for migration with status "%s" for ' - 'instance: %s' % (expected_status, server['id'])) + 'instance: %s' % (expected_statuses, server['id'])) diff --git a/nova/tests/functional/regressions/test_bug_1735407.py b/nova/tests/functional/regressions/test_bug_1735407.py index e222029c92ce..eb321db31d5e 100644 --- a/nova/tests/functional/regressions/test_bug_1735407.py +++ b/nova/tests/functional/regressions/test_bug_1735407.py @@ -148,8 +148,8 @@ class TestParallelEvacuationWithServerGroup( # that. The only thing that happens after the instance.host is set to # the target host is the migration status setting to done. So we have # to wait for that to avoid asserting the wrong host below. - self._wait_for_migration_status(server1, 'done') - self._wait_for_migration_status(server2, 'done') + self._wait_for_migration_status(server1, ['done', 'failed']) + self._wait_for_migration_status(server2, ['done', 'failed']) # get the servers again to have the latest information about their # hosts @@ -158,28 +158,19 @@ class TestParallelEvacuationWithServerGroup( # assert that the anti-affinity policy is enforced during the # evacuation - # NOTE(gibi): This shows bug 1735407 as both instance ends up on the - # same host. - self.assertEqual(server1['OS-EXT-SRV-ATTR:host'], - server2['OS-EXT-SRV-ATTR:host']) - # After the bug 1735407 is fixed the following is expected: - # self.assertNotEqual(server1['OS-EXT-SRV-ATTR:host'], - # server2['OS-EXT-SRV-ATTR:host']) + self.assertNotEqual(server1['OS-EXT-SRV-ATTR:host'], + server2['OS-EXT-SRV-ATTR:host']) # assert that one of the evacuation was successful and that server is # moved to another host and the evacuation of the other server is # failed - # NOTE(gibi): This shows bug 1735407 as both instance is moved - self.assertNotIn(server1['OS-EXT-SRV-ATTR:host'], {'host1', 'host2'}) - self.assertNotIn(server2['OS-EXT-SRV-ATTR:host'], {'host1', 'host2'}) - # After fixing the bug 1735407 the following is expected - # if server1['status'] == 'ERROR': - # failed_server = server1 - # evacuated_server = server2 - # else: - # failed_server = server2 - # evacuated_server = server1 - # self.assertEqual('ERROR', failed_server['status']) - # self.assertNotEqual('host3', failed_server['OS-EXT-SRV-ATTR:host']) - # self.assertEqual('ACTIVE', evacuated_server['status']) - # self.assertEqual('host3', evacuated_server['OS-EXT-SRV-ATTR:host']) + if server1['status'] == 'ERROR': + failed_server = server1 + evacuated_server = server2 + else: + failed_server = server2 + evacuated_server = server1 + self.assertEqual('ERROR', failed_server['status']) + self.assertNotEqual('host3', failed_server['OS-EXT-SRV-ATTR:host']) + self.assertEqual('ACTIVE', evacuated_server['status']) + self.assertEqual('host3', evacuated_server['OS-EXT-SRV-ATTR:host']) diff --git a/nova/tests/functional/test_server_group.py b/nova/tests/functional/test_server_group.py index 9e8ea76e713d..8fa7546f1470 100644 --- a/nova/tests/functional/test_server_group.py +++ b/nova/tests/functional/test_server_group.py @@ -427,7 +427,7 @@ class ServerGroupTestV21(ServerGroupTestBase): post = {'evacuate': {'onSharedStorage': False}} self.admin_api.post_server_action(servers[1]['id'], post) - self._wait_for_migration_status(servers[1], 'done') + self._wait_for_migration_status(servers[1], ['done']) evacuated_server = self._wait_for_state_change( self.admin_api, servers[1], 'ACTIVE') @@ -453,7 +453,7 @@ class ServerGroupTestV21(ServerGroupTestBase): post = {'evacuate': {'onSharedStorage': False}} self.admin_api.post_server_action(servers[1]['id'], post) - self._wait_for_migration_status(servers[1], 'error') + self._wait_for_migration_status(servers[1], ['error']) server_after_failed_evac = self._wait_for_state_change( self.admin_api, servers[1], 'ERROR') @@ -477,7 +477,7 @@ class ServerGroupTestV21(ServerGroupTestBase): post = {'evacuate': {'onSharedStorage': False}} self.admin_api.post_server_action(servers[1]['id'], post) - self._wait_for_migration_status(servers[1], 'error') + self._wait_for_migration_status(servers[1], ['error']) server_after_failed_evac = self._wait_for_state_change( self.admin_api, servers[1], 'ERROR') @@ -624,7 +624,7 @@ class ServerGroupTestV215(ServerGroupTestV21): post = {'evacuate': {}} self.admin_api.post_server_action(servers[1]['id'], post) - self._wait_for_migration_status(servers[1], 'done') + self._wait_for_migration_status(servers[1], ['done']) evacuated_server = self._wait_for_state_change( self.admin_api, servers[1], 'ACTIVE') @@ -651,7 +651,7 @@ class ServerGroupTestV215(ServerGroupTestV21): post = {'evacuate': {}} self.admin_api.post_server_action(servers[1]['id'], post) - self._wait_for_migration_status(servers[1], 'error') + self._wait_for_migration_status(servers[1], ['error']) server_after_failed_evac = self._wait_for_state_change( self.admin_api, servers[1], 'ERROR') @@ -675,7 +675,7 @@ class ServerGroupTestV215(ServerGroupTestV21): post = {'evacuate': {}} self.admin_api.post_server_action(servers[1]['id'], post) - self._wait_for_migration_status(servers[1], 'error') + self._wait_for_migration_status(servers[1], ['error']) server_after_failed_evac = self._wait_for_state_change( self.admin_api, servers[1], 'ERROR') @@ -817,7 +817,7 @@ class ServerGroupTestV215(ServerGroupTestV21): post = {'evacuate': {}} self.admin_api.post_server_action(servers[1]['id'], post) - self._wait_for_migration_status(servers[1], 'done') + self._wait_for_migration_status(servers[1], ['done']) evacuated_server = self._wait_for_state_change( self.admin_api, servers[1], 'ACTIVE') diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index eb45ab5a43d9..0829e6eec0b9 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -2531,7 +2531,7 @@ class ServerMovingTests(ProviderUsageBaseTestCase): # clears the task_state. The migration record status is set to # 'error', so that's what we need to look for to know when this # is done. - migration = self._wait_for_migration_status(server, 'error') + migration = self._wait_for_migration_status(server, ['error']) # The source_compute should be set on the migration record, but the # destination shouldn't be as we never made it to one. @@ -2591,7 +2591,7 @@ class ServerMovingTests(ProviderUsageBaseTestCase): self.api.post_server_action(server['id'], post) # The compute manager will put the migration record into error status # when pre_live_migration fails, so wait for that to happen. - migration = self._wait_for_migration_status(server, 'error') + migration = self._wait_for_migration_status(server, ['error']) # The _rollback_live_migration method in the compute manager will reset # the task_state on the instance, so wait for that to happen. server = self._wait_for_server_parameter( @@ -2859,7 +2859,7 @@ class ServerLiveMigrateForceAndAbort(ProviderUsageBaseTestCase): } self.api.post_server_action(server['id'], post) - migration = self._wait_for_migration_status(server, 'running') + migration = self._wait_for_migration_status(server, ['running']) self.api.force_complete_migration(server['id'], migration['id']) @@ -2902,7 +2902,7 @@ class ServerLiveMigrateForceAndAbort(ProviderUsageBaseTestCase): } self.api.post_server_action(server['id'], post) - migration = self._wait_for_migration_status(server, 'running') + migration = self._wait_for_migration_status(server, ['running']) self.api.delete_migration(server['id'], migration['id']) self._wait_for_server_parameter(self.api, server, diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index e915699e3403..d254c25ed035 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -3913,6 +3913,87 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): delete_alloc.assert_called_once_with(instance, 'foo', node_type='destination') + @mock.patch('nova.context.RequestContext.elevated') + @mock.patch('nova.objects.instance.Instance.drop_migration_context') + @mock.patch('nova.objects.instance.Instance.apply_migration_context') + @mock.patch('nova.objects.instance.Instance.mutated_migration_context') + @mock.patch('nova.objects.BlockDeviceMappingList.get_by_instance_uuid') + @mock.patch('nova.network.neutronv2.api.API.' + 'setup_instance_network_on_host') + @mock.patch('nova.network.neutronv2.api.API.setup_networks_on_host') + @mock.patch('nova.objects.instance.Instance.save') + @mock.patch('nova.compute.utils.notify_about_instance_action') + @mock.patch('nova.compute.utils.notify_about_instance_usage') + @mock.patch('nova.compute.utils.notify_usage_exists') + @mock.patch('nova.objects.instance.Instance.image_meta', + new_callable=mock.PropertyMock) + @mock.patch('nova.compute.manager.ComputeManager.' + '_validate_instance_group_policy') + @mock.patch('nova.compute.manager.ComputeManager._set_migration_status') + @mock.patch('nova.compute.resource_tracker.ResourceTracker.rebuild_claim') + def test_evacuate_late_server_group_policy_check( + self, mock_rebuild_claim, mock_set_migration_status, + mock_validate_policy, mock_image_meta, mock_notify_exists, + mock_notify_legacy, mock_notify, mock_instance_save, + mock_setup_networks, mock_setup_intance_network, mock_get_bdms, + mock_mutate_migration, mock_appy_migration, mock_drop_migration, + mock_context_elevated): + instance = fake_instance.fake_instance_obj(self.context) + instance.info_cache = None + elevated_context = mock.Mock() + mock_context_elevated.return_value = elevated_context + request_spec = objects.RequestSpec() + request_spec.scheduler_hints = {'group': [uuids.group]} + + self.compute.rebuild_instance( + self.context, instance, None, None, None, None, None, + None, recreate=True, scheduled_node='fake-node', + request_spec=request_spec) + + mock_validate_policy.assert_called_once_with( + elevated_context, instance, {'group': [uuids.group]}) + + @mock.patch('nova.compute.utils.add_instance_fault_from_exc') + @mock.patch('nova.compute.resource_tracker.ResourceTracker.' + 'delete_allocation_for_evacuated_instance') + @mock.patch('nova.context.RequestContext.elevated') + @mock.patch('nova.objects.instance.Instance.save') + @mock.patch('nova.compute.utils.notify_about_instance_action') + @mock.patch('nova.compute.utils.notify_about_instance_usage') + @mock.patch('nova.compute.manager.ComputeManager.' + '_validate_instance_group_policy') + @mock.patch('nova.compute.manager.ComputeManager._set_migration_status') + @mock.patch('nova.compute.resource_tracker.ResourceTracker.rebuild_claim') + def test_evacuate_late_server_group_policy_check_fails( + self, mock_rebuild_claim, mock_set_migration_status, + mock_validate_policy, mock_notify_legacy, mock_notify, + mock_instance_save, mock_context_elevated, mock_delete_allocation, + mock_instance_fault): + instance = fake_instance.fake_instance_obj(self.context) + instance.info_cache = None + elevated_context = mock.Mock() + mock_context_elevated.return_value = elevated_context + request_spec = objects.RequestSpec() + request_spec.scheduler_hints = {'group': [uuids.group]} + + exc = exception.RescheduledException( + instance_uuid=instance.uuid, reason='policy violation') + mock_validate_policy.side_effect = exc + + self.assertRaises( + exception.BuildAbortException, self.compute.rebuild_instance, + self.context, instance, None, None, None, None, None, None, + recreate=True, scheduled_node='fake-node', + request_spec=request_spec) + + mock_validate_policy.assert_called_once_with( + elevated_context, instance, {'group': [uuids.group]}) + mock_delete_allocation.assert_called_once_with( + instance, 'fake-node', node_type='destination') + mock_notify.assert_called_once_with( + elevated_context, instance, 'fake-mini', action='rebuild', + bdms=None, exception=exc, phase='error') + def test_rebuild_node_not_updated_if_not_recreate(self): node = uuidutils.generate_uuid() # ironic node uuid instance = fake_instance.fake_instance_obj(self.context, node=node) diff --git a/nova/tests/unit/compute/test_rpcapi.py b/nova/tests/unit/compute/test_rpcapi.py index cb72b22d59bb..1075059156c7 100644 --- a/nova/tests/unit/compute/test_rpcapi.py +++ b/nova/tests/unit/compute/test_rpcapi.py @@ -517,6 +517,15 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): reboot_type='type') def test_rebuild_instance(self): + self._test_compute_api('rebuild_instance', 'cast', new_pass='None', + injected_files='None', image_ref='None', orig_image_ref='None', + bdms=[], instance=self.fake_instance_obj, host='new_host', + orig_sys_metadata=None, recreate=True, on_shared_storage=True, + preserve_ephemeral=True, migration=None, node=None, + limits=None, request_spec=None, version='4.22') + + def test_rebuild_instance_remove_request_spec(self): + self.flags(group='upgrade_levels', compute='4.21') self._test_compute_api('rebuild_instance', 'cast', new_pass='None', injected_files='None', image_ref='None', orig_image_ref='None', bdms=[], instance=self.fake_instance_obj, host='new_host', diff --git a/nova/tests/unit/conductor/test_conductor.py b/nova/tests/unit/conductor/test_conductor.py index 915422976ed1..8e47cb60bcd4 100644 --- a/nova/tests/unit/conductor/test_conductor.py +++ b/nova/tests/unit/conductor/test_conductor.py @@ -366,9 +366,6 @@ class _BaseTaskTestCase(object): compute_rebuild_args['node'] = node compute_rebuild_args['limits'] = limits - # Args that are passed in to the method but don't get passed to RPC - compute_rebuild_args.pop('request_spec') - return rebuild_args, compute_rebuild_args @mock.patch.object(objects.InstanceMapping, 'get_by_instance_uuid') @@ -1312,6 +1309,7 @@ class _BaseTaskTestCase(object): select_dest_mock.assert_called_once_with(self.context, fake_spec, inst_uuids, return_objects=True, return_alternates=False) compute_args['host'] = expected_host + compute_args['request_spec'] = fake_spec rebuild_mock.assert_called_once_with(self.context, instance=inst_obj, **compute_args) @@ -1469,6 +1467,7 @@ class _BaseTaskTestCase(object): fake_spec, [inst_obj.uuid], return_objects=True, return_alternates=False) compute_args['host'] = expected_host + compute_args['request_spec'] = fake_spec rebuild_mock.assert_called_once_with(self.context, instance=inst_obj, **compute_args)