Merge "Add late server group policy check to rebuild"

2018-02-07 03:46:34 +00:00 · 2018-02-07 03:46:34 +00:00 · b37aa54153
parent e41ee4bd93 edeeaf9102
commit b37aa54153
12 changed files with 165 additions and 56 deletions
--- a/doc/source/user/cellsv2-layout.rst
+++ b/doc/source/user/cellsv2-layout.rst
@ -326,7 +326,7 @@ following:
   .. note:: This has been resolved in the Queens release [#]_.

 #. Instance affinity reporting from the compute nodes to scheduler
-#. The late anti-affinity check
+#. The late anti-affinity check during server create and evacuate
 #. Querying host aggregates from the cell
 #. Attaching a volume and ``[cinder]/cross_az_attach=False``

--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -485,7 +485,7 @@ class ComputeVirtAPI(virtapi.VirtAPI):
 class ComputeManager(manager.Manager):
    """Manages the running instances from creation to destruction."""

-    target = messaging.Target(version='4.21')
+    target = messaging.Target(version='4.22')

    # How long to wait in seconds before re-issuing a shutdown
    # signal to an instance during power off.  The overall
@ -2804,7 +2804,7 @@ class ComputeManager(manager.Manager):
                         injected_files, new_pass, orig_sys_metadata,
                         bdms, recreate, on_shared_storage=None,
                         preserve_ephemeral=False, migration=None,
-                         scheduled_node=None, limits=None):
+                         scheduled_node=None, limits=None, request_spec=None):
        """Destroy and re-make this instance.

        A 'rebuild' effectively purges all existing data from the system and
@ -2834,6 +2834,8 @@ class ComputeManager(manager.Manager):
                               None
        :param limits: Overcommit limits set by the scheduler. If a host was
                       specified by the user, this will be None
+        :param request_spec: a RequestSpec object used to schedule the instance
+
        """
        context = context.elevated()

@ -2886,11 +2888,22 @@ class ComputeManager(manager.Manager):
                    claim_ctxt, context, instance, orig_image_ref,
                    image_ref, injected_files, new_pass, orig_sys_metadata,
                    bdms, recreate, on_shared_storage, preserve_ephemeral,
-                    migration)
-            except exception.ComputeResourcesUnavailable as e:
-                LOG.debug("Could not rebuild instance on this host, not "
-                          "enough resources available.", instance=instance)
-
+                    migration, request_spec)
+            except (exception.ComputeResourcesUnavailable,
+                    exception.RescheduledException) as e:
+                if isinstance(e, exception.ComputeResourcesUnavailable):
+                    LOG.debug("Could not rebuild instance on this host, not "
+                              "enough resources available.", instance=instance)
+                else:
+                    # RescheduledException is raised by the late server group
+                    # policy check during evacuation if a parallel scheduling
+                    # violated the policy.
+                    # We catch the RescheduledException here but we don't have
+                    # the plumbing to do an actual reschedule so we abort the
+                    # operation.
+                    LOG.debug("Could not rebuild instance on this host, "
+                              "late server group check failed.",
+                              instance=instance)
                # NOTE(ndipanov): We just abort the build for now and leave a
                # migration record for potential cleanup later
                self._set_migration_status(migration, 'failed')
@ -2949,10 +2962,18 @@ class ComputeManager(manager.Manager):
                             image_ref, injected_files, new_pass,
                             orig_sys_metadata, bdms, recreate,
                             on_shared_storage, preserve_ephemeral,
-                             migration):
+                             migration, request_spec):
        orig_vm_state = instance.vm_state

        if recreate:
+            if request_spec:
+                # NOTE(gibi): Do a late check of server group policy as
+                # parallel scheduling could violate such policy. This will
+                # cause the evacuate to fail as rebuild does not implement
+                # reschedule.
+                hints = self._get_scheduler_hints({}, request_spec)
+                self._validate_instance_group_policy(context, instance, hints)
+
            if not self.driver.capabilities["supports_recreate"]:
                raise exception.InstanceRecreateNotSupported

--- a/nova/compute/rpcapi.py
+++ b/nova/compute/rpcapi.py
@ -340,6 +340,7 @@ class ComputeAPI(object):
        * 4.20 - Add multiattach argument to reserve_block_device_name().
        * 4.21 - prep_resize() now gets a 'host_list' parameter representing
                 potential alternate hosts for retries within a cell.
+        * 4.22 - Add request_spec to rebuild_instance()
    '''

    VERSION_ALIASES = {
@ -811,7 +812,7 @@ class ComputeAPI(object):
            image_ref, orig_image_ref, orig_sys_metadata, bdms,
            recreate=False, on_shared_storage=False, host=None, node=None,
            preserve_ephemeral=False, migration=None, limits=None,
-            kwargs=None):
+            request_spec=None, kwargs=None):
        # NOTE(edleafe): compute nodes can only use the dict form of limits.
        if isinstance(limits, objects.SchedulerLimits):
            limits = limits.to_dict()
@ -820,9 +821,13 @@ class ComputeAPI(object):
        extra = {'preserve_ephemeral': preserve_ephemeral,
                 'migration': migration,
                 'scheduled_node': node,
-                 'limits': limits}
-        version = '4.5'
+                 'limits': limits,
+                 'request_spec': request_spec}
+        version = '4.22'
        client = self.router.client(ctxt)
+        if not client.can_send_version(version):
+            version = '4.5'
+            del extra['request_spec']
        if not client.can_send_version(version):
            version = '4.0'
            extra.pop('migration')
--- a/nova/conductor/manager.py
+++ b/nova/conductor/manager.py
@ -988,7 +988,8 @@ class ComputeTaskManager(base.Base):
                    on_shared_storage=on_shared_storage,
                    preserve_ephemeral=preserve_ephemeral,
                    migration=migration,
-                    host=host, node=node, limits=limits)
+                    host=host, node=node, limits=limits,
+                    request_spec=request_spec)

    # TODO(avolkov): move method to bdm
    @staticmethod
--- a/nova/objects/service.py
+++ b/nova/objects/service.py
@ -31,7 +31,7 @@ LOG = logging.getLogger(__name__)


 # NOTE(danms): This is the global service version counter
-SERVICE_VERSION = 28
+SERVICE_VERSION = 29


 # NOTE(danms): This is our SERVICE_VERSION history. The idea is that any
@ -127,6 +127,8 @@ SERVICE_VERSION_HISTORY = (
    {'compute_rpc': '4.20'},
    # Version 28: Adds a 'host_list' parameter to prep_resize()
    {'compute_rpc': '4.21'},
+    # Version 29: Compute RPC version 4.22
+    {'compute_rpc': '4.22'},
 )


--- a/nova/tests/functional/integrated_helpers.py
+++ b/nova/tests/functional/integrated_helpers.py
@ -316,8 +316,8 @@ class InstanceHelperMixin(object):
            self.fail('Timed out waiting for %s failure event. Current '
                      'instance actions: %s' % (event_name, actions))

-    def _wait_for_migration_status(self, server, expected_status):
-        """Waits for a migration record with the given status to be found
+    def _wait_for_migration_status(self, server, expected_statuses):
+        """Waits for a migration record with the given statuses to be found
        for the given server, else the test fails. The migration record, if
        found, is returned.
        """
@ -325,13 +325,13 @@ class InstanceHelperMixin(object):
        if api is None:
            api = self.api

+        statuses = [status.lower() for status in expected_statuses]
        for attempt in range(10):
            migrations = api.api_get('/os-migrations').body['migrations']
            for migration in migrations:
                if (migration['instance_uuid'] == server['id'] and
-                        migration['status'].lower() ==
-                        expected_status.lower()):
+                        migration['status'].lower() in statuses):
                    return migration
            time.sleep(0.5)
        self.fail('Timed out waiting for migration with status "%s" for '
-                  'instance: %s' % (expected_status, server['id']))
+                  'instance: %s' % (expected_statuses, server['id']))
--- a/nova/tests/functional/regressions/test_bug_1735407.py
+++ b/nova/tests/functional/regressions/test_bug_1735407.py
@ -148,8 +148,8 @@ class TestParallelEvacuationWithServerGroup(
        # that. The only thing that happens after the instance.host is set to
        # the target host is the migration status setting to done. So we have
        # to wait for that to avoid asserting the wrong host below.
-        self._wait_for_migration_status(server1, 'done')
-        self._wait_for_migration_status(server2, 'done')
+        self._wait_for_migration_status(server1, ['done', 'failed'])
+        self._wait_for_migration_status(server2, ['done', 'failed'])

        # get the servers again to have the latest information about their
        # hosts
@ -158,28 +158,19 @@ class TestParallelEvacuationWithServerGroup(

        # assert that the anti-affinity policy is enforced during the
        # evacuation
-        # NOTE(gibi): This shows bug 1735407 as both instance ends up on the
-        # same host.
-        self.assertEqual(server1['OS-EXT-SRV-ATTR:host'],
-                         server2['OS-EXT-SRV-ATTR:host'])
-        # After the bug 1735407 is fixed the following is expected:
-        # self.assertNotEqual(server1['OS-EXT-SRV-ATTR:host'],
-        #                     server2['OS-EXT-SRV-ATTR:host'])
+        self.assertNotEqual(server1['OS-EXT-SRV-ATTR:host'],
+                            server2['OS-EXT-SRV-ATTR:host'])

        # assert that one of the evacuation was successful and that server is
        # moved to another host and the evacuation of the other server is
        # failed
-        # NOTE(gibi): This shows bug 1735407 as both instance is moved
-        self.assertNotIn(server1['OS-EXT-SRV-ATTR:host'], {'host1', 'host2'})
-        self.assertNotIn(server2['OS-EXT-SRV-ATTR:host'], {'host1', 'host2'})
-        # After fixing the bug 1735407 the following is expected
-        # if server1['status'] == 'ERROR':
-        #     failed_server = server1
-        #     evacuated_server = server2
-        # else:
-        #     failed_server = server2
-        #     evacuated_server = server1
-        # self.assertEqual('ERROR', failed_server['status'])
-        # self.assertNotEqual('host3', failed_server['OS-EXT-SRV-ATTR:host'])
-        # self.assertEqual('ACTIVE', evacuated_server['status'])
-        # self.assertEqual('host3', evacuated_server['OS-EXT-SRV-ATTR:host'])
+        if server1['status'] == 'ERROR':
+            failed_server = server1
+            evacuated_server = server2
+        else:
+            failed_server = server2
+            evacuated_server = server1
+        self.assertEqual('ERROR', failed_server['status'])
+        self.assertNotEqual('host3', failed_server['OS-EXT-SRV-ATTR:host'])
+        self.assertEqual('ACTIVE', evacuated_server['status'])
+        self.assertEqual('host3', evacuated_server['OS-EXT-SRV-ATTR:host'])
--- a/nova/tests/functional/test_server_group.py
+++ b/nova/tests/functional/test_server_group.py
@ -427,7 +427,7 @@ class ServerGroupTestV21(ServerGroupTestBase):

        post = {'evacuate': {'onSharedStorage': False}}
        self.admin_api.post_server_action(servers[1]['id'], post)
-        self._wait_for_migration_status(servers[1], 'done')
+        self._wait_for_migration_status(servers[1], ['done'])
        evacuated_server = self._wait_for_state_change(
            self.admin_api, servers[1], 'ACTIVE')

@ -453,7 +453,7 @@ class ServerGroupTestV21(ServerGroupTestBase):

        post = {'evacuate': {'onSharedStorage': False}}
        self.admin_api.post_server_action(servers[1]['id'], post)
-        self._wait_for_migration_status(servers[1], 'error')
+        self._wait_for_migration_status(servers[1], ['error'])
        server_after_failed_evac = self._wait_for_state_change(
            self.admin_api, servers[1], 'ERROR')

@ -477,7 +477,7 @@ class ServerGroupTestV21(ServerGroupTestBase):

        post = {'evacuate': {'onSharedStorage': False}}
        self.admin_api.post_server_action(servers[1]['id'], post)
-        self._wait_for_migration_status(servers[1], 'error')
+        self._wait_for_migration_status(servers[1], ['error'])
        server_after_failed_evac = self._wait_for_state_change(
            self.admin_api, servers[1], 'ERROR')

@ -624,7 +624,7 @@ class ServerGroupTestV215(ServerGroupTestV21):

        post = {'evacuate': {}}
        self.admin_api.post_server_action(servers[1]['id'], post)
-        self._wait_for_migration_status(servers[1], 'done')
+        self._wait_for_migration_status(servers[1], ['done'])
        evacuated_server = self._wait_for_state_change(
            self.admin_api, servers[1], 'ACTIVE')

@ -651,7 +651,7 @@ class ServerGroupTestV215(ServerGroupTestV21):

        post = {'evacuate': {}}
        self.admin_api.post_server_action(servers[1]['id'], post)
-        self._wait_for_migration_status(servers[1], 'error')
+        self._wait_for_migration_status(servers[1], ['error'])
        server_after_failed_evac = self._wait_for_state_change(
            self.admin_api, servers[1], 'ERROR')

@ -675,7 +675,7 @@ class ServerGroupTestV215(ServerGroupTestV21):

        post = {'evacuate': {}}
        self.admin_api.post_server_action(servers[1]['id'], post)
-        self._wait_for_migration_status(servers[1], 'error')
+        self._wait_for_migration_status(servers[1], ['error'])
        server_after_failed_evac = self._wait_for_state_change(
            self.admin_api, servers[1], 'ERROR')

@ -817,7 +817,7 @@ class ServerGroupTestV215(ServerGroupTestV21):

        post = {'evacuate': {}}
        self.admin_api.post_server_action(servers[1]['id'], post)
-        self._wait_for_migration_status(servers[1], 'done')
+        self._wait_for_migration_status(servers[1], ['done'])
        evacuated_server = self._wait_for_state_change(
            self.admin_api, servers[1], 'ACTIVE')

--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@ -2588,7 +2588,7 @@ class ServerMovingTests(ProviderUsageBaseTestCase):
            # clears the task_state. The migration record status is set to
            # 'error', so that's what we need to look for to know when this
            # is done.
-            migration = self._wait_for_migration_status(server, 'error')
+            migration = self._wait_for_migration_status(server, ['error'])

        # The source_compute should be set on the migration record, but the
        # destination shouldn't be as we never made it to one.
@ -2648,7 +2648,7 @@ class ServerMovingTests(ProviderUsageBaseTestCase):
        self.api.post_server_action(server['id'], post)
        # The compute manager will put the migration record into error status
        # when pre_live_migration fails, so wait for that to happen.
-        migration = self._wait_for_migration_status(server, 'error')
+        migration = self._wait_for_migration_status(server, ['error'])
        # The _rollback_live_migration method in the compute manager will reset
        # the task_state on the instance, so wait for that to happen.
        server = self._wait_for_server_parameter(
@ -2916,7 +2916,7 @@ class ServerLiveMigrateForceAndAbort(ProviderUsageBaseTestCase):
        }
        self.api.post_server_action(server['id'], post)

-        migration = self._wait_for_migration_status(server, 'running')
+        migration = self._wait_for_migration_status(server, ['running'])
        self.api.force_complete_migration(server['id'],
                                          migration['id'])

@ -2959,7 +2959,7 @@ class ServerLiveMigrateForceAndAbort(ProviderUsageBaseTestCase):
        }
        self.api.post_server_action(server['id'], post)

-        migration = self._wait_for_migration_status(server, 'running')
+        migration = self._wait_for_migration_status(server, ['running'])

        self.api.delete_migration(server['id'], migration['id'])
        self._wait_for_server_parameter(self.api, server,
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@ -3935,6 +3935,87 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
            delete_alloc.assert_called_once_with(instance, 'foo',
                                                 node_type='destination')

+    @mock.patch('nova.context.RequestContext.elevated')
+    @mock.patch('nova.objects.instance.Instance.drop_migration_context')
+    @mock.patch('nova.objects.instance.Instance.apply_migration_context')
+    @mock.patch('nova.objects.instance.Instance.mutated_migration_context')
+    @mock.patch('nova.objects.BlockDeviceMappingList.get_by_instance_uuid')
+    @mock.patch('nova.network.neutronv2.api.API.'
+                'setup_instance_network_on_host')
+    @mock.patch('nova.network.neutronv2.api.API.setup_networks_on_host')
+    @mock.patch('nova.objects.instance.Instance.save')
+    @mock.patch('nova.compute.utils.notify_about_instance_action')
+    @mock.patch('nova.compute.utils.notify_about_instance_usage')
+    @mock.patch('nova.compute.utils.notify_usage_exists')
+    @mock.patch('nova.objects.instance.Instance.image_meta',
+                new_callable=mock.PropertyMock)
+    @mock.patch('nova.compute.manager.ComputeManager.'
+                '_validate_instance_group_policy')
+    @mock.patch('nova.compute.manager.ComputeManager._set_migration_status')
+    @mock.patch('nova.compute.resource_tracker.ResourceTracker.rebuild_claim')
+    def test_evacuate_late_server_group_policy_check(
+            self, mock_rebuild_claim, mock_set_migration_status,
+            mock_validate_policy, mock_image_meta, mock_notify_exists,
+            mock_notify_legacy, mock_notify, mock_instance_save,
+            mock_setup_networks, mock_setup_intance_network, mock_get_bdms,
+            mock_mutate_migration, mock_appy_migration, mock_drop_migration,
+            mock_context_elevated):
+        instance = fake_instance.fake_instance_obj(self.context)
+        instance.info_cache = None
+        elevated_context = mock.Mock()
+        mock_context_elevated.return_value = elevated_context
+        request_spec = objects.RequestSpec()
+        request_spec.scheduler_hints = {'group': [uuids.group]}
+
+        self.compute.rebuild_instance(
+            self.context, instance, None, None, None, None, None,
+            None, recreate=True, scheduled_node='fake-node',
+            request_spec=request_spec)
+
+        mock_validate_policy.assert_called_once_with(
+            elevated_context, instance, {'group': [uuids.group]})
+
+    @mock.patch('nova.compute.utils.add_instance_fault_from_exc')
+    @mock.patch('nova.compute.resource_tracker.ResourceTracker.'
+                'delete_allocation_for_evacuated_instance')
+    @mock.patch('nova.context.RequestContext.elevated')
+    @mock.patch('nova.objects.instance.Instance.save')
+    @mock.patch('nova.compute.utils.notify_about_instance_action')
+    @mock.patch('nova.compute.utils.notify_about_instance_usage')
+    @mock.patch('nova.compute.manager.ComputeManager.'
+                '_validate_instance_group_policy')
+    @mock.patch('nova.compute.manager.ComputeManager._set_migration_status')
+    @mock.patch('nova.compute.resource_tracker.ResourceTracker.rebuild_claim')
+    def test_evacuate_late_server_group_policy_check_fails(
+            self, mock_rebuild_claim, mock_set_migration_status,
+            mock_validate_policy, mock_notify_legacy, mock_notify,
+            mock_instance_save, mock_context_elevated, mock_delete_allocation,
+            mock_instance_fault):
+        instance = fake_instance.fake_instance_obj(self.context)
+        instance.info_cache = None
+        elevated_context = mock.Mock()
+        mock_context_elevated.return_value = elevated_context
+        request_spec = objects.RequestSpec()
+        request_spec.scheduler_hints = {'group': [uuids.group]}
+
+        exc = exception.RescheduledException(
+            instance_uuid=instance.uuid, reason='policy violation')
+        mock_validate_policy.side_effect = exc
+
+        self.assertRaises(
+            exception.BuildAbortException, self.compute.rebuild_instance,
+            self.context, instance, None, None, None, None, None, None,
+            recreate=True, scheduled_node='fake-node',
+            request_spec=request_spec)
+
+        mock_validate_policy.assert_called_once_with(
+            elevated_context, instance, {'group': [uuids.group]})
+        mock_delete_allocation.assert_called_once_with(
+            instance, 'fake-node', node_type='destination')
+        mock_notify.assert_called_once_with(
+            elevated_context, instance, 'fake-mini', action='rebuild',
+            bdms=None, exception=exc, phase='error')
+
    def test_rebuild_node_not_updated_if_not_recreate(self):
        node = uuidutils.generate_uuid()  # ironic node uuid
        instance = fake_instance.fake_instance_obj(self.context, node=node)
--- a/nova/tests/unit/compute/test_rpcapi.py
+++ b/nova/tests/unit/compute/test_rpcapi.py
@ -517,6 +517,15 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
                reboot_type='type')

    def test_rebuild_instance(self):
+        self._test_compute_api('rebuild_instance', 'cast', new_pass='None',
+                injected_files='None', image_ref='None', orig_image_ref='None',
+                bdms=[], instance=self.fake_instance_obj, host='new_host',
+                orig_sys_metadata=None, recreate=True, on_shared_storage=True,
+                preserve_ephemeral=True, migration=None, node=None,
+                limits=None, request_spec=None, version='4.22')
+
+    def test_rebuild_instance_remove_request_spec(self):
+        self.flags(group='upgrade_levels', compute='4.21')
        self._test_compute_api('rebuild_instance', 'cast', new_pass='None',
                injected_files='None', image_ref='None', orig_image_ref='None',
                bdms=[], instance=self.fake_instance_obj, host='new_host',
--- a/nova/tests/unit/conductor/test_conductor.py
+++ b/nova/tests/unit/conductor/test_conductor.py
@ -366,9 +366,6 @@ class _BaseTaskTestCase(object):
        compute_rebuild_args['node'] = node
        compute_rebuild_args['limits'] = limits

-        # Args that are passed in to the method but don't get passed to RPC
-        compute_rebuild_args.pop('request_spec')
-
        return rebuild_args, compute_rebuild_args

    @mock.patch.object(objects.InstanceMapping, 'get_by_instance_uuid')
@ -1312,6 +1309,7 @@ class _BaseTaskTestCase(object):
            select_dest_mock.assert_called_once_with(self.context, fake_spec,
                    inst_uuids, return_objects=True, return_alternates=False)
            compute_args['host'] = expected_host
+            compute_args['request_spec'] = fake_spec
            rebuild_mock.assert_called_once_with(self.context,
                                            instance=inst_obj,
                                            **compute_args)
@ -1469,6 +1467,7 @@ class _BaseTaskTestCase(object):
                    fake_spec, [inst_obj.uuid], return_objects=True,
                    return_alternates=False)
            compute_args['host'] = expected_host
+            compute_args['request_spec'] = fake_spec
            rebuild_mock.assert_called_once_with(self.context,
                                            instance=inst_obj,
                                            **compute_args)