Fix leaking exceptions from scheduler utils

scheduler_utils.setup_instance_group() can raise NoValidHost exception and the conductor does not handle that properly which causes the server to get stuck in a scheduling state instead of going to ERROR. The function is called several times in conductor/manager.py. This patch moves the function calls in the try blocks already present. It handles the missing Affinity filters as UnsupportedPolicyException, which is a new exception type added here. This way the exception raised by scheduler_utils.setup_instance_group() will cause the VM to end up in ERROR state instead of NOSTATE, as it is expected. Quota rollback is also handled properly, when it is needed. Closes-bug: #1408326 Change-Id: I3a8ba37ded8ac3fa9b5f3f35b09feb82d822d583
2015-01-07 16:01:38 +01:00
parent 76e4d601f4
commit 28aedc604d
5 changed files with 163 additions and 14 deletions
--- a/nova/conductor/manager.py
+++ b/nova/conductor/manager.py
@@ -479,7 +479,8 @@ class ComputeTaskManager(base.Base):
                                   exception.HypervisorUnavailable,
                                   exception.InstanceNotRunning,
                                   exception.MigrationPreCheckError,
-                                   exception.LiveMigrationWithOldNovaNotSafe)
+                                   exception.LiveMigrationWithOldNovaNotSafe,
                                   exception.UnsupportedPolicyException)
    def migrate_server(self, context, instance, scheduler_hint, live, rebuild,
            flavor, block_migration, disk_over_commit, reservations=None,
            clean_shutdown=True):
@@ -521,15 +522,15 @@ class ComputeTaskManager(base.Base):
        quotas = objects.Quotas.from_reservations(context,
                                                  reservations,
                                                  instance=instance)
        scheduler_utils.setup_instance_group(context, request_spec,
                                             filter_properties)
        try:
            scheduler_utils.setup_instance_group(context, request_spec,
                                                 filter_properties)
            scheduler_utils.populate_retry(filter_properties, instance['uuid'])
            hosts = self.scheduler_client.select_destinations(
                    context, request_spec, filter_properties)
            host_state = hosts[0]
        except exception.NoValidHost as ex:
-            vm_state = instance['vm_state']
+            vm_state = instance.vm_state
            if not vm_state:
                vm_state = vm_states.ACTIVE
            updates = {'vm_state': vm_state, 'task_state': None}
@@ -544,6 +545,16 @@ class ComputeTaskManager(base.Base):
            else:
                msg = _("No valid host found for resize")
            raise exception.NoValidHost(reason=msg)
        except exception.UnsupportedPolicyException as ex:
            with excutils.save_and_reraise_exception():
                vm_state = instance.vm_state
                if not vm_state:
                    vm_state = vm_states.ACTIVE
                updates = {'vm_state': vm_state, 'task_state': None}
                self._set_vm_state_and_notify(context, instance.uuid,
                                              'migrate_server',
                                              updates, ex, request_spec)
                quotas.rollback()
        try:
            scheduler_utils.populate_filter_properties(filter_properties,
@@ -560,7 +571,7 @@ class ComputeTaskManager(base.Base):
                clean_shutdown=clean_shutdown)
        except Exception as ex:
            with excutils.save_and_reraise_exception():
-                updates = {'vm_state': instance['vm_state'],
+                updates = {'vm_state': instance.vm_state,
                           'task_state': None}
                self._set_vm_state_and_notify(context, instance.uuid,
                                              'migrate_server',
@@ -607,7 +618,7 @@ class ComputeTaskManager(base.Base):
                exception.LiveMigrationWithOldNovaNotSafe) as ex:
            with excutils.save_and_reraise_exception():
                # TODO(johngarbutt) - eventually need instance actions here
-                _set_vm_state(context, instance, ex, instance['vm_state'])
+                _set_vm_state(context, instance, ex, instance.vm_state)
        except Exception as ex:
            LOG.error(_LE('Migration of instance %(instance_id)s to host'
                          ' %(dest)s unexpectedly failed.'),
@@ -624,8 +635,6 @@ class ComputeTaskManager(base.Base):
        #                 2.0 of the RPC API.
        request_spec = scheduler_utils.build_request_spec(context, image,
                                                          instances)
        scheduler_utils.setup_instance_group(context, request_spec,
                                             filter_properties)
        # TODO(danms): Remove this in version 2.0 of the RPC API
        if (requested_networks and
                not isinstance(requested_networks,
@@ -642,6 +651,8 @@ class ComputeTaskManager(base.Base):
            filter_properties = dict(filter_properties, instance_type=flavor)
        try:
            scheduler_utils.setup_instance_group(context, request_spec,
                                                 filter_properties)
            # check retry policy. Rather ugly use of instances[0]...
            # but if we've exceeded max retries... then we really only
            # have a single instance.
@@ -748,7 +759,8 @@ class ComputeTaskManager(base.Base):
                    self.compute_rpcapi.unshelve_instance(
                            context, instance, host, image=image,
                            filter_properties=filter_properties, node=node)
-            except exception.NoValidHost:
+            except (exception.NoValidHost,
                    exception.UnsupportedPolicyException):
                instance.task_state = None
                instance.save()
                LOG.warning(_LW("No valid host found for unshelve instance"),
@@ -781,9 +793,9 @@ class ComputeTaskManager(base.Base):
                request_spec = scheduler_utils.build_request_spec(context,
                                                                  image_ref,
                                                                  [instance])
                scheduler_utils.setup_instance_group(context, request_spec,
                                                     filter_properties)
                try:
                    scheduler_utils.setup_instance_group(context, request_spec,
                                                         filter_properties)
                    hosts = self.scheduler_client.select_destinations(context,
                                                            request_spec,
                                                            filter_properties)
@@ -796,6 +808,15 @@ class ComputeTaskManager(base.Base):
                                 'task_state': None}, ex, request_spec)
                        LOG.warning(_LW("No valid host found for rebuild"),
                                    instance=instance)
                except exception.UnsupportedPolicyException as ex:
                    with excutils.save_and_reraise_exception():
                        self._set_vm_state_and_notify(context, instance.uuid,
                                'rebuild_server',
                                {'vm_state': instance.vm_state,
                                 'task_state': None}, ex, request_spec)
                        LOG.warning(_LW("Server with unsupported policy "
                                        "cannot be rebuilt"),
                                    instance=instance)
            self.compute_rpcapi.rebuild_instance(context,
                    instance=instance,
--- a/nova/exception.py
+++ b/nova/exception.py
@@ -1838,3 +1838,7 @@ class CPUPinningInvalid(Invalid):
 class ImageCPUPinningForbidden(Invalid):
    msg_fmt = _("Image property 'hw_cpu_policy' is not permitted to override "
                "CPU pinning policy set against the flavor")
 class UnsupportedPolicyException(Invalid):
    msg_fmt = _("ServerGroup policy is not supported: %(reason)s")
--- a/nova/scheduler/utils.py
+++ b/nova/scheduler/utils.py
@@ -285,11 +285,11 @@ def _get_group_details(context, instance_uuid, user_group_hosts=None):
        if (not _SUPPORTS_AFFINITY and 'affinity' in group.policies):
            msg = _("ServerGroupAffinityFilter not configured")
            LOG.error(msg)
-            raise exception.NoValidHost(reason=msg)
+            raise exception.UnsupportedPolicyException(reason=msg)
        if (not _SUPPORTS_ANTI_AFFINITY and 'anti-affinity' in group.policies):
            msg = _("ServerGroupAntiAffinityFilter not configured")
            LOG.error(msg)
-            raise exception.NoValidHost(reason=msg)
+            raise exception.UnsupportedPolicyException(reason=msg)
        group_hosts = set(group.get_hosts(context))
        user_hosts = set(user_group_hosts) if user_group_hosts else set()
        return GroupDetails(hosts=user_hosts | group_hosts,
--- a/nova/tests/unit/conductor/test_conductor.py
+++ b/nova/tests/unit/conductor/test_conductor.py
@@ -1378,6 +1378,49 @@ class _BaseTaskTestCase(object):
                block_device_mapping='block_device_mapping',
                legacy_bdm=False)
    @mock.patch('nova.utils.spawn_n')
    @mock.patch.object(scheduler_utils, 'build_request_spec')
    @mock.patch.object(scheduler_utils, 'setup_instance_group')
    @mock.patch.object(conductor_manager.ComputeTaskManager,
                       '_set_vm_state_and_notify')
    def test_build_instances_scheduler_group_failure(self, state_mock,
                                                     sig_mock, bs_mock,
                                                     spawn_mock):
        instances = [fake_instance.fake_instance_obj(self.context)
                     for i in range(2)]
        image = {'fake-data': 'should_pass_silently'}
        spec = {'fake': 'specs',
                'instance_properties': instances[0]}
        # NOTE(gibi): LocalComputeTaskAPI use eventlet spawn that makes mocking
        # hard so use direct call instead.
        spawn_mock.side_effect = lambda f, *a, **k: f(*a, **k)
        bs_mock.return_value = spec
        exception = exc.UnsupportedPolicyException(reason='fake-reason')
        sig_mock.side_effect = exception
        updates = {'vm_state': vm_states.ERROR, 'task_state': None}
        # build_instances() is a cast, we need to wait for it to complete
        self.useFixture(cast_as_call.CastAsCall(self.stubs))
        self.conductor.build_instances(
                          context=self.context,
                          instances=instances,
                          image=image,
                          filter_properties={},
                          admin_password='admin_password',
                          injected_files='injected_files',
                          requested_networks=None,
                          security_groups='security_groups',
                          block_device_mapping='block_device_mapping',
                          legacy_bdm=False)
        calls = []
        for instance in instances:
            calls.append(mock.call(self.context, instance.uuid,
                         'build_instances', updates, exception, spec))
        state_mock.assert_has_calls(calls)
    def test_unshelve_instance_on_host(self):
        instance = self._create_fake_instance_obj()
        instance.vm_state = vm_states.SHELVED
@@ -1606,6 +1649,49 @@ class _BaseTaskTestCase(object):
                                                     filter_properties)
            self.assertFalse(rebuild_mock.called)
    @mock.patch('nova.utils.spawn_n')
    @mock.patch.object(conductor_manager.compute_rpcapi.ComputeAPI,
                       'rebuild_instance')
    @mock.patch.object(scheduler_utils, 'setup_instance_group')
    @mock.patch.object(conductor_manager.scheduler_client.SchedulerClient,
                       'select_destinations')
    @mock.patch('nova.scheduler.utils.build_request_spec')
    @mock.patch.object(conductor_manager.ComputeTaskManager,
                       '_set_vm_state_and_notify')
    def test_rebuild_instance_with_scheduler_group_failure(self,
                                                           state_mock,
                                                           bs_mock,
                                                           select_dest_mock,
                                                           sig_mock,
                                                           rebuild_mock,
                                                           spawn_mock):
        inst_obj = self._create_fake_instance_obj()
        rebuild_args = self._prepare_rebuild_args({'host': None})
        request_spec = {}
        bs_mock.return_value = request_spec
        # NOTE(gibi): LocalComputeTaskAPI use eventlet spawn that makes mocking
        # hard so use direct call instead.
        spawn_mock.side_effect = lambda f, *a, **k: f(*a, **k)
        exception = exc.UnsupportedPolicyException(reason='')
        sig_mock.side_effect = exception
        # build_instances() is a cast, we need to wait for it to complete
        self.useFixture(cast_as_call.CastAsCall(self.stubs))
        self.assertRaises(exc.UnsupportedPolicyException,
                          self.conductor.rebuild_instance,
                          self.context,
                          inst_obj,
                          **rebuild_args)
        updates = {'vm_state': vm_states.ACTIVE, 'task_state': None}
        state_mock.assert_called_once_with(self.context, inst_obj.uuid,
                                           'rebuild_server', updates,
                                           exception, request_spec)
        self.assertFalse(select_dest_mock.called)
        self.assertFalse(rebuild_mock.called)
 class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
    """ComputeTaskManager Tests."""
@@ -1922,6 +2008,44 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
                                    clean_shutdown=True)
            self.assertIn('cold migrate', nvh.message)
    @mock.patch.object(compute_utils, 'get_image_metadata')
    @mock.patch('nova.scheduler.utils.build_request_spec')
    @mock.patch.object(scheduler_utils, 'setup_instance_group')
    @mock.patch.object(conductor_manager.ComputeTaskManager,
                       '_set_vm_state_and_notify')
    def test_cold_migrate_no_valid_host_in_group(self,
                                                 set_vm_mock,
                                                 sig_mock,
                                                 brs_mock,
                                                 image_mock):
        flavor = flavors.get_flavor_by_name('m1.tiny')
        inst = fake_instance.fake_db_instance(image_ref='fake-image_ref',
                                              vm_state=vm_states.STOPPED,
                                              instance_type_id=flavor['id'])
        inst_obj = objects.Instance._from_db_object(
                self.context, objects.Instance(), inst,
                expected_attrs=[])
        request_spec = dict(instance_type=dict(extra_specs=dict()),
                            instance_properties=dict())
        filter_props = dict(context=None)
        resvs = 'fake-resvs'
        image = 'fake-image'
        exception = exc.UnsupportedPolicyException(reason='')
        image_mock.return_value = image
        brs_mock.return_value = request_spec
        sig_mock.side_effect = exception
        self.assertRaises(exc.UnsupportedPolicyException,
                          self.conductor._cold_migrate, self.context,
                          inst_obj, flavor, filter_props, [resvs],
                          clean_shutdown=True)
        updates = {'vm_state': vm_states.STOPPED, 'task_state': None}
        set_vm_mock.assert_called_once_with(self.context, inst_obj.uuid,
                                            'migrate_server', updates,
                                            exception, request_spec)
    def test_cold_migrate_exception_host_in_error_state_and_raise(self):
        inst = fake_instance.fake_db_instance(image_ref='fake-image_ref',
                                              vm_state=vm_states.STOPPED)
--- a/nova/tests/unit/scheduler/test_scheduler_utils.py
+++ b/nova/tests/unit/scheduler/test_scheduler_utils.py
@@ -305,7 +305,7 @@ class SchedulerUtilsTestCase(test.NoDBTestCase):
        ) as (get_group, get_hosts):
            scheduler_utils._SUPPORTS_ANTI_AFFINITY = None
            scheduler_utils._SUPPORTS_AFFINITY = None
-            self.assertRaises(exception.NoValidHost,
+            self.assertRaises(exception.UnsupportedPolicyException,
                              scheduler_utils._get_group_details,
                              self.context, 'fake-uuid')