Fix leaking exceptions from scheduler utils

scheduler_utils.setup_instance_group() can raise NoValidHost
exception and the conductor does not handle that properly which
causes the server to get stuck in a scheduling state instead of
going to ERROR.

The function is called several times in conductor/manager.py. This
patch moves the function calls in the try blocks already present.
It handles the missing Affinity filters as UnsupportedPolicyException,
which is a new exception type added here. This way the exception
raised by scheduler_utils.setup_instance_group() will cause the
VM to end up in ERROR state instead of NOSTATE, as it is expected.
Quota rollback is also handled properly, when it is needed.

Closes-bug: #1408326
Change-Id: I3a8ba37ded8ac3fa9b5f3f35b09feb82d822d583
This commit is contained in:
Balazs Gibizer
2015-01-07 16:01:38 +01:00
committed by Ildiko Vancsa
parent 76e4d601f4
commit 28aedc604d
5 changed files with 163 additions and 14 deletions

View File

@@ -479,7 +479,8 @@ class ComputeTaskManager(base.Base):
exception.HypervisorUnavailable,
exception.InstanceNotRunning,
exception.MigrationPreCheckError,
exception.LiveMigrationWithOldNovaNotSafe)
exception.LiveMigrationWithOldNovaNotSafe,
exception.UnsupportedPolicyException)
def migrate_server(self, context, instance, scheduler_hint, live, rebuild,
flavor, block_migration, disk_over_commit, reservations=None,
clean_shutdown=True):
@@ -521,15 +522,15 @@ class ComputeTaskManager(base.Base):
quotas = objects.Quotas.from_reservations(context,
reservations,
instance=instance)
scheduler_utils.setup_instance_group(context, request_spec,
filter_properties)
try:
scheduler_utils.setup_instance_group(context, request_spec,
filter_properties)
scheduler_utils.populate_retry(filter_properties, instance['uuid'])
hosts = self.scheduler_client.select_destinations(
context, request_spec, filter_properties)
host_state = hosts[0]
except exception.NoValidHost as ex:
vm_state = instance['vm_state']
vm_state = instance.vm_state
if not vm_state:
vm_state = vm_states.ACTIVE
updates = {'vm_state': vm_state, 'task_state': None}
@@ -544,6 +545,16 @@ class ComputeTaskManager(base.Base):
else:
msg = _("No valid host found for resize")
raise exception.NoValidHost(reason=msg)
except exception.UnsupportedPolicyException as ex:
with excutils.save_and_reraise_exception():
vm_state = instance.vm_state
if not vm_state:
vm_state = vm_states.ACTIVE
updates = {'vm_state': vm_state, 'task_state': None}
self._set_vm_state_and_notify(context, instance.uuid,
'migrate_server',
updates, ex, request_spec)
quotas.rollback()
try:
scheduler_utils.populate_filter_properties(filter_properties,
@@ -560,7 +571,7 @@ class ComputeTaskManager(base.Base):
clean_shutdown=clean_shutdown)
except Exception as ex:
with excutils.save_and_reraise_exception():
updates = {'vm_state': instance['vm_state'],
updates = {'vm_state': instance.vm_state,
'task_state': None}
self._set_vm_state_and_notify(context, instance.uuid,
'migrate_server',
@@ -607,7 +618,7 @@ class ComputeTaskManager(base.Base):
exception.LiveMigrationWithOldNovaNotSafe) as ex:
with excutils.save_and_reraise_exception():
# TODO(johngarbutt) - eventually need instance actions here
_set_vm_state(context, instance, ex, instance['vm_state'])
_set_vm_state(context, instance, ex, instance.vm_state)
except Exception as ex:
LOG.error(_LE('Migration of instance %(instance_id)s to host'
' %(dest)s unexpectedly failed.'),
@@ -624,8 +635,6 @@ class ComputeTaskManager(base.Base):
# 2.0 of the RPC API.
request_spec = scheduler_utils.build_request_spec(context, image,
instances)
scheduler_utils.setup_instance_group(context, request_spec,
filter_properties)
# TODO(danms): Remove this in version 2.0 of the RPC API
if (requested_networks and
not isinstance(requested_networks,
@@ -642,6 +651,8 @@ class ComputeTaskManager(base.Base):
filter_properties = dict(filter_properties, instance_type=flavor)
try:
scheduler_utils.setup_instance_group(context, request_spec,
filter_properties)
# check retry policy. Rather ugly use of instances[0]...
# but if we've exceeded max retries... then we really only
# have a single instance.
@@ -748,7 +759,8 @@ class ComputeTaskManager(base.Base):
self.compute_rpcapi.unshelve_instance(
context, instance, host, image=image,
filter_properties=filter_properties, node=node)
except exception.NoValidHost:
except (exception.NoValidHost,
exception.UnsupportedPolicyException):
instance.task_state = None
instance.save()
LOG.warning(_LW("No valid host found for unshelve instance"),
@@ -781,9 +793,9 @@ class ComputeTaskManager(base.Base):
request_spec = scheduler_utils.build_request_spec(context,
image_ref,
[instance])
scheduler_utils.setup_instance_group(context, request_spec,
filter_properties)
try:
scheduler_utils.setup_instance_group(context, request_spec,
filter_properties)
hosts = self.scheduler_client.select_destinations(context,
request_spec,
filter_properties)
@@ -796,6 +808,15 @@ class ComputeTaskManager(base.Base):
'task_state': None}, ex, request_spec)
LOG.warning(_LW("No valid host found for rebuild"),
instance=instance)
except exception.UnsupportedPolicyException as ex:
with excutils.save_and_reraise_exception():
self._set_vm_state_and_notify(context, instance.uuid,
'rebuild_server',
{'vm_state': instance.vm_state,
'task_state': None}, ex, request_spec)
LOG.warning(_LW("Server with unsupported policy "
"cannot be rebuilt"),
instance=instance)
self.compute_rpcapi.rebuild_instance(context,
instance=instance,

View File

@@ -1838,3 +1838,7 @@ class CPUPinningInvalid(Invalid):
class ImageCPUPinningForbidden(Invalid):
msg_fmt = _("Image property 'hw_cpu_policy' is not permitted to override "
"CPU pinning policy set against the flavor")
class UnsupportedPolicyException(Invalid):
msg_fmt = _("ServerGroup policy is not supported: %(reason)s")

View File

@@ -285,11 +285,11 @@ def _get_group_details(context, instance_uuid, user_group_hosts=None):
if (not _SUPPORTS_AFFINITY and 'affinity' in group.policies):
msg = _("ServerGroupAffinityFilter not configured")
LOG.error(msg)
raise exception.NoValidHost(reason=msg)
raise exception.UnsupportedPolicyException(reason=msg)
if (not _SUPPORTS_ANTI_AFFINITY and 'anti-affinity' in group.policies):
msg = _("ServerGroupAntiAffinityFilter not configured")
LOG.error(msg)
raise exception.NoValidHost(reason=msg)
raise exception.UnsupportedPolicyException(reason=msg)
group_hosts = set(group.get_hosts(context))
user_hosts = set(user_group_hosts) if user_group_hosts else set()
return GroupDetails(hosts=user_hosts | group_hosts,

View File

@@ -1378,6 +1378,49 @@ class _BaseTaskTestCase(object):
block_device_mapping='block_device_mapping',
legacy_bdm=False)
@mock.patch('nova.utils.spawn_n')
@mock.patch.object(scheduler_utils, 'build_request_spec')
@mock.patch.object(scheduler_utils, 'setup_instance_group')
@mock.patch.object(conductor_manager.ComputeTaskManager,
'_set_vm_state_and_notify')
def test_build_instances_scheduler_group_failure(self, state_mock,
sig_mock, bs_mock,
spawn_mock):
instances = [fake_instance.fake_instance_obj(self.context)
for i in range(2)]
image = {'fake-data': 'should_pass_silently'}
spec = {'fake': 'specs',
'instance_properties': instances[0]}
# NOTE(gibi): LocalComputeTaskAPI use eventlet spawn that makes mocking
# hard so use direct call instead.
spawn_mock.side_effect = lambda f, *a, **k: f(*a, **k)
bs_mock.return_value = spec
exception = exc.UnsupportedPolicyException(reason='fake-reason')
sig_mock.side_effect = exception
updates = {'vm_state': vm_states.ERROR, 'task_state': None}
# build_instances() is a cast, we need to wait for it to complete
self.useFixture(cast_as_call.CastAsCall(self.stubs))
self.conductor.build_instances(
context=self.context,
instances=instances,
image=image,
filter_properties={},
admin_password='admin_password',
injected_files='injected_files',
requested_networks=None,
security_groups='security_groups',
block_device_mapping='block_device_mapping',
legacy_bdm=False)
calls = []
for instance in instances:
calls.append(mock.call(self.context, instance.uuid,
'build_instances', updates, exception, spec))
state_mock.assert_has_calls(calls)
def test_unshelve_instance_on_host(self):
instance = self._create_fake_instance_obj()
instance.vm_state = vm_states.SHELVED
@@ -1606,6 +1649,49 @@ class _BaseTaskTestCase(object):
filter_properties)
self.assertFalse(rebuild_mock.called)
@mock.patch('nova.utils.spawn_n')
@mock.patch.object(conductor_manager.compute_rpcapi.ComputeAPI,
'rebuild_instance')
@mock.patch.object(scheduler_utils, 'setup_instance_group')
@mock.patch.object(conductor_manager.scheduler_client.SchedulerClient,
'select_destinations')
@mock.patch('nova.scheduler.utils.build_request_spec')
@mock.patch.object(conductor_manager.ComputeTaskManager,
'_set_vm_state_and_notify')
def test_rebuild_instance_with_scheduler_group_failure(self,
state_mock,
bs_mock,
select_dest_mock,
sig_mock,
rebuild_mock,
spawn_mock):
inst_obj = self._create_fake_instance_obj()
rebuild_args = self._prepare_rebuild_args({'host': None})
request_spec = {}
bs_mock.return_value = request_spec
# NOTE(gibi): LocalComputeTaskAPI use eventlet spawn that makes mocking
# hard so use direct call instead.
spawn_mock.side_effect = lambda f, *a, **k: f(*a, **k)
exception = exc.UnsupportedPolicyException(reason='')
sig_mock.side_effect = exception
# build_instances() is a cast, we need to wait for it to complete
self.useFixture(cast_as_call.CastAsCall(self.stubs))
self.assertRaises(exc.UnsupportedPolicyException,
self.conductor.rebuild_instance,
self.context,
inst_obj,
**rebuild_args)
updates = {'vm_state': vm_states.ACTIVE, 'task_state': None}
state_mock.assert_called_once_with(self.context, inst_obj.uuid,
'rebuild_server', updates,
exception, request_spec)
self.assertFalse(select_dest_mock.called)
self.assertFalse(rebuild_mock.called)
class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
"""ComputeTaskManager Tests."""
@@ -1922,6 +2008,44 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
clean_shutdown=True)
self.assertIn('cold migrate', nvh.message)
@mock.patch.object(compute_utils, 'get_image_metadata')
@mock.patch('nova.scheduler.utils.build_request_spec')
@mock.patch.object(scheduler_utils, 'setup_instance_group')
@mock.patch.object(conductor_manager.ComputeTaskManager,
'_set_vm_state_and_notify')
def test_cold_migrate_no_valid_host_in_group(self,
set_vm_mock,
sig_mock,
brs_mock,
image_mock):
flavor = flavors.get_flavor_by_name('m1.tiny')
inst = fake_instance.fake_db_instance(image_ref='fake-image_ref',
vm_state=vm_states.STOPPED,
instance_type_id=flavor['id'])
inst_obj = objects.Instance._from_db_object(
self.context, objects.Instance(), inst,
expected_attrs=[])
request_spec = dict(instance_type=dict(extra_specs=dict()),
instance_properties=dict())
filter_props = dict(context=None)
resvs = 'fake-resvs'
image = 'fake-image'
exception = exc.UnsupportedPolicyException(reason='')
image_mock.return_value = image
brs_mock.return_value = request_spec
sig_mock.side_effect = exception
self.assertRaises(exc.UnsupportedPolicyException,
self.conductor._cold_migrate, self.context,
inst_obj, flavor, filter_props, [resvs],
clean_shutdown=True)
updates = {'vm_state': vm_states.STOPPED, 'task_state': None}
set_vm_mock.assert_called_once_with(self.context, inst_obj.uuid,
'migrate_server', updates,
exception, request_spec)
def test_cold_migrate_exception_host_in_error_state_and_raise(self):
inst = fake_instance.fake_db_instance(image_ref='fake-image_ref',
vm_state=vm_states.STOPPED)

View File

@@ -305,7 +305,7 @@ class SchedulerUtilsTestCase(test.NoDBTestCase):
) as (get_group, get_hosts):
scheduler_utils._SUPPORTS_ANTI_AFFINITY = None
scheduler_utils._SUPPORTS_AFFINITY = None
self.assertRaises(exception.NoValidHost,
self.assertRaises(exception.UnsupportedPolicyException,
scheduler_utils._get_group_details,
self.context, 'fake-uuid')