Fix leaking exceptions from scheduler utils

scheduler_utils.setup_instance_group() can raise NoValidHost
exception and the conductor does not handle that properly which
causes the server to get stuck in a scheduling state instead of
going to ERROR.

The function is called several times in conductor/manager.py. This
patch moves the function calls in the try blocks already present.
It handles the missing Affinity filters as UnsupportedPolicyException,
which is a new exception type added here. This way the exception
raised by scheduler_utils.setup_instance_group() will cause the
VM to end up in ERROR state instead of NOSTATE, as it is expected.
Quota rollback is also handled properly, when it is needed.

Closes-bug: #1408326
Change-Id: I3a8ba37ded8ac3fa9b5f3f35b09feb82d822d583
This commit is contained in:
Balazs Gibizer
2015-01-07 16:01:38 +01:00
committed by Ildiko Vancsa
parent 76e4d601f4
commit 28aedc604d
5 changed files with 163 additions and 14 deletions

View File

@@ -479,7 +479,8 @@ class ComputeTaskManager(base.Base):
exception.HypervisorUnavailable, exception.HypervisorUnavailable,
exception.InstanceNotRunning, exception.InstanceNotRunning,
exception.MigrationPreCheckError, exception.MigrationPreCheckError,
exception.LiveMigrationWithOldNovaNotSafe) exception.LiveMigrationWithOldNovaNotSafe,
exception.UnsupportedPolicyException)
def migrate_server(self, context, instance, scheduler_hint, live, rebuild, def migrate_server(self, context, instance, scheduler_hint, live, rebuild,
flavor, block_migration, disk_over_commit, reservations=None, flavor, block_migration, disk_over_commit, reservations=None,
clean_shutdown=True): clean_shutdown=True):
@@ -521,15 +522,15 @@ class ComputeTaskManager(base.Base):
quotas = objects.Quotas.from_reservations(context, quotas = objects.Quotas.from_reservations(context,
reservations, reservations,
instance=instance) instance=instance)
scheduler_utils.setup_instance_group(context, request_spec,
filter_properties)
try: try:
scheduler_utils.setup_instance_group(context, request_spec,
filter_properties)
scheduler_utils.populate_retry(filter_properties, instance['uuid']) scheduler_utils.populate_retry(filter_properties, instance['uuid'])
hosts = self.scheduler_client.select_destinations( hosts = self.scheduler_client.select_destinations(
context, request_spec, filter_properties) context, request_spec, filter_properties)
host_state = hosts[0] host_state = hosts[0]
except exception.NoValidHost as ex: except exception.NoValidHost as ex:
vm_state = instance['vm_state'] vm_state = instance.vm_state
if not vm_state: if not vm_state:
vm_state = vm_states.ACTIVE vm_state = vm_states.ACTIVE
updates = {'vm_state': vm_state, 'task_state': None} updates = {'vm_state': vm_state, 'task_state': None}
@@ -544,6 +545,16 @@ class ComputeTaskManager(base.Base):
else: else:
msg = _("No valid host found for resize") msg = _("No valid host found for resize")
raise exception.NoValidHost(reason=msg) raise exception.NoValidHost(reason=msg)
except exception.UnsupportedPolicyException as ex:
with excutils.save_and_reraise_exception():
vm_state = instance.vm_state
if not vm_state:
vm_state = vm_states.ACTIVE
updates = {'vm_state': vm_state, 'task_state': None}
self._set_vm_state_and_notify(context, instance.uuid,
'migrate_server',
updates, ex, request_spec)
quotas.rollback()
try: try:
scheduler_utils.populate_filter_properties(filter_properties, scheduler_utils.populate_filter_properties(filter_properties,
@@ -560,7 +571,7 @@ class ComputeTaskManager(base.Base):
clean_shutdown=clean_shutdown) clean_shutdown=clean_shutdown)
except Exception as ex: except Exception as ex:
with excutils.save_and_reraise_exception(): with excutils.save_and_reraise_exception():
updates = {'vm_state': instance['vm_state'], updates = {'vm_state': instance.vm_state,
'task_state': None} 'task_state': None}
self._set_vm_state_and_notify(context, instance.uuid, self._set_vm_state_and_notify(context, instance.uuid,
'migrate_server', 'migrate_server',
@@ -607,7 +618,7 @@ class ComputeTaskManager(base.Base):
exception.LiveMigrationWithOldNovaNotSafe) as ex: exception.LiveMigrationWithOldNovaNotSafe) as ex:
with excutils.save_and_reraise_exception(): with excutils.save_and_reraise_exception():
# TODO(johngarbutt) - eventually need instance actions here # TODO(johngarbutt) - eventually need instance actions here
_set_vm_state(context, instance, ex, instance['vm_state']) _set_vm_state(context, instance, ex, instance.vm_state)
except Exception as ex: except Exception as ex:
LOG.error(_LE('Migration of instance %(instance_id)s to host' LOG.error(_LE('Migration of instance %(instance_id)s to host'
' %(dest)s unexpectedly failed.'), ' %(dest)s unexpectedly failed.'),
@@ -624,8 +635,6 @@ class ComputeTaskManager(base.Base):
# 2.0 of the RPC API. # 2.0 of the RPC API.
request_spec = scheduler_utils.build_request_spec(context, image, request_spec = scheduler_utils.build_request_spec(context, image,
instances) instances)
scheduler_utils.setup_instance_group(context, request_spec,
filter_properties)
# TODO(danms): Remove this in version 2.0 of the RPC API # TODO(danms): Remove this in version 2.0 of the RPC API
if (requested_networks and if (requested_networks and
not isinstance(requested_networks, not isinstance(requested_networks,
@@ -642,6 +651,8 @@ class ComputeTaskManager(base.Base):
filter_properties = dict(filter_properties, instance_type=flavor) filter_properties = dict(filter_properties, instance_type=flavor)
try: try:
scheduler_utils.setup_instance_group(context, request_spec,
filter_properties)
# check retry policy. Rather ugly use of instances[0]... # check retry policy. Rather ugly use of instances[0]...
# but if we've exceeded max retries... then we really only # but if we've exceeded max retries... then we really only
# have a single instance. # have a single instance.
@@ -748,7 +759,8 @@ class ComputeTaskManager(base.Base):
self.compute_rpcapi.unshelve_instance( self.compute_rpcapi.unshelve_instance(
context, instance, host, image=image, context, instance, host, image=image,
filter_properties=filter_properties, node=node) filter_properties=filter_properties, node=node)
except exception.NoValidHost: except (exception.NoValidHost,
exception.UnsupportedPolicyException):
instance.task_state = None instance.task_state = None
instance.save() instance.save()
LOG.warning(_LW("No valid host found for unshelve instance"), LOG.warning(_LW("No valid host found for unshelve instance"),
@@ -781,9 +793,9 @@ class ComputeTaskManager(base.Base):
request_spec = scheduler_utils.build_request_spec(context, request_spec = scheduler_utils.build_request_spec(context,
image_ref, image_ref,
[instance]) [instance])
scheduler_utils.setup_instance_group(context, request_spec,
filter_properties)
try: try:
scheduler_utils.setup_instance_group(context, request_spec,
filter_properties)
hosts = self.scheduler_client.select_destinations(context, hosts = self.scheduler_client.select_destinations(context,
request_spec, request_spec,
filter_properties) filter_properties)
@@ -796,6 +808,15 @@ class ComputeTaskManager(base.Base):
'task_state': None}, ex, request_spec) 'task_state': None}, ex, request_spec)
LOG.warning(_LW("No valid host found for rebuild"), LOG.warning(_LW("No valid host found for rebuild"),
instance=instance) instance=instance)
except exception.UnsupportedPolicyException as ex:
with excutils.save_and_reraise_exception():
self._set_vm_state_and_notify(context, instance.uuid,
'rebuild_server',
{'vm_state': instance.vm_state,
'task_state': None}, ex, request_spec)
LOG.warning(_LW("Server with unsupported policy "
"cannot be rebuilt"),
instance=instance)
self.compute_rpcapi.rebuild_instance(context, self.compute_rpcapi.rebuild_instance(context,
instance=instance, instance=instance,

View File

@@ -1838,3 +1838,7 @@ class CPUPinningInvalid(Invalid):
class ImageCPUPinningForbidden(Invalid): class ImageCPUPinningForbidden(Invalid):
msg_fmt = _("Image property 'hw_cpu_policy' is not permitted to override " msg_fmt = _("Image property 'hw_cpu_policy' is not permitted to override "
"CPU pinning policy set against the flavor") "CPU pinning policy set against the flavor")
class UnsupportedPolicyException(Invalid):
msg_fmt = _("ServerGroup policy is not supported: %(reason)s")

View File

@@ -285,11 +285,11 @@ def _get_group_details(context, instance_uuid, user_group_hosts=None):
if (not _SUPPORTS_AFFINITY and 'affinity' in group.policies): if (not _SUPPORTS_AFFINITY and 'affinity' in group.policies):
msg = _("ServerGroupAffinityFilter not configured") msg = _("ServerGroupAffinityFilter not configured")
LOG.error(msg) LOG.error(msg)
raise exception.NoValidHost(reason=msg) raise exception.UnsupportedPolicyException(reason=msg)
if (not _SUPPORTS_ANTI_AFFINITY and 'anti-affinity' in group.policies): if (not _SUPPORTS_ANTI_AFFINITY and 'anti-affinity' in group.policies):
msg = _("ServerGroupAntiAffinityFilter not configured") msg = _("ServerGroupAntiAffinityFilter not configured")
LOG.error(msg) LOG.error(msg)
raise exception.NoValidHost(reason=msg) raise exception.UnsupportedPolicyException(reason=msg)
group_hosts = set(group.get_hosts(context)) group_hosts = set(group.get_hosts(context))
user_hosts = set(user_group_hosts) if user_group_hosts else set() user_hosts = set(user_group_hosts) if user_group_hosts else set()
return GroupDetails(hosts=user_hosts | group_hosts, return GroupDetails(hosts=user_hosts | group_hosts,

View File

@@ -1378,6 +1378,49 @@ class _BaseTaskTestCase(object):
block_device_mapping='block_device_mapping', block_device_mapping='block_device_mapping',
legacy_bdm=False) legacy_bdm=False)
@mock.patch('nova.utils.spawn_n')
@mock.patch.object(scheduler_utils, 'build_request_spec')
@mock.patch.object(scheduler_utils, 'setup_instance_group')
@mock.patch.object(conductor_manager.ComputeTaskManager,
'_set_vm_state_and_notify')
def test_build_instances_scheduler_group_failure(self, state_mock,
sig_mock, bs_mock,
spawn_mock):
instances = [fake_instance.fake_instance_obj(self.context)
for i in range(2)]
image = {'fake-data': 'should_pass_silently'}
spec = {'fake': 'specs',
'instance_properties': instances[0]}
# NOTE(gibi): LocalComputeTaskAPI use eventlet spawn that makes mocking
# hard so use direct call instead.
spawn_mock.side_effect = lambda f, *a, **k: f(*a, **k)
bs_mock.return_value = spec
exception = exc.UnsupportedPolicyException(reason='fake-reason')
sig_mock.side_effect = exception
updates = {'vm_state': vm_states.ERROR, 'task_state': None}
# build_instances() is a cast, we need to wait for it to complete
self.useFixture(cast_as_call.CastAsCall(self.stubs))
self.conductor.build_instances(
context=self.context,
instances=instances,
image=image,
filter_properties={},
admin_password='admin_password',
injected_files='injected_files',
requested_networks=None,
security_groups='security_groups',
block_device_mapping='block_device_mapping',
legacy_bdm=False)
calls = []
for instance in instances:
calls.append(mock.call(self.context, instance.uuid,
'build_instances', updates, exception, spec))
state_mock.assert_has_calls(calls)
def test_unshelve_instance_on_host(self): def test_unshelve_instance_on_host(self):
instance = self._create_fake_instance_obj() instance = self._create_fake_instance_obj()
instance.vm_state = vm_states.SHELVED instance.vm_state = vm_states.SHELVED
@@ -1606,6 +1649,49 @@ class _BaseTaskTestCase(object):
filter_properties) filter_properties)
self.assertFalse(rebuild_mock.called) self.assertFalse(rebuild_mock.called)
@mock.patch('nova.utils.spawn_n')
@mock.patch.object(conductor_manager.compute_rpcapi.ComputeAPI,
'rebuild_instance')
@mock.patch.object(scheduler_utils, 'setup_instance_group')
@mock.patch.object(conductor_manager.scheduler_client.SchedulerClient,
'select_destinations')
@mock.patch('nova.scheduler.utils.build_request_spec')
@mock.patch.object(conductor_manager.ComputeTaskManager,
'_set_vm_state_and_notify')
def test_rebuild_instance_with_scheduler_group_failure(self,
state_mock,
bs_mock,
select_dest_mock,
sig_mock,
rebuild_mock,
spawn_mock):
inst_obj = self._create_fake_instance_obj()
rebuild_args = self._prepare_rebuild_args({'host': None})
request_spec = {}
bs_mock.return_value = request_spec
# NOTE(gibi): LocalComputeTaskAPI use eventlet spawn that makes mocking
# hard so use direct call instead.
spawn_mock.side_effect = lambda f, *a, **k: f(*a, **k)
exception = exc.UnsupportedPolicyException(reason='')
sig_mock.side_effect = exception
# build_instances() is a cast, we need to wait for it to complete
self.useFixture(cast_as_call.CastAsCall(self.stubs))
self.assertRaises(exc.UnsupportedPolicyException,
self.conductor.rebuild_instance,
self.context,
inst_obj,
**rebuild_args)
updates = {'vm_state': vm_states.ACTIVE, 'task_state': None}
state_mock.assert_called_once_with(self.context, inst_obj.uuid,
'rebuild_server', updates,
exception, request_spec)
self.assertFalse(select_dest_mock.called)
self.assertFalse(rebuild_mock.called)
class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase): class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
"""ComputeTaskManager Tests.""" """ComputeTaskManager Tests."""
@@ -1922,6 +2008,44 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
clean_shutdown=True) clean_shutdown=True)
self.assertIn('cold migrate', nvh.message) self.assertIn('cold migrate', nvh.message)
@mock.patch.object(compute_utils, 'get_image_metadata')
@mock.patch('nova.scheduler.utils.build_request_spec')
@mock.patch.object(scheduler_utils, 'setup_instance_group')
@mock.patch.object(conductor_manager.ComputeTaskManager,
'_set_vm_state_and_notify')
def test_cold_migrate_no_valid_host_in_group(self,
set_vm_mock,
sig_mock,
brs_mock,
image_mock):
flavor = flavors.get_flavor_by_name('m1.tiny')
inst = fake_instance.fake_db_instance(image_ref='fake-image_ref',
vm_state=vm_states.STOPPED,
instance_type_id=flavor['id'])
inst_obj = objects.Instance._from_db_object(
self.context, objects.Instance(), inst,
expected_attrs=[])
request_spec = dict(instance_type=dict(extra_specs=dict()),
instance_properties=dict())
filter_props = dict(context=None)
resvs = 'fake-resvs'
image = 'fake-image'
exception = exc.UnsupportedPolicyException(reason='')
image_mock.return_value = image
brs_mock.return_value = request_spec
sig_mock.side_effect = exception
self.assertRaises(exc.UnsupportedPolicyException,
self.conductor._cold_migrate, self.context,
inst_obj, flavor, filter_props, [resvs],
clean_shutdown=True)
updates = {'vm_state': vm_states.STOPPED, 'task_state': None}
set_vm_mock.assert_called_once_with(self.context, inst_obj.uuid,
'migrate_server', updates,
exception, request_spec)
def test_cold_migrate_exception_host_in_error_state_and_raise(self): def test_cold_migrate_exception_host_in_error_state_and_raise(self):
inst = fake_instance.fake_db_instance(image_ref='fake-image_ref', inst = fake_instance.fake_db_instance(image_ref='fake-image_ref',
vm_state=vm_states.STOPPED) vm_state=vm_states.STOPPED)

View File

@@ -305,7 +305,7 @@ class SchedulerUtilsTestCase(test.NoDBTestCase):
) as (get_group, get_hosts): ) as (get_group, get_hosts):
scheduler_utils._SUPPORTS_ANTI_AFFINITY = None scheduler_utils._SUPPORTS_ANTI_AFFINITY = None
scheduler_utils._SUPPORTS_AFFINITY = None scheduler_utils._SUPPORTS_AFFINITY = None
self.assertRaises(exception.NoValidHost, self.assertRaises(exception.UnsupportedPolicyException,
scheduler_utils._get_group_details, scheduler_utils._get_group_details,
self.context, 'fake-uuid') self.context, 'fake-uuid')