Fix leaking exceptions from scheduler utils
scheduler_utils.setup_instance_group() can raise NoValidHost exception and the conductor does not handle that properly which causes the server to get stuck in a scheduling state instead of going to ERROR. The function is called several times in conductor/manager.py. This patch moves the function calls in the try blocks already present. It handles the missing Affinity filters as UnsupportedPolicyException, which is a new exception type added here. This way the exception raised by scheduler_utils.setup_instance_group() will cause the VM to end up in ERROR state instead of NOSTATE, as it is expected. Quota rollback is also handled properly, when it is needed. Closes-bug: #1408326 Change-Id: I3a8ba37ded8ac3fa9b5f3f35b09feb82d822d583
This commit is contained in:
committed by
Ildiko Vancsa
parent
76e4d601f4
commit
28aedc604d
@@ -479,7 +479,8 @@ class ComputeTaskManager(base.Base):
|
||||
exception.HypervisorUnavailable,
|
||||
exception.InstanceNotRunning,
|
||||
exception.MigrationPreCheckError,
|
||||
exception.LiveMigrationWithOldNovaNotSafe)
|
||||
exception.LiveMigrationWithOldNovaNotSafe,
|
||||
exception.UnsupportedPolicyException)
|
||||
def migrate_server(self, context, instance, scheduler_hint, live, rebuild,
|
||||
flavor, block_migration, disk_over_commit, reservations=None,
|
||||
clean_shutdown=True):
|
||||
@@ -521,15 +522,15 @@ class ComputeTaskManager(base.Base):
|
||||
quotas = objects.Quotas.from_reservations(context,
|
||||
reservations,
|
||||
instance=instance)
|
||||
scheduler_utils.setup_instance_group(context, request_spec,
|
||||
filter_properties)
|
||||
try:
|
||||
scheduler_utils.setup_instance_group(context, request_spec,
|
||||
filter_properties)
|
||||
scheduler_utils.populate_retry(filter_properties, instance['uuid'])
|
||||
hosts = self.scheduler_client.select_destinations(
|
||||
context, request_spec, filter_properties)
|
||||
host_state = hosts[0]
|
||||
except exception.NoValidHost as ex:
|
||||
vm_state = instance['vm_state']
|
||||
vm_state = instance.vm_state
|
||||
if not vm_state:
|
||||
vm_state = vm_states.ACTIVE
|
||||
updates = {'vm_state': vm_state, 'task_state': None}
|
||||
@@ -544,6 +545,16 @@ class ComputeTaskManager(base.Base):
|
||||
else:
|
||||
msg = _("No valid host found for resize")
|
||||
raise exception.NoValidHost(reason=msg)
|
||||
except exception.UnsupportedPolicyException as ex:
|
||||
with excutils.save_and_reraise_exception():
|
||||
vm_state = instance.vm_state
|
||||
if not vm_state:
|
||||
vm_state = vm_states.ACTIVE
|
||||
updates = {'vm_state': vm_state, 'task_state': None}
|
||||
self._set_vm_state_and_notify(context, instance.uuid,
|
||||
'migrate_server',
|
||||
updates, ex, request_spec)
|
||||
quotas.rollback()
|
||||
|
||||
try:
|
||||
scheduler_utils.populate_filter_properties(filter_properties,
|
||||
@@ -560,7 +571,7 @@ class ComputeTaskManager(base.Base):
|
||||
clean_shutdown=clean_shutdown)
|
||||
except Exception as ex:
|
||||
with excutils.save_and_reraise_exception():
|
||||
updates = {'vm_state': instance['vm_state'],
|
||||
updates = {'vm_state': instance.vm_state,
|
||||
'task_state': None}
|
||||
self._set_vm_state_and_notify(context, instance.uuid,
|
||||
'migrate_server',
|
||||
@@ -607,7 +618,7 @@ class ComputeTaskManager(base.Base):
|
||||
exception.LiveMigrationWithOldNovaNotSafe) as ex:
|
||||
with excutils.save_and_reraise_exception():
|
||||
# TODO(johngarbutt) - eventually need instance actions here
|
||||
_set_vm_state(context, instance, ex, instance['vm_state'])
|
||||
_set_vm_state(context, instance, ex, instance.vm_state)
|
||||
except Exception as ex:
|
||||
LOG.error(_LE('Migration of instance %(instance_id)s to host'
|
||||
' %(dest)s unexpectedly failed.'),
|
||||
@@ -624,8 +635,6 @@ class ComputeTaskManager(base.Base):
|
||||
# 2.0 of the RPC API.
|
||||
request_spec = scheduler_utils.build_request_spec(context, image,
|
||||
instances)
|
||||
scheduler_utils.setup_instance_group(context, request_spec,
|
||||
filter_properties)
|
||||
# TODO(danms): Remove this in version 2.0 of the RPC API
|
||||
if (requested_networks and
|
||||
not isinstance(requested_networks,
|
||||
@@ -642,6 +651,8 @@ class ComputeTaskManager(base.Base):
|
||||
filter_properties = dict(filter_properties, instance_type=flavor)
|
||||
|
||||
try:
|
||||
scheduler_utils.setup_instance_group(context, request_spec,
|
||||
filter_properties)
|
||||
# check retry policy. Rather ugly use of instances[0]...
|
||||
# but if we've exceeded max retries... then we really only
|
||||
# have a single instance.
|
||||
@@ -748,7 +759,8 @@ class ComputeTaskManager(base.Base):
|
||||
self.compute_rpcapi.unshelve_instance(
|
||||
context, instance, host, image=image,
|
||||
filter_properties=filter_properties, node=node)
|
||||
except exception.NoValidHost:
|
||||
except (exception.NoValidHost,
|
||||
exception.UnsupportedPolicyException):
|
||||
instance.task_state = None
|
||||
instance.save()
|
||||
LOG.warning(_LW("No valid host found for unshelve instance"),
|
||||
@@ -781,9 +793,9 @@ class ComputeTaskManager(base.Base):
|
||||
request_spec = scheduler_utils.build_request_spec(context,
|
||||
image_ref,
|
||||
[instance])
|
||||
scheduler_utils.setup_instance_group(context, request_spec,
|
||||
filter_properties)
|
||||
try:
|
||||
scheduler_utils.setup_instance_group(context, request_spec,
|
||||
filter_properties)
|
||||
hosts = self.scheduler_client.select_destinations(context,
|
||||
request_spec,
|
||||
filter_properties)
|
||||
@@ -796,6 +808,15 @@ class ComputeTaskManager(base.Base):
|
||||
'task_state': None}, ex, request_spec)
|
||||
LOG.warning(_LW("No valid host found for rebuild"),
|
||||
instance=instance)
|
||||
except exception.UnsupportedPolicyException as ex:
|
||||
with excutils.save_and_reraise_exception():
|
||||
self._set_vm_state_and_notify(context, instance.uuid,
|
||||
'rebuild_server',
|
||||
{'vm_state': instance.vm_state,
|
||||
'task_state': None}, ex, request_spec)
|
||||
LOG.warning(_LW("Server with unsupported policy "
|
||||
"cannot be rebuilt"),
|
||||
instance=instance)
|
||||
|
||||
self.compute_rpcapi.rebuild_instance(context,
|
||||
instance=instance,
|
||||
|
||||
@@ -1838,3 +1838,7 @@ class CPUPinningInvalid(Invalid):
|
||||
class ImageCPUPinningForbidden(Invalid):
|
||||
msg_fmt = _("Image property 'hw_cpu_policy' is not permitted to override "
|
||||
"CPU pinning policy set against the flavor")
|
||||
|
||||
|
||||
class UnsupportedPolicyException(Invalid):
|
||||
msg_fmt = _("ServerGroup policy is not supported: %(reason)s")
|
||||
|
||||
@@ -285,11 +285,11 @@ def _get_group_details(context, instance_uuid, user_group_hosts=None):
|
||||
if (not _SUPPORTS_AFFINITY and 'affinity' in group.policies):
|
||||
msg = _("ServerGroupAffinityFilter not configured")
|
||||
LOG.error(msg)
|
||||
raise exception.NoValidHost(reason=msg)
|
||||
raise exception.UnsupportedPolicyException(reason=msg)
|
||||
if (not _SUPPORTS_ANTI_AFFINITY and 'anti-affinity' in group.policies):
|
||||
msg = _("ServerGroupAntiAffinityFilter not configured")
|
||||
LOG.error(msg)
|
||||
raise exception.NoValidHost(reason=msg)
|
||||
raise exception.UnsupportedPolicyException(reason=msg)
|
||||
group_hosts = set(group.get_hosts(context))
|
||||
user_hosts = set(user_group_hosts) if user_group_hosts else set()
|
||||
return GroupDetails(hosts=user_hosts | group_hosts,
|
||||
|
||||
@@ -1378,6 +1378,49 @@ class _BaseTaskTestCase(object):
|
||||
block_device_mapping='block_device_mapping',
|
||||
legacy_bdm=False)
|
||||
|
||||
@mock.patch('nova.utils.spawn_n')
|
||||
@mock.patch.object(scheduler_utils, 'build_request_spec')
|
||||
@mock.patch.object(scheduler_utils, 'setup_instance_group')
|
||||
@mock.patch.object(conductor_manager.ComputeTaskManager,
|
||||
'_set_vm_state_and_notify')
|
||||
def test_build_instances_scheduler_group_failure(self, state_mock,
|
||||
sig_mock, bs_mock,
|
||||
spawn_mock):
|
||||
instances = [fake_instance.fake_instance_obj(self.context)
|
||||
for i in range(2)]
|
||||
image = {'fake-data': 'should_pass_silently'}
|
||||
spec = {'fake': 'specs',
|
||||
'instance_properties': instances[0]}
|
||||
|
||||
# NOTE(gibi): LocalComputeTaskAPI use eventlet spawn that makes mocking
|
||||
# hard so use direct call instead.
|
||||
spawn_mock.side_effect = lambda f, *a, **k: f(*a, **k)
|
||||
bs_mock.return_value = spec
|
||||
exception = exc.UnsupportedPolicyException(reason='fake-reason')
|
||||
sig_mock.side_effect = exception
|
||||
|
||||
updates = {'vm_state': vm_states.ERROR, 'task_state': None}
|
||||
|
||||
# build_instances() is a cast, we need to wait for it to complete
|
||||
self.useFixture(cast_as_call.CastAsCall(self.stubs))
|
||||
|
||||
self.conductor.build_instances(
|
||||
context=self.context,
|
||||
instances=instances,
|
||||
image=image,
|
||||
filter_properties={},
|
||||
admin_password='admin_password',
|
||||
injected_files='injected_files',
|
||||
requested_networks=None,
|
||||
security_groups='security_groups',
|
||||
block_device_mapping='block_device_mapping',
|
||||
legacy_bdm=False)
|
||||
calls = []
|
||||
for instance in instances:
|
||||
calls.append(mock.call(self.context, instance.uuid,
|
||||
'build_instances', updates, exception, spec))
|
||||
state_mock.assert_has_calls(calls)
|
||||
|
||||
def test_unshelve_instance_on_host(self):
|
||||
instance = self._create_fake_instance_obj()
|
||||
instance.vm_state = vm_states.SHELVED
|
||||
@@ -1606,6 +1649,49 @@ class _BaseTaskTestCase(object):
|
||||
filter_properties)
|
||||
self.assertFalse(rebuild_mock.called)
|
||||
|
||||
@mock.patch('nova.utils.spawn_n')
|
||||
@mock.patch.object(conductor_manager.compute_rpcapi.ComputeAPI,
|
||||
'rebuild_instance')
|
||||
@mock.patch.object(scheduler_utils, 'setup_instance_group')
|
||||
@mock.patch.object(conductor_manager.scheduler_client.SchedulerClient,
|
||||
'select_destinations')
|
||||
@mock.patch('nova.scheduler.utils.build_request_spec')
|
||||
@mock.patch.object(conductor_manager.ComputeTaskManager,
|
||||
'_set_vm_state_and_notify')
|
||||
def test_rebuild_instance_with_scheduler_group_failure(self,
|
||||
state_mock,
|
||||
bs_mock,
|
||||
select_dest_mock,
|
||||
sig_mock,
|
||||
rebuild_mock,
|
||||
spawn_mock):
|
||||
inst_obj = self._create_fake_instance_obj()
|
||||
rebuild_args = self._prepare_rebuild_args({'host': None})
|
||||
request_spec = {}
|
||||
bs_mock.return_value = request_spec
|
||||
|
||||
# NOTE(gibi): LocalComputeTaskAPI use eventlet spawn that makes mocking
|
||||
# hard so use direct call instead.
|
||||
spawn_mock.side_effect = lambda f, *a, **k: f(*a, **k)
|
||||
|
||||
exception = exc.UnsupportedPolicyException(reason='')
|
||||
sig_mock.side_effect = exception
|
||||
|
||||
# build_instances() is a cast, we need to wait for it to complete
|
||||
self.useFixture(cast_as_call.CastAsCall(self.stubs))
|
||||
|
||||
self.assertRaises(exc.UnsupportedPolicyException,
|
||||
self.conductor.rebuild_instance,
|
||||
self.context,
|
||||
inst_obj,
|
||||
**rebuild_args)
|
||||
updates = {'vm_state': vm_states.ACTIVE, 'task_state': None}
|
||||
state_mock.assert_called_once_with(self.context, inst_obj.uuid,
|
||||
'rebuild_server', updates,
|
||||
exception, request_spec)
|
||||
self.assertFalse(select_dest_mock.called)
|
||||
self.assertFalse(rebuild_mock.called)
|
||||
|
||||
|
||||
class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
|
||||
"""ComputeTaskManager Tests."""
|
||||
@@ -1922,6 +2008,44 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
|
||||
clean_shutdown=True)
|
||||
self.assertIn('cold migrate', nvh.message)
|
||||
|
||||
@mock.patch.object(compute_utils, 'get_image_metadata')
|
||||
@mock.patch('nova.scheduler.utils.build_request_spec')
|
||||
@mock.patch.object(scheduler_utils, 'setup_instance_group')
|
||||
@mock.patch.object(conductor_manager.ComputeTaskManager,
|
||||
'_set_vm_state_and_notify')
|
||||
def test_cold_migrate_no_valid_host_in_group(self,
|
||||
set_vm_mock,
|
||||
sig_mock,
|
||||
brs_mock,
|
||||
image_mock):
|
||||
flavor = flavors.get_flavor_by_name('m1.tiny')
|
||||
inst = fake_instance.fake_db_instance(image_ref='fake-image_ref',
|
||||
vm_state=vm_states.STOPPED,
|
||||
instance_type_id=flavor['id'])
|
||||
inst_obj = objects.Instance._from_db_object(
|
||||
self.context, objects.Instance(), inst,
|
||||
expected_attrs=[])
|
||||
request_spec = dict(instance_type=dict(extra_specs=dict()),
|
||||
instance_properties=dict())
|
||||
filter_props = dict(context=None)
|
||||
resvs = 'fake-resvs'
|
||||
image = 'fake-image'
|
||||
exception = exc.UnsupportedPolicyException(reason='')
|
||||
|
||||
image_mock.return_value = image
|
||||
brs_mock.return_value = request_spec
|
||||
sig_mock.side_effect = exception
|
||||
|
||||
self.assertRaises(exc.UnsupportedPolicyException,
|
||||
self.conductor._cold_migrate, self.context,
|
||||
inst_obj, flavor, filter_props, [resvs],
|
||||
clean_shutdown=True)
|
||||
|
||||
updates = {'vm_state': vm_states.STOPPED, 'task_state': None}
|
||||
set_vm_mock.assert_called_once_with(self.context, inst_obj.uuid,
|
||||
'migrate_server', updates,
|
||||
exception, request_spec)
|
||||
|
||||
def test_cold_migrate_exception_host_in_error_state_and_raise(self):
|
||||
inst = fake_instance.fake_db_instance(image_ref='fake-image_ref',
|
||||
vm_state=vm_states.STOPPED)
|
||||
|
||||
@@ -305,7 +305,7 @@ class SchedulerUtilsTestCase(test.NoDBTestCase):
|
||||
) as (get_group, get_hosts):
|
||||
scheduler_utils._SUPPORTS_ANTI_AFFINITY = None
|
||||
scheduler_utils._SUPPORTS_AFFINITY = None
|
||||
self.assertRaises(exception.NoValidHost,
|
||||
self.assertRaises(exception.UnsupportedPolicyException,
|
||||
scheduler_utils._get_group_details,
|
||||
self.context, 'fake-uuid')
|
||||
|
||||
|
||||
Reference in New Issue
Block a user