Merge "Fix failed count for anti-affinity check"
This commit is contained in:
commit
1fe8c4becb
@ -24,3 +24,11 @@ was rescheduled.
|
||||
ACTIVE = 'active' # Instance is running
|
||||
FAILED = 'failed' # Instance failed to build and was not rescheduled
|
||||
RESCHEDULED = 'rescheduled' # Instance failed to build, but was rescheduled
|
||||
# Instance failed by policy violation (such as affinity or anti-affinity)
|
||||
# and was not rescheduled. In this case, the node's failed count won't be
|
||||
# increased.
|
||||
FAILED_BY_POLICY = 'failed_by_policy'
|
||||
# Instance failed by policy violation (such as affinity or anti-affinity)
|
||||
# but was rescheduled. In this case, the node's failed count won't be
|
||||
# increased.
|
||||
RESCHEDULED_BY_POLICY = 'rescheduled_by_policy'
|
||||
|
@ -1891,11 +1891,8 @@ class ComputeManager(manager.Manager):
|
||||
else:
|
||||
max_server = 1
|
||||
if len(members_on_host) >= max_server:
|
||||
msg = _("Anti-affinity instance group policy "
|
||||
"was violated.")
|
||||
raise exception.RescheduledException(
|
||||
instance_uuid=instance.uuid,
|
||||
reason=msg)
|
||||
raise exception.GroupAffinityViolation(
|
||||
instance_uuid=instance.uuid, policy='Anti-affinity')
|
||||
|
||||
# NOTE(ganso): The check for affinity below does not work and it
|
||||
# can easily be violated because the lock happens in different
|
||||
@ -1905,10 +1902,8 @@ class ComputeManager(manager.Manager):
|
||||
elif group.policy and 'affinity' == group.policy:
|
||||
group_hosts = group.get_hosts(exclude=[instance.uuid])
|
||||
if group_hosts and self.host not in group_hosts:
|
||||
msg = _("Affinity instance group policy was violated.")
|
||||
raise exception.RescheduledException(
|
||||
instance_uuid=instance.uuid,
|
||||
reason=msg)
|
||||
raise exception.GroupAffinityViolation(
|
||||
instance_uuid=instance.uuid, policy='Affinity')
|
||||
|
||||
_do_validation(context, instance, group)
|
||||
|
||||
@ -2348,6 +2343,9 @@ class ComputeManager(manager.Manager):
|
||||
self.reportclient.delete_allocation_for_instance(
|
||||
context, instance.uuid, force=True)
|
||||
|
||||
if result in (build_results.FAILED_BY_POLICY,
|
||||
build_results.RESCHEDULED_BY_POLICY):
|
||||
return
|
||||
if result in (build_results.FAILED,
|
||||
build_results.RESCHEDULED):
|
||||
self._build_failed(node)
|
||||
@ -2446,6 +2444,8 @@ class ComputeManager(manager.Manager):
|
||||
self._nil_out_instance_obj_host_and_node(instance)
|
||||
self._set_instance_obj_error_state(instance,
|
||||
clean_task_state=True)
|
||||
if isinstance(e, exception.RescheduledByPolicyException):
|
||||
return build_results.FAILED_BY_POLICY
|
||||
return build_results.FAILED
|
||||
LOG.debug(e.format_message(), instance=instance)
|
||||
# This will be used for logging the exception
|
||||
@ -2472,6 +2472,10 @@ class ComputeManager(manager.Manager):
|
||||
injected_files, requested_networks, security_groups,
|
||||
block_device_mapping, request_spec=request_spec,
|
||||
host_lists=[host_list])
|
||||
|
||||
if isinstance(e, exception.RescheduledByPolicyException):
|
||||
return build_results.RESCHEDULED_BY_POLICY
|
||||
|
||||
return build_results.RESCHEDULED
|
||||
except (exception.InstanceNotFound,
|
||||
exception.UnexpectedDeletingTaskStateError):
|
||||
@ -2691,6 +2695,17 @@ class ComputeManager(manager.Manager):
|
||||
bdms=block_device_mapping)
|
||||
raise exception.BuildAbortException(instance_uuid=instance.uuid,
|
||||
reason=e.format_message())
|
||||
except exception.GroupAffinityViolation as e:
|
||||
LOG.exception('Failed to build and run instance',
|
||||
instance=instance)
|
||||
self._notify_about_instance_usage(context, instance,
|
||||
'create.error', fault=e)
|
||||
compute_utils.notify_about_instance_create(
|
||||
context, instance, self.host,
|
||||
phase=fields.NotificationPhase.ERROR, exception=e,
|
||||
bdms=block_device_mapping)
|
||||
raise exception.RescheduledByPolicyException(
|
||||
instance_uuid=instance.uuid, reason=str(e))
|
||||
except Exception as e:
|
||||
LOG.exception('Failed to build and run instance',
|
||||
instance=instance)
|
||||
|
@ -1502,6 +1502,15 @@ class RescheduledException(NovaException):
|
||||
"%(reason)s")
|
||||
|
||||
|
||||
class RescheduledByPolicyException(RescheduledException):
|
||||
msg_fmt = _("Build of instance %(instance_uuid)s was re-scheduled: "
|
||||
"%(reason)s")
|
||||
|
||||
|
||||
class GroupAffinityViolation(NovaException):
|
||||
msg_fmt = _("%(policy)s instance group policy was violated")
|
||||
|
||||
|
||||
class InstanceFaultRollback(NovaException):
|
||||
def __init__(self, inner_exception=None):
|
||||
message = _("Instance rollback performed due to: %s")
|
||||
|
@ -20,6 +20,7 @@ from oslo_config import cfg
|
||||
from nova.compute import instance_actions
|
||||
from nova import context
|
||||
from nova.db.main import api as db
|
||||
from nova import objects
|
||||
from nova import test
|
||||
from nova.tests import fixtures as nova_fixtures
|
||||
from nova.tests.functional.api import client
|
||||
@ -499,6 +500,85 @@ class ServerGroupTestV21(ServerGroupTestBase):
|
||||
self.assertIn('Invalid input', ex.response.text)
|
||||
self.assertIn('soft-affinity', ex.response.text)
|
||||
|
||||
@mock.patch('nova.scheduler.filters.affinity_filter.'
|
||||
'ServerGroupAffinityFilter.host_passes', return_value=True)
|
||||
def test_failed_count_with_affinity_violation(self, mock_host_passes):
|
||||
"""Check failed count not incremented after violation of the late
|
||||
affinity check. https://bugs.launchpad.net/nova/+bug/1996732
|
||||
"""
|
||||
|
||||
created_group = self.api.post_server_groups(self.affinity)
|
||||
flavor = self.api.get_flavors()[2]
|
||||
|
||||
# Ensure the first instance is on compute1
|
||||
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
|
||||
compute2_service_id = self.admin_api.get_services(
|
||||
host=self.compute2.host, binary='nova-compute')[0]['id']
|
||||
self.admin_api.put_service(compute2_service_id,
|
||||
{'status': 'disabled'})
|
||||
|
||||
self._boot_a_server_to_group(created_group, flavor=flavor)
|
||||
|
||||
# Ensure the second instance is on compute2
|
||||
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
|
||||
self.admin_api.put_service(compute2_service_id,
|
||||
{'status': 'enabled'})
|
||||
compute1_service_id = self.admin_api.get_services(
|
||||
host=self.compute.host, binary='nova-compute')[0]['id']
|
||||
self.admin_api.put_service(compute1_service_id,
|
||||
{'status': 'disabled'})
|
||||
|
||||
# Expects GroupAffinityViolation exception
|
||||
failed_server = self._boot_a_server_to_group(created_group,
|
||||
flavor=flavor,
|
||||
expected_status='ERROR')
|
||||
|
||||
self.assertEqual('Exceeded maximum number of retries. Exhausted all '
|
||||
'hosts available for retrying build failures for '
|
||||
'instance %s.' % failed_server['id'],
|
||||
failed_server['fault']['message'])
|
||||
|
||||
ctxt = context.get_admin_context()
|
||||
computes = objects.ComputeNodeList.get_all(ctxt)
|
||||
|
||||
for node in computes:
|
||||
self.assertEqual(node.stats.get('failed_builds'), '0')
|
||||
|
||||
@mock.patch('nova.scheduler.filters.affinity_filter.'
|
||||
'ServerGroupAntiAffinityFilter.host_passes', return_value=True)
|
||||
def test_failed_count_with_anti_affinity_violation(self, mock_host_passes):
|
||||
"""Check failed count after violation of the late affinity check.
|
||||
https://bugs.launchpad.net/nova/+bug/1996732
|
||||
"""
|
||||
|
||||
created_group = self.api.post_server_groups(self.anti_affinity)
|
||||
flavor = self.api.get_flavors()[2]
|
||||
|
||||
# Ensure two instances are scheduled on the same host
|
||||
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
|
||||
compute2_service_id = self.admin_api.get_services(
|
||||
host=self.compute2.host, binary='nova-compute')[0]['id']
|
||||
self.admin_api.put_service(compute2_service_id,
|
||||
{'status': 'disabled'})
|
||||
|
||||
self._boot_a_server_to_group(created_group, flavor=flavor)
|
||||
|
||||
# Expects GroupAffinityViolation exception
|
||||
failed_server = self._boot_a_server_to_group(created_group,
|
||||
flavor=flavor,
|
||||
expected_status='ERROR')
|
||||
|
||||
self.assertEqual('Exceeded maximum number of retries. Exhausted all '
|
||||
'hosts available for retrying build failures for '
|
||||
'instance %s.' % failed_server['id'],
|
||||
failed_server['fault']['message'])
|
||||
|
||||
ctxt = context.get_admin_context()
|
||||
computes = objects.ComputeNodeList.get_all(ctxt)
|
||||
|
||||
for node in computes:
|
||||
self.assertEqual(node.stats.get('failed_builds'), '0')
|
||||
|
||||
|
||||
class ServerGroupAffinityConfTest(ServerGroupTestBase):
|
||||
api_major_version = 'v2.1'
|
||||
|
@ -6962,13 +6962,14 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
||||
self.compute = manager.ComputeManager()
|
||||
self._test_build_and_run_instance()
|
||||
|
||||
@mock.patch.object(manager.ComputeManager, '_build_succeeded')
|
||||
@mock.patch.object(objects.InstanceActionEvent,
|
||||
'event_finish_with_failure')
|
||||
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
|
||||
@mock.patch.object(objects.Instance, 'save')
|
||||
@mock.patch.object(manager.ComputeManager, '_build_and_run_instance')
|
||||
def _test_build_and_run_instance(self, mock_build, mock_save,
|
||||
mock_start, mock_finish):
|
||||
mock_start, mock_finish, mock_succeeded):
|
||||
self._do_build_instance_update(mock_save)
|
||||
|
||||
orig_do_build_and_run = self.compute._do_build_and_run_instance
|
||||
@ -7001,6 +7002,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
||||
self.requested_networks, self.security_groups,
|
||||
self.block_device_mapping, self.node, self.limits,
|
||||
self.filter_properties, {}, self.accel_uuids)
|
||||
mock_succeeded.assert_called_once_with(self.node)
|
||||
|
||||
# This test when sending an icehouse compatible rpc call to juno compute
|
||||
# node, NetworkRequest object can load from three items tuple.
|
||||
@ -7028,6 +7030,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
||||
self.assertEqual('10.0.0.1', str(requested_network.address))
|
||||
self.assertEqual(uuids.port_instance, requested_network.port_id)
|
||||
|
||||
@mock.patch.object(manager.ComputeManager, '_build_failed')
|
||||
@mock.patch.object(objects.InstanceActionEvent,
|
||||
'event_finish_with_failure')
|
||||
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
|
||||
@ -7043,7 +7046,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
||||
def test_build_abort_exception(self, mock_build_run,
|
||||
mock_build, mock_set, mock_nil, mock_add,
|
||||
mock_clean_vol, mock_clean_net, mock_save,
|
||||
mock_start, mock_finish):
|
||||
mock_start, mock_finish, mock_failed):
|
||||
self._do_build_instance_update(mock_save)
|
||||
mock_build_run.side_effect = exception.BuildAbortException(reason='',
|
||||
instance_uuid=self.instance.uuid)
|
||||
@ -7086,7 +7089,9 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
||||
mock.ANY, mock.ANY)
|
||||
mock_nil.assert_called_once_with(self.instance)
|
||||
mock_set.assert_called_once_with(self.instance, clean_task_state=True)
|
||||
mock_failed.assert_called_once_with(self.node)
|
||||
|
||||
@mock.patch.object(manager.ComputeManager, '_build_failed')
|
||||
@mock.patch.object(objects.InstanceActionEvent,
|
||||
'event_finish_with_failure')
|
||||
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
|
||||
@ -7097,8 +7102,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
||||
@mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances')
|
||||
@mock.patch.object(manager.ComputeManager, '_build_and_run_instance')
|
||||
def test_rescheduled_exception(self, mock_build_run,
|
||||
mock_build, mock_set, mock_nil,
|
||||
mock_save, mock_start, mock_finish):
|
||||
mock_build, mock_set, mock_nil, mock_save,
|
||||
mock_start, mock_finish, mock_failed):
|
||||
self._do_build_instance_update(mock_save, reschedule_update=True)
|
||||
mock_build_run.side_effect = exception.RescheduledException(reason='',
|
||||
instance_uuid=self.instance.uuid)
|
||||
@ -7145,6 +7150,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
||||
self.admin_pass, self.injected_files, self.requested_networks,
|
||||
self.security_groups, self.block_device_mapping,
|
||||
request_spec={}, host_lists=[fake_host_list])
|
||||
mock_failed.assert_called_once_with(self.node)
|
||||
|
||||
@mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim')
|
||||
@mock.patch.object(manager.ComputeManager, '_shutdown_instance')
|
||||
@ -7499,6 +7505,139 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
||||
self.security_groups, self.block_device_mapping,
|
||||
request_spec={}, host_lists=[fake_host_list])
|
||||
|
||||
@mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim',
|
||||
new=mock.MagicMock())
|
||||
@mock.patch.object(objects.InstanceActionEvent,
|
||||
'event_finish_with_failure')
|
||||
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
|
||||
@mock.patch.object(objects.Instance, 'save')
|
||||
@mock.patch.object(manager.ComputeManager,
|
||||
'_nil_out_instance_obj_host_and_node')
|
||||
@mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances')
|
||||
@mock.patch.object(manager.ComputeManager, '_build_failed')
|
||||
@mock.patch.object(manager.ComputeManager, '_build_succeeded')
|
||||
@mock.patch.object(manager.ComputeManager,
|
||||
'_validate_instance_group_policy')
|
||||
def test_group_affinity_violation_exception_with_retry(
|
||||
self, mock_validate_policy, mock_succeeded, mock_failed, mock_build,
|
||||
mock_nil, mock_save, mock_start, mock_finish,
|
||||
):
|
||||
"""Test retry by affinity or anti-affinity validation check doesn't
|
||||
increase failed build
|
||||
"""
|
||||
|
||||
self._do_build_instance_update(mock_save, reschedule_update=True)
|
||||
mock_validate_policy.side_effect = \
|
||||
exception.GroupAffinityViolation(
|
||||
instance_uuid=self.instance.uuid, policy="Affinity")
|
||||
|
||||
orig_do_build_and_run = self.compute._do_build_and_run_instance
|
||||
|
||||
def _wrapped_do_build_and_run_instance(*args, **kwargs):
|
||||
ret = orig_do_build_and_run(*args, **kwargs)
|
||||
self.assertEqual(build_results.RESCHEDULED_BY_POLICY, ret)
|
||||
return ret
|
||||
|
||||
with test.nested(
|
||||
mock.patch.object(
|
||||
self.compute, '_do_build_and_run_instance',
|
||||
side_effect=_wrapped_do_build_and_run_instance,
|
||||
),
|
||||
mock.patch.object(
|
||||
self.compute.network_api, 'get_instance_nw_info',
|
||||
),
|
||||
):
|
||||
self.compute.build_and_run_instance(
|
||||
self.context, self.instance,
|
||||
self.image, request_spec={},
|
||||
filter_properties=self.filter_properties,
|
||||
accel_uuids=self.accel_uuids,
|
||||
injected_files=self.injected_files,
|
||||
admin_password=self.admin_pass,
|
||||
requested_networks=self.requested_networks,
|
||||
security_groups=self.security_groups,
|
||||
block_device_mapping=self.block_device_mapping, node=self.node,
|
||||
limits=self.limits, host_list=fake_host_list)
|
||||
|
||||
mock_succeeded.assert_not_called()
|
||||
mock_failed.assert_not_called()
|
||||
|
||||
self._instance_action_events(mock_start, mock_finish)
|
||||
self._assert_build_instance_update(mock_save, reschedule_update=True)
|
||||
mock_nil.assert_called_once_with(self.instance)
|
||||
mock_build.assert_called_once_with(self.context,
|
||||
[self.instance], self.image, self.filter_properties,
|
||||
self.admin_pass, self.injected_files, self.requested_networks,
|
||||
self.security_groups, self.block_device_mapping,
|
||||
request_spec={}, host_lists=[fake_host_list])
|
||||
|
||||
@mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim',
|
||||
new=mock.MagicMock())
|
||||
@mock.patch.object(objects.InstanceActionEvent,
|
||||
'event_finish_with_failure')
|
||||
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
|
||||
@mock.patch.object(objects.Instance, 'save')
|
||||
@mock.patch.object(manager.ComputeManager,
|
||||
'_nil_out_instance_obj_host_and_node')
|
||||
@mock.patch.object(manager.ComputeManager, '_cleanup_allocated_networks')
|
||||
@mock.patch.object(manager.ComputeManager, '_set_instance_obj_error_state')
|
||||
@mock.patch.object(compute_utils, 'add_instance_fault_from_exc')
|
||||
@mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances')
|
||||
@mock.patch.object(manager.ComputeManager, '_build_failed')
|
||||
@mock.patch.object(manager.ComputeManager, '_build_succeeded')
|
||||
@mock.patch.object(manager.ComputeManager,
|
||||
'_validate_instance_group_policy')
|
||||
def test_group_affinity_violation_exception_without_retry(
|
||||
self, mock_validate_policy, mock_succeeded, mock_failed, mock_build,
|
||||
mock_add, mock_set_state, mock_clean_net, mock_nil, mock_save,
|
||||
mock_start, mock_finish,
|
||||
):
|
||||
"""Test failure by affinity or anti-affinity validation check doesn't
|
||||
increase failed build
|
||||
"""
|
||||
|
||||
self._do_build_instance_update(mock_save)
|
||||
mock_validate_policy.side_effect = \
|
||||
exception.GroupAffinityViolation(
|
||||
instance_uuid=self.instance.uuid, policy="Affinity")
|
||||
|
||||
orig_do_build_and_run = self.compute._do_build_and_run_instance
|
||||
|
||||
def _wrapped_do_build_and_run_instance(*args, **kwargs):
|
||||
ret = orig_do_build_and_run(*args, **kwargs)
|
||||
self.assertEqual(build_results.FAILED_BY_POLICY, ret)
|
||||
return ret
|
||||
|
||||
with mock.patch.object(
|
||||
self.compute, '_do_build_and_run_instance',
|
||||
side_effect=_wrapped_do_build_and_run_instance,
|
||||
):
|
||||
self.compute.build_and_run_instance(
|
||||
self.context, self.instance,
|
||||
self.image, request_spec={},
|
||||
filter_properties={},
|
||||
accel_uuids=[],
|
||||
injected_files=self.injected_files,
|
||||
admin_password=self.admin_pass,
|
||||
requested_networks=self.requested_networks,
|
||||
security_groups=self.security_groups,
|
||||
block_device_mapping=self.block_device_mapping, node=self.node,
|
||||
limits=self.limits, host_list=fake_host_list)
|
||||
|
||||
mock_succeeded.assert_not_called()
|
||||
mock_failed.assert_not_called()
|
||||
|
||||
self._instance_action_events(mock_start, mock_finish)
|
||||
self._assert_build_instance_update(mock_save)
|
||||
mock_clean_net.assert_called_once_with(self.context, self.instance,
|
||||
self.requested_networks)
|
||||
mock_add.assert_called_once_with(self.context, self.instance,
|
||||
mock.ANY, mock.ANY, fault_message=mock.ANY)
|
||||
mock_nil.assert_called_once_with(self.instance)
|
||||
mock_build.assert_not_called()
|
||||
mock_set_state.assert_called_once_with(self.instance,
|
||||
clean_task_state=True)
|
||||
|
||||
@mock.patch.object(objects.InstanceActionEvent,
|
||||
'event_finish_with_failure')
|
||||
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
|
||||
@ -8078,7 +8217,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
|
||||
nodes.return_value = ['nodename']
|
||||
migration_list.return_value = [objects.Migration(
|
||||
uuid=uuids.migration, instance_uuid=uuids.instance)]
|
||||
self.assertRaises(exception.RescheduledException,
|
||||
self.assertRaises(exception.GroupAffinityViolation,
|
||||
self.compute._validate_instance_group_policy,
|
||||
self.context, instance, hints)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user