Merge "Fix failed count for anti-affinity check"

This commit is contained in:
Zuul 2023-06-07 14:35:52 +00:00 committed by Gerrit Code Review
commit 1fe8c4becb
5 changed files with 265 additions and 14 deletions

View File

@ -24,3 +24,11 @@ was rescheduled.
ACTIVE = 'active' # Instance is running
FAILED = 'failed' # Instance failed to build and was not rescheduled
RESCHEDULED = 'rescheduled' # Instance failed to build, but was rescheduled
# Instance failed by policy violation (such as affinity or anti-affinity)
# and was not rescheduled. In this case, the node's failed count won't be
# increased.
FAILED_BY_POLICY = 'failed_by_policy'
# Instance failed by policy violation (such as affinity or anti-affinity)
# but was rescheduled. In this case, the node's failed count won't be
# increased.
RESCHEDULED_BY_POLICY = 'rescheduled_by_policy'

View File

@ -1891,11 +1891,8 @@ class ComputeManager(manager.Manager):
else:
max_server = 1
if len(members_on_host) >= max_server:
msg = _("Anti-affinity instance group policy "
"was violated.")
raise exception.RescheduledException(
instance_uuid=instance.uuid,
reason=msg)
raise exception.GroupAffinityViolation(
instance_uuid=instance.uuid, policy='Anti-affinity')
# NOTE(ganso): The check for affinity below does not work and it
# can easily be violated because the lock happens in different
@ -1905,10 +1902,8 @@ class ComputeManager(manager.Manager):
elif group.policy and 'affinity' == group.policy:
group_hosts = group.get_hosts(exclude=[instance.uuid])
if group_hosts and self.host not in group_hosts:
msg = _("Affinity instance group policy was violated.")
raise exception.RescheduledException(
instance_uuid=instance.uuid,
reason=msg)
raise exception.GroupAffinityViolation(
instance_uuid=instance.uuid, policy='Affinity')
_do_validation(context, instance, group)
@ -2348,6 +2343,9 @@ class ComputeManager(manager.Manager):
self.reportclient.delete_allocation_for_instance(
context, instance.uuid, force=True)
if result in (build_results.FAILED_BY_POLICY,
build_results.RESCHEDULED_BY_POLICY):
return
if result in (build_results.FAILED,
build_results.RESCHEDULED):
self._build_failed(node)
@ -2446,6 +2444,8 @@ class ComputeManager(manager.Manager):
self._nil_out_instance_obj_host_and_node(instance)
self._set_instance_obj_error_state(instance,
clean_task_state=True)
if isinstance(e, exception.RescheduledByPolicyException):
return build_results.FAILED_BY_POLICY
return build_results.FAILED
LOG.debug(e.format_message(), instance=instance)
# This will be used for logging the exception
@ -2472,6 +2472,10 @@ class ComputeManager(manager.Manager):
injected_files, requested_networks, security_groups,
block_device_mapping, request_spec=request_spec,
host_lists=[host_list])
if isinstance(e, exception.RescheduledByPolicyException):
return build_results.RESCHEDULED_BY_POLICY
return build_results.RESCHEDULED
except (exception.InstanceNotFound,
exception.UnexpectedDeletingTaskStateError):
@ -2691,6 +2695,17 @@ class ComputeManager(manager.Manager):
bdms=block_device_mapping)
raise exception.BuildAbortException(instance_uuid=instance.uuid,
reason=e.format_message())
except exception.GroupAffinityViolation as e:
LOG.exception('Failed to build and run instance',
instance=instance)
self._notify_about_instance_usage(context, instance,
'create.error', fault=e)
compute_utils.notify_about_instance_create(
context, instance, self.host,
phase=fields.NotificationPhase.ERROR, exception=e,
bdms=block_device_mapping)
raise exception.RescheduledByPolicyException(
instance_uuid=instance.uuid, reason=str(e))
except Exception as e:
LOG.exception('Failed to build and run instance',
instance=instance)

View File

@ -1502,6 +1502,15 @@ class RescheduledException(NovaException):
"%(reason)s")
class RescheduledByPolicyException(RescheduledException):
msg_fmt = _("Build of instance %(instance_uuid)s was re-scheduled: "
"%(reason)s")
class GroupAffinityViolation(NovaException):
msg_fmt = _("%(policy)s instance group policy was violated")
class InstanceFaultRollback(NovaException):
def __init__(self, inner_exception=None):
message = _("Instance rollback performed due to: %s")

View File

@ -20,6 +20,7 @@ from oslo_config import cfg
from nova.compute import instance_actions
from nova import context
from nova.db.main import api as db
from nova import objects
from nova import test
from nova.tests import fixtures as nova_fixtures
from nova.tests.functional.api import client
@ -499,6 +500,85 @@ class ServerGroupTestV21(ServerGroupTestBase):
self.assertIn('Invalid input', ex.response.text)
self.assertIn('soft-affinity', ex.response.text)
@mock.patch('nova.scheduler.filters.affinity_filter.'
'ServerGroupAffinityFilter.host_passes', return_value=True)
def test_failed_count_with_affinity_violation(self, mock_host_passes):
"""Check failed count not incremented after violation of the late
affinity check. https://bugs.launchpad.net/nova/+bug/1996732
"""
created_group = self.api.post_server_groups(self.affinity)
flavor = self.api.get_flavors()[2]
# Ensure the first instance is on compute1
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
compute2_service_id = self.admin_api.get_services(
host=self.compute2.host, binary='nova-compute')[0]['id']
self.admin_api.put_service(compute2_service_id,
{'status': 'disabled'})
self._boot_a_server_to_group(created_group, flavor=flavor)
# Ensure the second instance is on compute2
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
self.admin_api.put_service(compute2_service_id,
{'status': 'enabled'})
compute1_service_id = self.admin_api.get_services(
host=self.compute.host, binary='nova-compute')[0]['id']
self.admin_api.put_service(compute1_service_id,
{'status': 'disabled'})
# Expects GroupAffinityViolation exception
failed_server = self._boot_a_server_to_group(created_group,
flavor=flavor,
expected_status='ERROR')
self.assertEqual('Exceeded maximum number of retries. Exhausted all '
'hosts available for retrying build failures for '
'instance %s.' % failed_server['id'],
failed_server['fault']['message'])
ctxt = context.get_admin_context()
computes = objects.ComputeNodeList.get_all(ctxt)
for node in computes:
self.assertEqual(node.stats.get('failed_builds'), '0')
@mock.patch('nova.scheduler.filters.affinity_filter.'
'ServerGroupAntiAffinityFilter.host_passes', return_value=True)
def test_failed_count_with_anti_affinity_violation(self, mock_host_passes):
"""Check failed count after violation of the late affinity check.
https://bugs.launchpad.net/nova/+bug/1996732
"""
created_group = self.api.post_server_groups(self.anti_affinity)
flavor = self.api.get_flavors()[2]
# Ensure two instances are scheduled on the same host
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
compute2_service_id = self.admin_api.get_services(
host=self.compute2.host, binary='nova-compute')[0]['id']
self.admin_api.put_service(compute2_service_id,
{'status': 'disabled'})
self._boot_a_server_to_group(created_group, flavor=flavor)
# Expects GroupAffinityViolation exception
failed_server = self._boot_a_server_to_group(created_group,
flavor=flavor,
expected_status='ERROR')
self.assertEqual('Exceeded maximum number of retries. Exhausted all '
'hosts available for retrying build failures for '
'instance %s.' % failed_server['id'],
failed_server['fault']['message'])
ctxt = context.get_admin_context()
computes = objects.ComputeNodeList.get_all(ctxt)
for node in computes:
self.assertEqual(node.stats.get('failed_builds'), '0')
class ServerGroupAffinityConfTest(ServerGroupTestBase):
api_major_version = 'v2.1'

View File

@ -6962,13 +6962,14 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.compute = manager.ComputeManager()
self._test_build_and_run_instance()
@mock.patch.object(manager.ComputeManager, '_build_succeeded')
@mock.patch.object(objects.InstanceActionEvent,
'event_finish_with_failure')
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(manager.ComputeManager, '_build_and_run_instance')
def _test_build_and_run_instance(self, mock_build, mock_save,
mock_start, mock_finish):
mock_start, mock_finish, mock_succeeded):
self._do_build_instance_update(mock_save)
orig_do_build_and_run = self.compute._do_build_and_run_instance
@ -7001,6 +7002,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.requested_networks, self.security_groups,
self.block_device_mapping, self.node, self.limits,
self.filter_properties, {}, self.accel_uuids)
mock_succeeded.assert_called_once_with(self.node)
# This test when sending an icehouse compatible rpc call to juno compute
# node, NetworkRequest object can load from three items tuple.
@ -7028,6 +7030,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.assertEqual('10.0.0.1', str(requested_network.address))
self.assertEqual(uuids.port_instance, requested_network.port_id)
@mock.patch.object(manager.ComputeManager, '_build_failed')
@mock.patch.object(objects.InstanceActionEvent,
'event_finish_with_failure')
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
@ -7043,7 +7046,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
def test_build_abort_exception(self, mock_build_run,
mock_build, mock_set, mock_nil, mock_add,
mock_clean_vol, mock_clean_net, mock_save,
mock_start, mock_finish):
mock_start, mock_finish, mock_failed):
self._do_build_instance_update(mock_save)
mock_build_run.side_effect = exception.BuildAbortException(reason='',
instance_uuid=self.instance.uuid)
@ -7086,7 +7089,9 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock.ANY, mock.ANY)
mock_nil.assert_called_once_with(self.instance)
mock_set.assert_called_once_with(self.instance, clean_task_state=True)
mock_failed.assert_called_once_with(self.node)
@mock.patch.object(manager.ComputeManager, '_build_failed')
@mock.patch.object(objects.InstanceActionEvent,
'event_finish_with_failure')
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
@ -7097,8 +7102,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
@mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances')
@mock.patch.object(manager.ComputeManager, '_build_and_run_instance')
def test_rescheduled_exception(self, mock_build_run,
mock_build, mock_set, mock_nil,
mock_save, mock_start, mock_finish):
mock_build, mock_set, mock_nil, mock_save,
mock_start, mock_finish, mock_failed):
self._do_build_instance_update(mock_save, reschedule_update=True)
mock_build_run.side_effect = exception.RescheduledException(reason='',
instance_uuid=self.instance.uuid)
@ -7145,6 +7150,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.admin_pass, self.injected_files, self.requested_networks,
self.security_groups, self.block_device_mapping,
request_spec={}, host_lists=[fake_host_list])
mock_failed.assert_called_once_with(self.node)
@mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim')
@mock.patch.object(manager.ComputeManager, '_shutdown_instance')
@ -7499,6 +7505,139 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.security_groups, self.block_device_mapping,
request_spec={}, host_lists=[fake_host_list])
@mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim',
new=mock.MagicMock())
@mock.patch.object(objects.InstanceActionEvent,
'event_finish_with_failure')
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(manager.ComputeManager,
'_nil_out_instance_obj_host_and_node')
@mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances')
@mock.patch.object(manager.ComputeManager, '_build_failed')
@mock.patch.object(manager.ComputeManager, '_build_succeeded')
@mock.patch.object(manager.ComputeManager,
'_validate_instance_group_policy')
def test_group_affinity_violation_exception_with_retry(
self, mock_validate_policy, mock_succeeded, mock_failed, mock_build,
mock_nil, mock_save, mock_start, mock_finish,
):
"""Test retry by affinity or anti-affinity validation check doesn't
increase failed build
"""
self._do_build_instance_update(mock_save, reschedule_update=True)
mock_validate_policy.side_effect = \
exception.GroupAffinityViolation(
instance_uuid=self.instance.uuid, policy="Affinity")
orig_do_build_and_run = self.compute._do_build_and_run_instance
def _wrapped_do_build_and_run_instance(*args, **kwargs):
ret = orig_do_build_and_run(*args, **kwargs)
self.assertEqual(build_results.RESCHEDULED_BY_POLICY, ret)
return ret
with test.nested(
mock.patch.object(
self.compute, '_do_build_and_run_instance',
side_effect=_wrapped_do_build_and_run_instance,
),
mock.patch.object(
self.compute.network_api, 'get_instance_nw_info',
),
):
self.compute.build_and_run_instance(
self.context, self.instance,
self.image, request_spec={},
filter_properties=self.filter_properties,
accel_uuids=self.accel_uuids,
injected_files=self.injected_files,
admin_password=self.admin_pass,
requested_networks=self.requested_networks,
security_groups=self.security_groups,
block_device_mapping=self.block_device_mapping, node=self.node,
limits=self.limits, host_list=fake_host_list)
mock_succeeded.assert_not_called()
mock_failed.assert_not_called()
self._instance_action_events(mock_start, mock_finish)
self._assert_build_instance_update(mock_save, reschedule_update=True)
mock_nil.assert_called_once_with(self.instance)
mock_build.assert_called_once_with(self.context,
[self.instance], self.image, self.filter_properties,
self.admin_pass, self.injected_files, self.requested_networks,
self.security_groups, self.block_device_mapping,
request_spec={}, host_lists=[fake_host_list])
@mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim',
new=mock.MagicMock())
@mock.patch.object(objects.InstanceActionEvent,
'event_finish_with_failure')
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(manager.ComputeManager,
'_nil_out_instance_obj_host_and_node')
@mock.patch.object(manager.ComputeManager, '_cleanup_allocated_networks')
@mock.patch.object(manager.ComputeManager, '_set_instance_obj_error_state')
@mock.patch.object(compute_utils, 'add_instance_fault_from_exc')
@mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances')
@mock.patch.object(manager.ComputeManager, '_build_failed')
@mock.patch.object(manager.ComputeManager, '_build_succeeded')
@mock.patch.object(manager.ComputeManager,
'_validate_instance_group_policy')
def test_group_affinity_violation_exception_without_retry(
self, mock_validate_policy, mock_succeeded, mock_failed, mock_build,
mock_add, mock_set_state, mock_clean_net, mock_nil, mock_save,
mock_start, mock_finish,
):
"""Test failure by affinity or anti-affinity validation check doesn't
increase failed build
"""
self._do_build_instance_update(mock_save)
mock_validate_policy.side_effect = \
exception.GroupAffinityViolation(
instance_uuid=self.instance.uuid, policy="Affinity")
orig_do_build_and_run = self.compute._do_build_and_run_instance
def _wrapped_do_build_and_run_instance(*args, **kwargs):
ret = orig_do_build_and_run(*args, **kwargs)
self.assertEqual(build_results.FAILED_BY_POLICY, ret)
return ret
with mock.patch.object(
self.compute, '_do_build_and_run_instance',
side_effect=_wrapped_do_build_and_run_instance,
):
self.compute.build_and_run_instance(
self.context, self.instance,
self.image, request_spec={},
filter_properties={},
accel_uuids=[],
injected_files=self.injected_files,
admin_password=self.admin_pass,
requested_networks=self.requested_networks,
security_groups=self.security_groups,
block_device_mapping=self.block_device_mapping, node=self.node,
limits=self.limits, host_list=fake_host_list)
mock_succeeded.assert_not_called()
mock_failed.assert_not_called()
self._instance_action_events(mock_start, mock_finish)
self._assert_build_instance_update(mock_save)
mock_clean_net.assert_called_once_with(self.context, self.instance,
self.requested_networks)
mock_add.assert_called_once_with(self.context, self.instance,
mock.ANY, mock.ANY, fault_message=mock.ANY)
mock_nil.assert_called_once_with(self.instance)
mock_build.assert_not_called()
mock_set_state.assert_called_once_with(self.instance,
clean_task_state=True)
@mock.patch.object(objects.InstanceActionEvent,
'event_finish_with_failure')
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
@ -8078,7 +8217,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
nodes.return_value = ['nodename']
migration_list.return_value = [objects.Migration(
uuid=uuids.migration, instance_uuid=uuids.instance)]
self.assertRaises(exception.RescheduledException,
self.assertRaises(exception.GroupAffinityViolation,
self.compute._validate_instance_group_policy,
self.context, instance, hints)