Fix failed count for anti-affinity check

The late anti-affinity check runs in the compute manager to avoid
parallel scheduling requests to invalidate the anti-affinity server
group policy. When the check fails the instance is re-scheduled.
However this failure counted as a real instance boot failure of the
compute host and can lead to de-prioritization of the compute host
in the scheduler via BuildFailureWeigher. As the late anti-affinity
check does not indicate any fault of the compute host itself it
should not be counted towards the build failure counter.
This patch adds new build results to handle this case.

Closes-Bug: #1996732
Change-Id: I2ba035c09ace20e9835d9d12a5c5bee17d616718
Signed-off-by: Yusuke Okada <okada.yusuke@fujitsu.com>
This commit is contained in:
Yusuke Okada 2023-02-08 22:10:31 -05:00 committed by Balazs Gibizer
parent 971809b4d4
commit 56d320a203
5 changed files with 265 additions and 14 deletions

View File

@ -24,3 +24,11 @@ was rescheduled.
ACTIVE = 'active' # Instance is running
FAILED = 'failed' # Instance failed to build and was not rescheduled
RESCHEDULED = 'rescheduled' # Instance failed to build, but was rescheduled
# Instance failed by policy violation (such as affinity or anti-affinity)
# and was not rescheduled. In this case, the node's failed count won't be
# increased.
FAILED_BY_POLICY = 'failed_by_policy'
# Instance failed by policy violation (such as affinity or anti-affinity)
# but was rescheduled. In this case, the node's failed count won't be
# increased.
RESCHEDULED_BY_POLICY = 'rescheduled_by_policy'

View File

@ -1891,11 +1891,8 @@ class ComputeManager(manager.Manager):
else:
max_server = 1
if len(members_on_host) >= max_server:
msg = _("Anti-affinity instance group policy "
"was violated.")
raise exception.RescheduledException(
instance_uuid=instance.uuid,
reason=msg)
raise exception.GroupAffinityViolation(
instance_uuid=instance.uuid, policy='Anti-affinity')
# NOTE(ganso): The check for affinity below does not work and it
# can easily be violated because the lock happens in different
@ -1905,10 +1902,8 @@ class ComputeManager(manager.Manager):
elif group.policy and 'affinity' == group.policy:
group_hosts = group.get_hosts(exclude=[instance.uuid])
if group_hosts and self.host not in group_hosts:
msg = _("Affinity instance group policy was violated.")
raise exception.RescheduledException(
instance_uuid=instance.uuid,
reason=msg)
raise exception.GroupAffinityViolation(
instance_uuid=instance.uuid, policy='Affinity')
_do_validation(context, instance, group)
@ -2348,6 +2343,9 @@ class ComputeManager(manager.Manager):
self.reportclient.delete_allocation_for_instance(
context, instance.uuid, force=True)
if result in (build_results.FAILED_BY_POLICY,
build_results.RESCHEDULED_BY_POLICY):
return
if result in (build_results.FAILED,
build_results.RESCHEDULED):
self._build_failed(node)
@ -2446,6 +2444,8 @@ class ComputeManager(manager.Manager):
self._nil_out_instance_obj_host_and_node(instance)
self._set_instance_obj_error_state(instance,
clean_task_state=True)
if isinstance(e, exception.RescheduledByPolicyException):
return build_results.FAILED_BY_POLICY
return build_results.FAILED
LOG.debug(e.format_message(), instance=instance)
# This will be used for logging the exception
@ -2472,6 +2472,10 @@ class ComputeManager(manager.Manager):
injected_files, requested_networks, security_groups,
block_device_mapping, request_spec=request_spec,
host_lists=[host_list])
if isinstance(e, exception.RescheduledByPolicyException):
return build_results.RESCHEDULED_BY_POLICY
return build_results.RESCHEDULED
except (exception.InstanceNotFound,
exception.UnexpectedDeletingTaskStateError):
@ -2691,6 +2695,17 @@ class ComputeManager(manager.Manager):
bdms=block_device_mapping)
raise exception.BuildAbortException(instance_uuid=instance.uuid,
reason=e.format_message())
except exception.GroupAffinityViolation as e:
LOG.exception('Failed to build and run instance',
instance=instance)
self._notify_about_instance_usage(context, instance,
'create.error', fault=e)
compute_utils.notify_about_instance_create(
context, instance, self.host,
phase=fields.NotificationPhase.ERROR, exception=e,
bdms=block_device_mapping)
raise exception.RescheduledByPolicyException(
instance_uuid=instance.uuid, reason=str(e))
except Exception as e:
LOG.exception('Failed to build and run instance',
instance=instance)

View File

@ -1502,6 +1502,15 @@ class RescheduledException(NovaException):
"%(reason)s")
class RescheduledByPolicyException(RescheduledException):
msg_fmt = _("Build of instance %(instance_uuid)s was re-scheduled: "
"%(reason)s")
class GroupAffinityViolation(NovaException):
msg_fmt = _("%(policy)s instance group policy was violated")
class InstanceFaultRollback(NovaException):
def __init__(self, inner_exception=None):
message = _("Instance rollback performed due to: %s")

View File

@ -20,6 +20,7 @@ from oslo_config import cfg
from nova.compute import instance_actions
from nova import context
from nova.db.main import api as db
from nova import objects
from nova import test
from nova.tests import fixtures as nova_fixtures
from nova.tests.functional.api import client
@ -499,6 +500,85 @@ class ServerGroupTestV21(ServerGroupTestBase):
self.assertIn('Invalid input', ex.response.text)
self.assertIn('soft-affinity', ex.response.text)
@mock.patch('nova.scheduler.filters.affinity_filter.'
'ServerGroupAffinityFilter.host_passes', return_value=True)
def test_failed_count_with_affinity_violation(self, mock_host_passes):
"""Check failed count not incremented after violation of the late
affinity check. https://bugs.launchpad.net/nova/+bug/1996732
"""
created_group = self.api.post_server_groups(self.affinity)
flavor = self.api.get_flavors()[2]
# Ensure the first instance is on compute1
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
compute2_service_id = self.admin_api.get_services(
host=self.compute2.host, binary='nova-compute')[0]['id']
self.admin_api.put_service(compute2_service_id,
{'status': 'disabled'})
self._boot_a_server_to_group(created_group, flavor=flavor)
# Ensure the second instance is on compute2
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
self.admin_api.put_service(compute2_service_id,
{'status': 'enabled'})
compute1_service_id = self.admin_api.get_services(
host=self.compute.host, binary='nova-compute')[0]['id']
self.admin_api.put_service(compute1_service_id,
{'status': 'disabled'})
# Expects GroupAffinityViolation exception
failed_server = self._boot_a_server_to_group(created_group,
flavor=flavor,
expected_status='ERROR')
self.assertEqual('Exceeded maximum number of retries. Exhausted all '
'hosts available for retrying build failures for '
'instance %s.' % failed_server['id'],
failed_server['fault']['message'])
ctxt = context.get_admin_context()
computes = objects.ComputeNodeList.get_all(ctxt)
for node in computes:
self.assertEqual(node.stats.get('failed_builds'), '0')
@mock.patch('nova.scheduler.filters.affinity_filter.'
'ServerGroupAntiAffinityFilter.host_passes', return_value=True)
def test_failed_count_with_anti_affinity_violation(self, mock_host_passes):
"""Check failed count after violation of the late affinity check.
https://bugs.launchpad.net/nova/+bug/1996732
"""
created_group = self.api.post_server_groups(self.anti_affinity)
flavor = self.api.get_flavors()[2]
# Ensure two instances are scheduled on the same host
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
compute2_service_id = self.admin_api.get_services(
host=self.compute2.host, binary='nova-compute')[0]['id']
self.admin_api.put_service(compute2_service_id,
{'status': 'disabled'})
self._boot_a_server_to_group(created_group, flavor=flavor)
# Expects GroupAffinityViolation exception
failed_server = self._boot_a_server_to_group(created_group,
flavor=flavor,
expected_status='ERROR')
self.assertEqual('Exceeded maximum number of retries. Exhausted all '
'hosts available for retrying build failures for '
'instance %s.' % failed_server['id'],
failed_server['fault']['message'])
ctxt = context.get_admin_context()
computes = objects.ComputeNodeList.get_all(ctxt)
for node in computes:
self.assertEqual(node.stats.get('failed_builds'), '0')
class ServerGroupAffinityConfTest(ServerGroupTestBase):
api_major_version = 'v2.1'

View File

@ -6962,13 +6962,14 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.compute = manager.ComputeManager()
self._test_build_and_run_instance()
@mock.patch.object(manager.ComputeManager, '_build_succeeded')
@mock.patch.object(objects.InstanceActionEvent,
'event_finish_with_failure')
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(manager.ComputeManager, '_build_and_run_instance')
def _test_build_and_run_instance(self, mock_build, mock_save,
mock_start, mock_finish):
mock_start, mock_finish, mock_succeeded):
self._do_build_instance_update(mock_save)
orig_do_build_and_run = self.compute._do_build_and_run_instance
@ -7001,6 +7002,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.requested_networks, self.security_groups,
self.block_device_mapping, self.node, self.limits,
self.filter_properties, {}, self.accel_uuids)
mock_succeeded.assert_called_once_with(self.node)
# This test when sending an icehouse compatible rpc call to juno compute
# node, NetworkRequest object can load from three items tuple.
@ -7028,6 +7030,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.assertEqual('10.0.0.1', str(requested_network.address))
self.assertEqual(uuids.port_instance, requested_network.port_id)
@mock.patch.object(manager.ComputeManager, '_build_failed')
@mock.patch.object(objects.InstanceActionEvent,
'event_finish_with_failure')
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
@ -7043,7 +7046,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
def test_build_abort_exception(self, mock_build_run,
mock_build, mock_set, mock_nil, mock_add,
mock_clean_vol, mock_clean_net, mock_save,
mock_start, mock_finish):
mock_start, mock_finish, mock_failed):
self._do_build_instance_update(mock_save)
mock_build_run.side_effect = exception.BuildAbortException(reason='',
instance_uuid=self.instance.uuid)
@ -7086,7 +7089,9 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock.ANY, mock.ANY)
mock_nil.assert_called_once_with(self.instance)
mock_set.assert_called_once_with(self.instance, clean_task_state=True)
mock_failed.assert_called_once_with(self.node)
@mock.patch.object(manager.ComputeManager, '_build_failed')
@mock.patch.object(objects.InstanceActionEvent,
'event_finish_with_failure')
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
@ -7097,8 +7102,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
@mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances')
@mock.patch.object(manager.ComputeManager, '_build_and_run_instance')
def test_rescheduled_exception(self, mock_build_run,
mock_build, mock_set, mock_nil,
mock_save, mock_start, mock_finish):
mock_build, mock_set, mock_nil, mock_save,
mock_start, mock_finish, mock_failed):
self._do_build_instance_update(mock_save, reschedule_update=True)
mock_build_run.side_effect = exception.RescheduledException(reason='',
instance_uuid=self.instance.uuid)
@ -7145,6 +7150,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.admin_pass, self.injected_files, self.requested_networks,
self.security_groups, self.block_device_mapping,
request_spec={}, host_lists=[fake_host_list])
mock_failed.assert_called_once_with(self.node)
@mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim')
@mock.patch.object(manager.ComputeManager, '_shutdown_instance')
@ -7499,6 +7505,139 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.security_groups, self.block_device_mapping,
request_spec={}, host_lists=[fake_host_list])
@mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim',
new=mock.MagicMock())
@mock.patch.object(objects.InstanceActionEvent,
'event_finish_with_failure')
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(manager.ComputeManager,
'_nil_out_instance_obj_host_and_node')
@mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances')
@mock.patch.object(manager.ComputeManager, '_build_failed')
@mock.patch.object(manager.ComputeManager, '_build_succeeded')
@mock.patch.object(manager.ComputeManager,
'_validate_instance_group_policy')
def test_group_affinity_violation_exception_with_retry(
self, mock_validate_policy, mock_succeeded, mock_failed, mock_build,
mock_nil, mock_save, mock_start, mock_finish,
):
"""Test retry by affinity or anti-affinity validation check doesn't
increase failed build
"""
self._do_build_instance_update(mock_save, reschedule_update=True)
mock_validate_policy.side_effect = \
exception.GroupAffinityViolation(
instance_uuid=self.instance.uuid, policy="Affinity")
orig_do_build_and_run = self.compute._do_build_and_run_instance
def _wrapped_do_build_and_run_instance(*args, **kwargs):
ret = orig_do_build_and_run(*args, **kwargs)
self.assertEqual(build_results.RESCHEDULED_BY_POLICY, ret)
return ret
with test.nested(
mock.patch.object(
self.compute, '_do_build_and_run_instance',
side_effect=_wrapped_do_build_and_run_instance,
),
mock.patch.object(
self.compute.network_api, 'get_instance_nw_info',
),
):
self.compute.build_and_run_instance(
self.context, self.instance,
self.image, request_spec={},
filter_properties=self.filter_properties,
accel_uuids=self.accel_uuids,
injected_files=self.injected_files,
admin_password=self.admin_pass,
requested_networks=self.requested_networks,
security_groups=self.security_groups,
block_device_mapping=self.block_device_mapping, node=self.node,
limits=self.limits, host_list=fake_host_list)
mock_succeeded.assert_not_called()
mock_failed.assert_not_called()
self._instance_action_events(mock_start, mock_finish)
self._assert_build_instance_update(mock_save, reschedule_update=True)
mock_nil.assert_called_once_with(self.instance)
mock_build.assert_called_once_with(self.context,
[self.instance], self.image, self.filter_properties,
self.admin_pass, self.injected_files, self.requested_networks,
self.security_groups, self.block_device_mapping,
request_spec={}, host_lists=[fake_host_list])
@mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim',
new=mock.MagicMock())
@mock.patch.object(objects.InstanceActionEvent,
'event_finish_with_failure')
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(manager.ComputeManager,
'_nil_out_instance_obj_host_and_node')
@mock.patch.object(manager.ComputeManager, '_cleanup_allocated_networks')
@mock.patch.object(manager.ComputeManager, '_set_instance_obj_error_state')
@mock.patch.object(compute_utils, 'add_instance_fault_from_exc')
@mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances')
@mock.patch.object(manager.ComputeManager, '_build_failed')
@mock.patch.object(manager.ComputeManager, '_build_succeeded')
@mock.patch.object(manager.ComputeManager,
'_validate_instance_group_policy')
def test_group_affinity_violation_exception_without_retry(
self, mock_validate_policy, mock_succeeded, mock_failed, mock_build,
mock_add, mock_set_state, mock_clean_net, mock_nil, mock_save,
mock_start, mock_finish,
):
"""Test failure by affinity or anti-affinity validation check doesn't
increase failed build
"""
self._do_build_instance_update(mock_save)
mock_validate_policy.side_effect = \
exception.GroupAffinityViolation(
instance_uuid=self.instance.uuid, policy="Affinity")
orig_do_build_and_run = self.compute._do_build_and_run_instance
def _wrapped_do_build_and_run_instance(*args, **kwargs):
ret = orig_do_build_and_run(*args, **kwargs)
self.assertEqual(build_results.FAILED_BY_POLICY, ret)
return ret
with mock.patch.object(
self.compute, '_do_build_and_run_instance',
side_effect=_wrapped_do_build_and_run_instance,
):
self.compute.build_and_run_instance(
self.context, self.instance,
self.image, request_spec={},
filter_properties={},
accel_uuids=[],
injected_files=self.injected_files,
admin_password=self.admin_pass,
requested_networks=self.requested_networks,
security_groups=self.security_groups,
block_device_mapping=self.block_device_mapping, node=self.node,
limits=self.limits, host_list=fake_host_list)
mock_succeeded.assert_not_called()
mock_failed.assert_not_called()
self._instance_action_events(mock_start, mock_finish)
self._assert_build_instance_update(mock_save)
mock_clean_net.assert_called_once_with(self.context, self.instance,
self.requested_networks)
mock_add.assert_called_once_with(self.context, self.instance,
mock.ANY, mock.ANY, fault_message=mock.ANY)
mock_nil.assert_called_once_with(self.instance)
mock_build.assert_not_called()
mock_set_state.assert_called_once_with(self.instance,
clean_task_state=True)
@mock.patch.object(objects.InstanceActionEvent,
'event_finish_with_failure')
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
@ -8078,7 +8217,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
nodes.return_value = ['nodename']
migration_list.return_value = [objects.Migration(
uuid=uuids.migration, instance_uuid=uuids.instance)]
self.assertRaises(exception.RescheduledException,
self.assertRaises(exception.GroupAffinityViolation,
self.compute._validate_instance_group_policy,
self.context, instance, hints)