Merge "Fix failed count for anti-affinity check"

This commit is contained in:
Zuul 2023-06-07 14:35:52 +00:00 committed by Gerrit Code Review
commit 1fe8c4becb
5 changed files with 265 additions and 14 deletions

View File

@ -24,3 +24,11 @@ was rescheduled.
ACTIVE = 'active' # Instance is running ACTIVE = 'active' # Instance is running
FAILED = 'failed' # Instance failed to build and was not rescheduled FAILED = 'failed' # Instance failed to build and was not rescheduled
RESCHEDULED = 'rescheduled' # Instance failed to build, but was rescheduled RESCHEDULED = 'rescheduled' # Instance failed to build, but was rescheduled
# Instance failed by policy violation (such as affinity or anti-affinity)
# and was not rescheduled. In this case, the node's failed count won't be
# increased.
FAILED_BY_POLICY = 'failed_by_policy'
# Instance failed by policy violation (such as affinity or anti-affinity)
# but was rescheduled. In this case, the node's failed count won't be
# increased.
RESCHEDULED_BY_POLICY = 'rescheduled_by_policy'

View File

@ -1891,11 +1891,8 @@ class ComputeManager(manager.Manager):
else: else:
max_server = 1 max_server = 1
if len(members_on_host) >= max_server: if len(members_on_host) >= max_server:
msg = _("Anti-affinity instance group policy " raise exception.GroupAffinityViolation(
"was violated.") instance_uuid=instance.uuid, policy='Anti-affinity')
raise exception.RescheduledException(
instance_uuid=instance.uuid,
reason=msg)
# NOTE(ganso): The check for affinity below does not work and it # NOTE(ganso): The check for affinity below does not work and it
# can easily be violated because the lock happens in different # can easily be violated because the lock happens in different
@ -1905,10 +1902,8 @@ class ComputeManager(manager.Manager):
elif group.policy and 'affinity' == group.policy: elif group.policy and 'affinity' == group.policy:
group_hosts = group.get_hosts(exclude=[instance.uuid]) group_hosts = group.get_hosts(exclude=[instance.uuid])
if group_hosts and self.host not in group_hosts: if group_hosts and self.host not in group_hosts:
msg = _("Affinity instance group policy was violated.") raise exception.GroupAffinityViolation(
raise exception.RescheduledException( instance_uuid=instance.uuid, policy='Affinity')
instance_uuid=instance.uuid,
reason=msg)
_do_validation(context, instance, group) _do_validation(context, instance, group)
@ -2348,6 +2343,9 @@ class ComputeManager(manager.Manager):
self.reportclient.delete_allocation_for_instance( self.reportclient.delete_allocation_for_instance(
context, instance.uuid, force=True) context, instance.uuid, force=True)
if result in (build_results.FAILED_BY_POLICY,
build_results.RESCHEDULED_BY_POLICY):
return
if result in (build_results.FAILED, if result in (build_results.FAILED,
build_results.RESCHEDULED): build_results.RESCHEDULED):
self._build_failed(node) self._build_failed(node)
@ -2446,6 +2444,8 @@ class ComputeManager(manager.Manager):
self._nil_out_instance_obj_host_and_node(instance) self._nil_out_instance_obj_host_and_node(instance)
self._set_instance_obj_error_state(instance, self._set_instance_obj_error_state(instance,
clean_task_state=True) clean_task_state=True)
if isinstance(e, exception.RescheduledByPolicyException):
return build_results.FAILED_BY_POLICY
return build_results.FAILED return build_results.FAILED
LOG.debug(e.format_message(), instance=instance) LOG.debug(e.format_message(), instance=instance)
# This will be used for logging the exception # This will be used for logging the exception
@ -2472,6 +2472,10 @@ class ComputeManager(manager.Manager):
injected_files, requested_networks, security_groups, injected_files, requested_networks, security_groups,
block_device_mapping, request_spec=request_spec, block_device_mapping, request_spec=request_spec,
host_lists=[host_list]) host_lists=[host_list])
if isinstance(e, exception.RescheduledByPolicyException):
return build_results.RESCHEDULED_BY_POLICY
return build_results.RESCHEDULED return build_results.RESCHEDULED
except (exception.InstanceNotFound, except (exception.InstanceNotFound,
exception.UnexpectedDeletingTaskStateError): exception.UnexpectedDeletingTaskStateError):
@ -2691,6 +2695,17 @@ class ComputeManager(manager.Manager):
bdms=block_device_mapping) bdms=block_device_mapping)
raise exception.BuildAbortException(instance_uuid=instance.uuid, raise exception.BuildAbortException(instance_uuid=instance.uuid,
reason=e.format_message()) reason=e.format_message())
except exception.GroupAffinityViolation as e:
LOG.exception('Failed to build and run instance',
instance=instance)
self._notify_about_instance_usage(context, instance,
'create.error', fault=e)
compute_utils.notify_about_instance_create(
context, instance, self.host,
phase=fields.NotificationPhase.ERROR, exception=e,
bdms=block_device_mapping)
raise exception.RescheduledByPolicyException(
instance_uuid=instance.uuid, reason=str(e))
except Exception as e: except Exception as e:
LOG.exception('Failed to build and run instance', LOG.exception('Failed to build and run instance',
instance=instance) instance=instance)

View File

@ -1502,6 +1502,15 @@ class RescheduledException(NovaException):
"%(reason)s") "%(reason)s")
class RescheduledByPolicyException(RescheduledException):
msg_fmt = _("Build of instance %(instance_uuid)s was re-scheduled: "
"%(reason)s")
class GroupAffinityViolation(NovaException):
msg_fmt = _("%(policy)s instance group policy was violated")
class InstanceFaultRollback(NovaException): class InstanceFaultRollback(NovaException):
def __init__(self, inner_exception=None): def __init__(self, inner_exception=None):
message = _("Instance rollback performed due to: %s") message = _("Instance rollback performed due to: %s")

View File

@ -20,6 +20,7 @@ from oslo_config import cfg
from nova.compute import instance_actions from nova.compute import instance_actions
from nova import context from nova import context
from nova.db.main import api as db from nova.db.main import api as db
from nova import objects
from nova import test from nova import test
from nova.tests import fixtures as nova_fixtures from nova.tests import fixtures as nova_fixtures
from nova.tests.functional.api import client from nova.tests.functional.api import client
@ -499,6 +500,85 @@ class ServerGroupTestV21(ServerGroupTestBase):
self.assertIn('Invalid input', ex.response.text) self.assertIn('Invalid input', ex.response.text)
self.assertIn('soft-affinity', ex.response.text) self.assertIn('soft-affinity', ex.response.text)
@mock.patch('nova.scheduler.filters.affinity_filter.'
'ServerGroupAffinityFilter.host_passes', return_value=True)
def test_failed_count_with_affinity_violation(self, mock_host_passes):
"""Check failed count not incremented after violation of the late
affinity check. https://bugs.launchpad.net/nova/+bug/1996732
"""
created_group = self.api.post_server_groups(self.affinity)
flavor = self.api.get_flavors()[2]
# Ensure the first instance is on compute1
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
compute2_service_id = self.admin_api.get_services(
host=self.compute2.host, binary='nova-compute')[0]['id']
self.admin_api.put_service(compute2_service_id,
{'status': 'disabled'})
self._boot_a_server_to_group(created_group, flavor=flavor)
# Ensure the second instance is on compute2
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
self.admin_api.put_service(compute2_service_id,
{'status': 'enabled'})
compute1_service_id = self.admin_api.get_services(
host=self.compute.host, binary='nova-compute')[0]['id']
self.admin_api.put_service(compute1_service_id,
{'status': 'disabled'})
# Expects GroupAffinityViolation exception
failed_server = self._boot_a_server_to_group(created_group,
flavor=flavor,
expected_status='ERROR')
self.assertEqual('Exceeded maximum number of retries. Exhausted all '
'hosts available for retrying build failures for '
'instance %s.' % failed_server['id'],
failed_server['fault']['message'])
ctxt = context.get_admin_context()
computes = objects.ComputeNodeList.get_all(ctxt)
for node in computes:
self.assertEqual(node.stats.get('failed_builds'), '0')
@mock.patch('nova.scheduler.filters.affinity_filter.'
'ServerGroupAntiAffinityFilter.host_passes', return_value=True)
def test_failed_count_with_anti_affinity_violation(self, mock_host_passes):
"""Check failed count after violation of the late affinity check.
https://bugs.launchpad.net/nova/+bug/1996732
"""
created_group = self.api.post_server_groups(self.anti_affinity)
flavor = self.api.get_flavors()[2]
# Ensure two instances are scheduled on the same host
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
compute2_service_id = self.admin_api.get_services(
host=self.compute2.host, binary='nova-compute')[0]['id']
self.admin_api.put_service(compute2_service_id,
{'status': 'disabled'})
self._boot_a_server_to_group(created_group, flavor=flavor)
# Expects GroupAffinityViolation exception
failed_server = self._boot_a_server_to_group(created_group,
flavor=flavor,
expected_status='ERROR')
self.assertEqual('Exceeded maximum number of retries. Exhausted all '
'hosts available for retrying build failures for '
'instance %s.' % failed_server['id'],
failed_server['fault']['message'])
ctxt = context.get_admin_context()
computes = objects.ComputeNodeList.get_all(ctxt)
for node in computes:
self.assertEqual(node.stats.get('failed_builds'), '0')
class ServerGroupAffinityConfTest(ServerGroupTestBase): class ServerGroupAffinityConfTest(ServerGroupTestBase):
api_major_version = 'v2.1' api_major_version = 'v2.1'

View File

@ -6962,13 +6962,14 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.compute = manager.ComputeManager() self.compute = manager.ComputeManager()
self._test_build_and_run_instance() self._test_build_and_run_instance()
@mock.patch.object(manager.ComputeManager, '_build_succeeded')
@mock.patch.object(objects.InstanceActionEvent, @mock.patch.object(objects.InstanceActionEvent,
'event_finish_with_failure') 'event_finish_with_failure')
@mock.patch.object(objects.InstanceActionEvent, 'event_start') @mock.patch.object(objects.InstanceActionEvent, 'event_start')
@mock.patch.object(objects.Instance, 'save') @mock.patch.object(objects.Instance, 'save')
@mock.patch.object(manager.ComputeManager, '_build_and_run_instance') @mock.patch.object(manager.ComputeManager, '_build_and_run_instance')
def _test_build_and_run_instance(self, mock_build, mock_save, def _test_build_and_run_instance(self, mock_build, mock_save,
mock_start, mock_finish): mock_start, mock_finish, mock_succeeded):
self._do_build_instance_update(mock_save) self._do_build_instance_update(mock_save)
orig_do_build_and_run = self.compute._do_build_and_run_instance orig_do_build_and_run = self.compute._do_build_and_run_instance
@ -7001,6 +7002,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.requested_networks, self.security_groups, self.requested_networks, self.security_groups,
self.block_device_mapping, self.node, self.limits, self.block_device_mapping, self.node, self.limits,
self.filter_properties, {}, self.accel_uuids) self.filter_properties, {}, self.accel_uuids)
mock_succeeded.assert_called_once_with(self.node)
# This test when sending an icehouse compatible rpc call to juno compute # This test when sending an icehouse compatible rpc call to juno compute
# node, NetworkRequest object can load from three items tuple. # node, NetworkRequest object can load from three items tuple.
@ -7028,6 +7030,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.assertEqual('10.0.0.1', str(requested_network.address)) self.assertEqual('10.0.0.1', str(requested_network.address))
self.assertEqual(uuids.port_instance, requested_network.port_id) self.assertEqual(uuids.port_instance, requested_network.port_id)
@mock.patch.object(manager.ComputeManager, '_build_failed')
@mock.patch.object(objects.InstanceActionEvent, @mock.patch.object(objects.InstanceActionEvent,
'event_finish_with_failure') 'event_finish_with_failure')
@mock.patch.object(objects.InstanceActionEvent, 'event_start') @mock.patch.object(objects.InstanceActionEvent, 'event_start')
@ -7043,7 +7046,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
def test_build_abort_exception(self, mock_build_run, def test_build_abort_exception(self, mock_build_run,
mock_build, mock_set, mock_nil, mock_add, mock_build, mock_set, mock_nil, mock_add,
mock_clean_vol, mock_clean_net, mock_save, mock_clean_vol, mock_clean_net, mock_save,
mock_start, mock_finish): mock_start, mock_finish, mock_failed):
self._do_build_instance_update(mock_save) self._do_build_instance_update(mock_save)
mock_build_run.side_effect = exception.BuildAbortException(reason='', mock_build_run.side_effect = exception.BuildAbortException(reason='',
instance_uuid=self.instance.uuid) instance_uuid=self.instance.uuid)
@ -7086,7 +7089,9 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock.ANY, mock.ANY) mock.ANY, mock.ANY)
mock_nil.assert_called_once_with(self.instance) mock_nil.assert_called_once_with(self.instance)
mock_set.assert_called_once_with(self.instance, clean_task_state=True) mock_set.assert_called_once_with(self.instance, clean_task_state=True)
mock_failed.assert_called_once_with(self.node)
@mock.patch.object(manager.ComputeManager, '_build_failed')
@mock.patch.object(objects.InstanceActionEvent, @mock.patch.object(objects.InstanceActionEvent,
'event_finish_with_failure') 'event_finish_with_failure')
@mock.patch.object(objects.InstanceActionEvent, 'event_start') @mock.patch.object(objects.InstanceActionEvent, 'event_start')
@ -7097,8 +7102,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
@mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances') @mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances')
@mock.patch.object(manager.ComputeManager, '_build_and_run_instance') @mock.patch.object(manager.ComputeManager, '_build_and_run_instance')
def test_rescheduled_exception(self, mock_build_run, def test_rescheduled_exception(self, mock_build_run,
mock_build, mock_set, mock_nil, mock_build, mock_set, mock_nil, mock_save,
mock_save, mock_start, mock_finish): mock_start, mock_finish, mock_failed):
self._do_build_instance_update(mock_save, reschedule_update=True) self._do_build_instance_update(mock_save, reschedule_update=True)
mock_build_run.side_effect = exception.RescheduledException(reason='', mock_build_run.side_effect = exception.RescheduledException(reason='',
instance_uuid=self.instance.uuid) instance_uuid=self.instance.uuid)
@ -7145,6 +7150,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.admin_pass, self.injected_files, self.requested_networks, self.admin_pass, self.injected_files, self.requested_networks,
self.security_groups, self.block_device_mapping, self.security_groups, self.block_device_mapping,
request_spec={}, host_lists=[fake_host_list]) request_spec={}, host_lists=[fake_host_list])
mock_failed.assert_called_once_with(self.node)
@mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim') @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim')
@mock.patch.object(manager.ComputeManager, '_shutdown_instance') @mock.patch.object(manager.ComputeManager, '_shutdown_instance')
@ -7499,6 +7505,139 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.security_groups, self.block_device_mapping, self.security_groups, self.block_device_mapping,
request_spec={}, host_lists=[fake_host_list]) request_spec={}, host_lists=[fake_host_list])
@mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim',
new=mock.MagicMock())
@mock.patch.object(objects.InstanceActionEvent,
'event_finish_with_failure')
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(manager.ComputeManager,
'_nil_out_instance_obj_host_and_node')
@mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances')
@mock.patch.object(manager.ComputeManager, '_build_failed')
@mock.patch.object(manager.ComputeManager, '_build_succeeded')
@mock.patch.object(manager.ComputeManager,
'_validate_instance_group_policy')
def test_group_affinity_violation_exception_with_retry(
self, mock_validate_policy, mock_succeeded, mock_failed, mock_build,
mock_nil, mock_save, mock_start, mock_finish,
):
"""Test retry by affinity or anti-affinity validation check doesn't
increase failed build
"""
self._do_build_instance_update(mock_save, reschedule_update=True)
mock_validate_policy.side_effect = \
exception.GroupAffinityViolation(
instance_uuid=self.instance.uuid, policy="Affinity")
orig_do_build_and_run = self.compute._do_build_and_run_instance
def _wrapped_do_build_and_run_instance(*args, **kwargs):
ret = orig_do_build_and_run(*args, **kwargs)
self.assertEqual(build_results.RESCHEDULED_BY_POLICY, ret)
return ret
with test.nested(
mock.patch.object(
self.compute, '_do_build_and_run_instance',
side_effect=_wrapped_do_build_and_run_instance,
),
mock.patch.object(
self.compute.network_api, 'get_instance_nw_info',
),
):
self.compute.build_and_run_instance(
self.context, self.instance,
self.image, request_spec={},
filter_properties=self.filter_properties,
accel_uuids=self.accel_uuids,
injected_files=self.injected_files,
admin_password=self.admin_pass,
requested_networks=self.requested_networks,
security_groups=self.security_groups,
block_device_mapping=self.block_device_mapping, node=self.node,
limits=self.limits, host_list=fake_host_list)
mock_succeeded.assert_not_called()
mock_failed.assert_not_called()
self._instance_action_events(mock_start, mock_finish)
self._assert_build_instance_update(mock_save, reschedule_update=True)
mock_nil.assert_called_once_with(self.instance)
mock_build.assert_called_once_with(self.context,
[self.instance], self.image, self.filter_properties,
self.admin_pass, self.injected_files, self.requested_networks,
self.security_groups, self.block_device_mapping,
request_spec={}, host_lists=[fake_host_list])
@mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim',
new=mock.MagicMock())
@mock.patch.object(objects.InstanceActionEvent,
'event_finish_with_failure')
@mock.patch.object(objects.InstanceActionEvent, 'event_start')
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(manager.ComputeManager,
'_nil_out_instance_obj_host_and_node')
@mock.patch.object(manager.ComputeManager, '_cleanup_allocated_networks')
@mock.patch.object(manager.ComputeManager, '_set_instance_obj_error_state')
@mock.patch.object(compute_utils, 'add_instance_fault_from_exc')
@mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances')
@mock.patch.object(manager.ComputeManager, '_build_failed')
@mock.patch.object(manager.ComputeManager, '_build_succeeded')
@mock.patch.object(manager.ComputeManager,
'_validate_instance_group_policy')
def test_group_affinity_violation_exception_without_retry(
self, mock_validate_policy, mock_succeeded, mock_failed, mock_build,
mock_add, mock_set_state, mock_clean_net, mock_nil, mock_save,
mock_start, mock_finish,
):
"""Test failure by affinity or anti-affinity validation check doesn't
increase failed build
"""
self._do_build_instance_update(mock_save)
mock_validate_policy.side_effect = \
exception.GroupAffinityViolation(
instance_uuid=self.instance.uuid, policy="Affinity")
orig_do_build_and_run = self.compute._do_build_and_run_instance
def _wrapped_do_build_and_run_instance(*args, **kwargs):
ret = orig_do_build_and_run(*args, **kwargs)
self.assertEqual(build_results.FAILED_BY_POLICY, ret)
return ret
with mock.patch.object(
self.compute, '_do_build_and_run_instance',
side_effect=_wrapped_do_build_and_run_instance,
):
self.compute.build_and_run_instance(
self.context, self.instance,
self.image, request_spec={},
filter_properties={},
accel_uuids=[],
injected_files=self.injected_files,
admin_password=self.admin_pass,
requested_networks=self.requested_networks,
security_groups=self.security_groups,
block_device_mapping=self.block_device_mapping, node=self.node,
limits=self.limits, host_list=fake_host_list)
mock_succeeded.assert_not_called()
mock_failed.assert_not_called()
self._instance_action_events(mock_start, mock_finish)
self._assert_build_instance_update(mock_save)
mock_clean_net.assert_called_once_with(self.context, self.instance,
self.requested_networks)
mock_add.assert_called_once_with(self.context, self.instance,
mock.ANY, mock.ANY, fault_message=mock.ANY)
mock_nil.assert_called_once_with(self.instance)
mock_build.assert_not_called()
mock_set_state.assert_called_once_with(self.instance,
clean_task_state=True)
@mock.patch.object(objects.InstanceActionEvent, @mock.patch.object(objects.InstanceActionEvent,
'event_finish_with_failure') 'event_finish_with_failure')
@mock.patch.object(objects.InstanceActionEvent, 'event_start') @mock.patch.object(objects.InstanceActionEvent, 'event_start')
@ -8078,7 +8217,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
nodes.return_value = ['nodename'] nodes.return_value = ['nodename']
migration_list.return_value = [objects.Migration( migration_list.return_value = [objects.Migration(
uuid=uuids.migration, instance_uuid=uuids.instance)] uuid=uuids.migration, instance_uuid=uuids.instance)]
self.assertRaises(exception.RescheduledException, self.assertRaises(exception.GroupAffinityViolation,
self.compute._validate_instance_group_policy, self.compute._validate_instance_group_policy,
self.context, instance, hints) self.context, instance, hints)