From 56d320a203a13f262a2e94e491af222032e453d3 Mon Sep 17 00:00:00 2001 From: Yusuke Okada Date: Wed, 8 Feb 2023 22:10:31 -0500 Subject: [PATCH] Fix failed count for anti-affinity check The late anti-affinity check runs in the compute manager to avoid parallel scheduling requests to invalidate the anti-affinity server group policy. When the check fails the instance is re-scheduled. However this failure counted as a real instance boot failure of the compute host and can lead to de-prioritization of the compute host in the scheduler via BuildFailureWeigher. As the late anti-affinity check does not indicate any fault of the compute host itself it should not be counted towards the build failure counter. This patch adds new build results to handle this case. Closes-Bug: #1996732 Change-Id: I2ba035c09ace20e9835d9d12a5c5bee17d616718 Signed-off-by: Yusuke Okada --- nova/compute/build_results.py | 8 ++ nova/compute/manager.py | 33 +++-- nova/exception.py | 9 ++ nova/tests/functional/test_server_group.py | 80 +++++++++++ nova/tests/unit/compute/test_compute_mgr.py | 149 +++++++++++++++++++- 5 files changed, 265 insertions(+), 14 deletions(-) diff --git a/nova/compute/build_results.py b/nova/compute/build_results.py index ca9ed51410f7..a091c89ff653 100644 --- a/nova/compute/build_results.py +++ b/nova/compute/build_results.py @@ -24,3 +24,11 @@ was rescheduled. ACTIVE = 'active' # Instance is running FAILED = 'failed' # Instance failed to build and was not rescheduled RESCHEDULED = 'rescheduled' # Instance failed to build, but was rescheduled +# Instance failed by policy violation (such as affinity or anti-affinity) +# and was not rescheduled. In this case, the node's failed count won't be +# increased. +FAILED_BY_POLICY = 'failed_by_policy' +# Instance failed by policy violation (such as affinity or anti-affinity) +# but was rescheduled. In this case, the node's failed count won't be +# increased. +RESCHEDULED_BY_POLICY = 'rescheduled_by_policy' diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 5c42aa4d89a1..e99263a69077 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -1891,11 +1891,8 @@ class ComputeManager(manager.Manager): else: max_server = 1 if len(members_on_host) >= max_server: - msg = _("Anti-affinity instance group policy " - "was violated.") - raise exception.RescheduledException( - instance_uuid=instance.uuid, - reason=msg) + raise exception.GroupAffinityViolation( + instance_uuid=instance.uuid, policy='Anti-affinity') # NOTE(ganso): The check for affinity below does not work and it # can easily be violated because the lock happens in different @@ -1905,10 +1902,8 @@ class ComputeManager(manager.Manager): elif group.policy and 'affinity' == group.policy: group_hosts = group.get_hosts(exclude=[instance.uuid]) if group_hosts and self.host not in group_hosts: - msg = _("Affinity instance group policy was violated.") - raise exception.RescheduledException( - instance_uuid=instance.uuid, - reason=msg) + raise exception.GroupAffinityViolation( + instance_uuid=instance.uuid, policy='Affinity') _do_validation(context, instance, group) @@ -2348,6 +2343,9 @@ class ComputeManager(manager.Manager): self.reportclient.delete_allocation_for_instance( context, instance.uuid, force=True) + if result in (build_results.FAILED_BY_POLICY, + build_results.RESCHEDULED_BY_POLICY): + return if result in (build_results.FAILED, build_results.RESCHEDULED): self._build_failed(node) @@ -2446,6 +2444,8 @@ class ComputeManager(manager.Manager): self._nil_out_instance_obj_host_and_node(instance) self._set_instance_obj_error_state(instance, clean_task_state=True) + if isinstance(e, exception.RescheduledByPolicyException): + return build_results.FAILED_BY_POLICY return build_results.FAILED LOG.debug(e.format_message(), instance=instance) # This will be used for logging the exception @@ -2472,6 +2472,10 @@ class ComputeManager(manager.Manager): injected_files, requested_networks, security_groups, block_device_mapping, request_spec=request_spec, host_lists=[host_list]) + + if isinstance(e, exception.RescheduledByPolicyException): + return build_results.RESCHEDULED_BY_POLICY + return build_results.RESCHEDULED except (exception.InstanceNotFound, exception.UnexpectedDeletingTaskStateError): @@ -2691,6 +2695,17 @@ class ComputeManager(manager.Manager): bdms=block_device_mapping) raise exception.BuildAbortException(instance_uuid=instance.uuid, reason=e.format_message()) + except exception.GroupAffinityViolation as e: + LOG.exception('Failed to build and run instance', + instance=instance) + self._notify_about_instance_usage(context, instance, + 'create.error', fault=e) + compute_utils.notify_about_instance_create( + context, instance, self.host, + phase=fields.NotificationPhase.ERROR, exception=e, + bdms=block_device_mapping) + raise exception.RescheduledByPolicyException( + instance_uuid=instance.uuid, reason=str(e)) except Exception as e: LOG.exception('Failed to build and run instance', instance=instance) diff --git a/nova/exception.py b/nova/exception.py index 0c0ffa85a1a9..c3ada86c1e21 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -1502,6 +1502,15 @@ class RescheduledException(NovaException): "%(reason)s") +class RescheduledByPolicyException(RescheduledException): + msg_fmt = _("Build of instance %(instance_uuid)s was re-scheduled: " + "%(reason)s") + + +class GroupAffinityViolation(NovaException): + msg_fmt = _("%(policy)s instance group policy was violated") + + class InstanceFaultRollback(NovaException): def __init__(self, inner_exception=None): message = _("Instance rollback performed due to: %s") diff --git a/nova/tests/functional/test_server_group.py b/nova/tests/functional/test_server_group.py index 01e3547f7e59..ee608d904410 100644 --- a/nova/tests/functional/test_server_group.py +++ b/nova/tests/functional/test_server_group.py @@ -20,6 +20,7 @@ from oslo_config import cfg from nova.compute import instance_actions from nova import context from nova.db.main import api as db +from nova import objects from nova import test from nova.tests import fixtures as nova_fixtures from nova.tests.functional.api import client @@ -499,6 +500,85 @@ class ServerGroupTestV21(ServerGroupTestBase): self.assertIn('Invalid input', ex.response.text) self.assertIn('soft-affinity', ex.response.text) + @mock.patch('nova.scheduler.filters.affinity_filter.' + 'ServerGroupAffinityFilter.host_passes', return_value=True) + def test_failed_count_with_affinity_violation(self, mock_host_passes): + """Check failed count not incremented after violation of the late + affinity check. https://bugs.launchpad.net/nova/+bug/1996732 + """ + + created_group = self.api.post_server_groups(self.affinity) + flavor = self.api.get_flavors()[2] + + # Ensure the first instance is on compute1 + with utils.temporary_mutation(self.admin_api, microversion='2.53'): + compute2_service_id = self.admin_api.get_services( + host=self.compute2.host, binary='nova-compute')[0]['id'] + self.admin_api.put_service(compute2_service_id, + {'status': 'disabled'}) + + self._boot_a_server_to_group(created_group, flavor=flavor) + + # Ensure the second instance is on compute2 + with utils.temporary_mutation(self.admin_api, microversion='2.53'): + self.admin_api.put_service(compute2_service_id, + {'status': 'enabled'}) + compute1_service_id = self.admin_api.get_services( + host=self.compute.host, binary='nova-compute')[0]['id'] + self.admin_api.put_service(compute1_service_id, + {'status': 'disabled'}) + + # Expects GroupAffinityViolation exception + failed_server = self._boot_a_server_to_group(created_group, + flavor=flavor, + expected_status='ERROR') + + self.assertEqual('Exceeded maximum number of retries. Exhausted all ' + 'hosts available for retrying build failures for ' + 'instance %s.' % failed_server['id'], + failed_server['fault']['message']) + + ctxt = context.get_admin_context() + computes = objects.ComputeNodeList.get_all(ctxt) + + for node in computes: + self.assertEqual(node.stats.get('failed_builds'), '0') + + @mock.patch('nova.scheduler.filters.affinity_filter.' + 'ServerGroupAntiAffinityFilter.host_passes', return_value=True) + def test_failed_count_with_anti_affinity_violation(self, mock_host_passes): + """Check failed count after violation of the late affinity check. + https://bugs.launchpad.net/nova/+bug/1996732 + """ + + created_group = self.api.post_server_groups(self.anti_affinity) + flavor = self.api.get_flavors()[2] + + # Ensure two instances are scheduled on the same host + with utils.temporary_mutation(self.admin_api, microversion='2.53'): + compute2_service_id = self.admin_api.get_services( + host=self.compute2.host, binary='nova-compute')[0]['id'] + self.admin_api.put_service(compute2_service_id, + {'status': 'disabled'}) + + self._boot_a_server_to_group(created_group, flavor=flavor) + + # Expects GroupAffinityViolation exception + failed_server = self._boot_a_server_to_group(created_group, + flavor=flavor, + expected_status='ERROR') + + self.assertEqual('Exceeded maximum number of retries. Exhausted all ' + 'hosts available for retrying build failures for ' + 'instance %s.' % failed_server['id'], + failed_server['fault']['message']) + + ctxt = context.get_admin_context() + computes = objects.ComputeNodeList.get_all(ctxt) + + for node in computes: + self.assertEqual(node.stats.get('failed_builds'), '0') + class ServerGroupAffinityConfTest(ServerGroupTestBase): api_major_version = 'v2.1' diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 73c9d32197ee..42fc06bd39f0 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -6962,13 +6962,14 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): self.compute = manager.ComputeManager() self._test_build_and_run_instance() + @mock.patch.object(manager.ComputeManager, '_build_succeeded') @mock.patch.object(objects.InstanceActionEvent, 'event_finish_with_failure') @mock.patch.object(objects.InstanceActionEvent, 'event_start') @mock.patch.object(objects.Instance, 'save') @mock.patch.object(manager.ComputeManager, '_build_and_run_instance') def _test_build_and_run_instance(self, mock_build, mock_save, - mock_start, mock_finish): + mock_start, mock_finish, mock_succeeded): self._do_build_instance_update(mock_save) orig_do_build_and_run = self.compute._do_build_and_run_instance @@ -7001,6 +7002,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): self.requested_networks, self.security_groups, self.block_device_mapping, self.node, self.limits, self.filter_properties, {}, self.accel_uuids) + mock_succeeded.assert_called_once_with(self.node) # This test when sending an icehouse compatible rpc call to juno compute # node, NetworkRequest object can load from three items tuple. @@ -7028,6 +7030,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): self.assertEqual('10.0.0.1', str(requested_network.address)) self.assertEqual(uuids.port_instance, requested_network.port_id) + @mock.patch.object(manager.ComputeManager, '_build_failed') @mock.patch.object(objects.InstanceActionEvent, 'event_finish_with_failure') @mock.patch.object(objects.InstanceActionEvent, 'event_start') @@ -7043,7 +7046,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): def test_build_abort_exception(self, mock_build_run, mock_build, mock_set, mock_nil, mock_add, mock_clean_vol, mock_clean_net, mock_save, - mock_start, mock_finish): + mock_start, mock_finish, mock_failed): self._do_build_instance_update(mock_save) mock_build_run.side_effect = exception.BuildAbortException(reason='', instance_uuid=self.instance.uuid) @@ -7086,7 +7089,9 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): mock.ANY, mock.ANY) mock_nil.assert_called_once_with(self.instance) mock_set.assert_called_once_with(self.instance, clean_task_state=True) + mock_failed.assert_called_once_with(self.node) + @mock.patch.object(manager.ComputeManager, '_build_failed') @mock.patch.object(objects.InstanceActionEvent, 'event_finish_with_failure') @mock.patch.object(objects.InstanceActionEvent, 'event_start') @@ -7097,8 +7102,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): @mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances') @mock.patch.object(manager.ComputeManager, '_build_and_run_instance') def test_rescheduled_exception(self, mock_build_run, - mock_build, mock_set, mock_nil, - mock_save, mock_start, mock_finish): + mock_build, mock_set, mock_nil, mock_save, + mock_start, mock_finish, mock_failed): self._do_build_instance_update(mock_save, reschedule_update=True) mock_build_run.side_effect = exception.RescheduledException(reason='', instance_uuid=self.instance.uuid) @@ -7145,6 +7150,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): self.admin_pass, self.injected_files, self.requested_networks, self.security_groups, self.block_device_mapping, request_spec={}, host_lists=[fake_host_list]) + mock_failed.assert_called_once_with(self.node) @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim') @mock.patch.object(manager.ComputeManager, '_shutdown_instance') @@ -7499,6 +7505,139 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): self.security_groups, self.block_device_mapping, request_spec={}, host_lists=[fake_host_list]) + @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim', + new=mock.MagicMock()) + @mock.patch.object(objects.InstanceActionEvent, + 'event_finish_with_failure') + @mock.patch.object(objects.InstanceActionEvent, 'event_start') + @mock.patch.object(objects.Instance, 'save') + @mock.patch.object(manager.ComputeManager, + '_nil_out_instance_obj_host_and_node') + @mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances') + @mock.patch.object(manager.ComputeManager, '_build_failed') + @mock.patch.object(manager.ComputeManager, '_build_succeeded') + @mock.patch.object(manager.ComputeManager, + '_validate_instance_group_policy') + def test_group_affinity_violation_exception_with_retry( + self, mock_validate_policy, mock_succeeded, mock_failed, mock_build, + mock_nil, mock_save, mock_start, mock_finish, + ): + """Test retry by affinity or anti-affinity validation check doesn't + increase failed build + """ + + self._do_build_instance_update(mock_save, reschedule_update=True) + mock_validate_policy.side_effect = \ + exception.GroupAffinityViolation( + instance_uuid=self.instance.uuid, policy="Affinity") + + orig_do_build_and_run = self.compute._do_build_and_run_instance + + def _wrapped_do_build_and_run_instance(*args, **kwargs): + ret = orig_do_build_and_run(*args, **kwargs) + self.assertEqual(build_results.RESCHEDULED_BY_POLICY, ret) + return ret + + with test.nested( + mock.patch.object( + self.compute, '_do_build_and_run_instance', + side_effect=_wrapped_do_build_and_run_instance, + ), + mock.patch.object( + self.compute.network_api, 'get_instance_nw_info', + ), + ): + self.compute.build_and_run_instance( + self.context, self.instance, + self.image, request_spec={}, + filter_properties=self.filter_properties, + accel_uuids=self.accel_uuids, + injected_files=self.injected_files, + admin_password=self.admin_pass, + requested_networks=self.requested_networks, + security_groups=self.security_groups, + block_device_mapping=self.block_device_mapping, node=self.node, + limits=self.limits, host_list=fake_host_list) + + mock_succeeded.assert_not_called() + mock_failed.assert_not_called() + + self._instance_action_events(mock_start, mock_finish) + self._assert_build_instance_update(mock_save, reschedule_update=True) + mock_nil.assert_called_once_with(self.instance) + mock_build.assert_called_once_with(self.context, + [self.instance], self.image, self.filter_properties, + self.admin_pass, self.injected_files, self.requested_networks, + self.security_groups, self.block_device_mapping, + request_spec={}, host_lists=[fake_host_list]) + + @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim', + new=mock.MagicMock()) + @mock.patch.object(objects.InstanceActionEvent, + 'event_finish_with_failure') + @mock.patch.object(objects.InstanceActionEvent, 'event_start') + @mock.patch.object(objects.Instance, 'save') + @mock.patch.object(manager.ComputeManager, + '_nil_out_instance_obj_host_and_node') + @mock.patch.object(manager.ComputeManager, '_cleanup_allocated_networks') + @mock.patch.object(manager.ComputeManager, '_set_instance_obj_error_state') + @mock.patch.object(compute_utils, 'add_instance_fault_from_exc') + @mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances') + @mock.patch.object(manager.ComputeManager, '_build_failed') + @mock.patch.object(manager.ComputeManager, '_build_succeeded') + @mock.patch.object(manager.ComputeManager, + '_validate_instance_group_policy') + def test_group_affinity_violation_exception_without_retry( + self, mock_validate_policy, mock_succeeded, mock_failed, mock_build, + mock_add, mock_set_state, mock_clean_net, mock_nil, mock_save, + mock_start, mock_finish, + ): + """Test failure by affinity or anti-affinity validation check doesn't + increase failed build + """ + + self._do_build_instance_update(mock_save) + mock_validate_policy.side_effect = \ + exception.GroupAffinityViolation( + instance_uuid=self.instance.uuid, policy="Affinity") + + orig_do_build_and_run = self.compute._do_build_and_run_instance + + def _wrapped_do_build_and_run_instance(*args, **kwargs): + ret = orig_do_build_and_run(*args, **kwargs) + self.assertEqual(build_results.FAILED_BY_POLICY, ret) + return ret + + with mock.patch.object( + self.compute, '_do_build_and_run_instance', + side_effect=_wrapped_do_build_and_run_instance, + ): + self.compute.build_and_run_instance( + self.context, self.instance, + self.image, request_spec={}, + filter_properties={}, + accel_uuids=[], + injected_files=self.injected_files, + admin_password=self.admin_pass, + requested_networks=self.requested_networks, + security_groups=self.security_groups, + block_device_mapping=self.block_device_mapping, node=self.node, + limits=self.limits, host_list=fake_host_list) + + mock_succeeded.assert_not_called() + mock_failed.assert_not_called() + + self._instance_action_events(mock_start, mock_finish) + self._assert_build_instance_update(mock_save) + mock_clean_net.assert_called_once_with(self.context, self.instance, + self.requested_networks) + mock_add.assert_called_once_with(self.context, self.instance, + mock.ANY, mock.ANY, fault_message=mock.ANY) + mock_nil.assert_called_once_with(self.instance) + mock_build.assert_not_called() + mock_set_state.assert_called_once_with(self.instance, + clean_task_state=True) + @mock.patch.object(objects.InstanceActionEvent, 'event_finish_with_failure') @mock.patch.object(objects.InstanceActionEvent, 'event_start') @@ -8078,7 +8217,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): nodes.return_value = ['nodename'] migration_list.return_value = [objects.Migration( uuid=uuids.migration, instance_uuid=uuids.instance)] - self.assertRaises(exception.RescheduledException, + self.assertRaises(exception.GroupAffinityViolation, self.compute._validate_instance_group_policy, self.context, instance, hints)