From d94ea23d3d64ecd3f2539a337c066487b938fcad Mon Sep 17 00:00:00 2001 From: Sundar Nadathur Date: Mon, 30 Mar 2020 19:24:30 -0700 Subject: [PATCH] Delete ARQs by UUID if Cyborg ARQ bind fails. During the reivew of the cyborg series it was noted that in some cases ARQs could be leaked during binding. See https://review.opendev.org/#/c/673735/46/nova/conductor/manager.py@1632 This change adds a delete_arqs_by_uuid function that can delete unbound ARQs by instance uuid. This change modifies build_instances and schedule_and_build_instances to handel the AcceleratorRequestBindingFailed exception raised when binding fails and clean up instance arqs. Co-Authored-By: Wenping Song Closes-Bug: #1872730 Change-Id: I86c2f00e2368fe02211175e7328b2cd9c0ebf41b Blueprint: nova-cyborg-interaction --- nova/accelerator/cyborg.py | 30 ++++++++++++++-- nova/conductor/manager.py | 39 ++++++++++++--------- nova/exception.py | 8 +++++ nova/tests/unit/accelerator/test_cyborg.py | 32 +++++++++++++++-- nova/tests/unit/conductor/test_conductor.py | 32 ++++++++++++++--- 5 files changed, 114 insertions(+), 27 deletions(-) diff --git a/nova/accelerator/cyborg.py b/nova/accelerator/cyborg.py index d6b2b0edf912..bf74722a424d 100644 --- a/nova/accelerator/cyborg.py +++ b/nova/accelerator/cyborg.py @@ -212,10 +212,11 @@ class _CyborgClient(object): resp, err_msg = self._call_cyborg(self._client.patch, self.ARQ_URL, json=patch_list) if err_msg: + arq_uuids = bindings.keys() msg = _(' Binding failed for ARQ UUIDs: ') - err_msg = err_msg + msg + ','.join(bindings.keys()) - raise exception.AcceleratorRequestOpFailed( - op=_('bind'), msg=err_msg) + err_msg = err_msg + msg + ','.join(arq_uuids) + raise exception.AcceleratorRequestBindingFailed( + arqs=arq_uuids, msg=err_msg) def get_arqs_for_instance(self, instance_uuid, only_resolved=False): """Get ARQs for the instance. @@ -278,3 +279,26 @@ class _CyborgClient(object): msg = err_msg + _(' Instance %s') % instance_uuid raise exception.AcceleratorRequestOpFailed( op=_('delete'), msg=msg) + + def delete_arqs_by_uuid(self, arq_uuids): + """Delete the specified ARQs, unbinding them if needed. + + This is meant to be used to clean up ARQs that have failed to bind + to an instance. So delete_arqs_for_instance() is not applicable. + + This Cyborg API call is NOT idempotent, i.e., if called more than + once, the 2nd and later calls will throw errors. + + If this fails, an error is logged but no exception is raised + because this cleans up Cyborg resources, but should otherwise + not affect instance spawn. + + :params arq_uuids: dict_keys() of ARQ UUIDs + """ + arq_uuid_str = ','.join(arq_uuids) + params = {'arqs': arq_uuid_str} + resp, err_msg = self._call_cyborg(self._client.delete, + self.ARQ_URL, params=params) + if err_msg: + # No point raising an exception. + LOG.error('Failed to delete ARQs %s', arq_uuid_str) diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py index 5c6f50724562..1a3be3e84fac 100644 --- a/nova/conductor/manager.py +++ b/nova/conductor/manager.py @@ -838,15 +838,13 @@ class ComputeTaskManager(base.Base): host.service_host, host.nodename, alts, instance=instance) try: - resource_provider_mapping = ( - local_reqspec.get_request_group_mapping()) - accel_uuids = self._create_and_bind_arqs( - context, instance.uuid, instance.flavor.extra_specs, - host.nodename, resource_provider_mapping) + accel_uuids = self._create_and_bind_arq_for_instance( + context, instance, host, local_reqspec) except Exception as exc: LOG.exception('Failed to reschedule. Reason: %s', exc) - self._cleanup_when_reschedule_fails(context, instance, exc, - legacy_request_spec, requested_networks) + self._cleanup_when_reschedule_fails( + context, instance, exc, legacy_request_spec, + requested_networks) continue self.compute_rpcapi.build_and_run_instance(context, @@ -861,6 +859,22 @@ class ComputeTaskManager(base.Base): limits=host.limits, host_list=host_list, accel_uuids=accel_uuids) + def _create_and_bind_arq_for_instance(self, context, instance, host, + request_spec): + try: + resource_provider_mapping = ( + request_spec.get_request_group_mapping()) + # Using nodename instead of hostname. See: + # http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa + return self._create_and_bind_arqs( + context, instance.uuid, instance.flavor.extra_specs, + host.nodename, resource_provider_mapping) + except exception.AcceleratorRequestBindingFailed as exc: + # If anything failed here we need to cleanup and bail out. + cyclient = cyborg.get_client(context) + cyclient.delete_arqs_by_uuid(exc.arqs) + raise + def _schedule_instances(self, context, request_spec, instance_uuids=None, return_alternates=False): scheduler_utils.setup_instance_group(context, request_spec) @@ -1619,17 +1633,10 @@ class ComputeTaskManager(base.Base): # this one. continue - accel_uuids = [] try: - resource_provider_mapping = ( - request_spec.get_request_group_mapping()) - # Using nodename instead of hostname. See: - # http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa - accel_uuids = self._create_and_bind_arqs( - context, instance.uuid, instance.flavor.extra_specs, - host.nodename, resource_provider_mapping) + accel_uuids = self._create_and_bind_arq_for_instance( + context, instance, host, request_spec) except Exception as exc: - # If anything failed here we need to cleanup and bail out. with excutils.save_and_reraise_exception(): self._cleanup_build_artifacts( context, exc, instances, build_requests, request_specs, diff --git a/nova/exception.py b/nova/exception.py index 022f13d34862..26ee45d20a54 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -2316,6 +2316,14 @@ class AcceleratorRequestOpFailed(NovaException): msg_fmt = _("Failed to %(op)s accelerator requests: %(msg)s") +class AcceleratorRequestBindingFailed(NovaException): + msg_fmt = _("Failed to bind accelerator requests: %(msg)s") + + def __init__(self, arqs, **kwargs): + super().__init__(message=self.msg_fmt, **kwargs) + self.arqs = arqs or [] + + class InvalidLibvirtGPUConfig(NovaException): msg_fmt = _('Invalid configuration for GPU devices: %(reason)s') diff --git a/nova/tests/unit/accelerator/test_cyborg.py b/nova/tests/unit/accelerator/test_cyborg.py index 796286136649..f0736b2ccf40 100644 --- a/nova/tests/unit/accelerator/test_cyborg.py +++ b/nova/tests/unit/accelerator/test_cyborg.py @@ -250,13 +250,15 @@ class CyborgTestCase(test.NoDBTestCase): called_params = mock_cyborg_patch.call_args.kwargs['json'] self.assertEqual(sorted(called_params), sorted(patch_list)) + @mock.patch('nova.accelerator.cyborg._CyborgClient.delete_arqs_by_uuid') @mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg') - def test_bind_arqs_exception(self, mock_call_cyborg): + def test_bind_arqs_exception(self, mock_call_cyborg, mock_del_arqs): # If Cyborg returns invalid response, raise exception. bindings, _ = self._get_bound_arqs() mock_call_cyborg.return_value = None, 'Some error' - self.assertRaises(exception.AcceleratorRequestOpFailed, - self.client.bind_arqs, bindings=bindings) + self.assertRaises(exception.AcceleratorRequestBindingFailed, + self.client.bind_arqs, bindings=bindings) + mock_del_arqs.assert_not_called() @mock.patch('keystoneauth1.adapter.Adapter.get') def test_get_arqs_for_instance(self, mock_cyborg_get): @@ -368,3 +370,27 @@ class CyborgTestCase(test.NoDBTestCase): expected_msg = ('Failed to delete accelerator requests: ' + err_msg + ' Instance ' + instance_uuid) self.assertEqual(expected_msg, exc.format_message()) + + @mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg') + def test_delete_arqs_by_uuid(self, mock_call_cyborg): + # Happy path + mock_call_cyborg.return_value = ('Some Value', None) + _, bound_arqs = self._get_bound_arqs() + arq_uuids = [arq['uuid'] for arq in bound_arqs] + arq_uuid_str = ','.join(arq_uuids) + self.client.delete_arqs_by_uuid(arq_uuids) + mock_call_cyborg.assert_called_once_with(mock.ANY, + self.client.ARQ_URL, params={'arqs': arq_uuid_str}) + + @mock.patch('nova.accelerator.cyborg.LOG.error') + @mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg') + def test_delete_arqs_by_uuid_exception(self, mock_call_cyborg, mock_log): + mock_call_cyborg.return_value = (None, 'Some error') + _, bound_arqs = self._get_bound_arqs() + arq_uuids = [arq['uuid'] for arq in bound_arqs] + arq_uuid_str = ','.join(arq_uuids) + self.client.delete_arqs_by_uuid(arq_uuids) + mock_call_cyborg.assert_called_once_with(mock.ANY, + self.client.ARQ_URL, params={'arqs': arq_uuid_str}) + mock_log.assert_called_once_with('Failed to delete ARQs %s', + arq_uuid_str) diff --git a/nova/tests/unit/conductor/test_conductor.py b/nova/tests/unit/conductor/test_conductor.py index 79e08f2d3250..946685fce810 100644 --- a/nova/tests/unit/conductor/test_conductor.py +++ b/nova/tests/unit/conductor/test_conductor.py @@ -26,6 +26,7 @@ from oslo_utils import timeutils from oslo_versionedobjects import exception as ovo_exc import six +from nova.accelerator import cyborg from nova import block_device from nova.compute import flavors from nova.compute import rpcapi as compute_rpcapi @@ -48,6 +49,7 @@ from nova import objects from nova.objects import base as obj_base from nova.objects import block_device as block_device_obj from nova.objects import fields +from nova.objects import request_spec from nova.scheduler.client import query from nova.scheduler import utils as scheduler_utils from nova import test @@ -583,7 +585,7 @@ class _BaseTaskTestCase(object): mock_getaz.return_value = 'myaz' mock_create_bind_arqs.side_effect = ( - exc.AcceleratorRequestOpFailed(op='', msg='')) + exc.AcceleratorRequestBindingFailed(arqs=[], msg='')) self.conductor.build_instances(self.context, instances=instances, @@ -605,10 +607,10 @@ class _BaseTaskTestCase(object): # in the above flow. So, we compare the fields instead. mock_cleanup.assert_has_calls([ mock.call(self.context, test.MatchType(objects.Instance), - test.MatchType(exc.AcceleratorRequestOpFailed), + test.MatchType(exc.AcceleratorRequestBindingFailed), test.MatchType(dict), None), mock.call(self.context, test.MatchType(objects.Instance), - test.MatchType(exc.AcceleratorRequestOpFailed), + test.MatchType(exc.AcceleratorRequestBindingFailed), test.MatchType(dict), None), ]) call_list = mock_cleanup.call_args_list @@ -2548,11 +2550,11 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase): self, mock_create_bind_arqs, mock_cleanup): # Exceptions in _create_and_bind_arqs result in cleanup mock_create_bind_arqs.side_effect = ( - exc.AcceleratorRequestOpFailed(op='', msg='')) + exc.AcceleratorRequestBindingFailed(arqs=[], msg='')) try: self._do_schedule_and_build_instances_test(self.params) - except exc.AcceleratorRequestOpFailed: + except exc.AcceleratorRequestBindingFailed: pass mock_cleanup.assert_called_once_with( @@ -2561,6 +2563,26 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase): self.params['block_device_mapping'], self.params['tags'], mock.ANY) + @mock.patch.object(request_spec.RequestSpec, "get_request_group_mapping") + @mock.patch.object(cyborg, "get_client") + @mock.patch.object( + conductor_manager.ComputeTaskManager, '_create_and_bind_arqs') + def test__create_and_bind_arq_for_instance( + self, mock_create_bind_arqs, mock_client, mock_request_mappings): + # Exceptions in _create_and_bind_arqs result in cleanup + arqs = ["fake-arq-uuid"] + mock_create_bind_arqs.side_effect = ( + exc.AcceleratorRequestBindingFailed(arqs=arqs, msg='')) + mock_client.return_value = mock.Mock() + instance = mock.Mock() + instance.uuid = "fake-uuid" + ex = self.assertRaises(exc.AcceleratorRequestBindingFailed, + self.conductor._create_and_bind_arq_for_instance, + None, instance, mock.Mock(), request_spec.RequestSpec()) + + self.assertIn('Failed to bind accelerator requests', ex.message) + mock_client.return_value.delete_arqs_by_uuid.assert_called_with(arqs) + def test_map_instance_to_cell_already_mapped(self): """Tests a scenario where an instance is already mapped to a cell during scheduling.