diff --git a/nova/accelerator/cyborg.py b/nova/accelerator/cyborg.py index d6b2b0edf912..bf74722a424d 100644 --- a/nova/accelerator/cyborg.py +++ b/nova/accelerator/cyborg.py @@ -212,10 +212,11 @@ class _CyborgClient(object): resp, err_msg = self._call_cyborg(self._client.patch, self.ARQ_URL, json=patch_list) if err_msg: + arq_uuids = bindings.keys() msg = _(' Binding failed for ARQ UUIDs: ') - err_msg = err_msg + msg + ','.join(bindings.keys()) - raise exception.AcceleratorRequestOpFailed( - op=_('bind'), msg=err_msg) + err_msg = err_msg + msg + ','.join(arq_uuids) + raise exception.AcceleratorRequestBindingFailed( + arqs=arq_uuids, msg=err_msg) def get_arqs_for_instance(self, instance_uuid, only_resolved=False): """Get ARQs for the instance. @@ -278,3 +279,26 @@ class _CyborgClient(object): msg = err_msg + _(' Instance %s') % instance_uuid raise exception.AcceleratorRequestOpFailed( op=_('delete'), msg=msg) + + def delete_arqs_by_uuid(self, arq_uuids): + """Delete the specified ARQs, unbinding them if needed. + + This is meant to be used to clean up ARQs that have failed to bind + to an instance. So delete_arqs_for_instance() is not applicable. + + This Cyborg API call is NOT idempotent, i.e., if called more than + once, the 2nd and later calls will throw errors. + + If this fails, an error is logged but no exception is raised + because this cleans up Cyborg resources, but should otherwise + not affect instance spawn. + + :params arq_uuids: dict_keys() of ARQ UUIDs + """ + arq_uuid_str = ','.join(arq_uuids) + params = {'arqs': arq_uuid_str} + resp, err_msg = self._call_cyborg(self._client.delete, + self.ARQ_URL, params=params) + if err_msg: + # No point raising an exception. + LOG.error('Failed to delete ARQs %s', arq_uuid_str) diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py index 5c6f50724562..1a3be3e84fac 100644 --- a/nova/conductor/manager.py +++ b/nova/conductor/manager.py @@ -838,15 +838,13 @@ class ComputeTaskManager(base.Base): host.service_host, host.nodename, alts, instance=instance) try: - resource_provider_mapping = ( - local_reqspec.get_request_group_mapping()) - accel_uuids = self._create_and_bind_arqs( - context, instance.uuid, instance.flavor.extra_specs, - host.nodename, resource_provider_mapping) + accel_uuids = self._create_and_bind_arq_for_instance( + context, instance, host, local_reqspec) except Exception as exc: LOG.exception('Failed to reschedule. Reason: %s', exc) - self._cleanup_when_reschedule_fails(context, instance, exc, - legacy_request_spec, requested_networks) + self._cleanup_when_reschedule_fails( + context, instance, exc, legacy_request_spec, + requested_networks) continue self.compute_rpcapi.build_and_run_instance(context, @@ -861,6 +859,22 @@ class ComputeTaskManager(base.Base): limits=host.limits, host_list=host_list, accel_uuids=accel_uuids) + def _create_and_bind_arq_for_instance(self, context, instance, host, + request_spec): + try: + resource_provider_mapping = ( + request_spec.get_request_group_mapping()) + # Using nodename instead of hostname. See: + # http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa + return self._create_and_bind_arqs( + context, instance.uuid, instance.flavor.extra_specs, + host.nodename, resource_provider_mapping) + except exception.AcceleratorRequestBindingFailed as exc: + # If anything failed here we need to cleanup and bail out. + cyclient = cyborg.get_client(context) + cyclient.delete_arqs_by_uuid(exc.arqs) + raise + def _schedule_instances(self, context, request_spec, instance_uuids=None, return_alternates=False): scheduler_utils.setup_instance_group(context, request_spec) @@ -1619,17 +1633,10 @@ class ComputeTaskManager(base.Base): # this one. continue - accel_uuids = [] try: - resource_provider_mapping = ( - request_spec.get_request_group_mapping()) - # Using nodename instead of hostname. See: - # http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa - accel_uuids = self._create_and_bind_arqs( - context, instance.uuid, instance.flavor.extra_specs, - host.nodename, resource_provider_mapping) + accel_uuids = self._create_and_bind_arq_for_instance( + context, instance, host, request_spec) except Exception as exc: - # If anything failed here we need to cleanup and bail out. with excutils.save_and_reraise_exception(): self._cleanup_build_artifacts( context, exc, instances, build_requests, request_specs, diff --git a/nova/exception.py b/nova/exception.py index 022f13d34862..26ee45d20a54 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -2316,6 +2316,14 @@ class AcceleratorRequestOpFailed(NovaException): msg_fmt = _("Failed to %(op)s accelerator requests: %(msg)s") +class AcceleratorRequestBindingFailed(NovaException): + msg_fmt = _("Failed to bind accelerator requests: %(msg)s") + + def __init__(self, arqs, **kwargs): + super().__init__(message=self.msg_fmt, **kwargs) + self.arqs = arqs or [] + + class InvalidLibvirtGPUConfig(NovaException): msg_fmt = _('Invalid configuration for GPU devices: %(reason)s') diff --git a/nova/tests/unit/accelerator/test_cyborg.py b/nova/tests/unit/accelerator/test_cyborg.py index 796286136649..f0736b2ccf40 100644 --- a/nova/tests/unit/accelerator/test_cyborg.py +++ b/nova/tests/unit/accelerator/test_cyborg.py @@ -250,13 +250,15 @@ class CyborgTestCase(test.NoDBTestCase): called_params = mock_cyborg_patch.call_args.kwargs['json'] self.assertEqual(sorted(called_params), sorted(patch_list)) + @mock.patch('nova.accelerator.cyborg._CyborgClient.delete_arqs_by_uuid') @mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg') - def test_bind_arqs_exception(self, mock_call_cyborg): + def test_bind_arqs_exception(self, mock_call_cyborg, mock_del_arqs): # If Cyborg returns invalid response, raise exception. bindings, _ = self._get_bound_arqs() mock_call_cyborg.return_value = None, 'Some error' - self.assertRaises(exception.AcceleratorRequestOpFailed, - self.client.bind_arqs, bindings=bindings) + self.assertRaises(exception.AcceleratorRequestBindingFailed, + self.client.bind_arqs, bindings=bindings) + mock_del_arqs.assert_not_called() @mock.patch('keystoneauth1.adapter.Adapter.get') def test_get_arqs_for_instance(self, mock_cyborg_get): @@ -368,3 +370,27 @@ class CyborgTestCase(test.NoDBTestCase): expected_msg = ('Failed to delete accelerator requests: ' + err_msg + ' Instance ' + instance_uuid) self.assertEqual(expected_msg, exc.format_message()) + + @mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg') + def test_delete_arqs_by_uuid(self, mock_call_cyborg): + # Happy path + mock_call_cyborg.return_value = ('Some Value', None) + _, bound_arqs = self._get_bound_arqs() + arq_uuids = [arq['uuid'] for arq in bound_arqs] + arq_uuid_str = ','.join(arq_uuids) + self.client.delete_arqs_by_uuid(arq_uuids) + mock_call_cyborg.assert_called_once_with(mock.ANY, + self.client.ARQ_URL, params={'arqs': arq_uuid_str}) + + @mock.patch('nova.accelerator.cyborg.LOG.error') + @mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg') + def test_delete_arqs_by_uuid_exception(self, mock_call_cyborg, mock_log): + mock_call_cyborg.return_value = (None, 'Some error') + _, bound_arqs = self._get_bound_arqs() + arq_uuids = [arq['uuid'] for arq in bound_arqs] + arq_uuid_str = ','.join(arq_uuids) + self.client.delete_arqs_by_uuid(arq_uuids) + mock_call_cyborg.assert_called_once_with(mock.ANY, + self.client.ARQ_URL, params={'arqs': arq_uuid_str}) + mock_log.assert_called_once_with('Failed to delete ARQs %s', + arq_uuid_str) diff --git a/nova/tests/unit/conductor/test_conductor.py b/nova/tests/unit/conductor/test_conductor.py index 79e08f2d3250..946685fce810 100644 --- a/nova/tests/unit/conductor/test_conductor.py +++ b/nova/tests/unit/conductor/test_conductor.py @@ -26,6 +26,7 @@ from oslo_utils import timeutils from oslo_versionedobjects import exception as ovo_exc import six +from nova.accelerator import cyborg from nova import block_device from nova.compute import flavors from nova.compute import rpcapi as compute_rpcapi @@ -48,6 +49,7 @@ from nova import objects from nova.objects import base as obj_base from nova.objects import block_device as block_device_obj from nova.objects import fields +from nova.objects import request_spec from nova.scheduler.client import query from nova.scheduler import utils as scheduler_utils from nova import test @@ -583,7 +585,7 @@ class _BaseTaskTestCase(object): mock_getaz.return_value = 'myaz' mock_create_bind_arqs.side_effect = ( - exc.AcceleratorRequestOpFailed(op='', msg='')) + exc.AcceleratorRequestBindingFailed(arqs=[], msg='')) self.conductor.build_instances(self.context, instances=instances, @@ -605,10 +607,10 @@ class _BaseTaskTestCase(object): # in the above flow. So, we compare the fields instead. mock_cleanup.assert_has_calls([ mock.call(self.context, test.MatchType(objects.Instance), - test.MatchType(exc.AcceleratorRequestOpFailed), + test.MatchType(exc.AcceleratorRequestBindingFailed), test.MatchType(dict), None), mock.call(self.context, test.MatchType(objects.Instance), - test.MatchType(exc.AcceleratorRequestOpFailed), + test.MatchType(exc.AcceleratorRequestBindingFailed), test.MatchType(dict), None), ]) call_list = mock_cleanup.call_args_list @@ -2548,11 +2550,11 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase): self, mock_create_bind_arqs, mock_cleanup): # Exceptions in _create_and_bind_arqs result in cleanup mock_create_bind_arqs.side_effect = ( - exc.AcceleratorRequestOpFailed(op='', msg='')) + exc.AcceleratorRequestBindingFailed(arqs=[], msg='')) try: self._do_schedule_and_build_instances_test(self.params) - except exc.AcceleratorRequestOpFailed: + except exc.AcceleratorRequestBindingFailed: pass mock_cleanup.assert_called_once_with( @@ -2561,6 +2563,26 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase): self.params['block_device_mapping'], self.params['tags'], mock.ANY) + @mock.patch.object(request_spec.RequestSpec, "get_request_group_mapping") + @mock.patch.object(cyborg, "get_client") + @mock.patch.object( + conductor_manager.ComputeTaskManager, '_create_and_bind_arqs') + def test__create_and_bind_arq_for_instance( + self, mock_create_bind_arqs, mock_client, mock_request_mappings): + # Exceptions in _create_and_bind_arqs result in cleanup + arqs = ["fake-arq-uuid"] + mock_create_bind_arqs.side_effect = ( + exc.AcceleratorRequestBindingFailed(arqs=arqs, msg='')) + mock_client.return_value = mock.Mock() + instance = mock.Mock() + instance.uuid = "fake-uuid" + ex = self.assertRaises(exc.AcceleratorRequestBindingFailed, + self.conductor._create_and_bind_arq_for_instance, + None, instance, mock.Mock(), request_spec.RequestSpec()) + + self.assertIn('Failed to bind accelerator requests', ex.message) + mock_client.return_value.delete_arqs_by_uuid.assert_called_with(arqs) + def test_map_instance_to_cell_already_mapped(self): """Tests a scenario where an instance is already mapped to a cell during scheduling.