Merge "Delete ARQs by UUID if Cyborg ARQ bind fails."

This commit is contained in:
Zuul 2020-07-27 17:52:43 +00:00 committed by Gerrit Code Review
commit ccecb507ff
5 changed files with 114 additions and 27 deletions

View File

@ -212,10 +212,11 @@ class _CyborgClient(object):
resp, err_msg = self._call_cyborg(self._client.patch, resp, err_msg = self._call_cyborg(self._client.patch,
self.ARQ_URL, json=patch_list) self.ARQ_URL, json=patch_list)
if err_msg: if err_msg:
arq_uuids = bindings.keys()
msg = _(' Binding failed for ARQ UUIDs: ') msg = _(' Binding failed for ARQ UUIDs: ')
err_msg = err_msg + msg + ','.join(bindings.keys()) err_msg = err_msg + msg + ','.join(arq_uuids)
raise exception.AcceleratorRequestOpFailed( raise exception.AcceleratorRequestBindingFailed(
op=_('bind'), msg=err_msg) arqs=arq_uuids, msg=err_msg)
def get_arqs_for_instance(self, instance_uuid, only_resolved=False): def get_arqs_for_instance(self, instance_uuid, only_resolved=False):
"""Get ARQs for the instance. """Get ARQs for the instance.
@ -278,3 +279,26 @@ class _CyborgClient(object):
msg = err_msg + _(' Instance %s') % instance_uuid msg = err_msg + _(' Instance %s') % instance_uuid
raise exception.AcceleratorRequestOpFailed( raise exception.AcceleratorRequestOpFailed(
op=_('delete'), msg=msg) op=_('delete'), msg=msg)
def delete_arqs_by_uuid(self, arq_uuids):
"""Delete the specified ARQs, unbinding them if needed.
This is meant to be used to clean up ARQs that have failed to bind
to an instance. So delete_arqs_for_instance() is not applicable.
This Cyborg API call is NOT idempotent, i.e., if called more than
once, the 2nd and later calls will throw errors.
If this fails, an error is logged but no exception is raised
because this cleans up Cyborg resources, but should otherwise
not affect instance spawn.
:params arq_uuids: dict_keys() of ARQ UUIDs
"""
arq_uuid_str = ','.join(arq_uuids)
params = {'arqs': arq_uuid_str}
resp, err_msg = self._call_cyborg(self._client.delete,
self.ARQ_URL, params=params)
if err_msg:
# No point raising an exception.
LOG.error('Failed to delete ARQs %s', arq_uuid_str)

View File

@ -838,15 +838,13 @@ class ComputeTaskManager(base.Base):
host.service_host, host.nodename, alts, instance=instance) host.service_host, host.nodename, alts, instance=instance)
try: try:
resource_provider_mapping = ( accel_uuids = self._create_and_bind_arq_for_instance(
local_reqspec.get_request_group_mapping()) context, instance, host, local_reqspec)
accel_uuids = self._create_and_bind_arqs(
context, instance.uuid, instance.flavor.extra_specs,
host.nodename, resource_provider_mapping)
except Exception as exc: except Exception as exc:
LOG.exception('Failed to reschedule. Reason: %s', exc) LOG.exception('Failed to reschedule. Reason: %s', exc)
self._cleanup_when_reschedule_fails(context, instance, exc, self._cleanup_when_reschedule_fails(
legacy_request_spec, requested_networks) context, instance, exc, legacy_request_spec,
requested_networks)
continue continue
self.compute_rpcapi.build_and_run_instance(context, self.compute_rpcapi.build_and_run_instance(context,
@ -861,6 +859,22 @@ class ComputeTaskManager(base.Base):
limits=host.limits, host_list=host_list, limits=host.limits, host_list=host_list,
accel_uuids=accel_uuids) accel_uuids=accel_uuids)
def _create_and_bind_arq_for_instance(self, context, instance, host,
request_spec):
try:
resource_provider_mapping = (
request_spec.get_request_group_mapping())
# Using nodename instead of hostname. See:
# http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa
return self._create_and_bind_arqs(
context, instance.uuid, instance.flavor.extra_specs,
host.nodename, resource_provider_mapping)
except exception.AcceleratorRequestBindingFailed as exc:
# If anything failed here we need to cleanup and bail out.
cyclient = cyborg.get_client(context)
cyclient.delete_arqs_by_uuid(exc.arqs)
raise
def _schedule_instances(self, context, request_spec, def _schedule_instances(self, context, request_spec,
instance_uuids=None, return_alternates=False): instance_uuids=None, return_alternates=False):
scheduler_utils.setup_instance_group(context, request_spec) scheduler_utils.setup_instance_group(context, request_spec)
@ -1619,17 +1633,10 @@ class ComputeTaskManager(base.Base):
# this one. # this one.
continue continue
accel_uuids = []
try: try:
resource_provider_mapping = ( accel_uuids = self._create_and_bind_arq_for_instance(
request_spec.get_request_group_mapping()) context, instance, host, request_spec)
# Using nodename instead of hostname. See:
# http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa
accel_uuids = self._create_and_bind_arqs(
context, instance.uuid, instance.flavor.extra_specs,
host.nodename, resource_provider_mapping)
except Exception as exc: except Exception as exc:
# If anything failed here we need to cleanup and bail out.
with excutils.save_and_reraise_exception(): with excutils.save_and_reraise_exception():
self._cleanup_build_artifacts( self._cleanup_build_artifacts(
context, exc, instances, build_requests, request_specs, context, exc, instances, build_requests, request_specs,

View File

@ -2320,6 +2320,14 @@ class AcceleratorRequestOpFailed(NovaException):
msg_fmt = _("Failed to %(op)s accelerator requests: %(msg)s") msg_fmt = _("Failed to %(op)s accelerator requests: %(msg)s")
class AcceleratorRequestBindingFailed(NovaException):
msg_fmt = _("Failed to bind accelerator requests: %(msg)s")
def __init__(self, arqs, **kwargs):
super().__init__(message=self.msg_fmt, **kwargs)
self.arqs = arqs or []
class InvalidLibvirtGPUConfig(NovaException): class InvalidLibvirtGPUConfig(NovaException):
msg_fmt = _('Invalid configuration for GPU devices: %(reason)s') msg_fmt = _('Invalid configuration for GPU devices: %(reason)s')

View File

@ -250,13 +250,15 @@ class CyborgTestCase(test.NoDBTestCase):
called_params = mock_cyborg_patch.call_args.kwargs['json'] called_params = mock_cyborg_patch.call_args.kwargs['json']
self.assertEqual(sorted(called_params), sorted(patch_list)) self.assertEqual(sorted(called_params), sorted(patch_list))
@mock.patch('nova.accelerator.cyborg._CyborgClient.delete_arqs_by_uuid')
@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg') @mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
def test_bind_arqs_exception(self, mock_call_cyborg): def test_bind_arqs_exception(self, mock_call_cyborg, mock_del_arqs):
# If Cyborg returns invalid response, raise exception. # If Cyborg returns invalid response, raise exception.
bindings, _ = self._get_bound_arqs() bindings, _ = self._get_bound_arqs()
mock_call_cyborg.return_value = None, 'Some error' mock_call_cyborg.return_value = None, 'Some error'
self.assertRaises(exception.AcceleratorRequestOpFailed, self.assertRaises(exception.AcceleratorRequestBindingFailed,
self.client.bind_arqs, bindings=bindings) self.client.bind_arqs, bindings=bindings)
mock_del_arqs.assert_not_called()
@mock.patch('keystoneauth1.adapter.Adapter.get') @mock.patch('keystoneauth1.adapter.Adapter.get')
def test_get_arqs_for_instance(self, mock_cyborg_get): def test_get_arqs_for_instance(self, mock_cyborg_get):
@ -368,3 +370,27 @@ class CyborgTestCase(test.NoDBTestCase):
expected_msg = ('Failed to delete accelerator requests: ' + expected_msg = ('Failed to delete accelerator requests: ' +
err_msg + ' Instance ' + instance_uuid) err_msg + ' Instance ' + instance_uuid)
self.assertEqual(expected_msg, exc.format_message()) self.assertEqual(expected_msg, exc.format_message())
@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
def test_delete_arqs_by_uuid(self, mock_call_cyborg):
# Happy path
mock_call_cyborg.return_value = ('Some Value', None)
_, bound_arqs = self._get_bound_arqs()
arq_uuids = [arq['uuid'] for arq in bound_arqs]
arq_uuid_str = ','.join(arq_uuids)
self.client.delete_arqs_by_uuid(arq_uuids)
mock_call_cyborg.assert_called_once_with(mock.ANY,
self.client.ARQ_URL, params={'arqs': arq_uuid_str})
@mock.patch('nova.accelerator.cyborg.LOG.error')
@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
def test_delete_arqs_by_uuid_exception(self, mock_call_cyborg, mock_log):
mock_call_cyborg.return_value = (None, 'Some error')
_, bound_arqs = self._get_bound_arqs()
arq_uuids = [arq['uuid'] for arq in bound_arqs]
arq_uuid_str = ','.join(arq_uuids)
self.client.delete_arqs_by_uuid(arq_uuids)
mock_call_cyborg.assert_called_once_with(mock.ANY,
self.client.ARQ_URL, params={'arqs': arq_uuid_str})
mock_log.assert_called_once_with('Failed to delete ARQs %s',
arq_uuid_str)

View File

@ -26,6 +26,7 @@ from oslo_utils import timeutils
from oslo_versionedobjects import exception as ovo_exc from oslo_versionedobjects import exception as ovo_exc
import six import six
from nova.accelerator import cyborg
from nova import block_device from nova import block_device
from nova.compute import flavors from nova.compute import flavors
from nova.compute import rpcapi as compute_rpcapi from nova.compute import rpcapi as compute_rpcapi
@ -48,6 +49,7 @@ from nova import objects
from nova.objects import base as obj_base from nova.objects import base as obj_base
from nova.objects import block_device as block_device_obj from nova.objects import block_device as block_device_obj
from nova.objects import fields from nova.objects import fields
from nova.objects import request_spec
from nova.scheduler.client import query from nova.scheduler.client import query
from nova.scheduler import utils as scheduler_utils from nova.scheduler import utils as scheduler_utils
from nova import test from nova import test
@ -583,7 +585,7 @@ class _BaseTaskTestCase(object):
mock_getaz.return_value = 'myaz' mock_getaz.return_value = 'myaz'
mock_create_bind_arqs.side_effect = ( mock_create_bind_arqs.side_effect = (
exc.AcceleratorRequestOpFailed(op='', msg='')) exc.AcceleratorRequestBindingFailed(arqs=[], msg=''))
self.conductor.build_instances(self.context, self.conductor.build_instances(self.context,
instances=instances, instances=instances,
@ -605,10 +607,10 @@ class _BaseTaskTestCase(object):
# in the above flow. So, we compare the fields instead. # in the above flow. So, we compare the fields instead.
mock_cleanup.assert_has_calls([ mock_cleanup.assert_has_calls([
mock.call(self.context, test.MatchType(objects.Instance), mock.call(self.context, test.MatchType(objects.Instance),
test.MatchType(exc.AcceleratorRequestOpFailed), test.MatchType(exc.AcceleratorRequestBindingFailed),
test.MatchType(dict), None), test.MatchType(dict), None),
mock.call(self.context, test.MatchType(objects.Instance), mock.call(self.context, test.MatchType(objects.Instance),
test.MatchType(exc.AcceleratorRequestOpFailed), test.MatchType(exc.AcceleratorRequestBindingFailed),
test.MatchType(dict), None), test.MatchType(dict), None),
]) ])
call_list = mock_cleanup.call_args_list call_list = mock_cleanup.call_args_list
@ -2548,11 +2550,11 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
self, mock_create_bind_arqs, mock_cleanup): self, mock_create_bind_arqs, mock_cleanup):
# Exceptions in _create_and_bind_arqs result in cleanup # Exceptions in _create_and_bind_arqs result in cleanup
mock_create_bind_arqs.side_effect = ( mock_create_bind_arqs.side_effect = (
exc.AcceleratorRequestOpFailed(op='', msg='')) exc.AcceleratorRequestBindingFailed(arqs=[], msg=''))
try: try:
self._do_schedule_and_build_instances_test(self.params) self._do_schedule_and_build_instances_test(self.params)
except exc.AcceleratorRequestOpFailed: except exc.AcceleratorRequestBindingFailed:
pass pass
mock_cleanup.assert_called_once_with( mock_cleanup.assert_called_once_with(
@ -2561,6 +2563,26 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
self.params['block_device_mapping'], self.params['tags'], self.params['block_device_mapping'], self.params['tags'],
mock.ANY) mock.ANY)
@mock.patch.object(request_spec.RequestSpec, "get_request_group_mapping")
@mock.patch.object(cyborg, "get_client")
@mock.patch.object(
conductor_manager.ComputeTaskManager, '_create_and_bind_arqs')
def test__create_and_bind_arq_for_instance(
self, mock_create_bind_arqs, mock_client, mock_request_mappings):
# Exceptions in _create_and_bind_arqs result in cleanup
arqs = ["fake-arq-uuid"]
mock_create_bind_arqs.side_effect = (
exc.AcceleratorRequestBindingFailed(arqs=arqs, msg=''))
mock_client.return_value = mock.Mock()
instance = mock.Mock()
instance.uuid = "fake-uuid"
ex = self.assertRaises(exc.AcceleratorRequestBindingFailed,
self.conductor._create_and_bind_arq_for_instance,
None, instance, mock.Mock(), request_spec.RequestSpec())
self.assertIn('Failed to bind accelerator requests', ex.message)
mock_client.return_value.delete_arqs_by_uuid.assert_called_with(arqs)
def test_map_instance_to_cell_already_mapped(self): def test_map_instance_to_cell_already_mapped(self):
"""Tests a scenario where an instance is already mapped to a cell """Tests a scenario where an instance is already mapped to a cell
during scheduling. during scheduling.