Delete ARQs by UUID if Cyborg ARQ bind fails.

During the reivew of the cyborg series it was noted that
in some cases ARQs could be leaked during binding.
See https://review.opendev.org/#/c/673735/46/nova/conductor/manager.py@1632

This change adds a delete_arqs_by_uuid function that can delete
unbound ARQs by instance uuid.

This change modifies build_instances and schedule_and_build_instances
to handel the AcceleratorRequestBindingFailed exception raised when
binding fails and clean up instance arqs.

Co-Authored-By: Wenping Song <songwenping@inspur.com>

Closes-Bug: #1872730
Change-Id: I86c2f00e2368fe02211175e7328b2cd9c0ebf41b
Blueprint: nova-cyborg-interaction
This commit is contained in:
Sundar Nadathur 2020-03-30 19:24:30 -07:00 committed by zhangbailin
parent a1e392fa90
commit d94ea23d3d
5 changed files with 114 additions and 27 deletions

View File

@ -212,10 +212,11 @@ class _CyborgClient(object):
resp, err_msg = self._call_cyborg(self._client.patch,
self.ARQ_URL, json=patch_list)
if err_msg:
arq_uuids = bindings.keys()
msg = _(' Binding failed for ARQ UUIDs: ')
err_msg = err_msg + msg + ','.join(bindings.keys())
raise exception.AcceleratorRequestOpFailed(
op=_('bind'), msg=err_msg)
err_msg = err_msg + msg + ','.join(arq_uuids)
raise exception.AcceleratorRequestBindingFailed(
arqs=arq_uuids, msg=err_msg)
def get_arqs_for_instance(self, instance_uuid, only_resolved=False):
"""Get ARQs for the instance.
@ -278,3 +279,26 @@ class _CyborgClient(object):
msg = err_msg + _(' Instance %s') % instance_uuid
raise exception.AcceleratorRequestOpFailed(
op=_('delete'), msg=msg)
def delete_arqs_by_uuid(self, arq_uuids):
"""Delete the specified ARQs, unbinding them if needed.
This is meant to be used to clean up ARQs that have failed to bind
to an instance. So delete_arqs_for_instance() is not applicable.
This Cyborg API call is NOT idempotent, i.e., if called more than
once, the 2nd and later calls will throw errors.
If this fails, an error is logged but no exception is raised
because this cleans up Cyborg resources, but should otherwise
not affect instance spawn.
:params arq_uuids: dict_keys() of ARQ UUIDs
"""
arq_uuid_str = ','.join(arq_uuids)
params = {'arqs': arq_uuid_str}
resp, err_msg = self._call_cyborg(self._client.delete,
self.ARQ_URL, params=params)
if err_msg:
# No point raising an exception.
LOG.error('Failed to delete ARQs %s', arq_uuid_str)

View File

@ -838,15 +838,13 @@ class ComputeTaskManager(base.Base):
host.service_host, host.nodename, alts, instance=instance)
try:
resource_provider_mapping = (
local_reqspec.get_request_group_mapping())
accel_uuids = self._create_and_bind_arqs(
context, instance.uuid, instance.flavor.extra_specs,
host.nodename, resource_provider_mapping)
accel_uuids = self._create_and_bind_arq_for_instance(
context, instance, host, local_reqspec)
except Exception as exc:
LOG.exception('Failed to reschedule. Reason: %s', exc)
self._cleanup_when_reschedule_fails(context, instance, exc,
legacy_request_spec, requested_networks)
self._cleanup_when_reschedule_fails(
context, instance, exc, legacy_request_spec,
requested_networks)
continue
self.compute_rpcapi.build_and_run_instance(context,
@ -861,6 +859,22 @@ class ComputeTaskManager(base.Base):
limits=host.limits, host_list=host_list,
accel_uuids=accel_uuids)
def _create_and_bind_arq_for_instance(self, context, instance, host,
request_spec):
try:
resource_provider_mapping = (
request_spec.get_request_group_mapping())
# Using nodename instead of hostname. See:
# http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa
return self._create_and_bind_arqs(
context, instance.uuid, instance.flavor.extra_specs,
host.nodename, resource_provider_mapping)
except exception.AcceleratorRequestBindingFailed as exc:
# If anything failed here we need to cleanup and bail out.
cyclient = cyborg.get_client(context)
cyclient.delete_arqs_by_uuid(exc.arqs)
raise
def _schedule_instances(self, context, request_spec,
instance_uuids=None, return_alternates=False):
scheduler_utils.setup_instance_group(context, request_spec)
@ -1619,17 +1633,10 @@ class ComputeTaskManager(base.Base):
# this one.
continue
accel_uuids = []
try:
resource_provider_mapping = (
request_spec.get_request_group_mapping())
# Using nodename instead of hostname. See:
# http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa
accel_uuids = self._create_and_bind_arqs(
context, instance.uuid, instance.flavor.extra_specs,
host.nodename, resource_provider_mapping)
accel_uuids = self._create_and_bind_arq_for_instance(
context, instance, host, request_spec)
except Exception as exc:
# If anything failed here we need to cleanup and bail out.
with excutils.save_and_reraise_exception():
self._cleanup_build_artifacts(
context, exc, instances, build_requests, request_specs,

View File

@ -2316,6 +2316,14 @@ class AcceleratorRequestOpFailed(NovaException):
msg_fmt = _("Failed to %(op)s accelerator requests: %(msg)s")
class AcceleratorRequestBindingFailed(NovaException):
msg_fmt = _("Failed to bind accelerator requests: %(msg)s")
def __init__(self, arqs, **kwargs):
super().__init__(message=self.msg_fmt, **kwargs)
self.arqs = arqs or []
class InvalidLibvirtGPUConfig(NovaException):
msg_fmt = _('Invalid configuration for GPU devices: %(reason)s')

View File

@ -250,13 +250,15 @@ class CyborgTestCase(test.NoDBTestCase):
called_params = mock_cyborg_patch.call_args.kwargs['json']
self.assertEqual(sorted(called_params), sorted(patch_list))
@mock.patch('nova.accelerator.cyborg._CyborgClient.delete_arqs_by_uuid')
@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
def test_bind_arqs_exception(self, mock_call_cyborg):
def test_bind_arqs_exception(self, mock_call_cyborg, mock_del_arqs):
# If Cyborg returns invalid response, raise exception.
bindings, _ = self._get_bound_arqs()
mock_call_cyborg.return_value = None, 'Some error'
self.assertRaises(exception.AcceleratorRequestOpFailed,
self.client.bind_arqs, bindings=bindings)
self.assertRaises(exception.AcceleratorRequestBindingFailed,
self.client.bind_arqs, bindings=bindings)
mock_del_arqs.assert_not_called()
@mock.patch('keystoneauth1.adapter.Adapter.get')
def test_get_arqs_for_instance(self, mock_cyborg_get):
@ -368,3 +370,27 @@ class CyborgTestCase(test.NoDBTestCase):
expected_msg = ('Failed to delete accelerator requests: ' +
err_msg + ' Instance ' + instance_uuid)
self.assertEqual(expected_msg, exc.format_message())
@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
def test_delete_arqs_by_uuid(self, mock_call_cyborg):
# Happy path
mock_call_cyborg.return_value = ('Some Value', None)
_, bound_arqs = self._get_bound_arqs()
arq_uuids = [arq['uuid'] for arq in bound_arqs]
arq_uuid_str = ','.join(arq_uuids)
self.client.delete_arqs_by_uuid(arq_uuids)
mock_call_cyborg.assert_called_once_with(mock.ANY,
self.client.ARQ_URL, params={'arqs': arq_uuid_str})
@mock.patch('nova.accelerator.cyborg.LOG.error')
@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
def test_delete_arqs_by_uuid_exception(self, mock_call_cyborg, mock_log):
mock_call_cyborg.return_value = (None, 'Some error')
_, bound_arqs = self._get_bound_arqs()
arq_uuids = [arq['uuid'] for arq in bound_arqs]
arq_uuid_str = ','.join(arq_uuids)
self.client.delete_arqs_by_uuid(arq_uuids)
mock_call_cyborg.assert_called_once_with(mock.ANY,
self.client.ARQ_URL, params={'arqs': arq_uuid_str})
mock_log.assert_called_once_with('Failed to delete ARQs %s',
arq_uuid_str)

View File

@ -26,6 +26,7 @@ from oslo_utils import timeutils
from oslo_versionedobjects import exception as ovo_exc
import six
from nova.accelerator import cyborg
from nova import block_device
from nova.compute import flavors
from nova.compute import rpcapi as compute_rpcapi
@ -48,6 +49,7 @@ from nova import objects
from nova.objects import base as obj_base
from nova.objects import block_device as block_device_obj
from nova.objects import fields
from nova.objects import request_spec
from nova.scheduler.client import query
from nova.scheduler import utils as scheduler_utils
from nova import test
@ -583,7 +585,7 @@ class _BaseTaskTestCase(object):
mock_getaz.return_value = 'myaz'
mock_create_bind_arqs.side_effect = (
exc.AcceleratorRequestOpFailed(op='', msg=''))
exc.AcceleratorRequestBindingFailed(arqs=[], msg=''))
self.conductor.build_instances(self.context,
instances=instances,
@ -605,10 +607,10 @@ class _BaseTaskTestCase(object):
# in the above flow. So, we compare the fields instead.
mock_cleanup.assert_has_calls([
mock.call(self.context, test.MatchType(objects.Instance),
test.MatchType(exc.AcceleratorRequestOpFailed),
test.MatchType(exc.AcceleratorRequestBindingFailed),
test.MatchType(dict), None),
mock.call(self.context, test.MatchType(objects.Instance),
test.MatchType(exc.AcceleratorRequestOpFailed),
test.MatchType(exc.AcceleratorRequestBindingFailed),
test.MatchType(dict), None),
])
call_list = mock_cleanup.call_args_list
@ -2548,11 +2550,11 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
self, mock_create_bind_arqs, mock_cleanup):
# Exceptions in _create_and_bind_arqs result in cleanup
mock_create_bind_arqs.side_effect = (
exc.AcceleratorRequestOpFailed(op='', msg=''))
exc.AcceleratorRequestBindingFailed(arqs=[], msg=''))
try:
self._do_schedule_and_build_instances_test(self.params)
except exc.AcceleratorRequestOpFailed:
except exc.AcceleratorRequestBindingFailed:
pass
mock_cleanup.assert_called_once_with(
@ -2561,6 +2563,26 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
self.params['block_device_mapping'], self.params['tags'],
mock.ANY)
@mock.patch.object(request_spec.RequestSpec, "get_request_group_mapping")
@mock.patch.object(cyborg, "get_client")
@mock.patch.object(
conductor_manager.ComputeTaskManager, '_create_and_bind_arqs')
def test__create_and_bind_arq_for_instance(
self, mock_create_bind_arqs, mock_client, mock_request_mappings):
# Exceptions in _create_and_bind_arqs result in cleanup
arqs = ["fake-arq-uuid"]
mock_create_bind_arqs.side_effect = (
exc.AcceleratorRequestBindingFailed(arqs=arqs, msg=''))
mock_client.return_value = mock.Mock()
instance = mock.Mock()
instance.uuid = "fake-uuid"
ex = self.assertRaises(exc.AcceleratorRequestBindingFailed,
self.conductor._create_and_bind_arq_for_instance,
None, instance, mock.Mock(), request_spec.RequestSpec())
self.assertIn('Failed to bind accelerator requests', ex.message)
mock_client.return_value.delete_arqs_by_uuid.assert_called_with(arqs)
def test_map_instance_to_cell_already_mapped(self):
"""Tests a scenario where an instance is already mapped to a cell
during scheduling.