Delete ARQs by UUID if Cyborg ARQ bind fails.
During the reivew of the cyborg series it was noted that in some cases ARQs could be leaked during binding. See https://review.opendev.org/#/c/673735/46/nova/conductor/manager.py@1632 This change adds a delete_arqs_by_uuid function that can delete unbound ARQs by instance uuid. This change modifies build_instances and schedule_and_build_instances to handel the AcceleratorRequestBindingFailed exception raised when binding fails and clean up instance arqs. Co-Authored-By: Wenping Song <songwenping@inspur.com> Closes-Bug: #1872730 Change-Id: I86c2f00e2368fe02211175e7328b2cd9c0ebf41b Blueprint: nova-cyborg-interaction
This commit is contained in:
parent
a1e392fa90
commit
d94ea23d3d
@ -212,10 +212,11 @@ class _CyborgClient(object):
|
||||
resp, err_msg = self._call_cyborg(self._client.patch,
|
||||
self.ARQ_URL, json=patch_list)
|
||||
if err_msg:
|
||||
arq_uuids = bindings.keys()
|
||||
msg = _(' Binding failed for ARQ UUIDs: ')
|
||||
err_msg = err_msg + msg + ','.join(bindings.keys())
|
||||
raise exception.AcceleratorRequestOpFailed(
|
||||
op=_('bind'), msg=err_msg)
|
||||
err_msg = err_msg + msg + ','.join(arq_uuids)
|
||||
raise exception.AcceleratorRequestBindingFailed(
|
||||
arqs=arq_uuids, msg=err_msg)
|
||||
|
||||
def get_arqs_for_instance(self, instance_uuid, only_resolved=False):
|
||||
"""Get ARQs for the instance.
|
||||
@ -278,3 +279,26 @@ class _CyborgClient(object):
|
||||
msg = err_msg + _(' Instance %s') % instance_uuid
|
||||
raise exception.AcceleratorRequestOpFailed(
|
||||
op=_('delete'), msg=msg)
|
||||
|
||||
def delete_arqs_by_uuid(self, arq_uuids):
|
||||
"""Delete the specified ARQs, unbinding them if needed.
|
||||
|
||||
This is meant to be used to clean up ARQs that have failed to bind
|
||||
to an instance. So delete_arqs_for_instance() is not applicable.
|
||||
|
||||
This Cyborg API call is NOT idempotent, i.e., if called more than
|
||||
once, the 2nd and later calls will throw errors.
|
||||
|
||||
If this fails, an error is logged but no exception is raised
|
||||
because this cleans up Cyborg resources, but should otherwise
|
||||
not affect instance spawn.
|
||||
|
||||
:params arq_uuids: dict_keys() of ARQ UUIDs
|
||||
"""
|
||||
arq_uuid_str = ','.join(arq_uuids)
|
||||
params = {'arqs': arq_uuid_str}
|
||||
resp, err_msg = self._call_cyborg(self._client.delete,
|
||||
self.ARQ_URL, params=params)
|
||||
if err_msg:
|
||||
# No point raising an exception.
|
||||
LOG.error('Failed to delete ARQs %s', arq_uuid_str)
|
||||
|
@ -838,15 +838,13 @@ class ComputeTaskManager(base.Base):
|
||||
host.service_host, host.nodename, alts, instance=instance)
|
||||
|
||||
try:
|
||||
resource_provider_mapping = (
|
||||
local_reqspec.get_request_group_mapping())
|
||||
accel_uuids = self._create_and_bind_arqs(
|
||||
context, instance.uuid, instance.flavor.extra_specs,
|
||||
host.nodename, resource_provider_mapping)
|
||||
accel_uuids = self._create_and_bind_arq_for_instance(
|
||||
context, instance, host, local_reqspec)
|
||||
except Exception as exc:
|
||||
LOG.exception('Failed to reschedule. Reason: %s', exc)
|
||||
self._cleanup_when_reschedule_fails(context, instance, exc,
|
||||
legacy_request_spec, requested_networks)
|
||||
self._cleanup_when_reschedule_fails(
|
||||
context, instance, exc, legacy_request_spec,
|
||||
requested_networks)
|
||||
continue
|
||||
|
||||
self.compute_rpcapi.build_and_run_instance(context,
|
||||
@ -861,6 +859,22 @@ class ComputeTaskManager(base.Base):
|
||||
limits=host.limits, host_list=host_list,
|
||||
accel_uuids=accel_uuids)
|
||||
|
||||
def _create_and_bind_arq_for_instance(self, context, instance, host,
|
||||
request_spec):
|
||||
try:
|
||||
resource_provider_mapping = (
|
||||
request_spec.get_request_group_mapping())
|
||||
# Using nodename instead of hostname. See:
|
||||
# http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa
|
||||
return self._create_and_bind_arqs(
|
||||
context, instance.uuid, instance.flavor.extra_specs,
|
||||
host.nodename, resource_provider_mapping)
|
||||
except exception.AcceleratorRequestBindingFailed as exc:
|
||||
# If anything failed here we need to cleanup and bail out.
|
||||
cyclient = cyborg.get_client(context)
|
||||
cyclient.delete_arqs_by_uuid(exc.arqs)
|
||||
raise
|
||||
|
||||
def _schedule_instances(self, context, request_spec,
|
||||
instance_uuids=None, return_alternates=False):
|
||||
scheduler_utils.setup_instance_group(context, request_spec)
|
||||
@ -1619,17 +1633,10 @@ class ComputeTaskManager(base.Base):
|
||||
# this one.
|
||||
continue
|
||||
|
||||
accel_uuids = []
|
||||
try:
|
||||
resource_provider_mapping = (
|
||||
request_spec.get_request_group_mapping())
|
||||
# Using nodename instead of hostname. See:
|
||||
# http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa
|
||||
accel_uuids = self._create_and_bind_arqs(
|
||||
context, instance.uuid, instance.flavor.extra_specs,
|
||||
host.nodename, resource_provider_mapping)
|
||||
accel_uuids = self._create_and_bind_arq_for_instance(
|
||||
context, instance, host, request_spec)
|
||||
except Exception as exc:
|
||||
# If anything failed here we need to cleanup and bail out.
|
||||
with excutils.save_and_reraise_exception():
|
||||
self._cleanup_build_artifacts(
|
||||
context, exc, instances, build_requests, request_specs,
|
||||
|
@ -2316,6 +2316,14 @@ class AcceleratorRequestOpFailed(NovaException):
|
||||
msg_fmt = _("Failed to %(op)s accelerator requests: %(msg)s")
|
||||
|
||||
|
||||
class AcceleratorRequestBindingFailed(NovaException):
|
||||
msg_fmt = _("Failed to bind accelerator requests: %(msg)s")
|
||||
|
||||
def __init__(self, arqs, **kwargs):
|
||||
super().__init__(message=self.msg_fmt, **kwargs)
|
||||
self.arqs = arqs or []
|
||||
|
||||
|
||||
class InvalidLibvirtGPUConfig(NovaException):
|
||||
msg_fmt = _('Invalid configuration for GPU devices: %(reason)s')
|
||||
|
||||
|
@ -250,13 +250,15 @@ class CyborgTestCase(test.NoDBTestCase):
|
||||
called_params = mock_cyborg_patch.call_args.kwargs['json']
|
||||
self.assertEqual(sorted(called_params), sorted(patch_list))
|
||||
|
||||
@mock.patch('nova.accelerator.cyborg._CyborgClient.delete_arqs_by_uuid')
|
||||
@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
|
||||
def test_bind_arqs_exception(self, mock_call_cyborg):
|
||||
def test_bind_arqs_exception(self, mock_call_cyborg, mock_del_arqs):
|
||||
# If Cyborg returns invalid response, raise exception.
|
||||
bindings, _ = self._get_bound_arqs()
|
||||
mock_call_cyborg.return_value = None, 'Some error'
|
||||
self.assertRaises(exception.AcceleratorRequestOpFailed,
|
||||
self.client.bind_arqs, bindings=bindings)
|
||||
self.assertRaises(exception.AcceleratorRequestBindingFailed,
|
||||
self.client.bind_arqs, bindings=bindings)
|
||||
mock_del_arqs.assert_not_called()
|
||||
|
||||
@mock.patch('keystoneauth1.adapter.Adapter.get')
|
||||
def test_get_arqs_for_instance(self, mock_cyborg_get):
|
||||
@ -368,3 +370,27 @@ class CyborgTestCase(test.NoDBTestCase):
|
||||
expected_msg = ('Failed to delete accelerator requests: ' +
|
||||
err_msg + ' Instance ' + instance_uuid)
|
||||
self.assertEqual(expected_msg, exc.format_message())
|
||||
|
||||
@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
|
||||
def test_delete_arqs_by_uuid(self, mock_call_cyborg):
|
||||
# Happy path
|
||||
mock_call_cyborg.return_value = ('Some Value', None)
|
||||
_, bound_arqs = self._get_bound_arqs()
|
||||
arq_uuids = [arq['uuid'] for arq in bound_arqs]
|
||||
arq_uuid_str = ','.join(arq_uuids)
|
||||
self.client.delete_arqs_by_uuid(arq_uuids)
|
||||
mock_call_cyborg.assert_called_once_with(mock.ANY,
|
||||
self.client.ARQ_URL, params={'arqs': arq_uuid_str})
|
||||
|
||||
@mock.patch('nova.accelerator.cyborg.LOG.error')
|
||||
@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
|
||||
def test_delete_arqs_by_uuid_exception(self, mock_call_cyborg, mock_log):
|
||||
mock_call_cyborg.return_value = (None, 'Some error')
|
||||
_, bound_arqs = self._get_bound_arqs()
|
||||
arq_uuids = [arq['uuid'] for arq in bound_arqs]
|
||||
arq_uuid_str = ','.join(arq_uuids)
|
||||
self.client.delete_arqs_by_uuid(arq_uuids)
|
||||
mock_call_cyborg.assert_called_once_with(mock.ANY,
|
||||
self.client.ARQ_URL, params={'arqs': arq_uuid_str})
|
||||
mock_log.assert_called_once_with('Failed to delete ARQs %s',
|
||||
arq_uuid_str)
|
||||
|
@ -26,6 +26,7 @@ from oslo_utils import timeutils
|
||||
from oslo_versionedobjects import exception as ovo_exc
|
||||
import six
|
||||
|
||||
from nova.accelerator import cyborg
|
||||
from nova import block_device
|
||||
from nova.compute import flavors
|
||||
from nova.compute import rpcapi as compute_rpcapi
|
||||
@ -48,6 +49,7 @@ from nova import objects
|
||||
from nova.objects import base as obj_base
|
||||
from nova.objects import block_device as block_device_obj
|
||||
from nova.objects import fields
|
||||
from nova.objects import request_spec
|
||||
from nova.scheduler.client import query
|
||||
from nova.scheduler import utils as scheduler_utils
|
||||
from nova import test
|
||||
@ -583,7 +585,7 @@ class _BaseTaskTestCase(object):
|
||||
|
||||
mock_getaz.return_value = 'myaz'
|
||||
mock_create_bind_arqs.side_effect = (
|
||||
exc.AcceleratorRequestOpFailed(op='', msg=''))
|
||||
exc.AcceleratorRequestBindingFailed(arqs=[], msg=''))
|
||||
|
||||
self.conductor.build_instances(self.context,
|
||||
instances=instances,
|
||||
@ -605,10 +607,10 @@ class _BaseTaskTestCase(object):
|
||||
# in the above flow. So, we compare the fields instead.
|
||||
mock_cleanup.assert_has_calls([
|
||||
mock.call(self.context, test.MatchType(objects.Instance),
|
||||
test.MatchType(exc.AcceleratorRequestOpFailed),
|
||||
test.MatchType(exc.AcceleratorRequestBindingFailed),
|
||||
test.MatchType(dict), None),
|
||||
mock.call(self.context, test.MatchType(objects.Instance),
|
||||
test.MatchType(exc.AcceleratorRequestOpFailed),
|
||||
test.MatchType(exc.AcceleratorRequestBindingFailed),
|
||||
test.MatchType(dict), None),
|
||||
])
|
||||
call_list = mock_cleanup.call_args_list
|
||||
@ -2548,11 +2550,11 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
|
||||
self, mock_create_bind_arqs, mock_cleanup):
|
||||
# Exceptions in _create_and_bind_arqs result in cleanup
|
||||
mock_create_bind_arqs.side_effect = (
|
||||
exc.AcceleratorRequestOpFailed(op='', msg=''))
|
||||
exc.AcceleratorRequestBindingFailed(arqs=[], msg=''))
|
||||
|
||||
try:
|
||||
self._do_schedule_and_build_instances_test(self.params)
|
||||
except exc.AcceleratorRequestOpFailed:
|
||||
except exc.AcceleratorRequestBindingFailed:
|
||||
pass
|
||||
|
||||
mock_cleanup.assert_called_once_with(
|
||||
@ -2561,6 +2563,26 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
|
||||
self.params['block_device_mapping'], self.params['tags'],
|
||||
mock.ANY)
|
||||
|
||||
@mock.patch.object(request_spec.RequestSpec, "get_request_group_mapping")
|
||||
@mock.patch.object(cyborg, "get_client")
|
||||
@mock.patch.object(
|
||||
conductor_manager.ComputeTaskManager, '_create_and_bind_arqs')
|
||||
def test__create_and_bind_arq_for_instance(
|
||||
self, mock_create_bind_arqs, mock_client, mock_request_mappings):
|
||||
# Exceptions in _create_and_bind_arqs result in cleanup
|
||||
arqs = ["fake-arq-uuid"]
|
||||
mock_create_bind_arqs.side_effect = (
|
||||
exc.AcceleratorRequestBindingFailed(arqs=arqs, msg=''))
|
||||
mock_client.return_value = mock.Mock()
|
||||
instance = mock.Mock()
|
||||
instance.uuid = "fake-uuid"
|
||||
ex = self.assertRaises(exc.AcceleratorRequestBindingFailed,
|
||||
self.conductor._create_and_bind_arq_for_instance,
|
||||
None, instance, mock.Mock(), request_spec.RequestSpec())
|
||||
|
||||
self.assertIn('Failed to bind accelerator requests', ex.message)
|
||||
mock_client.return_value.delete_arqs_by_uuid.assert_called_with(arqs)
|
||||
|
||||
def test_map_instance_to_cell_already_mapped(self):
|
||||
"""Tests a scenario where an instance is already mapped to a cell
|
||||
during scheduling.
|
||||
|
Loading…
Reference in New Issue
Block a user