Merge "Delete ARQs by UUID if Cyborg ARQ bind fails."
This commit is contained in:
commit
ccecb507ff
|
@ -212,10 +212,11 @@ class _CyborgClient(object):
|
||||||
resp, err_msg = self._call_cyborg(self._client.patch,
|
resp, err_msg = self._call_cyborg(self._client.patch,
|
||||||
self.ARQ_URL, json=patch_list)
|
self.ARQ_URL, json=patch_list)
|
||||||
if err_msg:
|
if err_msg:
|
||||||
|
arq_uuids = bindings.keys()
|
||||||
msg = _(' Binding failed for ARQ UUIDs: ')
|
msg = _(' Binding failed for ARQ UUIDs: ')
|
||||||
err_msg = err_msg + msg + ','.join(bindings.keys())
|
err_msg = err_msg + msg + ','.join(arq_uuids)
|
||||||
raise exception.AcceleratorRequestOpFailed(
|
raise exception.AcceleratorRequestBindingFailed(
|
||||||
op=_('bind'), msg=err_msg)
|
arqs=arq_uuids, msg=err_msg)
|
||||||
|
|
||||||
def get_arqs_for_instance(self, instance_uuid, only_resolved=False):
|
def get_arqs_for_instance(self, instance_uuid, only_resolved=False):
|
||||||
"""Get ARQs for the instance.
|
"""Get ARQs for the instance.
|
||||||
|
@ -278,3 +279,26 @@ class _CyborgClient(object):
|
||||||
msg = err_msg + _(' Instance %s') % instance_uuid
|
msg = err_msg + _(' Instance %s') % instance_uuid
|
||||||
raise exception.AcceleratorRequestOpFailed(
|
raise exception.AcceleratorRequestOpFailed(
|
||||||
op=_('delete'), msg=msg)
|
op=_('delete'), msg=msg)
|
||||||
|
|
||||||
|
def delete_arqs_by_uuid(self, arq_uuids):
|
||||||
|
"""Delete the specified ARQs, unbinding them if needed.
|
||||||
|
|
||||||
|
This is meant to be used to clean up ARQs that have failed to bind
|
||||||
|
to an instance. So delete_arqs_for_instance() is not applicable.
|
||||||
|
|
||||||
|
This Cyborg API call is NOT idempotent, i.e., if called more than
|
||||||
|
once, the 2nd and later calls will throw errors.
|
||||||
|
|
||||||
|
If this fails, an error is logged but no exception is raised
|
||||||
|
because this cleans up Cyborg resources, but should otherwise
|
||||||
|
not affect instance spawn.
|
||||||
|
|
||||||
|
:params arq_uuids: dict_keys() of ARQ UUIDs
|
||||||
|
"""
|
||||||
|
arq_uuid_str = ','.join(arq_uuids)
|
||||||
|
params = {'arqs': arq_uuid_str}
|
||||||
|
resp, err_msg = self._call_cyborg(self._client.delete,
|
||||||
|
self.ARQ_URL, params=params)
|
||||||
|
if err_msg:
|
||||||
|
# No point raising an exception.
|
||||||
|
LOG.error('Failed to delete ARQs %s', arq_uuid_str)
|
||||||
|
|
|
@ -838,15 +838,13 @@ class ComputeTaskManager(base.Base):
|
||||||
host.service_host, host.nodename, alts, instance=instance)
|
host.service_host, host.nodename, alts, instance=instance)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
resource_provider_mapping = (
|
accel_uuids = self._create_and_bind_arq_for_instance(
|
||||||
local_reqspec.get_request_group_mapping())
|
context, instance, host, local_reqspec)
|
||||||
accel_uuids = self._create_and_bind_arqs(
|
|
||||||
context, instance.uuid, instance.flavor.extra_specs,
|
|
||||||
host.nodename, resource_provider_mapping)
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
LOG.exception('Failed to reschedule. Reason: %s', exc)
|
LOG.exception('Failed to reschedule. Reason: %s', exc)
|
||||||
self._cleanup_when_reschedule_fails(context, instance, exc,
|
self._cleanup_when_reschedule_fails(
|
||||||
legacy_request_spec, requested_networks)
|
context, instance, exc, legacy_request_spec,
|
||||||
|
requested_networks)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
self.compute_rpcapi.build_and_run_instance(context,
|
self.compute_rpcapi.build_and_run_instance(context,
|
||||||
|
@ -861,6 +859,22 @@ class ComputeTaskManager(base.Base):
|
||||||
limits=host.limits, host_list=host_list,
|
limits=host.limits, host_list=host_list,
|
||||||
accel_uuids=accel_uuids)
|
accel_uuids=accel_uuids)
|
||||||
|
|
||||||
|
def _create_and_bind_arq_for_instance(self, context, instance, host,
|
||||||
|
request_spec):
|
||||||
|
try:
|
||||||
|
resource_provider_mapping = (
|
||||||
|
request_spec.get_request_group_mapping())
|
||||||
|
# Using nodename instead of hostname. See:
|
||||||
|
# http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa
|
||||||
|
return self._create_and_bind_arqs(
|
||||||
|
context, instance.uuid, instance.flavor.extra_specs,
|
||||||
|
host.nodename, resource_provider_mapping)
|
||||||
|
except exception.AcceleratorRequestBindingFailed as exc:
|
||||||
|
# If anything failed here we need to cleanup and bail out.
|
||||||
|
cyclient = cyborg.get_client(context)
|
||||||
|
cyclient.delete_arqs_by_uuid(exc.arqs)
|
||||||
|
raise
|
||||||
|
|
||||||
def _schedule_instances(self, context, request_spec,
|
def _schedule_instances(self, context, request_spec,
|
||||||
instance_uuids=None, return_alternates=False):
|
instance_uuids=None, return_alternates=False):
|
||||||
scheduler_utils.setup_instance_group(context, request_spec)
|
scheduler_utils.setup_instance_group(context, request_spec)
|
||||||
|
@ -1619,17 +1633,10 @@ class ComputeTaskManager(base.Base):
|
||||||
# this one.
|
# this one.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
accel_uuids = []
|
|
||||||
try:
|
try:
|
||||||
resource_provider_mapping = (
|
accel_uuids = self._create_and_bind_arq_for_instance(
|
||||||
request_spec.get_request_group_mapping())
|
context, instance, host, request_spec)
|
||||||
# Using nodename instead of hostname. See:
|
|
||||||
# http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa
|
|
||||||
accel_uuids = self._create_and_bind_arqs(
|
|
||||||
context, instance.uuid, instance.flavor.extra_specs,
|
|
||||||
host.nodename, resource_provider_mapping)
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
# If anything failed here we need to cleanup and bail out.
|
|
||||||
with excutils.save_and_reraise_exception():
|
with excutils.save_and_reraise_exception():
|
||||||
self._cleanup_build_artifacts(
|
self._cleanup_build_artifacts(
|
||||||
context, exc, instances, build_requests, request_specs,
|
context, exc, instances, build_requests, request_specs,
|
||||||
|
|
|
@ -2320,6 +2320,14 @@ class AcceleratorRequestOpFailed(NovaException):
|
||||||
msg_fmt = _("Failed to %(op)s accelerator requests: %(msg)s")
|
msg_fmt = _("Failed to %(op)s accelerator requests: %(msg)s")
|
||||||
|
|
||||||
|
|
||||||
|
class AcceleratorRequestBindingFailed(NovaException):
|
||||||
|
msg_fmt = _("Failed to bind accelerator requests: %(msg)s")
|
||||||
|
|
||||||
|
def __init__(self, arqs, **kwargs):
|
||||||
|
super().__init__(message=self.msg_fmt, **kwargs)
|
||||||
|
self.arqs = arqs or []
|
||||||
|
|
||||||
|
|
||||||
class InvalidLibvirtGPUConfig(NovaException):
|
class InvalidLibvirtGPUConfig(NovaException):
|
||||||
msg_fmt = _('Invalid configuration for GPU devices: %(reason)s')
|
msg_fmt = _('Invalid configuration for GPU devices: %(reason)s')
|
||||||
|
|
||||||
|
|
|
@ -250,13 +250,15 @@ class CyborgTestCase(test.NoDBTestCase):
|
||||||
called_params = mock_cyborg_patch.call_args.kwargs['json']
|
called_params = mock_cyborg_patch.call_args.kwargs['json']
|
||||||
self.assertEqual(sorted(called_params), sorted(patch_list))
|
self.assertEqual(sorted(called_params), sorted(patch_list))
|
||||||
|
|
||||||
|
@mock.patch('nova.accelerator.cyborg._CyborgClient.delete_arqs_by_uuid')
|
||||||
@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
|
@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
|
||||||
def test_bind_arqs_exception(self, mock_call_cyborg):
|
def test_bind_arqs_exception(self, mock_call_cyborg, mock_del_arqs):
|
||||||
# If Cyborg returns invalid response, raise exception.
|
# If Cyborg returns invalid response, raise exception.
|
||||||
bindings, _ = self._get_bound_arqs()
|
bindings, _ = self._get_bound_arqs()
|
||||||
mock_call_cyborg.return_value = None, 'Some error'
|
mock_call_cyborg.return_value = None, 'Some error'
|
||||||
self.assertRaises(exception.AcceleratorRequestOpFailed,
|
self.assertRaises(exception.AcceleratorRequestBindingFailed,
|
||||||
self.client.bind_arqs, bindings=bindings)
|
self.client.bind_arqs, bindings=bindings)
|
||||||
|
mock_del_arqs.assert_not_called()
|
||||||
|
|
||||||
@mock.patch('keystoneauth1.adapter.Adapter.get')
|
@mock.patch('keystoneauth1.adapter.Adapter.get')
|
||||||
def test_get_arqs_for_instance(self, mock_cyborg_get):
|
def test_get_arqs_for_instance(self, mock_cyborg_get):
|
||||||
|
@ -368,3 +370,27 @@ class CyborgTestCase(test.NoDBTestCase):
|
||||||
expected_msg = ('Failed to delete accelerator requests: ' +
|
expected_msg = ('Failed to delete accelerator requests: ' +
|
||||||
err_msg + ' Instance ' + instance_uuid)
|
err_msg + ' Instance ' + instance_uuid)
|
||||||
self.assertEqual(expected_msg, exc.format_message())
|
self.assertEqual(expected_msg, exc.format_message())
|
||||||
|
|
||||||
|
@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
|
||||||
|
def test_delete_arqs_by_uuid(self, mock_call_cyborg):
|
||||||
|
# Happy path
|
||||||
|
mock_call_cyborg.return_value = ('Some Value', None)
|
||||||
|
_, bound_arqs = self._get_bound_arqs()
|
||||||
|
arq_uuids = [arq['uuid'] for arq in bound_arqs]
|
||||||
|
arq_uuid_str = ','.join(arq_uuids)
|
||||||
|
self.client.delete_arqs_by_uuid(arq_uuids)
|
||||||
|
mock_call_cyborg.assert_called_once_with(mock.ANY,
|
||||||
|
self.client.ARQ_URL, params={'arqs': arq_uuid_str})
|
||||||
|
|
||||||
|
@mock.patch('nova.accelerator.cyborg.LOG.error')
|
||||||
|
@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
|
||||||
|
def test_delete_arqs_by_uuid_exception(self, mock_call_cyborg, mock_log):
|
||||||
|
mock_call_cyborg.return_value = (None, 'Some error')
|
||||||
|
_, bound_arqs = self._get_bound_arqs()
|
||||||
|
arq_uuids = [arq['uuid'] for arq in bound_arqs]
|
||||||
|
arq_uuid_str = ','.join(arq_uuids)
|
||||||
|
self.client.delete_arqs_by_uuid(arq_uuids)
|
||||||
|
mock_call_cyborg.assert_called_once_with(mock.ANY,
|
||||||
|
self.client.ARQ_URL, params={'arqs': arq_uuid_str})
|
||||||
|
mock_log.assert_called_once_with('Failed to delete ARQs %s',
|
||||||
|
arq_uuid_str)
|
||||||
|
|
|
@ -26,6 +26,7 @@ from oslo_utils import timeutils
|
||||||
from oslo_versionedobjects import exception as ovo_exc
|
from oslo_versionedobjects import exception as ovo_exc
|
||||||
import six
|
import six
|
||||||
|
|
||||||
|
from nova.accelerator import cyborg
|
||||||
from nova import block_device
|
from nova import block_device
|
||||||
from nova.compute import flavors
|
from nova.compute import flavors
|
||||||
from nova.compute import rpcapi as compute_rpcapi
|
from nova.compute import rpcapi as compute_rpcapi
|
||||||
|
@ -48,6 +49,7 @@ from nova import objects
|
||||||
from nova.objects import base as obj_base
|
from nova.objects import base as obj_base
|
||||||
from nova.objects import block_device as block_device_obj
|
from nova.objects import block_device as block_device_obj
|
||||||
from nova.objects import fields
|
from nova.objects import fields
|
||||||
|
from nova.objects import request_spec
|
||||||
from nova.scheduler.client import query
|
from nova.scheduler.client import query
|
||||||
from nova.scheduler import utils as scheduler_utils
|
from nova.scheduler import utils as scheduler_utils
|
||||||
from nova import test
|
from nova import test
|
||||||
|
@ -583,7 +585,7 @@ class _BaseTaskTestCase(object):
|
||||||
|
|
||||||
mock_getaz.return_value = 'myaz'
|
mock_getaz.return_value = 'myaz'
|
||||||
mock_create_bind_arqs.side_effect = (
|
mock_create_bind_arqs.side_effect = (
|
||||||
exc.AcceleratorRequestOpFailed(op='', msg=''))
|
exc.AcceleratorRequestBindingFailed(arqs=[], msg=''))
|
||||||
|
|
||||||
self.conductor.build_instances(self.context,
|
self.conductor.build_instances(self.context,
|
||||||
instances=instances,
|
instances=instances,
|
||||||
|
@ -605,10 +607,10 @@ class _BaseTaskTestCase(object):
|
||||||
# in the above flow. So, we compare the fields instead.
|
# in the above flow. So, we compare the fields instead.
|
||||||
mock_cleanup.assert_has_calls([
|
mock_cleanup.assert_has_calls([
|
||||||
mock.call(self.context, test.MatchType(objects.Instance),
|
mock.call(self.context, test.MatchType(objects.Instance),
|
||||||
test.MatchType(exc.AcceleratorRequestOpFailed),
|
test.MatchType(exc.AcceleratorRequestBindingFailed),
|
||||||
test.MatchType(dict), None),
|
test.MatchType(dict), None),
|
||||||
mock.call(self.context, test.MatchType(objects.Instance),
|
mock.call(self.context, test.MatchType(objects.Instance),
|
||||||
test.MatchType(exc.AcceleratorRequestOpFailed),
|
test.MatchType(exc.AcceleratorRequestBindingFailed),
|
||||||
test.MatchType(dict), None),
|
test.MatchType(dict), None),
|
||||||
])
|
])
|
||||||
call_list = mock_cleanup.call_args_list
|
call_list = mock_cleanup.call_args_list
|
||||||
|
@ -2548,11 +2550,11 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
|
||||||
self, mock_create_bind_arqs, mock_cleanup):
|
self, mock_create_bind_arqs, mock_cleanup):
|
||||||
# Exceptions in _create_and_bind_arqs result in cleanup
|
# Exceptions in _create_and_bind_arqs result in cleanup
|
||||||
mock_create_bind_arqs.side_effect = (
|
mock_create_bind_arqs.side_effect = (
|
||||||
exc.AcceleratorRequestOpFailed(op='', msg=''))
|
exc.AcceleratorRequestBindingFailed(arqs=[], msg=''))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self._do_schedule_and_build_instances_test(self.params)
|
self._do_schedule_and_build_instances_test(self.params)
|
||||||
except exc.AcceleratorRequestOpFailed:
|
except exc.AcceleratorRequestBindingFailed:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
mock_cleanup.assert_called_once_with(
|
mock_cleanup.assert_called_once_with(
|
||||||
|
@ -2561,6 +2563,26 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
|
||||||
self.params['block_device_mapping'], self.params['tags'],
|
self.params['block_device_mapping'], self.params['tags'],
|
||||||
mock.ANY)
|
mock.ANY)
|
||||||
|
|
||||||
|
@mock.patch.object(request_spec.RequestSpec, "get_request_group_mapping")
|
||||||
|
@mock.patch.object(cyborg, "get_client")
|
||||||
|
@mock.patch.object(
|
||||||
|
conductor_manager.ComputeTaskManager, '_create_and_bind_arqs')
|
||||||
|
def test__create_and_bind_arq_for_instance(
|
||||||
|
self, mock_create_bind_arqs, mock_client, mock_request_mappings):
|
||||||
|
# Exceptions in _create_and_bind_arqs result in cleanup
|
||||||
|
arqs = ["fake-arq-uuid"]
|
||||||
|
mock_create_bind_arqs.side_effect = (
|
||||||
|
exc.AcceleratorRequestBindingFailed(arqs=arqs, msg=''))
|
||||||
|
mock_client.return_value = mock.Mock()
|
||||||
|
instance = mock.Mock()
|
||||||
|
instance.uuid = "fake-uuid"
|
||||||
|
ex = self.assertRaises(exc.AcceleratorRequestBindingFailed,
|
||||||
|
self.conductor._create_and_bind_arq_for_instance,
|
||||||
|
None, instance, mock.Mock(), request_spec.RequestSpec())
|
||||||
|
|
||||||
|
self.assertIn('Failed to bind accelerator requests', ex.message)
|
||||||
|
mock_client.return_value.delete_arqs_by_uuid.assert_called_with(arqs)
|
||||||
|
|
||||||
def test_map_instance_to_cell_already_mapped(self):
|
def test_map_instance_to_cell_already_mapped(self):
|
||||||
"""Tests a scenario where an instance is already mapped to a cell
|
"""Tests a scenario where an instance is already mapped to a cell
|
||||||
during scheduling.
|
during scheduling.
|
||||||
|
|
Loading…
Reference in New Issue