Browse Source

Merge "Delete ARQs by UUID if Cyborg ARQ bind fails."

changes/12/741712/7
Zuul 2 weeks ago
committed by Gerrit Code Review
parent
commit
ccecb507ff
5 changed files with 114 additions and 27 deletions
  1. +27
    -3
      nova/accelerator/cyborg.py
  2. +23
    -16
      nova/conductor/manager.py
  3. +8
    -0
      nova/exception.py
  4. +29
    -3
      nova/tests/unit/accelerator/test_cyborg.py
  5. +27
    -5
      nova/tests/unit/conductor/test_conductor.py

+ 27
- 3
nova/accelerator/cyborg.py View File

@@ -212,10 +212,11 @@ class _CyborgClient(object):
resp, err_msg = self._call_cyborg(self._client.patch,
self.ARQ_URL, json=patch_list)
if err_msg:
arq_uuids = bindings.keys()
msg = _(' Binding failed for ARQ UUIDs: ')
err_msg = err_msg + msg + ','.join(bindings.keys())
raise exception.AcceleratorRequestOpFailed(
op=_('bind'), msg=err_msg)
err_msg = err_msg + msg + ','.join(arq_uuids)
raise exception.AcceleratorRequestBindingFailed(
arqs=arq_uuids, msg=err_msg)

def get_arqs_for_instance(self, instance_uuid, only_resolved=False):
"""Get ARQs for the instance.
@@ -278,3 +279,26 @@ class _CyborgClient(object):
msg = err_msg + _(' Instance %s') % instance_uuid
raise exception.AcceleratorRequestOpFailed(
op=_('delete'), msg=msg)

def delete_arqs_by_uuid(self, arq_uuids):
"""Delete the specified ARQs, unbinding them if needed.

This is meant to be used to clean up ARQs that have failed to bind
to an instance. So delete_arqs_for_instance() is not applicable.

This Cyborg API call is NOT idempotent, i.e., if called more than
once, the 2nd and later calls will throw errors.

If this fails, an error is logged but no exception is raised
because this cleans up Cyborg resources, but should otherwise
not affect instance spawn.

:params arq_uuids: dict_keys() of ARQ UUIDs
"""
arq_uuid_str = ','.join(arq_uuids)
params = {'arqs': arq_uuid_str}
resp, err_msg = self._call_cyborg(self._client.delete,
self.ARQ_URL, params=params)
if err_msg:
# No point raising an exception.
LOG.error('Failed to delete ARQs %s', arq_uuid_str)

+ 23
- 16
nova/conductor/manager.py View File

@@ -838,15 +838,13 @@ class ComputeTaskManager(base.Base):
host.service_host, host.nodename, alts, instance=instance)

try:
resource_provider_mapping = (
local_reqspec.get_request_group_mapping())
accel_uuids = self._create_and_bind_arqs(
context, instance.uuid, instance.flavor.extra_specs,
host.nodename, resource_provider_mapping)
accel_uuids = self._create_and_bind_arq_for_instance(
context, instance, host, local_reqspec)
except Exception as exc:
LOG.exception('Failed to reschedule. Reason: %s', exc)
self._cleanup_when_reschedule_fails(context, instance, exc,
legacy_request_spec, requested_networks)
self._cleanup_when_reschedule_fails(
context, instance, exc, legacy_request_spec,
requested_networks)
continue

self.compute_rpcapi.build_and_run_instance(context,
@@ -861,6 +859,22 @@ class ComputeTaskManager(base.Base):
limits=host.limits, host_list=host_list,
accel_uuids=accel_uuids)

def _create_and_bind_arq_for_instance(self, context, instance, host,
request_spec):
try:
resource_provider_mapping = (
request_spec.get_request_group_mapping())
# Using nodename instead of hostname. See:
# http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa
return self._create_and_bind_arqs(
context, instance.uuid, instance.flavor.extra_specs,
host.nodename, resource_provider_mapping)
except exception.AcceleratorRequestBindingFailed as exc:
# If anything failed here we need to cleanup and bail out.
cyclient = cyborg.get_client(context)
cyclient.delete_arqs_by_uuid(exc.arqs)
raise

def _schedule_instances(self, context, request_spec,
instance_uuids=None, return_alternates=False):
scheduler_utils.setup_instance_group(context, request_spec)
@@ -1619,17 +1633,10 @@ class ComputeTaskManager(base.Base):
# this one.
continue

accel_uuids = []
try:
resource_provider_mapping = (
request_spec.get_request_group_mapping())
# Using nodename instead of hostname. See:
# http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa
accel_uuids = self._create_and_bind_arqs(
context, instance.uuid, instance.flavor.extra_specs,
host.nodename, resource_provider_mapping)
accel_uuids = self._create_and_bind_arq_for_instance(
context, instance, host, request_spec)
except Exception as exc:
# If anything failed here we need to cleanup and bail out.
with excutils.save_and_reraise_exception():
self._cleanup_build_artifacts(
context, exc, instances, build_requests, request_specs,


+ 8
- 0
nova/exception.py View File

@@ -2320,6 +2320,14 @@ class AcceleratorRequestOpFailed(NovaException):
msg_fmt = _("Failed to %(op)s accelerator requests: %(msg)s")


class AcceleratorRequestBindingFailed(NovaException):
msg_fmt = _("Failed to bind accelerator requests: %(msg)s")

def __init__(self, arqs, **kwargs):
super().__init__(message=self.msg_fmt, **kwargs)
self.arqs = arqs or []


class InvalidLibvirtGPUConfig(NovaException):
msg_fmt = _('Invalid configuration for GPU devices: %(reason)s')



+ 29
- 3
nova/tests/unit/accelerator/test_cyborg.py View File

@@ -250,13 +250,15 @@ class CyborgTestCase(test.NoDBTestCase):
called_params = mock_cyborg_patch.call_args.kwargs['json']
self.assertEqual(sorted(called_params), sorted(patch_list))

@mock.patch('nova.accelerator.cyborg._CyborgClient.delete_arqs_by_uuid')
@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
def test_bind_arqs_exception(self, mock_call_cyborg):
def test_bind_arqs_exception(self, mock_call_cyborg, mock_del_arqs):
# If Cyborg returns invalid response, raise exception.
bindings, _ = self._get_bound_arqs()
mock_call_cyborg.return_value = None, 'Some error'
self.assertRaises(exception.AcceleratorRequestOpFailed,
self.client.bind_arqs, bindings=bindings)
self.assertRaises(exception.AcceleratorRequestBindingFailed,
self.client.bind_arqs, bindings=bindings)
mock_del_arqs.assert_not_called()

@mock.patch('keystoneauth1.adapter.Adapter.get')
def test_get_arqs_for_instance(self, mock_cyborg_get):
@@ -368,3 +370,27 @@ class CyborgTestCase(test.NoDBTestCase):
expected_msg = ('Failed to delete accelerator requests: ' +
err_msg + ' Instance ' + instance_uuid)
self.assertEqual(expected_msg, exc.format_message())

@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
def test_delete_arqs_by_uuid(self, mock_call_cyborg):
# Happy path
mock_call_cyborg.return_value = ('Some Value', None)
_, bound_arqs = self._get_bound_arqs()
arq_uuids = [arq['uuid'] for arq in bound_arqs]
arq_uuid_str = ','.join(arq_uuids)
self.client.delete_arqs_by_uuid(arq_uuids)
mock_call_cyborg.assert_called_once_with(mock.ANY,
self.client.ARQ_URL, params={'arqs': arq_uuid_str})

@mock.patch('nova.accelerator.cyborg.LOG.error')
@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
def test_delete_arqs_by_uuid_exception(self, mock_call_cyborg, mock_log):
mock_call_cyborg.return_value = (None, 'Some error')
_, bound_arqs = self._get_bound_arqs()
arq_uuids = [arq['uuid'] for arq in bound_arqs]
arq_uuid_str = ','.join(arq_uuids)
self.client.delete_arqs_by_uuid(arq_uuids)
mock_call_cyborg.assert_called_once_with(mock.ANY,
self.client.ARQ_URL, params={'arqs': arq_uuid_str})
mock_log.assert_called_once_with('Failed to delete ARQs %s',
arq_uuid_str)

+ 27
- 5
nova/tests/unit/conductor/test_conductor.py View File

@@ -26,6 +26,7 @@ from oslo_utils import timeutils
from oslo_versionedobjects import exception as ovo_exc
import six

from nova.accelerator import cyborg
from nova import block_device
from nova.compute import flavors
from nova.compute import rpcapi as compute_rpcapi
@@ -48,6 +49,7 @@ from nova import objects
from nova.objects import base as obj_base
from nova.objects import block_device as block_device_obj
from nova.objects import fields
from nova.objects import request_spec
from nova.scheduler.client import query
from nova.scheduler import utils as scheduler_utils
from nova import test
@@ -583,7 +585,7 @@ class _BaseTaskTestCase(object):

mock_getaz.return_value = 'myaz'
mock_create_bind_arqs.side_effect = (
exc.AcceleratorRequestOpFailed(op='', msg=''))
exc.AcceleratorRequestBindingFailed(arqs=[], msg=''))

self.conductor.build_instances(self.context,
instances=instances,
@@ -605,10 +607,10 @@ class _BaseTaskTestCase(object):
# in the above flow. So, we compare the fields instead.
mock_cleanup.assert_has_calls([
mock.call(self.context, test.MatchType(objects.Instance),
test.MatchType(exc.AcceleratorRequestOpFailed),
test.MatchType(exc.AcceleratorRequestBindingFailed),
test.MatchType(dict), None),
mock.call(self.context, test.MatchType(objects.Instance),
test.MatchType(exc.AcceleratorRequestOpFailed),
test.MatchType(exc.AcceleratorRequestBindingFailed),
test.MatchType(dict), None),
])
call_list = mock_cleanup.call_args_list
@@ -2548,11 +2550,11 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
self, mock_create_bind_arqs, mock_cleanup):
# Exceptions in _create_and_bind_arqs result in cleanup
mock_create_bind_arqs.side_effect = (
exc.AcceleratorRequestOpFailed(op='', msg=''))
exc.AcceleratorRequestBindingFailed(arqs=[], msg=''))

try:
self._do_schedule_and_build_instances_test(self.params)
except exc.AcceleratorRequestOpFailed:
except exc.AcceleratorRequestBindingFailed:
pass

mock_cleanup.assert_called_once_with(
@@ -2561,6 +2563,26 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
self.params['block_device_mapping'], self.params['tags'],
mock.ANY)

@mock.patch.object(request_spec.RequestSpec, "get_request_group_mapping")
@mock.patch.object(cyborg, "get_client")
@mock.patch.object(
conductor_manager.ComputeTaskManager, '_create_and_bind_arqs')
def test__create_and_bind_arq_for_instance(
self, mock_create_bind_arqs, mock_client, mock_request_mappings):
# Exceptions in _create_and_bind_arqs result in cleanup
arqs = ["fake-arq-uuid"]
mock_create_bind_arqs.side_effect = (
exc.AcceleratorRequestBindingFailed(arqs=arqs, msg=''))
mock_client.return_value = mock.Mock()
instance = mock.Mock()
instance.uuid = "fake-uuid"
ex = self.assertRaises(exc.AcceleratorRequestBindingFailed,
self.conductor._create_and_bind_arq_for_instance,
None, instance, mock.Mock(), request_spec.RequestSpec())

self.assertIn('Failed to bind accelerator requests', ex.message)
mock_client.return_value.delete_arqs_by_uuid.assert_called_with(arqs)

def test_map_instance_to_cell_already_mapped(self):
"""Tests a scenario where an instance is already mapped to a cell
during scheduling.


Loading…
Cancel
Save