smartnic support - cleanup arqs

delete arqs:
        -  delete arq while port unbind
	-  create ops failed and arqs did not bind to instance
	-  arq bind to instance but not bind to port

Implements: blueprint sriov-smartnic-support
Change-Id: Idab0ee38750d018de409699a0dbdff106d9e11fb
This commit is contained in:
Yongli He 2021-06-25 07:24:22 +00:00
parent 120fe8af93
commit e19fa1a199
8 changed files with 113 additions and 16 deletions

View File

@ -2579,7 +2579,11 @@ class ComputeManager(manager.Manager):
spec_arqs, network_arqs)
except (Exception, eventlet.timeout.Timeout) as exc:
LOG.exception(exc)
compute_utils.delete_arqs_if_needed(context, instance)
# ARQs created for instance or ports.
# The port binding isn't done yet.
# Unbind port won't clean port ARQs.
compute_utils.delete_arqs_if_needed(
context, instance, accel_uuids)
msg = _('Failure getting accelerator requests.')
raise exception.BuildAbortException(
reason=msg, instance_uuid=instance.uuid)
@ -2678,7 +2682,12 @@ class ComputeManager(manager.Manager):
reason=str(exc))
finally:
# Call Cyborg to delete accelerator requests
compute_utils.delete_arqs_if_needed(context, instance)
if accel_uuids:
# ARQs created for instance or ports.
# Port bind may not successful.
# unbind port won't clean port ARQs.
compute_utils.delete_arqs_if_needed(
context, instance, accel_uuids)
def _get_bound_arq_resources(self, context, instance, arq_uuids):
"""Get bound accelerator requests.

View File

@ -1537,16 +1537,28 @@ def update_pci_request_spec_with_allocated_interface_name(
spec['parent_ifname'] = rp_name_pieces[2]
def delete_arqs_if_needed(context, instance):
"""Delete Cyborg ARQs for the instance."""
dp_name = instance.flavor.extra_specs.get('accel:device_profile')
if dp_name is None:
return
def delete_arqs_if_needed(context, instance, arq_uuids=None):
"""Delete Cyborg ARQs for the instance.
:param context
:param instance: instance who own the args
:param uuids: delete arqs by uuids while did not bind to instance yet.
"""
cyclient = cyborg.get_client(context)
LOG.debug('Calling Cyborg to delete ARQs for instance %(instance)s',
dp_name = instance.flavor.extra_specs.get('accel:device_profile')
if dp_name:
LOG.debug('Calling Cyborg to delete ARQs for instance %(instance)s',
{'instance': instance.uuid})
try:
cyclient.delete_arqs_for_instance(instance.uuid)
except exception.AcceleratorRequestOpFailed as e:
LOG.exception('Failed to delete accelerator requests for '
try:
cyclient.delete_arqs_for_instance(instance.uuid)
except exception.AcceleratorRequestOpFailed as e:
LOG.exception('Failed to delete accelerator requests for '
'instance %s. Exception: %s', instance.uuid, e)
if arq_uuids:
LOG.debug('Calling Cyborg to delete ARQs by uuids %(uuid)s for'
' instance %(instance)s',
{'instance': instance.uuid,
'uuid': arq_uuids})
cyclient.delete_arqs_by_uuid(arq_uuids)

View File

@ -592,7 +592,13 @@ class ComputeTaskManager:
legacy_request_spec)
self._cleanup_allocated_networks(
context, instance, requested_networks)
compute_utils.delete_arqs_if_needed(context, instance)
arq_uuids = None
# arqs have not bound to port/instance yet
if requested_networks:
arq_uuids = [req.arq_uuid
for req in requested_networks if req.arq_uuid]
compute_utils.delete_arqs_if_needed(context, instance, arq_uuids)
# NOTE(danms): This is never cell-targeted because it is only used for
# n-cpu reschedules which go to the cell conductor and thus are always

View File

@ -653,11 +653,19 @@ class API:
).get('network')
networks[net_id] = network
# Unbind Port device
if port_profile.get('arq_uuid'):
"""Delete device profile by arq uuid."""
cyclient = cyborg.get_client(context)
cyclient.delete_arqs_by_uuid([port_profile['arq_uuid']])
LOG.debug('Delete ARQs %s for port %s',
port_profile['arq_uuid'], port_id)
# NOTE: We're doing this to remove the binding information
# for the physical device but don't want to overwrite the other
# information in the binding profile.
for profile_key in ('pci_vendor_info', 'pci_slot',
constants.ALLOCATION):
constants.ALLOCATION, 'arq_uuid'):
if profile_key in port_profile:
del port_profile[profile_key]
port_req_body['port'][constants.BINDING_PROFILE] = port_profile

View File

@ -6235,9 +6235,12 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.context, self.instance, arq_uuids)
self.assertEqual(sorted(resources['accel_info']), sorted(arq_list))
@mock.patch.object(compute_utils,
'delete_arqs_if_needed')
@mock.patch.object(nova.compute.manager.ComputeManager,
'_get_bound_arq_resources')
def test_accel_build_resources_exception(self, mock_get_arqs):
def test_accel_build_resources_exception(self, mock_get_arqs,
mock_clean_arq):
dp_name = "mydp"
self.instance.flavor.extra_specs = {"accel:device_profile": dp_name}
arq_list = fixtures.CyborgFixture.bound_arq_list
@ -6252,6 +6255,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self._test_accel_build_resources,
arq_uuids, None,
self.requested_networks)
mock_clean_arq.assert_called_once_with(
self.context, self.instance, arq_uuids)
@mock.patch.object(nova.compute.manager.ComputeManager,
'_get_bound_arq_resources')

View File

@ -1658,3 +1658,22 @@ class AcceleratorRequestTestCase(test.NoDBTestCase):
mock_log_exc.assert_called_once()
self.assertIn('Failed to delete accelerator requests for instance',
mock_log_exc.call_args[0][0])
@mock.patch.object(cyborgclient, 'delete_arqs_by_uuid')
def test_delete_with_arq_uuid(self, mock_del_arq):
flavor = objects.Flavor(**test_flavor.fake_flavor)
instance = fake_instance.fake_instance_obj(self.context, flavor=flavor)
arq_uuids = [uuids.arq1, uuids.arq2]
compute_utils.delete_arqs_if_needed(self.context, instance, arq_uuids)
mock_del_arq.assert_called_once_with(arq_uuids)
@mock.patch.object(cyborgclient, 'delete_arqs_by_uuid')
@mock.patch.object(cyborgclient, 'delete_arqs_for_instance')
def test_delete_with_arq_uuid_and_dp(self, mock_del_inst, mock_del_uuid):
flavor = objects.Flavor(**test_flavor.fake_flavor)
flavor['extra_specs'] = {'accel:device_profile': 'mydp'}
instance = fake_instance.fake_instance_obj(self.context, flavor=flavor)
arq_uuids = [uuids.arq1, uuids.arq2]
compute_utils.delete_arqs_if_needed(self.context, instance, arq_uuids)
mock_del_inst.assert_called_once_with(instance.uuid)
mock_del_uuid.assert_called_once_with(arq_uuids)

View File

@ -4144,7 +4144,25 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
self.conductor_manager._cleanup_when_reschedule_fails(
self.context, instance, exception=None,
legacy_request_spec=None, requested_networks=None)
mock_del_arqs.assert_called_once_with(self.context, instance)
mock_del_arqs.assert_called_once_with(
self.context, instance, None)
@mock.patch.object(conductor_manager.ComputeTaskManager,
'_cleanup_allocated_networks')
@mock.patch.object(conductor_manager.ComputeTaskManager,
'_set_vm_state_and_notify')
@mock.patch.object(compute_utils, 'delete_arqs_if_needed')
def test_cleanup_arqs_on_reschedule_by_uuid(self, mock_del_arqs,
mock_set_vm, mock_clean_net):
instance = fake_instance.fake_instance_obj(self.context)
request_tuples = [('123', '1.2.3.4', uuids.fakeid,
None, uuids.arq_uuid, None)]
requests = objects.NetworkRequestList.from_tuples(request_tuples)
self.conductor_manager._cleanup_when_reschedule_fails(
self.context, instance, exception=None,
legacy_request_spec=None, requested_networks=requests)
mock_del_arqs.assert_called_once_with(self.context,
instance, [uuids.arq_uuid])
def test_cleanup_allocated_networks_none_requested(self):
# Tests that we don't deallocate networks if 'none' were specifically

View File

@ -7065,6 +7065,26 @@ class TestAPIPortbinding(TestAPIBase):
self.api.delete_port_binding(self.context, port_id,
'fake-host')
@mock.patch('nova.accelerator.cyborg._CyborgClient.delete_arqs_by_uuid')
@mock.patch('nova.network.neutron.get_binding_profile')
@mock.patch('nova.network.neutron.API._show_port')
@mock.patch('nova.network.neutron.get_client')
def test_unbind_ports_clean_arq(self, mock_neutron, mock_show,
mock_bind, mock_delete_arq):
mock_client = mock.Mock()
mock_ctx = mock.Mock(is_admin=False)
ports = ["1"]
mock_show.return_value = {'id': uuids.port}
mock_bind.return_value = {'arq_uuid': self.arqs[0]['uuid'],
'key': 'val'}
api = neutronapi.API()
api._unbind_ports(mock_ctx, ports, mock_neutron, mock_client)
mock_delete_arq.assert_called_once_with([self.arqs[0]['uuid']])
# verify binding profile key 'arq_uuid' deleted
call_args = mock_client.update_port.call_args[0][1]
self.assertEqual(call_args['port']['binding:profile'],
{'key': 'val'})
class TestAllocateForInstance(test.NoDBTestCase):
def setUp(self):