Delete ARQs for an instance when the instance is deleted.

This patch series now works for many VM operations with libvirt:
* Creation, deletion of VM instances.
* Pause/unpause

The following works but is a no-op:
* Lock/unlock

Hard reboots are taken up in a later patch in this series.
Soft reboots work for accelerators unless some unrelated failure
forces a hard reboot in the libvirt driver.

Suspend is not supported yet. It would fail with this error:
   libvirtError: Requested operation is not valid:
   domain has assigned non-USB host devices

Shelve is not supported yet.
Live migration is not intended to be supported with accelerators now.

Change-Id: Icb95890d8f16cad1f7dc18487a48def2f7c9aec2
Blueprint: nova-cyborg-interaction
This commit is contained in:
Sundar Nadathur 2019-07-30 19:55:11 -07:00
parent faeb39e02d
commit a20aca7f5e
12 changed files with 306 additions and 48 deletions

View File

@ -263,3 +263,18 @@ class _CyborgClient(object):
arqs = [arq for arq in arqs if
arq['state'] in ['Bound', 'BindFailed', 'Deleting']]
return arqs
def delete_arqs_for_instance(self, instance_uuid):
"""Delete ARQs for instance, after unbinding if needed.
:param instance_uuid: Instance UUID
:raises: AcceleratorRequestOpFailed
"""
# Unbind and delete the ARQs
params = {"instance": instance_uuid}
resp, err_msg = self._call_cyborg(self._client.delete,
self.ARQ_URL, params=params)
if err_msg:
msg = err_msg + _(' Instance %s') % instance_uuid
raise exception.AcceleratorRequestOpFailed(
op=_('delete'), msg=msg)

View File

@ -2451,6 +2451,10 @@ class API(base.Base):
# cleanup volumes
self._local_cleanup_bdm_volumes(bdms, instance, context)
# cleanup accelerator requests (ARQs)
compute_utils.delete_arqs_if_needed(context, instance)
# Cleanup allocations in Placement since we can't do it from the
# compute service.
self.placementclient.delete_allocation_for_instance(

View File

@ -2600,6 +2600,7 @@ class ComputeManager(manager.Manager):
except (Exception, eventlet.timeout.Timeout) as exc:
LOG.exception(exc.format_message())
self._build_resources_cleanup(instance, network_info)
compute_utils.delete_arqs_if_needed(context, instance)
msg = _('Failure getting accelerator requests.')
raise exception.BuildAbortException(instance_uuid=instance.uuid,
reason=msg)
@ -2634,6 +2635,9 @@ class ComputeManager(manager.Manager):
raise exception.BuildAbortException(
instance_uuid=instance.uuid,
reason=six.text_type(exc))
finally:
# Call Cyborg to delete accelerator requests
compute_utils.delete_arqs_if_needed(context, instance)
def _get_bound_arq_resources(self, context, dp_name, instance):
"""Get bound accelerator requests.
@ -2973,6 +2977,8 @@ class ComputeManager(manager.Manager):
self._cleanup_volumes(context, instance, bdms,
raise_exc=False, detach=False)
# Delete Cyborg ARQs if the instance has a device profile.
compute_utils.delete_arqs_if_needed(context, instance)
# if a delete task succeeded, always update vm state and task
# state without expecting task state to be DELETING
instance.vm_state = vm_states.DELETED

View File

@ -27,6 +27,7 @@ from oslo_serialization import jsonutils
from oslo_utils import excutils
import six
from nova.accelerator import cyborg
from nova import block_device
from nova.compute import power_state
from nova.compute import task_states
@ -1548,3 +1549,18 @@ def update_pci_request_spec_with_allocated_interface_name(
for spec in pci_request.spec:
spec['parent_ifname'] = rp_name_pieces[2]
def delete_arqs_if_needed(context, instance):
"""Delete Cyborg ARQs for the instance."""
dp_name = instance.flavor.extra_specs.get('accel:device_profile')
if dp_name is None:
return
cyclient = cyborg.get_client(context)
LOG.debug('Calling Cyborg to delete ARQs for instance %(instance)s',
{'instance': instance.uuid})
try:
cyclient.delete_arqs_for_instance(instance.uuid)
except exception.AcceleratorRequestOpFailed as e:
LOG.exception('Failed to delete accelerator requests for '
'instance %s. Exception: %s', instance.uuid, e)

View File

@ -595,6 +595,7 @@ class ComputeTaskManager(base.Base):
legacy_request_spec)
self._cleanup_allocated_networks(
context, instance, requested_networks)
compute_utils.delete_arqs_if_needed(context, instance)
# NOTE(danms): This is never cell-targeted because it is only used for
# n-cpu reschedules which go to the cell conductor and thus are always

View File

@ -2644,6 +2644,10 @@ class CyborgFixture(fixtures.Fixture):
match[0][0], match[0][1], match[0][2])
return bound_arq_list
@staticmethod
def fake_delete_arqs_for_instance(instance_uuid):
return None
def setUp(self):
super(CyborgFixture, self).setUp()
self.mock_get_dp = self.useFixture(fixtures.MockPatch(
@ -2659,3 +2663,7 @@ class CyborgFixture(fixtures.Fixture):
'nova.accelerator.cyborg._CyborgClient.'
'get_arqs_for_instance',
side_effect=self.fake_get_arqs_for_instance)).mock
self.mock_del_arqs = self.useFixture(fixtures.MockPatch(
'nova.accelerator.cyborg._CyborgClient.'
'delete_arqs_for_instance',
side_effect=self.fake_delete_arqs_for_instance)).mock

View File

@ -7835,51 +7835,7 @@ class AcceleratorServerBase(integrated_helpers.ProviderUsageBaseTestCase):
extra_spec=extra_specs)
return flavor_id
def _check_allocations_usage(self, server_uuid):
host_rp_uuid = self.compute_rp_uuids[0]
device_rp_uuid = self.device_rp_map[host_rp_uuid]
expected_host_alloc = {
'resources': {'VCPU': 2, 'MEMORY_MB': 2048, 'DISK_GB': 20},
}
expected_device_alloc = {'resources': {'FPGA': 1}}
host_alloc = self._get_allocations_by_provider_uuid(host_rp_uuid)
self.assertEqual(expected_host_alloc, host_alloc[server_uuid])
device_alloc = self._get_allocations_by_provider_uuid(device_rp_uuid)
self.assertEqual(expected_device_alloc, device_alloc[server_uuid])
class AcceleratorServerTest(AcceleratorServerBase):
def setUp(self):
self.NUM_HOSTS = 1
super(AcceleratorServerTest, self).setUp()
def test_create_server(self):
flavor_id = self._create_acc_flavor()
server_name = 'accel_server1'
server = self._create_server(
server_name, flavor_id=flavor_id,
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
networks='none', expected_state='ACTIVE')
# Verify that the host name and the device rp UUID are set properly.
# Other fields in the ARQ are hardcoded data from the fixture.
arqs = self.cyborg.fake_get_arqs_for_instance(server['id'])
self.assertEqual(self.device_rp_uuids[0], arqs[0]['device_rp_uuid'])
self.assertEqual(server['OS-EXT-SRV-ATTR:host'], arqs[0]['hostname'])
# Check allocations and usage
self._check_allocations_usage(server['id'])
class AcceleratorServerReschedTest(AcceleratorServerBase):
def setUp(self):
self.NUM_HOSTS = 2
super(AcceleratorServerReschedTest, self).setUp()
def _check_allocations_usage_resched(self, server):
def _check_allocations_usage(self, server):
# Check allocations on host where instance is running
server_uuid = server['id']
@ -7917,15 +7873,87 @@ class AcceleratorServerReschedTest(AcceleratorServerBase):
for arq in arqs}
self.assertSetEqual(expected_arq_bind_info, arq_bind_info)
def _check_no_allocations(self, server_uuid):
def _check_no_allocs_usage(self, server_uuid):
allocs = self._get_allocations_by_server_uuid(server_uuid)
self.assertEqual(allocs, {})
for i in range(self.NUM_HOSTS):
host_alloc = self._get_allocations_by_provider_uuid(
self.compute_rp_uuids[i])
self.assertEqual({}, host_alloc)
device_alloc = self._get_allocations_by_provider_uuid(
self.device_rp_uuids[i])
self.assertEqual({}, device_alloc)
usage = self._get_provider_usages(
self.device_rp_uuids[i]).get('FPGA')
self.assertEqual(usage, 0)
class AcceleratorServerTest(AcceleratorServerBase):
def setUp(self):
self.NUM_HOSTS = 1
super(AcceleratorServerTest, self).setUp()
def _get_server(self, expected_state='ACTIVE'):
flavor_id = self._create_acc_flavor()
server_name = 'accel_server1'
server = self._create_server(
server_name, flavor_id=flavor_id,
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
networks='none', expected_state=expected_state)
return server
def test_create_delete_server_ok(self):
server = self._get_server()
# Verify that the host name and the device rp UUID are set properly.
# Other fields in the ARQ are hardcoded data from the fixture.
arqs = self.cyborg.fake_get_arqs_for_instance(server['id'])
self.assertEqual(self.device_rp_uuids[0], arqs[0]['device_rp_uuid'])
self.assertEqual(server['OS-EXT-SRV-ATTR:host'], arqs[0]['hostname'])
# Check allocations and usage
self._check_allocations_usage(server)
# Delete server and check that ARQs got deleted
self.api.delete_server(server['id'])
self._wait_until_deleted(server)
self.cyborg.mock_del_arqs.assert_called_once_with(server['id'])
# Check that resources are freed
self._check_no_allocs_usage(server['id'])
def test_create_server_with_error(self):
def throw_error(*args, **kwargs):
raise exception.BuildAbortException(reason='',
instance_uuid='fake')
self.stub_out('nova.virt.fake.FakeDriver.spawn', throw_error)
server = self._get_server(expected_state='ERROR')
server_uuid = server['id']
# Check that Cyborg was called to delete ARQs
self.cyborg.mock_del_arqs.assert_called_once_with(server_uuid)
# An instance in error state should consume no resources
self._check_no_allocs_usage(server_uuid)
self.api.delete_server(server_uuid)
self._wait_until_deleted(server)
# Verify that there is one more call to delete ARQs
self.cyborg.mock_del_arqs.assert_has_calls(
[mock.call(server_uuid), mock.call(server_uuid)])
# Verify that no allocations/usages remain after deletion
self._check_no_allocs_usage(server_uuid)
class AcceleratorServerReschedTest(AcceleratorServerBase):
def setUp(self):
self.NUM_HOSTS = 2
super(AcceleratorServerReschedTest, self).setUp()
def test_resched(self):
orig_spawn = fake.FakeDriver.spawn
@ -7947,7 +7975,8 @@ class AcceleratorServerReschedTest(AcceleratorServerBase):
networks='none', expected_state='ACTIVE')
self.assertEqual(2, fake_spawn.count)
self._check_allocations_usage_resched(server)
self._check_allocations_usage(server)
self.cyborg.mock_del_arqs.assert_called_once_with(server['id'])
def test_resched_fails(self):
@ -7964,4 +7993,9 @@ class AcceleratorServerReschedTest(AcceleratorServerBase):
image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
networks='none', expected_state='ERROR')
self._check_no_allocations(server['id'])
server_uuid = server['id']
self._check_no_allocs_usage(server_uuid)
self.cyborg.mock_del_arqs.assert_has_calls(
[mock.call(server_uuid),
mock.call(server_uuid),
mock.call(server_uuid)])

View File

@ -347,3 +347,24 @@ class CyborgTestCase(test.NoDBTestCase):
mock_cyborg_get.assert_called_once_with(
self.client.ARQ_URL, params=query)
self.assertEqual(ret_arqs, [bound_arqs[0]])
@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
def test_delete_arqs_for_instance(self, mock_call_cyborg):
# Happy path
mock_call_cyborg.return_value = ('Some Value', None)
instance_uuid = 'edbba496-3cc8-4256-94ca-dfe3413348eb'
self.client.delete_arqs_for_instance(instance_uuid)
mock_call_cyborg.assert_called_once_with(mock.ANY,
self.client.ARQ_URL, params={'instance': instance_uuid})
@mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg')
def test_delete_arqs_for_instance_exception(self, mock_call_cyborg):
# If Cyborg returns invalid response, raise exception.
err_msg = 'Some error'
mock_call_cyborg.return_value = (None, err_msg)
instance_uuid = 'edbba496-3cc8-4256-94ca-dfe3413348eb'
exc = self.assertRaises(exception.AcceleratorRequestOpFailed,
self.client.delete_arqs_for_instance, instance_uuid)
expected_msg = ('Failed to delete accelerator requests: ' +
err_msg + ' Instance ' + instance_uuid)
self.assertEqual(expected_msg, exc.format_message())

View File

@ -1296,6 +1296,26 @@ class _ComputeAPIUnitTestMixIn(object):
mock_dealloc.assert_called_once_with(self.context, inst)
@mock.patch.object(compute_utils, 'notify_about_instance_action')
@mock.patch.object(objects.BlockDeviceMapping, 'destroy')
@mock.patch.object(cinder.API, 'detach')
@mock.patch.object(compute_utils, 'notify_about_instance_usage')
@mock.patch.object(neutron_api.API, 'deallocate_for_instance')
@mock.patch.object(context.RequestContext, 'elevated')
@mock.patch.object(objects.Instance, 'destroy')
@mock.patch.object(compute_utils, 'delete_arqs_if_needed')
def test_local_delete_for_arqs(
self, mock_del_arqs, mock_inst_destroy, mock_elevated,
mock_dealloc, mock_notify_legacy, mock_detach,
mock_bdm_destroy, mock_notify):
inst = self._create_instance_obj()
inst._context = self.context
mock_elevated.return_value = self.context
bdms = []
self.compute_api._local_delete(self.context, inst, bdms,
'delete', self._fake_do_delete)
mock_del_arqs.assert_called_once_with(self.context, inst)
@mock.patch.object(objects.BlockDeviceMapping, 'destroy')
def test_local_cleanup_bdm_volumes_stashed_connector(self, mock_destroy):
"""Tests that we call volume_api.terminate_connection when we found

View File

@ -264,6 +264,37 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
phase='end',
bdms=mock_bdms)])
@mock.patch.object(objects.Instance, 'destroy')
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(manager.ComputeManager, '_complete_deletion')
@mock.patch.object(manager.ComputeManager, '_cleanup_volumes')
@mock.patch.object(manager.ComputeManager, '_shutdown_instance')
@mock.patch.object(compute_utils, 'notify_about_instance_action')
@mock.patch.object(manager.ComputeManager, '_notify_about_instance_usage')
def _test_delete_instance_with_accels(self, instance, mock_inst_usage,
mock_inst_action, mock_shutdown, mock_cleanup_vols,
mock_complete_del, mock_inst_save, mock_inst_destroy):
self.compute._delete_instance(self.context, instance, bdms=None)
@mock.patch('nova.accelerator.cyborg._CyborgClient.'
'delete_arqs_for_instance')
def test_delete_instance_with_accels_ok(self, mock_del_arqs):
# _delete_instance() calls Cyborg to delete ARQs, if
# the extra specs has a device profile name.
instance = fake_instance.fake_instance_obj(self.context)
instance.flavor.extra_specs = {'accel:device_profile': 'mydp'}
self._test_delete_instance_with_accels(instance)
mock_del_arqs.assert_called_once_with(instance.uuid)
@mock.patch('nova.accelerator.cyborg._CyborgClient.'
'delete_arqs_for_instance')
def test_delete_instance_with_accels_no_dp(self, mock_del_arqs):
# _delete_instance() does not call Cyborg to delete ARQs, if
# the extra specs has no device profile name.
instance = fake_instance.fake_instance_obj(self.context)
self._test_delete_instance_with_accels(instance)
mock_del_arqs.assert_not_called()
def _make_compute_node(self, hyp_hostname, cn_id):
cn = mock.Mock(spec_set=['hypervisor_hostname', 'id', 'uuid',
'destroy'])
@ -6215,6 +6246,55 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock.ANY, self.injected_files, self.admin_pass, mock.ANY,
network_info=None, block_device_info=None, accel_info=accel_info)
@mock.patch.object(objects.Instance, 'save')
@mock.patch.object(nova.compute.manager.ComputeManager,
'_build_networks_for_instance')
@mock.patch.object(nova.compute.manager.ComputeManager,
'_default_block_device_names')
@mock.patch.object(nova.compute.manager.ComputeManager,
'_prep_block_device')
@mock.patch.object(virt_driver.ComputeDriver,
'prepare_for_spawn')
@mock.patch.object(virt_driver.ComputeDriver,
'prepare_networks_before_block_device_mapping')
@mock.patch.object(virt_driver.ComputeDriver,
'clean_networks_preparation')
@mock.patch.object(nova.compute.manager.ComputeManager,
'_get_bound_arq_resources')
def _test_delete_arqs_exception(self, mock_get_arqs,
mock_clean_net, mock_prep_net, mock_prep_spawn, mock_prep_bd,
mock_bdnames, mock_build_net, mock_save):
args = (self.context, self.instance, self.requested_networks,
self.security_groups, self.image, self.block_device_mapping,
self.resource_provider_mapping)
mock_get_arqs.side_effect = (
exception.AcceleratorRequestOpFailed(op='get', msg=''))
with self.compute._build_resources(*args):
raise test.TestingException()
@mock.patch('nova.accelerator.cyborg._CyborgClient.'
'delete_arqs_for_instance')
def test_delete_arqs_if_build_res_exception(self, mock_del_arqs):
# Cyborg is called to delete ARQs if exception is thrown inside
# the context of # _build_resources().
self.instance.flavor.extra_specs = {'accel:device_profile': 'mydp'}
self.assertRaisesRegex(exception.BuildAbortException,
'Failure getting accelerator requests',
self._test_delete_arqs_exception)
mock_del_arqs.assert_called_once_with(self.instance.uuid)
@mock.patch('nova.accelerator.cyborg._CyborgClient.'
'delete_arqs_for_instance')
def test_delete_arqs_if_build_res_exception_no_dp(self, mock_del_arqs):
# Cyborg is not called to delete ARQs, even if an exception is
# thrown inside the context of _build_resources(), if there is no
# device profile name in the extra specs.
self.instance.flavor.extra_specs = {}
self.assertRaises(exception.BuildAbortException,
self._test_delete_arqs_exception)
mock_del_arqs.assert_not_called()
def test_build_and_run_instance_called_with_proper_args(self):
self._test_build_and_run_instance()

View File

@ -27,6 +27,7 @@ from oslo_utils.fixture import uuidsentinel as uuids
from oslo_utils import uuidutils
import six
from nova.accelerator.cyborg import _CyborgClient as cyborgclient
from nova.compute import manager
from nova.compute import power_state
from nova.compute import task_states
@ -1645,3 +1646,41 @@ class PciRequestUpdateTestCase(test.NoDBTestCase):
self.assertEqual(
'enp0s31f6',
instance.pci_requests.requests[0].spec[0]['parent_ifname'])
class AcceleratorRequestTestCase(test.NoDBTestCase):
def setUp(self):
super(AcceleratorRequestTestCase, self).setUp()
self.context = context.get_admin_context()
@mock.patch.object(cyborgclient, 'delete_arqs_for_instance')
def test_delete_with_device_profile(self, mock_del_arq):
flavor = objects.Flavor(**test_flavor.fake_flavor)
flavor['extra_specs'] = {'accel:device_profile': 'mydp'}
instance = fake_instance.fake_instance_obj(self.context, flavor=flavor)
compute_utils.delete_arqs_if_needed(self.context, instance)
mock_del_arq.assert_called_once_with(instance.uuid)
@mock.patch.object(cyborgclient, 'delete_arqs_for_instance')
def test_delete_with_no_device_profile(self, mock_del_arq):
flavor = objects.Flavor(**test_flavor.fake_flavor)
flavor['extra_specs'] = {}
instance = fake_instance.fake_instance_obj(self.context, flavor=flavor)
compute_utils.delete_arqs_if_needed(self.context, instance)
mock_del_arq.assert_not_called()
@mock.patch('nova.compute.utils.LOG.exception')
@mock.patch.object(cyborgclient, 'delete_arqs_for_instance')
def test_delete_with_device_profile_exception(self, mock_del_arq,
mock_log_exc):
flavor = objects.Flavor(**test_flavor.fake_flavor)
flavor['extra_specs'] = {'accel:device_profile': 'mydp'}
instance = fake_instance.fake_instance_obj(self.context, flavor=flavor)
mock_del_arq.side_effect = exception.AcceleratorRequestOpFailed(
op='', msg='')
compute_utils.delete_arqs_if_needed(self.context, instance)
mock_del_arq.assert_called_once_with(instance.uuid)
mock_log_exc.assert_called_once()
self.assertIn('Failed to delete accelerator requests for instance',
mock_log_exc.call_args[0][0])

View File

@ -30,6 +30,7 @@ from nova import block_device
from nova.compute import flavors
from nova.compute import rpcapi as compute_rpcapi
from nova.compute import task_states
from nova.compute import utils as compute_utils
from nova.compute import vm_states
from nova.conductor import api as conductor_api
from nova.conductor import manager as conductor_manager
@ -3563,6 +3564,19 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
# handled the error.
mock_save.assert_not_called()
@mock.patch.object(conductor_manager.ComputeTaskManager,
'_cleanup_allocated_networks')
@mock.patch.object(conductor_manager.ComputeTaskManager,
'_set_vm_state_and_notify')
@mock.patch.object(compute_utils, 'delete_arqs_if_needed')
def test_cleanup_arqs_on_reschedule(self, mock_del_arqs,
mock_set_vm, mock_clean_net):
instance = fake_instance.fake_instance_obj(self.context)
self.conductor_manager._cleanup_when_reschedule_fails(
self.context, instance, exception=None,
legacy_request_spec=None, requested_networks=None)
mock_del_arqs.assert_called_once_with(self.context, instance)
def test_cleanup_allocated_networks_none_requested(self):
# Tests that we don't deallocate networks if 'none' were specifically
# requested.