Enable start/stop of instances with accelerators.

. Do not delete accelerator requests in stop code paths.
. In the start code path, get the list of accelerator requests from
  Cyborg in the compute manager 'power_on'.
. Pass accel_info (said list) to the virt driver power_on.
. In libvirt driver, pass that accel_info to driver power_on.

Change-Id: I8c94504b87aa4450d163fe2b33f6aa0eb5dae5ff
Blueprint: nova-cyborg-interaction
This commit is contained in:
Sundar Nadathur 2019-12-14 13:28:13 -08:00
parent b5527c07fb
commit 536d42d807
12 changed files with 66 additions and 14 deletions

View File

@ -3099,9 +3099,10 @@ class ComputeManager(manager.Manager):
network_info = self.network_api.get_instance_nw_info(context, instance)
block_device_info = self._get_instance_block_device_info(context,
instance)
accel_info = self._get_accel_info(context, instance)
self.driver.power_on(context, instance,
network_info,
block_device_info)
block_device_info, accel_info)
def _delete_snapshot_of_shelved_instance(self, context, instance,
snapshot_id):

View File

@ -7782,8 +7782,10 @@ class AcceleratorServerBase(integrated_helpers.ProviderUsageBaseTestCase):
self._setup_compute_nodes_and_device_rps()
def _setup_compute_nodes_and_device_rps(self):
self.compute_services = []
for i in range(self.NUM_HOSTS):
self._start_compute(host='accel_host' + str(i))
svc = self._start_compute(host='accel_host' + str(i))
self.compute_services.append(svc)
self.compute_rp_uuids = [
rp['uuid'] for rp in self._get_all_providers()
if rp['uuid'] == rp['root_provider_uuid']]
@ -7947,6 +7949,30 @@ class AcceleratorServerTest(AcceleratorServerBase):
# Verify that no allocations/usages remain after deletion
self._check_no_allocs_usage(server_uuid)
def test_create_server_with_local_delete(self):
"""Delete the server when compute service is down."""
server = self._get_server()
server_uuid = server['id']
# Stop the server.
self.api.post_server_action(server_uuid, {'os-stop': {}})
self._wait_for_state_change(server, 'SHUTOFF')
self._check_allocations_usage(server)
# Stop and force down the compute service.
compute_id = self.admin_api.get_services(
host='accel_host0', binary='nova-compute')[0]['id']
self.compute_services[0].stop()
self.admin_api.put_service(compute_id, {'forced_down': 'true'})
# Delete the server with compute service down.
self.api.delete_server(server_uuid)
self.cyborg.mock_del_arqs.assert_called_once_with(server_uuid)
self._check_no_allocs_usage(server_uuid)
# Restart the compute service to see if anything fails.
self.admin_api.put_service(compute_id, {'forced_down': 'false'})
self.compute_services[0].start()
class AcceleratorServerReschedTest(AcceleratorServerBase):

View File

@ -2493,7 +2493,7 @@ class ComputeTestCase(BaseTestCase,
called = {'power_on': False}
def fake_driver_power_on(self, context, instance, network_info,
block_device_info):
block_device_info, accel_device_info=None):
called['power_on'] = True
self.stub_out('nova.virt.fake.FakeDriver.power_on',
@ -2512,6 +2512,25 @@ class ComputeTestCase(BaseTestCase,
self.assertTrue(called['power_on'])
self.compute.terminate_instance(self.context, inst_obj, [])
@mock.patch.object(compute_manager.ComputeManager,
'_get_instance_block_device_info')
@mock.patch('nova.network.neutron.API.get_instance_nw_info')
@mock.patch.object(fake.FakeDriver, 'power_on')
@mock.patch('nova.accelerator.cyborg._CyborgClient.get_arqs_for_instance')
def test_power_on_with_accels(self, mock_get_arqs,
mock_power_on, mock_nw_info, mock_blockdev):
instance = self._create_fake_instance_obj()
instance.flavor.extra_specs = {'accel:device_profile': 'mydp'}
accel_info = [{'k1': 'v1', 'k2': 'v2'}]
mock_get_arqs.return_value = accel_info
mock_nw_info.return_value = 'nw_info'
mock_blockdev.return_value = 'blockdev_info'
self.compute._power_on(self.context, instance)
mock_get_arqs.assert_called_once_with(instance['uuid'])
mock_power_on.assert_called_once_with(self.context,
instance, 'nw_info', 'blockdev_info', accel_info)
def test_power_off(self):
# Ensure instance can be powered off.

View File

@ -889,10 +889,14 @@ class ComputeDriver(object):
raise NotImplementedError()
def power_on(self, context, instance, network_info,
block_device_info=None):
block_device_info=None, accel_info=None):
"""Power on the specified instance.
:param instance: nova.objects.instance.Instance
:param network_info: instance network information
:param block_device_info: instance volume block device info
:param accel_info: List of accelerator request dicts. The exact
data struct is doc'd in nova/virt/driver.py::spawn().
"""
raise NotImplementedError()

View File

@ -276,7 +276,7 @@ class FakeDriver(driver.ComputeDriver):
raise exception.InstanceNotFound(instance_id=instance.uuid)
def power_on(self, context, instance, network_info,
block_device_info=None):
block_device_info=None, accel_info=None):
if instance.uuid in self.instances:
self.instances[instance.uuid].state = power_state.RUNNING
else:

View File

@ -224,7 +224,7 @@ class HyperVDriver(driver.ComputeDriver):
self._vmops.power_off(instance, timeout, retry_interval)
def power_on(self, context, instance, network_info,
block_device_info=None):
block_device_info=None, accel_info=None):
self._vmops.power_on(instance, block_device_info, network_info)
def resume_state_on_host_boot(self, context, instance, network_info,

View File

@ -1474,7 +1474,7 @@ class IronicDriver(virt_driver.ComputeDriver):
node.uuid, instance=instance)
def power_on(self, context, instance, network_info,
block_device_info=None):
block_device_info=None, accel_info=None):
"""Power on the specified instance.
NOTE: Unlike the libvirt driver, this method does not delete
@ -1486,7 +1486,8 @@ class IronicDriver(virt_driver.ComputeDriver):
this driver.
:param block_device_info: Instance block device
information. Ignored by this driver.
:param accel_info: List of accelerator requests for this instance.
Ignored by this driver.
"""
LOG.debug('Power on called for instance', instance=instance)
node = self._validate_instance_and_node(instance)

View File

@ -3297,12 +3297,13 @@ class LibvirtDriver(driver.ComputeDriver):
self._destroy(instance)
def power_on(self, context, instance, network_info,
block_device_info=None):
block_device_info=None, accel_info=None):
"""Power on the specified instance."""
# We use _hard_reboot here to ensure that all backing files,
# network, and block device connections, etc. are established
# and available before we attempt to start the instance.
self._hard_reboot(context, instance, network_info, block_device_info)
self._hard_reboot(context, instance, network_info, block_device_info,
accel_info)
def trigger_crash_dump(self, instance):

View File

@ -464,7 +464,7 @@ class PowerVMDriver(driver.ComputeDriver):
timeout=timeout)
def power_on(self, context, instance, network_info,
block_device_info=None):
block_device_info=None, accel_info=None):
"""Power on the specified instance.
:param instance: nova.objects.instance.Instance

View File

@ -658,7 +658,7 @@ class VMwareVCDriver(driver.ComputeDriver):
self._vmops.power_off(instance, timeout, retry_interval)
def power_on(self, context, instance, network_info,
block_device_info=None):
block_device_info=None, accel_info=None):
"""Power on the specified instance."""
self._vmops.power_on(instance)

View File

@ -331,7 +331,7 @@ class XenAPIDriver(driver.ComputeDriver):
self._vmops.power_off(instance)
def power_on(self, context, instance, network_info,
block_device_info=None):
block_device_info=None, accel_info=None):
"""Power on the specified instance."""
self._vmops.power_on(instance)

View File

@ -395,7 +395,7 @@ class ZVMDriver(driver.ComputeDriver):
self._hypervisor.guest_softstop(instance.name)
def power_on(self, context, instance, network_info,
block_device_info=None):
block_device_info=None, accel_info=None):
self._hypervisor.guest_start(instance.name)
def pause(self, instance):