diff --git a/api-guide/source/accelerator-support.rst b/api-guide/source/accelerator-support.rst index f7c82944edc5..8ad60a0f5669 100644 --- a/api-guide/source/accelerator-support.rst +++ b/api-guide/source/accelerator-support.rst @@ -43,12 +43,12 @@ The lists of supported and unsupported operations are as below: * Rescue and unrescue. * Rebuild. * Evacuate. + * Shelve and unshelve. * Unsupported operations * Resize. * Suspend and resume. - * Shelve and unshelve. * Cold migration. * Live migration. @@ -56,6 +56,10 @@ The lists of supported and unsupported operations are as below: Added support for rebuild and evacuate operations. +.. versionchanged:: 23.0.0(Wallaby) + + Added support for shelve and unshelve operations. + Some operations, such as lock and unlock, work as they are effectively no-ops for accelerators. diff --git a/nova/api/openstack/compute/shelve.py b/nova/api/openstack/compute/shelve.py index bb4a7b92bc6e..acd5b41413f4 100644 --- a/nova/api/openstack/compute/shelve.py +++ b/nova/api/openstack/compute/shelve.py @@ -37,7 +37,7 @@ class ShelveController(wsgi.Controller): self.network_api = neutron.API() @wsgi.response(202) - @wsgi.expected_errors((404, 409)) + @wsgi.expected_errors((404, 403, 409)) @wsgi.action('shelve') def _shelve(self, req, id, body): """Move an instance into shelved mode.""" @@ -55,6 +55,8 @@ class ShelveController(wsgi.Controller): exception.UnexpectedTaskStateError, ) as e: raise exc.HTTPConflict(explanation=e.format_message()) + except exception.ForbiddenWithAccelerators as e: + raise exc.HTTPForbidden(explanation=e.format_message()) except exception.InstanceInvalidState as state_error: common.raise_http_conflict_for_instance_invalid_state(state_error, 'shelve', id) diff --git a/nova/compute/api.py b/nova/compute/api.py index 0b2e8248c840..a03f5f682725 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -4158,7 +4158,7 @@ class API(base.Base): return allow_same_host @reject_vtpm_instances(instance_actions.SHELVE) - @block_accelerators() + @block_accelerators(until_service=54) @check_instance_lock @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED, vm_states.PAUSED, vm_states.SUSPENDED]) @@ -4181,16 +4181,23 @@ class API(base.Base): self._record_action_start(context, instance, instance_actions.SHELVE) + accel_uuids = [] + if instance.flavor.extra_specs.get('accel:device_profile'): + cyclient = cyborg.get_client(context) + accel_uuids = cyclient.get_arq_uuids_for_instance(instance) + if not compute_utils.is_volume_backed_instance(context, instance): name = '%s-shelved' % instance.display_name image_meta = compute_utils.create_image( context, instance, name, 'snapshot', self.image_api) image_id = image_meta['id'] self.compute_rpcapi.shelve_instance(context, instance=instance, - image_id=image_id, clean_shutdown=clean_shutdown) + image_id=image_id, clean_shutdown=clean_shutdown, + accel_uuids=accel_uuids) else: - self.compute_rpcapi.shelve_offload_instance(context, - instance=instance, clean_shutdown=clean_shutdown) + self.compute_rpcapi.shelve_offload_instance( + context, instance=instance, clean_shutdown=clean_shutdown, + accel_uuids=accel_uuids) @check_instance_lock @check_instance_state(vm_state=[vm_states.SHELVED]) @@ -4202,8 +4209,14 @@ class API(base.Base): self._record_action_start(context, instance, instance_actions.SHELVE_OFFLOAD) - self.compute_rpcapi.shelve_offload_instance(context, instance=instance, - clean_shutdown=clean_shutdown) + accel_uuids = [] + if instance.flavor.extra_specs.get('accel:device_profile'): + cyclient = cyborg.get_client(context) + accel_uuids = cyclient.get_arq_uuids_for_instance(instance) + + self.compute_rpcapi.shelve_offload_instance( + context, instance=instance, + clean_shutdown=clean_shutdown, accel_uuids=accel_uuids) def _validate_unshelve_az(self, context, instance, availability_zone): """Verify the specified availability_zone during unshelve. diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 7ae5d3531a5c..6a296e15e00a 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -528,7 +528,7 @@ class ComputeVirtAPI(virtapi.VirtAPI): class ComputeManager(manager.Manager): """Manages the running instances from creation to destruction.""" - target = messaging.Target(version='5.12') + target = messaging.Target(version='5.13') def __init__(self, compute_driver=None, *args, **kwargs): """Load configuration options and connect to the hypervisor.""" @@ -6326,7 +6326,7 @@ class ComputeManager(manager.Manager): @wrap_instance_event(prefix='compute') @wrap_instance_fault def shelve_instance(self, context, instance, image_id, - clean_shutdown): + clean_shutdown, accel_uuids=None): """Shelve an instance. This should be used when you want to take a snapshot of the instance. @@ -6337,15 +6337,17 @@ class ComputeManager(manager.Manager): :param instance: an Instance object :param image_id: an image id to snapshot to. :param clean_shutdown: give the GuestOS a chance to stop + :param accel_uuids: the accelerators uuids for the instance """ @utils.synchronized(instance.uuid) def do_shelve_instance(): - self._shelve_instance(context, instance, image_id, clean_shutdown) + self._shelve_instance(context, instance, image_id, clean_shutdown, + accel_uuids) do_shelve_instance() def _shelve_instance(self, context, instance, image_id, - clean_shutdown): + clean_shutdown, accel_uuids=None): LOG.info('Shelving', instance=instance) offload = CONF.shelved_offload_time == 0 if offload: @@ -6400,14 +6402,16 @@ class ComputeManager(manager.Manager): phase=fields.NotificationPhase.END, bdms=bdms) if offload: - self._shelve_offload_instance(context, instance, - clean_shutdown=False, bdms=bdms) + self._shelve_offload_instance( + context, instance, clean_shutdown=False, bdms=bdms, + accel_uuids=accel_uuids) @wrap_exception() @reverts_task_state @wrap_instance_event(prefix='compute') @wrap_instance_fault - def shelve_offload_instance(self, context, instance, clean_shutdown): + def shelve_offload_instance(self, context, instance, clean_shutdown, + accel_uuids=None): """Remove a shelved instance from the hypervisor. This frees up those resources for use by other instances, but may lead @@ -6418,15 +6422,17 @@ class ComputeManager(manager.Manager): :param context: request context :param instance: nova.objects.instance.Instance :param clean_shutdown: give the GuestOS a chance to stop + :param accel_uuids: the accelerators uuids for the instance """ @utils.synchronized(instance.uuid) def do_shelve_offload_instance(): - self._shelve_offload_instance(context, instance, clean_shutdown) + self._shelve_offload_instance(context, instance, clean_shutdown, + accel_uuids=accel_uuids) do_shelve_offload_instance() def _shelve_offload_instance(self, context, instance, clean_shutdown, - bdms=None): + bdms=None, accel_uuids=None): LOG.info('Shelve offloading', instance=instance) if bdms is None: bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( @@ -6451,6 +6457,12 @@ class ComputeManager(manager.Manager): # terminate all the connections with the volume server and the host self._terminate_volume_connections(context, instance, bdms) + # NOTE(brinzhang): Free up the accelerator resource occupied + # in the cyborg service. + if accel_uuids: + cyclient = cyborg.get_client(context) + cyclient.delete_arqs_for_instance(instance.uuid) + # Free up the resource allocations in the placement service. # This should happen *before* the vm_state is changed to # SHELVED_OFFLOADED in case client-side code is polling the API to @@ -6490,8 +6502,9 @@ class ComputeManager(manager.Manager): @reverts_task_state @wrap_instance_event(prefix='compute') @wrap_instance_fault - def unshelve_instance(self, context, instance, image, - filter_properties, node, request_spec=None): + def unshelve_instance( + self, context, instance, image, filter_properties, node, + request_spec=None, accel_uuids=None): """Unshelve the instance. :param context: request context @@ -6502,6 +6515,7 @@ class ComputeManager(manager.Manager): :param node: target compute node :param request_spec: the RequestSpec object used to schedule the instance + :param accel_uuids: the accelerators uuids for the instance """ if filter_properties is None: filter_properties = {} @@ -6510,7 +6524,7 @@ class ComputeManager(manager.Manager): def do_unshelve_instance(): self._unshelve_instance( context, instance, image, filter_properties, node, - request_spec) + request_spec, accel_uuids) do_unshelve_instance() def _unshelve_instance_key_scrub(self, instance): @@ -6527,7 +6541,7 @@ class ComputeManager(manager.Manager): instance.update(keys) def _unshelve_instance(self, context, instance, image, filter_properties, - node, request_spec): + node, request_spec, accel_uuids): LOG.info('Unshelving', instance=instance) bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( context, instance.uuid) @@ -6574,6 +6588,19 @@ class ComputeManager(manager.Manager): provider_mappings=provider_mappings) network_info = self.network_api.get_instance_nw_info( context, instance) + + accel_info = [] + if accel_uuids: + try: + accel_info = self._get_bound_arq_resources( + context, instance, accel_uuids) + except (Exception, eventlet.timeout.Timeout) as exc: + LOG.exception('Failure getting accelerator requests ' + 'with the exception: %s', exc, + instance=instance) + self._build_resources_cleanup(instance, network_info) + raise + with self.rt.instance_claim(context, instance, node, allocations, limits): self.driver.spawn(context, instance, image_meta, @@ -6581,7 +6608,8 @@ class ComputeManager(manager.Manager): admin_password=None, allocations=allocations, network_info=network_info, - block_device_info=block_device_info) + block_device_info=block_device_info, + accel_info=accel_info) except Exception: with excutils.save_and_reraise_exception(logger=LOG): LOG.exception('Instance failed to spawn', diff --git a/nova/compute/rpcapi.py b/nova/compute/rpcapi.py index 3558c9f1477a..6e9675248cbd 100644 --- a/nova/compute/rpcapi.py +++ b/nova/compute/rpcapi.py @@ -380,6 +380,9 @@ class ComputeAPI(object): build_and_run_instance() * 5.12 - Add accel_uuids (accelerator requests) parameter to rebuild_instance() + * 5.13 - Add accel_uuids (accelerator requests) parameter to + shelve_instance(), shelve_offload_instance() and + unshelve_instance() ''' VERSION_ALIASES = { @@ -1350,40 +1353,65 @@ class ComputeAPI(object): cctxt.cast(ctxt, 'restore_instance', instance=instance) def shelve_instance(self, ctxt, instance, image_id=None, - clean_shutdown=True): - version = '5.0' - cctxt = self.router.client(ctxt).prepare( + clean_shutdown=True, accel_uuids=None): + msg_kwargs = { + 'instance': instance, + 'image_id': image_id, + 'clean_shutdown': clean_shutdown, + 'accel_uuids': accel_uuids, + } + client = self.router.client(ctxt) + version = '5.13' + if not client.can_send_version(version): + if accel_uuids: + LOG.error("Shelve with accelerators is not supported as " + "RPC version is too old.") + raise exception.ForbiddenWithAccelerators() + else: + msg_kwargs.pop('accel_uuids') + version = '5.0' + cctxt = client.prepare( server=_compute_host(None, instance), version=version) - cctxt.cast(ctxt, 'shelve_instance', instance=instance, - image_id=image_id, clean_shutdown=clean_shutdown) + cctxt.cast(ctxt, 'shelve_instance', **msg_kwargs) def shelve_offload_instance(self, ctxt, instance, - clean_shutdown=True): - version = '5.0' - cctxt = self.router.client(ctxt).prepare( + clean_shutdown=True, accel_uuids=None): + msg_kwargs = { + 'instance': instance, + 'clean_shutdown': clean_shutdown, + 'accel_uuids': accel_uuids, + } + client = self.router.client(ctxt) + version = '5.13' + if not client.can_send_version(version): + msg_kwargs.pop('accel_uuids') + version = '5.0' + cctxt = client.prepare( server=_compute_host(None, instance), version=version) - cctxt.cast(ctxt, 'shelve_offload_instance', instance=instance, - clean_shutdown=clean_shutdown) + cctxt.cast(ctxt, 'shelve_offload_instance', **msg_kwargs) def unshelve_instance(self, ctxt, instance, host, request_spec, image=None, - filter_properties=None, node=None): - version = '5.2' + filter_properties=None, node=None, accel_uuids=None): + version = '5.13' msg_kwargs = { 'instance': instance, 'image': image, 'filter_properties': filter_properties, 'node': node, 'request_spec': request_spec, + 'accel_uuids': accel_uuids, } client = self.router.client(ctxt) + if not client.can_send_version(version): + msg_kwargs.pop('accel_uuids') + version = '5.2' if not client.can_send_version(version): msg_kwargs.pop('request_spec') version = '5.0' - cctxt = client.prepare( - server=host, version=version) + cctxt = client.prepare(server=host, version=version) cctxt.cast(ctxt, 'unshelve_instance', **msg_kwargs) def volume_snapshot_create(self, ctxt, instance, volume_id, diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py index 4824a1a3c1d5..605608e79f49 100644 --- a/nova/conductor/manager.py +++ b/nova/conductor/manager.py @@ -838,7 +838,7 @@ class ComputeTaskManager(base.Base): try: accel_uuids = self._create_and_bind_arq_for_instance( - context, instance, host, local_reqspec) + context, instance, host.nodename, local_reqspec) except Exception as exc: LOG.exception('Failed to reschedule. Reason: %s', exc) self._cleanup_when_reschedule_fails( @@ -858,7 +858,7 @@ class ComputeTaskManager(base.Base): limits=host.limits, host_list=host_list, accel_uuids=accel_uuids) - def _create_and_bind_arq_for_instance(self, context, instance, host, + def _create_and_bind_arq_for_instance(self, context, instance, hostname, request_spec): try: resource_provider_mapping = ( @@ -867,7 +867,7 @@ class ComputeTaskManager(base.Base): # http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa return self._create_and_bind_arqs( context, instance.uuid, instance.flavor.extra_specs, - host.nodename, resource_provider_mapping) + hostname, resource_provider_mapping) except exception.AcceleratorRequestBindingFailed as exc: # If anything failed here we need to cleanup and bail out. cyclient = cyborg.get_client(context) @@ -963,14 +963,19 @@ class ComputeTaskManager(base.Base): filter_properties = request_spec.\ to_legacy_filter_properties_dict() - port_res_req = ( + external_resources = ( self.network_api.get_requested_resource_for_instance( context, instance.uuid)) - # NOTE(gibi): When cyborg or other module wants to handle - # similar non-nova resources then here we have to collect - # all the external resource requests in a single list and + extra_specs = request_spec.flavor.extra_specs + device_profile = extra_specs.get('accel:device_profile') + external_resources.extend( + cyborg.get_device_profile_request_groups( + context, device_profile) if device_profile else []) + # NOTE(gibi): When other modules want to handle similar + # non-nova resources then here we have to collect all + # the external resource requests in a single list and # add them to the RequestSpec. - request_spec.requested_resources = port_res_req + request_spec.requested_resources = external_resources # NOTE(cfriesen): Ensure that we restrict the scheduler to # the cell specified by the instance mapping. @@ -996,9 +1001,14 @@ class ComputeTaskManager(base.Base): scheduler_utils.fill_provider_mapping( request_spec, selection) + # NOTE(brinzhang): For unshelve operation we should + # re-create-and-bound the arqs for the instance. + accel_uuids = self._create_and_bind_arq_for_instance( + context, instance, node, request_spec) self.compute_rpcapi.unshelve_instance( context, instance, host, request_spec, image=image, - filter_properties=filter_properties, node=node) + filter_properties=filter_properties, node=node, + accel_uuids=accel_uuids) except (exception.NoValidHost, exception.UnsupportedPolicyException): instance.task_state = None @@ -1006,7 +1016,11 @@ class ComputeTaskManager(base.Base): LOG.warning("No valid host found for unshelve instance", instance=instance) return - except Exception: + except Exception as exc: + if isinstance(exc, exception.AcceleratorRequestBindingFailed): + cyclient = cyborg.get_client(context) + cyclient.delete_arqs_by_uuid(exc.arqs) + LOG.exception('Failed to unshelve. Reason: %s', exc) with excutils.save_and_reraise_exception(): instance.task_state = None instance.save() @@ -1225,12 +1239,24 @@ class ComputeTaskManager(base.Base): instance.availability_zone = ( availability_zones.get_host_availability_zone( context, host)) + accel_uuids = [] try: - accel_uuids = self._rebuild_cyborg_arq( - context, instance, host, request_spec, evacuate) - except exception.AcceleratorRequestBindingFailed as exc: - cyclient = cyborg.get_client(context) - cyclient.delete_arqs_by_uuid(exc.arqs) + if instance.flavor.extra_specs.get('accel:device_profile'): + cyclient = cyborg.get_client(context) + if evacuate: + # NOTE(brinzhang): For evacuate operation we should + # delete the bound arqs, then re-create-and-bound the + # arqs for the instance. + cyclient.delete_arqs_for_instance(instance.uuid) + accel_uuids = self._create_and_bind_arq_for_instance( + context, instance, node, request_spec) + else: + accel_uuids = cyclient.get_arq_uuids_for_instance( + instance) + except Exception as exc: + if isinstance(exc, exception.AcceleratorRequestBindingFailed): + cyclient = cyborg.get_client(context) + cyclient.delete_arqs_by_uuid(exc.arqs) LOG.exception('Failed to rebuild. Reason: %s', exc) raise exc @@ -1253,22 +1279,6 @@ class ComputeTaskManager(base.Base): request_spec=request_spec, accel_uuids=accel_uuids) - def _rebuild_cyborg_arq( - self, context, instance, host, request_spec, evacuate): - dp_name = instance.flavor.extra_specs.get('accel:device_profile') - if not dp_name: - return [] - - cyclient = cyborg.get_client(context) - if not evacuate: - return cyclient.get_arq_uuids_for_instance(instance) - - cyclient.delete_arqs_for_instance(instance.uuid) - resource_provider_mapping = request_spec.get_request_group_mapping() - return self._create_and_bind_arqs( - context, instance.uuid, instance.flavor.extra_specs, - host, resource_provider_mapping) - def _validate_image_traits_for_rebuild(self, context, instance, image_ref): """Validates that the traits specified in the image can be satisfied by the providers of the current allocations for the instance during @@ -1668,7 +1678,7 @@ class ComputeTaskManager(base.Base): try: accel_uuids = self._create_and_bind_arq_for_instance( - context, instance, host, request_spec) + context, instance, host.nodename, request_spec) except Exception as exc: with excutils.save_and_reraise_exception(): self._cleanup_build_artifacts( diff --git a/nova/objects/service.py b/nova/objects/service.py index 5ce447f5368c..d314fa3aa1ca 100644 --- a/nova/objects/service.py +++ b/nova/objects/service.py @@ -31,7 +31,7 @@ LOG = logging.getLogger(__name__) # NOTE(danms): This is the global service version counter -SERVICE_VERSION = 53 +SERVICE_VERSION = 54 # NOTE(danms): This is our SERVICE_VERSION history. The idea is that any @@ -190,6 +190,10 @@ SERVICE_VERSION_HISTORY = ( # Version 53: Compute RPC v5.12: # Add accel_uuids (accelerator requests) param to rebuild_instance {'compute_rpc': '5.12'}, + # Version 54: Compute RPC v5.13: + # Add accel_uuids (accelerator requests) param to shelve_instance and + # shelve_offload_instance and unshelve_instance + {'compute_rpc': '5.13'}, ) # This is used to raise an error at service startup if older than N-1 computes diff --git a/nova/tests/fixtures.py b/nova/tests/fixtures.py index 931f5f497bdd..58fe876b4a88 100644 --- a/nova/tests/fixtures.py +++ b/nova/tests/fixtures.py @@ -3095,7 +3095,10 @@ class CyborgFixture(fixtures.Fixture): This function uses bindings indexed by instance UUID to populate the bound ARQ templates in CyborgFixture.bound_arq_list. """ - arq_host_rp_list = CyborgFixture.bindings_by_instance[instance_uuid] + arq_host_rp_list = CyborgFixture.bindings_by_instance.get( + instance_uuid) + if not arq_host_rp_list: + return [] # The above looks like: # [{'hostname': $hostname, # 'device_rp_uuid': $device_rp_uuid, @@ -3117,7 +3120,7 @@ class CyborgFixture(fixtures.Fixture): @staticmethod def fake_delete_arqs_for_instance(instance_uuid): - return None + CyborgFixture.bindings_by_instance.pop(instance_uuid, None) def setUp(self): super(CyborgFixture, self).setUp() diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py index c5207e680150..49440010a67a 100644 --- a/nova/tests/functional/integrated_helpers.py +++ b/nova/tests/functional/integrated_helpers.py @@ -486,9 +486,17 @@ class InstanceHelperMixin: self.api.post_server_action(server['id'], {'shelve': {}}) return self._wait_for_state_change(server, expected_state) + def _shelve_offload_server(self, server): + """Shelve offload a server.""" + self.api.post_server_action(server['id'], {'shelveOffload': {}}) + self._wait_for_server_parameter(server, + {'status': 'SHELVED_OFFLOADED', + 'OS-EXT-SRV-ATTR:host': None, + 'OS-EXT-AZ:availability_zone': ''}) + def _unshelve_server(self, server, expected_state='ACTIVE'): """Unshelve a server.""" - self.api.post_server_action(server['id'], {'unshelve': {}}) + self.api.post_server_action(server['id'], {'unshelve': None}) return self._wait_for_state_change(server, expected_state) def _evacuate_server( diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index 3f23b8ed923e..dd7c3418c9d0 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -33,6 +33,7 @@ from oslo_utils import timeutils from nova.compute import api as compute_api from nova.compute import instance_actions from nova.compute import manager as compute_manager +from nova.compute import rpcapi as compute_rpcapi from nova import context from nova import exception from nova.network import constants @@ -7890,6 +7891,29 @@ class AcceleratorServerOpsTest(AcceleratorServerBase): self._wait_for_state_change(self.server, 'ACTIVE') self._check_allocations_usage(self.server) + def test_shelve_and_unshelve_ok(self): + self.flags(shelved_offload_time=1) + arqs = self.cyborg.fake_get_arqs_for_instance(self.server['id']) + self.assertEqual(len(arqs), 1) + self._shelve_server(self.server, 'SHELVED') + arqs = self.cyborg.fake_get_arqs_for_instance(self.server['id']) + self.assertEqual(len(arqs), 1) + self._unshelve_server(self.server) + arqs = self.cyborg.fake_get_arqs_for_instance(self.server['id']) + self.assertEqual(len(arqs), 1) + + def test_shelve_offload_and_unshelve_ok(self): + self.flags(shelved_offload_time=-1) + arqs = self.cyborg.fake_get_arqs_for_instance(self.server['id']) + self.assertEqual(len(arqs), 1) + self._shelve_server(self.server, 'SHELVED') + self._shelve_offload_server(self.server) + arqs = self.cyborg.fake_get_arqs_for_instance(self.server['id']) + self.assertEqual(len(arqs), 0) + self._unshelve_server(self.server) + arqs = self.cyborg.fake_get_arqs_for_instance(self.server['id']) + self.assertEqual(len(arqs), 1) + def test_resize_fails(self): ex = self.assertRaises(client.OpenStackApiException, self.api.post_server_action, self.server['id'], @@ -7949,6 +7973,81 @@ class AcceleratorServerOpsTest(AcceleratorServerBase): self.assertEqual(403, ex.response.status_code) self._check_allocations_usage(self.server) + @mock.patch.object(objects.service, 'get_minimum_version_all_cells') + def test_shelve_old_compute(self, old_compute_version): + """Tests when the source compute service is too old to call + shelve so OpenStackApiException is raised. + """ + old_compute_version.return_value = 53 + ex = self.assertRaises(client.OpenStackApiException, + self.api.post_server_action, self.server['id'], + {'shelve': {}}) + self.assertEqual(403, ex.response.status_code) + self._check_allocations_usage(self.server) + + def _test_shelve_instance_with_compute_rpc_pin( + self, version_cap, body=None): + self.flags(compute=version_cap, group='upgrade_levels') + + self.flags(shelved_offload_time=-1) + self.api.post_server_action(self.server['id'], body) + self._wait_for_state_change(self.server, 'SHELVED') + + def test_shelve_with_compute_rpc_pin_5_0(self): + self.flags(compute=5.0, group='upgrade_levels') + compute_rpcapi.reset_globals() + ex = self.assertRaises( + client.OpenStackApiException, self.api.post_server_action, + self.server['id'], {'shelve': {}}) + self.assertEqual(403, ex.response.status_code) + + def test_shelve_instance_5_13(self): + body = {'shelve': {}} + self._test_shelve_instance_with_compute_rpc_pin( + '5.13', body=body) + + def _test_shelve_offload_instance_with_compute_rpc_pin(self, version_cap): + self.flags(compute=version_cap, group='upgrade_levels') + + self.flags(shelved_offload_time=2) + self.api.post_server_action(self.server['id'], {'shelve': {}}) + self._wait_for_state_change(self.server, 'SHELVED') + + self.api.post_server_action(self.server['id'], {'shelveOffload': {}}) + self._wait_for_state_change(self.server, 'SHELVED_OFFLOADED') + + def test_shelve_offload_instance_5_0(self): + self._test_shelve_offload_instance_with_compute_rpc_pin('5.0') + + def test_shelve_offload_instance_5_13(self): + self._test_shelve_offload_instance_with_compute_rpc_pin('5.13') + + def _test_unshelve_instance_with_compute_rpc_pin( + self, version_cap, body=None): + self.flags(compute=version_cap, group='upgrade_levels') + + self.api.microversion = '2.87' + self.api.post_server_action(self.server['id'], {'shelve': {}}) + self._wait_for_state_change(self.server, 'SHELVED_OFFLOADED') + + self.api.post_server_action(self.server['id'], body) + self._wait_for_state_change(self.server, 'ACTIVE') + + def test_unshelve_instance_5_0(self): + body = {'unshelve': None} + self._test_unshelve_instance_with_compute_rpc_pin( + '5.0', body=body) + + def test_unshelve_instance_5_2(self): + body = {'unshelve': None} + self._test_unshelve_instance_with_compute_rpc_pin( + '5.2', body=body) + + def test_unshelve_instance_5_13(self): + body = {'unshelve': None} + self._test_unshelve_instance_with_compute_rpc_pin( + '5.13', body=body) + class CrossCellResizeWithQoSPort(PortResourceRequestBasedSchedulingTestBase): NUMBER_OF_CELLS = 2 diff --git a/nova/tests/unit/compute/test_api.py b/nova/tests/unit/compute/test_api.py index 36aa81f2a172..f450fa60ed62 100644 --- a/nova/tests/unit/compute/test_api.py +++ b/nova/tests/unit/compute/test_api.py @@ -7637,6 +7637,127 @@ class ComputeAPIUnitTestCase(_ComputeAPIUnitTestMixIn, test.NoDBTestCase): # myfunc was not called self.assertEqual({}, args_info) + @mock.patch('nova.accelerator.cyborg._CyborgClient.' + 'get_arq_uuids_for_instance') + @mock.patch.object(compute_utils, 'create_image') + def test_get_arqs_in_shelve(self, mock_create_img, mock_get_arq_uuids): + """Test get_arq_uuids_for_instance() was called if the + 'accel:device_profile' exist in the flavor for shelve operation. + """ + extra_specs = {'accel:device_profile': 'mydp'} + flavor = self._create_flavor(extra_specs=extra_specs) + params = dict(display_name="vm1") + instance = self._create_instance_obj(params=params, flavor=flavor) + mock_create_img.return_value = dict(id='fake-image-id') + mock_get_arq_uuids.return_value = ( + ['983d1742-c8ca-4f8a-9f3c-3f54bbafaa7d']) + accel_uuids = mock_get_arq_uuids.return_value + + with test.nested( + mock.patch('nova.objects.service.get_minimum_version_all_cells', + return_value=54), + mock.patch('nova.compute.utils.is_volume_backed_instance', + return_value=False), + mock.patch.object(objects.Instance, 'save'), + mock.patch.object(self.compute_api, '_record_action_start'), + mock.patch.object(compute_rpcapi.ComputeAPI, 'shelve_instance'), + ) as ( + mock_get_min, mock_volume_backend, + mock_instance_save, mock_record_action, mock_shelve_service + ): + self.compute_api.shelve(self.context, instance) + mock_get_arq_uuids.assert_called_once() + mock_shelve_service.assert_called_once_with( + self.context, instance=instance, image_id='fake-image-id', + clean_shutdown=True, accel_uuids=accel_uuids) + + @mock.patch('nova.accelerator.cyborg._CyborgClient.' + 'get_arq_uuids_for_instance') + @mock.patch.object(compute_utils, 'create_image') + def test_shelve_with_unsupport_accelerators( + self, mock_create_img, mock_get_arq_uuids): + """Test get_arq_uuids_for_instance() was called if the + 'accel:device_profile' exist in the flavor for shelve operation. + """ + extra_specs = {'accel:device_profile': 'mydp'} + flavor = self._create_flavor(extra_specs=extra_specs) + params = dict(display_name="vm1") + instance = self._create_instance_obj(params=params, flavor=flavor) + + with test.nested( + mock.patch('nova.objects.service.get_minimum_version_all_cells', + return_value=54), + mock.patch('nova.compute.utils.is_volume_backed_instance', + return_value=False), + mock.patch.object(objects.Instance, 'save'), + mock.patch.object(self.compute_api, '_record_action_start'), + mock.patch.object(compute_rpcapi.ComputeAPI, 'shelve_instance'), + ) as ( + mock_get_min, mock_volume_backend, + mock_instance_save, mock_record_action, mock_shelve_service + ): + mock_shelve_service.side_effect = ( + exception.ForbiddenWithAccelerators( + oldest_supported_version=54, + scope=mock.ANY, + min_service_level=54, + oldest_supported_service=54)) + self.assertRaises( + exception.ForbiddenWithAccelerators, + self.compute_api.shelve, + self.context, + instance) + + @mock.patch('nova.accelerator.cyborg._CyborgClient.' + 'get_arq_uuids_for_instance') + @mock.patch('nova.objects.service.get_minimum_version_all_cells', + return_value=53) + def test_shelve_until_service_forbidden(self, mock_get_min, + mock_get_arq_uuuids): + """Ensure a 'ForbiddenWithAccelerators' exception raises if any + compute service less than the version of 54. + """ + extra_specs = {'accel:device_profile': 'mydp'} + flavor = self._create_flavor(extra_specs=extra_specs) + instance = self._create_instance_obj(flavor=flavor) + self.assertRaisesRegex(exception.ForbiddenWithAccelerators, + 'Forbidden with instances that have accelerators.', + self.compute_api.shelve, self.context, instance) + mock_get_arq_uuuids.assert_not_called() + + @mock.patch('nova.accelerator.cyborg._CyborgClient.' + 'get_arq_uuids_for_instance') + @mock.patch.object(compute_utils, 'create_image') + def test_get_arqs_in_shelve_offload(self, mock_create_img, + mock_get_arq_uuids): + """Test get_arq_uuids_for_instance() was called if the + 'accel:device_profile' exist in the flavor for + shelve offload operation. + """ + extra_specs = {'accel:device_profile': 'mydp'} + flavor = self._create_flavor(extra_specs=extra_specs) + params = {'vm_state': vm_states.SHELVED} + instance = self._create_instance_obj(params=params, flavor=flavor) + mock_create_img.return_value = dict(id='fake-image-id') + mock_get_arq_uuids.return_value = [uuids.fake] + accel_uuids = mock_get_arq_uuids.return_value + with test.nested( + mock.patch('nova.objects.service.get_minimum_version_all_cells', + return_value=54), + mock.patch.object(objects.Instance, 'save'), + mock.patch.object(self.compute_api, '_record_action_start'), + mock.patch.object(compute_rpcapi.ComputeAPI, + 'shelve_offload_instance'), + ) as ( + mock_get_min, mock_instance_save, + mock_record_action, mock_shelve_service + ): + self.compute_api.shelve_offload(self.context, instance) + mock_get_arq_uuids.assert_called_once() + mock_shelve_service.assert_called_once_with( + self.context, instance=instance, clean_shutdown=True, + accel_uuids=accel_uuids) + # TODO(huaqiang): Remove in Wallaby @mock.patch('nova.objects.service.get_minimum_version_all_cells') def test__check_compute_service_for_mixed_instance(self, mock_ver): diff --git a/nova/tests/unit/compute/test_rpcapi.py b/nova/tests/unit/compute/test_rpcapi.py index f2655eddfc9e..24ae515822fa 100644 --- a/nova/tests/unit/compute/test_rpcapi.py +++ b/nova/tests/unit/compute/test_rpcapi.py @@ -897,19 +897,67 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): def test_shelve_instance(self): self._test_compute_api('shelve_instance', 'cast', instance=self.fake_instance_obj, image_id='image_id', - clean_shutdown=True, version='5.0') + clean_shutdown=True, accel_uuids=None, version='5.13') + + def test_shelve_instance_old_rpcapi(self): + # With rpcapi < 5.13, accel_uuids must be dropped in the client call. + ctxt = context.RequestContext('fake_user', 'fake_project') + compute_api = compute_rpcapi.ComputeAPI() + compute_api.router.client = mock.Mock() + mock_client = mock.MagicMock() + compute_api.router.client.return_value = mock_client + # Force can_send_version to False, so that 5.0 version is used. + mock_client.can_send_version.return_value = False + mock_cctx = mock.MagicMock() + mock_client.prepare.return_value = mock_cctx + compute_api.shelve_instance( + ctxt, instance=self.fake_instance_obj, + accel_uuids=[], + image_id='image_id', clean_shutdown=True) + + mock_client.can_send_version.assert_called_once_with('5.13') + mock_client.prepare.assert_called_with( + server=self.fake_instance_obj.host, version='5.0') + mock_cctx.cast.assert_called_with( # No accel_uuids + ctxt, 'shelve_instance', + instance=self.fake_instance_obj, + image_id='image_id', clean_shutdown=True) def test_shelve_offload_instance(self): self._test_compute_api('shelve_offload_instance', 'cast', instance=self.fake_instance_obj, - clean_shutdown=True, version='5.0') + clean_shutdown=True, accel_uuids=None, version='5.13') + + def test_shelve_offload_instance_old_rpcapi(self): + # With rpcapi < 5.13, accel_uuids must be dropped in the client call. + ctxt = context.RequestContext('fake_user', 'fake_project') + compute_api = compute_rpcapi.ComputeAPI() + compute_api.router.client = mock.Mock() + mock_client = mock.MagicMock() + compute_api.router.client.return_value = mock_client + # Force can_send_version to False, so that 5.0 version is used. + mock_client.can_send_version.return_value = False + mock_cctx = mock.MagicMock() + mock_client.prepare.return_value = mock_cctx + compute_api.shelve_offload_instance( + ctxt, instance=self.fake_instance_obj, + accel_uuids=['938af7f9-f136-4e5a-bdbe-3b6feab54311'], + clean_shutdown=True,) + + mock_client.can_send_version.assert_called_once_with('5.13') + mock_client.prepare.assert_called_with( + server=self.fake_instance_obj.host, version='5.0') + mock_cctx.cast.assert_called_with( # No accel_uuids + ctxt, 'shelve_offload_instance', + instance=self.fake_instance_obj, + clean_shutdown=True) def test_unshelve_instance(self): self._test_compute_api('unshelve_instance', 'cast', instance=self.fake_instance_obj, host='host', image='image', filter_properties={'fakeprop': 'fakeval'}, node='node', - request_spec=self.fake_request_spec_obj, - version='5.2') + request_spec=self.fake_request_spec_obj, accel_uuids=None, + version='5.13') def test_cache_image(self): self._test_compute_api('cache_images', 'call', @@ -943,7 +991,10 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): host='host', request_spec=self.fake_request_spec_obj, image='image') - mock_client.can_send_version.assert_called_once_with('5.2') + mock_client.can_send_version.assert_has_calls([ + mock.call('5.13'), + mock.call('5.2'), + ]) mock_client.prepare.assert_called_with( server='host', version='5.0') mock_cctx.cast.assert_called_with( diff --git a/nova/tests/unit/compute/test_shelve.py b/nova/tests/unit/compute/test_shelve.py index fba5a8876ea6..eeda476c6273 100644 --- a/nova/tests/unit/compute/test_shelve.py +++ b/nova/tests/unit/compute/test_shelve.py @@ -10,6 +10,7 @@ # License for the specific language governing permissions and limitations # under the License. +import eventlet import mock from oslo_utils import fixture as utils_fixture from oslo_utils.fixture import uuidsentinel as uuids @@ -63,7 +64,8 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase): mock_notify_instance_usage, mock_get_power_state, mock_snapshot, mock_power_off, mock_terminate, mock_get_bdms, clean_shutdown=True, - guest_power_state=power_state.RUNNING): + guest_power_state=power_state.RUNNING, + accel_uuids=None): mock_get_power_state.return_value = 123 CONF.set_override('shelved_offload_time', shelved_offload_time) @@ -127,7 +129,8 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase): mock_save.side_effect = check_save self.compute.shelve_instance(self.context, instance, image_id=image_id, - clean_shutdown=clean_shutdown) + clean_shutdown=clean_shutdown, + accel_uuids=accel_uuids) mock_notify.assert_has_calls([ mock.call(self.context, instance, 'fake-mini', action='shelve', phase='start', bdms=fake_bdms), @@ -181,6 +184,18 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase): def test_shelve_paused_instance(self): self._shelve_instance(-1, guest_power_state=power_state.PAUSED) + @mock.patch('nova.accelerator.cyborg._CyborgClient.' + 'delete_arqs_for_instance') + def test_shelve_and_offload_with_accel_uuids(self, mock_del_arqs): + self._shelve_instance(0, accel_uuids=[uuids.fake]) + mock_del_arqs.assert_called_once() + + @mock.patch('nova.accelerator.cyborg._CyborgClient.' + 'delete_arqs_for_instance') + def test_shelve_and_with_accel_uuids(self, mock_del_arqs): + self._shelve_instance(-1, accel_uuids=[uuids.fake]) + mock_del_arqs.assert_not_called() + @mock.patch.object(nova.virt.fake.SmallFakeDriver, 'power_off') def test_shelve_offload(self, mock_power_off): instance = self._shelve_offload() @@ -279,10 +294,12 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase): mock_get_power_state, mock_spawn, mock_prep_block_device, mock_notify_instance_usage, mock_notify_instance_action, - mock_get_bdms): + mock_get_bdms, + accel_uuids=None, + instance=None): mock_bdms = mock.Mock() mock_get_bdms.return_value = mock_bdms - instance = self._create_fake_instance_obj() + instance = instance or self._create_fake_instance_obj() instance.task_state = task_states.UNSHELVING instance.save() image = {'id': uuids.image_id} @@ -340,7 +357,8 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase): self.compute.unshelve_instance( self.context, instance, image=image, filter_properties=filter_properties, - node=node, request_spec=objects.RequestSpec()) + node=node, request_spec=objects.RequestSpec(), + accel_uuids=accel_uuids) mock_notify_instance_action.assert_has_calls([ mock.call(self.context, instance, 'fake-mini', @@ -362,7 +380,7 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase): mock_spawn.assert_called_once_with(self.context, instance, test.MatchType(objects.ImageMeta), injected_files=[], admin_password=None, allocations={}, network_info=[], - block_device_info='fake_bdm') + block_device_info='fake_bdm', accel_info=mock.ANY) self.mock_get_allocations.assert_called_once_with(self.context, instance.uuid) mock_get_power_state.assert_called_once_with(instance) @@ -380,6 +398,57 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase): self.assertEqual(self.compute.host, instance.host) self.assertFalse(instance.auto_disk_config) + @mock.patch.object(nova.compute.manager.ComputeManager, + '_get_bound_arq_resources') + def test_unshelve_with_arqs(self, mock_get_arqs): + self.test_unshelve(accel_uuids=[uuids.fake]) + mock_get_arqs.assert_called_once() + + @mock.patch.object(nova.compute.manager.ComputeManager, + '_nil_out_instance_obj_host_and_node') + @mock.patch.object(nova.compute.manager.ComputeManager, + '_terminate_volume_connections') + @mock.patch.object(nova.compute.manager.ComputeManager, + '_build_resources_cleanup') + @mock.patch.object(nova.compute.manager.ComputeManager, + '_get_bound_arq_resources') + def test_unshelve_with_arqs_failes_cleanup_resources( + self, mock_get_arqs, mock_cleanup_resources, + mock_bdms, mock_nil): + instance = self._create_fake_instance_obj() + + mock_get_arqs.side_effect = exception.AcceleratorRequestOpFailed( + instance_uuid=instance.uuid, op='get', + msg='Failure getting accelerator requests.') + + exc = self.assertRaises(exception.AcceleratorRequestOpFailed, + self.test_unshelve, + accel_uuids=[uuids.fake], + instance=instance) + self.assertIn('Failure getting accelerator requests.', str(exc)) + mock_cleanup_resources.assert_called_once() + + @mock.patch.object(nova.compute.manager.ComputeManager, + '_nil_out_instance_obj_host_and_node') + @mock.patch.object(nova.compute.manager.ComputeManager, + '_terminate_volume_connections') + @mock.patch.object(nova.compute.manager.ComputeManager, + '_build_resources_cleanup') + @mock.patch.object(nova.compute.manager.ComputeManager, + '_get_bound_arq_resources') + def test_unshelve_with_arqs_failes_cleanup_resources_timeout( + self, mock_get_arqs, mock_cleanup_resources, + mock_bdms, mock_nil): + instance = self._create_fake_instance_obj() + + mock_get_arqs.side_effect = eventlet.timeout.Timeout() + + self.assertRaises(eventlet.timeout.Timeout, + self.test_unshelve, + accel_uuids=[uuids.fake], + instance=instance) + mock_cleanup_resources.assert_called_once() + @mock.patch('nova.objects.BlockDeviceMappingList.get_by_instance_uuid') @mock.patch('nova.compute.utils.notify_about_instance_action') @mock.patch.object(nova.compute.resource_tracker.ResourceTracker, @@ -464,7 +533,7 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase): {}, limits) mock_spawn.assert_called_once_with(self.context, instance, test.MatchType(objects.ImageMeta), - injected_files=[], admin_password=None, + injected_files=[], admin_password=None, accel_info=[], allocations={}, network_info=[], block_device_info='fake_bdm') self.mock_get_allocations.assert_called_once_with(self.context, instance.uuid) @@ -551,7 +620,7 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase): {}, limits) mock_spawn.assert_called_once_with( self.context, instance, test.MatchType(objects.ImageMeta), - injected_files=[], admin_password=None, + injected_files=[], admin_password=None, accel_info=[], allocations={}, network_info=[], block_device_info='fake_bdm') mock_terminate_volume_connections.assert_called_once_with( self.context, instance, mock_bdms) @@ -767,11 +836,11 @@ class ShelveComputeAPITestCase(test_compute.BaseTestCase): if boot_from_volume: rpcapi_shelve_offload_instance.assert_called_once_with( self.context, instance=instance, - clean_shutdown=clean_shutdown) + clean_shutdown=clean_shutdown, accel_uuids=[]) else: rpcapi_shelve_instance.assert_called_once_with( self.context, instance=instance, image_id='fake-image-id', - clean_shutdown=clean_shutdown) + clean_shutdown=clean_shutdown, accel_uuids=[]) db.instance_destroy(self.context, instance['uuid']) @@ -831,7 +900,7 @@ class ShelveComputeAPITestCase(test_compute.BaseTestCase): instance_save.assert_called_once_with(expected_task_state=[None]) rpcapi_shelve_offload_instance.assert_called_once_with( self.context, instance=fake_instance, - clean_shutdown=clean_shutdown) + clean_shutdown=clean_shutdown, accel_uuids=[]) record.assert_called_once_with(self.context, fake_instance, instance_actions.SHELVE_OFFLOAD) diff --git a/nova/tests/unit/conductor/test_conductor.py b/nova/tests/unit/conductor/test_conductor.py index 9dd46a493280..485ae166e0e4 100644 --- a/nova/tests/unit/conductor/test_conductor.py +++ b/nova/tests/unit/conductor/test_conductor.py @@ -1340,6 +1340,7 @@ class _BaseTaskTestCase(object): system_metadata['shelved_host'] = 'fake-mini' fake_spec = fake_request_spec.fake_spec_obj() + fake_spec.flavor = instance.flavor # FIXME(sbauza): Modify the fake RequestSpec object to either add a # non-empty SchedulerRetries object or nullify the field fake_spec.retry = None @@ -1409,6 +1410,7 @@ class _BaseTaskTestCase(object): unshelve_instance.assert_called_once_with( self.context, mock.ANY, host['host'], test.MatchType(objects.RequestSpec), image=mock.ANY, + accel_uuids=[], filter_properties=filter_properties, node=host['nodename'] ) @@ -1450,6 +1452,7 @@ class _BaseTaskTestCase(object): instance.task_state = task_states.UNSHELVING # 'shelved_image_id' is None for volumebacked instance instance.system_metadata['shelved_image_id'] = None + self.request_spec.flavor = instance.flavor with test.nested( mock.patch.object(self.conductor_manager, @@ -1487,6 +1490,7 @@ class _BaseTaskTestCase(object): mock_im.return_value = objects.InstanceMapping( cell_mapping=cell_mapping) instance = self._create_fake_instance_obj() + fake_spec.flavor = instance.flavor instance.vm_state = vm_states.SHELVED_OFFLOADED instance.save() system_metadata = instance.system_metadata @@ -1506,6 +1510,7 @@ class _BaseTaskTestCase(object): filter_properties=dict( # populate_filter_properties adds limits={} fake_spec.to_legacy_filter_properties_dict(), limits={}), + accel_uuids=[], node='fake_node') def test_unshelve_instance_schedule_and_rebuild_novalid_host(self): @@ -1513,6 +1518,7 @@ class _BaseTaskTestCase(object): instance.vm_state = vm_states.SHELVED_OFFLOADED instance.save() system_metadata = instance.system_metadata + self.request_spec.flavor = instance.flavor def fake_schedule_instances(context, request_spec, *instances, **kwargs): @@ -1559,6 +1565,7 @@ class _BaseTaskTestCase(object): system_metadata['shelved_at'] = timeutils.utcnow() system_metadata['shelved_image_id'] = 'fake_image_id' system_metadata['shelved_host'] = 'fake-mini' + self.request_spec.flavor = instance.flavor self.assertRaises(messaging.MessagingTimeout, self.conductor_manager.unshelve_instance, self.context, instance, self.request_spec) @@ -1582,6 +1589,7 @@ class _BaseTaskTestCase(object): cell_mapping=objects.CellMapping.get_by_uuid(self.context, uuids.cell1)) instance = self._create_fake_instance_obj() + fake_spec.flavor = instance.flavor instance.vm_state = vm_states.SHELVED_OFFLOADED instance.save() system_metadata = instance.system_metadata @@ -1594,7 +1602,7 @@ class _BaseTaskTestCase(object): self.context, fake_spec, [instance.uuid], return_alternates=False) mock_unshelve.assert_called_once_with( self.context, instance, 'fake_host', fake_spec, image=None, - filter_properties={'limits': {}}, node='fake_node') + accel_uuids=[], filter_properties={'limits': {}}, node='fake_node') @mock.patch('nova.scheduler.utils.fill_provider_mapping') @mock.patch('nova.network.neutron.API.get_requested_resource_for_instance') @@ -1607,6 +1615,7 @@ class _BaseTaskTestCase(object): instance.save() request_spec = objects.RequestSpec() + request_spec.flavor = instance.flavor selection = objects.Selection( service_host='fake_host', @@ -1629,6 +1638,87 @@ class _BaseTaskTestCase(object): mock_fill_provider_mapping.assert_called_once_with( request_spec, selection) + @mock.patch('nova.accelerator.cyborg.get_device_profile_request_groups') + @mock.patch.object(conductor_manager.ComputeTaskManager, + '_create_and_bind_arq_for_instance', ) + @mock.patch('nova.scheduler.utils.fill_provider_mapping') + @mock.patch('nova.network.neutron.API.get_requested_resource_for_instance') + @mock.patch.object(conductor_manager.ComputeTaskManager, + '_schedule_instances', ) + def test_unshelve_instance_resource_request_with_device_profile( + self, mock_schedule, mock_get_res_req, mock_fill_provider_mapping, + mock_get_arq, mock_get_dp): + instance = self._create_fake_instance_obj() + instance.vm_state = vm_states.SHELVED_OFFLOADED + instance.save() + + request_spec = objects.RequestSpec() + request_spec.flavor = instance.flavor + request_spec.flavor.extra_specs = {'accel:device_profile': 'mydp'} + + selection = objects.Selection( + service_host='fake_host', + nodename='fake_node', + limits=None) + mock_schedule.return_value = [[selection]] + + mock_get_res_req.return_value = [] + + dp_groups = [objects.RequestGroup(requester_id='deviceprofile2'), + objects.RequestGroup(requester_id='deviceprofile3')] + mock_get_dp.return_value = dp_groups + mock_get_arq.return_value = [] + + self.conductor_manager.unshelve_instance( + self.context, instance, request_spec) + + self.assertEqual(dp_groups, request_spec.requested_resources) + + mock_get_res_req.assert_called_once_with(self.context, instance.uuid) + mock_schedule.assert_called_once_with( + self.context, request_spec, [instance.uuid], + return_alternates=False) + mock_fill_provider_mapping.assert_called_once_with( + request_spec, selection) + + @mock.patch('nova.accelerator.cyborg._CyborgClient.delete_arqs_by_uuid') + @mock.patch('nova.accelerator.cyborg.get_device_profile_request_groups') + @mock.patch.object(conductor_manager.ComputeTaskManager, + '_create_and_bind_arq_for_instance', ) + @mock.patch('nova.scheduler.utils.fill_provider_mapping') + @mock.patch('nova.network.neutron.API.get_requested_resource_for_instance') + @mock.patch.object(conductor_manager.ComputeTaskManager, + '_schedule_instances', ) + def test_unshelve_instance_resource_request_with_arq_bind_fail( + self, mock_schedule, mock_get_res_req, mock_fill_provider_mapping, + mock_get_arqs, mock_get_dp, mock_del_arqs): + instance = self._create_fake_instance_obj() + instance.vm_state = vm_states.SHELVED_OFFLOADED + instance.save() + + request_spec = objects.RequestSpec() + request_spec.flavor = instance.flavor + request_spec.flavor.extra_specs = {'accel:device_profile': 'mydp'} + + selection = objects.Selection( + service_host='fake_host', + nodename='fake_node', + limits=None) + mock_schedule.return_value = [[selection]] + dp_groups = [objects.RequestGroup(requester_id='deviceprofile2'), + objects.RequestGroup(requester_id='deviceprofile3')] + mock_get_dp.return_value = dp_groups + mock_get_res_req.return_value = [] + arqs = ["fake-arq-uuid"] + mock_get_arqs.side_effect = exc.AcceleratorRequestBindingFailed( + arqs=arqs, msg='') + ex = self.assertRaises(exc.AcceleratorRequestBindingFailed, + self.conductor_manager.unshelve_instance, + context=self.context, instance=instance, + request_spec=request_spec) + self.assertIn('Failed to bind accelerator requests', ex.message) + mock_del_arqs.assert_called_with(arqs) + def test_rebuild_instance(self): inst_obj = self._create_fake_instance_obj() rebuild_args, compute_args = self._prepare_rebuild_args( @@ -1698,6 +1788,57 @@ class _BaseTaskTestCase(object): self.context, inst_obj, 'thebesthost', action='rebuild_scheduled', source='nova-conductor') + @mock.patch('nova.accelerator.cyborg._CyborgClient.' + 'get_arq_uuids_for_instance') + @mock.patch('nova.compute.utils.notify_about_instance_rebuild') + def test_rebuild_instance_with_device_profile(self, mock_notify, + mock_get_arqs): + inst_obj = self._create_fake_instance_obj() + inst_obj.flavor.extra_specs = {'accel:device_profile': 'mydp'} + inst_obj.host = 'noselect' + expected_host = 'thebesthost' + expected_node = 'thebestnode' + expected_limits = None + fake_selection = objects.Selection(service_host=expected_host, + nodename=expected_node, limits=None) + rebuild_args, compute_args = self._prepare_rebuild_args( + {'host': None, 'node': expected_node, 'limits': expected_limits}) + fake_spec = objects.RequestSpec() + rebuild_args['request_spec'] = fake_spec + inst_uuids = [inst_obj.uuid] + arqs = ['fake-arq-uuid'] + mock_get_arqs.return_value = arqs + with test.nested( + mock.patch.object(self.conductor_manager.compute_rpcapi, + 'rebuild_instance'), + mock.patch.object(scheduler_utils, 'setup_instance_group', + return_value=False), + mock.patch.object(self.conductor_manager.query_client, + 'select_destinations', + return_value=[[fake_selection]]) + ) as (rebuild_mock, sig_mock, select_dest_mock): + self.conductor_manager.rebuild_instance(context=self.context, + instance=inst_obj, + **rebuild_args) + self.ensure_network_metadata_mock.assert_called_once_with( + inst_obj) + self.heal_reqspec_is_bfv_mock.assert_called_once_with( + self.context, fake_spec, inst_obj) + select_dest_mock.assert_called_once_with(self.context, fake_spec, + inst_uuids, return_objects=True, return_alternates=False) + compute_args['host'] = expected_host + compute_args['request_spec'] = fake_spec + compute_args['accel_uuids'] = arqs + rebuild_mock.assert_called_once_with(self.context, + instance=inst_obj, + **compute_args) + self.assertEqual(inst_obj.project_id, fake_spec.project_id) + self.assertEqual('compute.instance.rebuild.scheduled', + fake_notifier.NOTIFICATIONS[0].event_type) + mock_notify.assert_called_once_with( + self.context, inst_obj, 'thebesthost', action='rebuild_scheduled', + source='nova-conductor') + def test_rebuild_instance_with_scheduler_no_host(self): inst_obj = self._create_fake_instance_obj() inst_obj.host = 'noselect' @@ -1896,6 +2037,151 @@ class _BaseTaskTestCase(object): self.context, inst_obj, 'thebesthost', action='rebuild_scheduled', source='nova-conductor') + @mock.patch( + 'nova.accelerator.cyborg._CyborgClient.delete_arqs_for_instance') + @mock.patch.object(conductor_manager.ComputeTaskManager, + '_create_and_bind_arq_for_instance', ) + @mock.patch('nova.accelerator.cyborg.get_device_profile_request_groups') + @mock.patch('nova.compute.utils.notify_about_instance_rebuild') + def test_evacuate_instance_with_request_spec_device_profile( + self, mock_notify, mock_get_dp, mock_get_arqs, mock_del_arqs): + inst_obj = self._create_fake_instance_obj() + inst_obj.host = 'noselect' + expected_host = 'thebesthost' + expected_node = 'thebestnode' + expected_limits = None + fake_selection = objects.Selection(service_host=expected_host, + nodename=expected_node, limits=None) + fake_spec = objects.RequestSpec(ignore_hosts=[uuids.ignored_host]) + fake_spec.flavor = inst_obj.flavor + fake_spec.flavor.extra_specs = {'accel:device_profile': 'mydp'} + rebuild_args, compute_args = self._prepare_rebuild_args( + {'host': None, 'node': expected_node, 'limits': expected_limits, + 'request_spec': fake_spec, 'recreate': True}) + dp_groups = [objects.RequestGroup(requester_id='deviceprofile2'), + objects.RequestGroup(requester_id='deviceprofile3')] + mock_get_dp.return_value = dp_groups + arqs = ['fake-arq-uuid'] + mock_get_arqs.return_value = arqs + with test.nested( + mock.patch.object(self.conductor_manager.compute_rpcapi, + 'rebuild_instance'), + mock.patch.object(scheduler_utils, 'setup_instance_group', + return_value=False), + mock.patch.object(self.conductor_manager.query_client, + 'select_destinations', + return_value=[[fake_selection]]), + mock.patch.object(fake_spec, 'reset_forced_destinations'), + mock.patch('nova.scheduler.utils.fill_provider_mapping'), + mock.patch('nova.network.neutron.API.' + 'get_requested_resource_for_instance', + return_value=[]) + ) as (rebuild_mock, sig_mock, select_dest_mock, reset_fd, + fill_rp_mapping_mock, get_req_res_mock): + self.conductor_manager.rebuild_instance(context=self.context, + instance=inst_obj, + **rebuild_args) + reset_fd.assert_called_once_with() + # The RequestSpec.ignore_hosts field should be overwritten. + self.assertEqual([inst_obj.host], fake_spec.ignore_hosts) + # The RequestSpec.requested_destination.cell field should be set. + self.assertIn('requested_destination', fake_spec) + self.assertIn('cell', fake_spec.requested_destination) + self.assertIsNotNone(fake_spec.requested_destination.cell) + select_dest_mock.assert_called_once_with(self.context, + fake_spec, [inst_obj.uuid], return_objects=True, + return_alternates=False) + compute_args['host'] = expected_host + compute_args['request_spec'] = fake_spec + compute_args['accel_uuids'] = arqs + rebuild_mock.assert_called_once_with(self.context, + instance=inst_obj, + **compute_args) + get_req_res_mock.assert_called_once_with( + self.context, inst_obj.uuid) + fill_rp_mapping_mock.assert_called_once_with( + fake_spec, fake_selection) + mock_del_arqs.assert_called_once() + + self.assertEqual('compute.instance.rebuild.scheduled', + fake_notifier.NOTIFICATIONS[0].event_type) + mock_notify.assert_called_once_with( + self.context, inst_obj, 'thebesthost', action='rebuild_scheduled', + source='nova-conductor') + + @mock.patch('nova.accelerator.cyborg._CyborgClient.delete_arqs_by_uuid') + @mock.patch( + 'nova.accelerator.cyborg._CyborgClient.delete_arqs_for_instance') + @mock.patch.object(conductor_manager.ComputeTaskManager, + '_create_and_bind_arq_for_instance', ) + @mock.patch('nova.accelerator.cyborg.get_device_profile_request_groups') + @mock.patch('nova.compute.utils.notify_about_instance_rebuild') + def test_evacuate_instance_with_request_spec_arq_bind_fail( + self, mock_notify, mock_get_dp, mock_get_arqs, + mock_del_arqs_instance, mock_del_arqs): + inst_obj = self._create_fake_instance_obj() + inst_obj.host = 'noselect' + expected_host = 'thebesthost' + expected_node = 'thebestnode' + expected_limits = None + fake_selection = objects.Selection(service_host=expected_host, + nodename=expected_node, limits=None) + fake_spec = objects.RequestSpec(ignore_hosts=[uuids.ignored_host]) + fake_spec.flavor = inst_obj.flavor + fake_spec.flavor.extra_specs = {'accel:device_profile': 'mydp'} + dp_groups = [objects.RequestGroup(requester_id='deviceprofile2'), + objects.RequestGroup(requester_id='deviceprofile3')] + mock_get_dp.return_value = dp_groups + fake_spec.requested_resources = dp_groups + rebuild_args, compute_args = self._prepare_rebuild_args( + {'host': None, 'node': expected_node, 'limits': expected_limits, + 'request_spec': fake_spec, 'recreate': True}) + arqs = ['fake-arq-uuid'] + mock_get_arqs.side_effect = exc.AcceleratorRequestBindingFailed( + arqs=arqs, msg='') + with test.nested( + mock.patch.object(scheduler_utils, 'setup_instance_group', + return_value=False), + mock.patch.object(self.conductor_manager.query_client, + 'select_destinations', + return_value=[[fake_selection]]), + mock.patch.object(fake_spec, 'reset_forced_destinations'), + mock.patch('nova.scheduler.utils.fill_provider_mapping'), + mock.patch('nova.network.neutron.API.' + 'get_requested_resource_for_instance', + return_value=[]) + ) as (sig_mock, select_dest_mock, reset_fd, + fill_rp_mapping_mock, get_req_res_mock): + ex = self.assertRaises(exc.AcceleratorRequestBindingFailed, + self.conductor_manager.rebuild_instance, + context=self.context, instance=inst_obj, + **rebuild_args) + reset_fd.assert_called_once_with() + # The RequestSpec.ignore_hosts field should be overwritten. + self.assertEqual([inst_obj.host], fake_spec.ignore_hosts) + # The RequestSpec.requested_destination.cell field should be set. + self.assertIn('requested_destination', fake_spec) + self.assertIn('cell', fake_spec.requested_destination) + self.assertIsNotNone(fake_spec.requested_destination.cell) + select_dest_mock.assert_called_once_with(self.context, + fake_spec, [inst_obj.uuid], return_objects=True, + return_alternates=False) + compute_args['host'] = expected_host + compute_args['request_spec'] = fake_spec + get_req_res_mock.assert_called_once_with( + self.context, inst_obj.uuid) + fill_rp_mapping_mock.assert_called_once_with( + fake_spec, fake_selection) + self.assertIn('Failed to bind accelerator requests', ex.message) + mock_del_arqs.assert_called_with(arqs) + mock_del_arqs_instance.assert_called_once() + + self.assertEqual('compute.instance.rebuild.scheduled', + fake_notifier.NOTIFICATIONS[0].event_type) + mock_notify.assert_called_once_with( + self.context, inst_obj, 'thebesthost', action='rebuild_scheduled', + source='nova-conductor') + @mock.patch( 'nova.conductor.tasks.cross_cell_migrate.ConfirmResizeTask.execute') def test_confirm_snapshot_based_resize(self, mock_execute): diff --git a/releasenotes/notes/bp-cyborg-rebuild-and-evacuate-97bba59988b8b072.yaml b/releasenotes/notes/bp-cyborg-rebuild-and-evacuate-97bba59988b8b072.yaml new file mode 100644 index 000000000000..7f318117607d --- /dev/null +++ b/releasenotes/notes/bp-cyborg-rebuild-and-evacuate-97bba59988b8b072.yaml @@ -0,0 +1,12 @@ +--- +features: + - | + Add Cyborg shelve/unshelve support. + + After shelve the ARQs are still kept bound to the instance. + + After shelve offload the ARQs of the instance will be feered + in Cyborg. + + During unshelve the ARQs will be reallocated and bound to + the instance if needed.