Merge "Cyborg shelve/unshelve support"

2021-01-16 12:13:37 +00:00 · 2021-01-16 12:13:37 +00:00 · 941f462463
parent c31f0e7c56 7fbd787b1b
commit 941f462463
15 changed files with 827 additions and 89 deletions
--- a/api-guide/source/accelerator-support.rst
+++ b/api-guide/source/accelerator-support.rst
@ -43,12 +43,12 @@ The lists of supported and unsupported operations are as below:
  * Rescue and unrescue.
  * Rebuild.
  * Evacuate.
+  * Shelve and unshelve.

 * Unsupported operations

  * Resize.
  * Suspend and resume.
-  * Shelve and unshelve.
  * Cold migration.
  * Live migration.

@ -56,6 +56,10 @@ The lists of supported and unsupported operations are as below:

   Added support for rebuild and evacuate operations.

+.. versionchanged:: 23.0.0(Wallaby)
+
+   Added support for shelve and unshelve operations.
+
 Some operations, such as lock and unlock, work as they are effectively
 no-ops for accelerators.

--- a/nova/api/openstack/compute/shelve.py
+++ b/nova/api/openstack/compute/shelve.py
@ -37,7 +37,7 @@ class ShelveController(wsgi.Controller):
        self.network_api = neutron.API()

    @wsgi.response(202)
-    @wsgi.expected_errors((404, 409))
+    @wsgi.expected_errors((404, 403, 409))
    @wsgi.action('shelve')
    def _shelve(self, req, id, body):
        """Move an instance into shelved mode."""
@ -55,6 +55,8 @@ class ShelveController(wsgi.Controller):
            exception.UnexpectedTaskStateError,
        ) as e:
            raise exc.HTTPConflict(explanation=e.format_message())
+        except exception.ForbiddenWithAccelerators as e:
+            raise exc.HTTPForbidden(explanation=e.format_message())
        except exception.InstanceInvalidState as state_error:
            common.raise_http_conflict_for_instance_invalid_state(state_error,
                                                                  'shelve', id)
--- a/nova/compute/api.py
+++ b/nova/compute/api.py
@ -4158,7 +4158,7 @@ class API(base.Base):
        return allow_same_host

    @reject_vtpm_instances(instance_actions.SHELVE)
-    @block_accelerators()
+    @block_accelerators(until_service=54)
    @check_instance_lock
    @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED,
                                    vm_states.PAUSED, vm_states.SUSPENDED])
@ -4181,16 +4181,23 @@ class API(base.Base):

        self._record_action_start(context, instance, instance_actions.SHELVE)

+        accel_uuids = []
+        if instance.flavor.extra_specs.get('accel:device_profile'):
+            cyclient = cyborg.get_client(context)
+            accel_uuids = cyclient.get_arq_uuids_for_instance(instance)
+
        if not compute_utils.is_volume_backed_instance(context, instance):
            name = '%s-shelved' % instance.display_name
            image_meta = compute_utils.create_image(
                context, instance, name, 'snapshot', self.image_api)
            image_id = image_meta['id']
            self.compute_rpcapi.shelve_instance(context, instance=instance,
-                    image_id=image_id, clean_shutdown=clean_shutdown)
+                image_id=image_id, clean_shutdown=clean_shutdown,
+                accel_uuids=accel_uuids)
        else:
-            self.compute_rpcapi.shelve_offload_instance(context,
-                    instance=instance, clean_shutdown=clean_shutdown)
+            self.compute_rpcapi.shelve_offload_instance(
+                context, instance=instance, clean_shutdown=clean_shutdown,
+                accel_uuids=accel_uuids)

    @check_instance_lock
    @check_instance_state(vm_state=[vm_states.SHELVED])
@ -4202,8 +4209,14 @@ class API(base.Base):
        self._record_action_start(context, instance,
                                  instance_actions.SHELVE_OFFLOAD)

-        self.compute_rpcapi.shelve_offload_instance(context, instance=instance,
-            clean_shutdown=clean_shutdown)
+        accel_uuids = []
+        if instance.flavor.extra_specs.get('accel:device_profile'):
+            cyclient = cyborg.get_client(context)
+            accel_uuids = cyclient.get_arq_uuids_for_instance(instance)
+
+        self.compute_rpcapi.shelve_offload_instance(
+            context, instance=instance,
+            clean_shutdown=clean_shutdown, accel_uuids=accel_uuids)

    def _validate_unshelve_az(self, context, instance, availability_zone):
        """Verify the specified availability_zone during unshelve.
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -528,7 +528,7 @@ class ComputeVirtAPI(virtapi.VirtAPI):
 class ComputeManager(manager.Manager):
    """Manages the running instances from creation to destruction."""

-    target = messaging.Target(version='5.12')
+    target = messaging.Target(version='5.13')

    def __init__(self, compute_driver=None, *args, **kwargs):
        """Load configuration options and connect to the hypervisor."""
@ -6326,7 +6326,7 @@ class ComputeManager(manager.Manager):
    @wrap_instance_event(prefix='compute')
    @wrap_instance_fault
    def shelve_instance(self, context, instance, image_id,
-                        clean_shutdown):
+                        clean_shutdown, accel_uuids=None):
        """Shelve an instance.

        This should be used when you want to take a snapshot of the instance.
@ -6337,15 +6337,17 @@ class ComputeManager(manager.Manager):
        :param instance: an Instance object
        :param image_id: an image id to snapshot to.
        :param clean_shutdown: give the GuestOS a chance to stop
+        :param accel_uuids: the accelerators uuids for the instance
        """

        @utils.synchronized(instance.uuid)
        def do_shelve_instance():
-            self._shelve_instance(context, instance, image_id, clean_shutdown)
+            self._shelve_instance(context, instance, image_id, clean_shutdown,
+                                  accel_uuids)
        do_shelve_instance()

    def _shelve_instance(self, context, instance, image_id,
-                         clean_shutdown):
+                         clean_shutdown, accel_uuids=None):
        LOG.info('Shelving', instance=instance)
        offload = CONF.shelved_offload_time == 0
        if offload:
@ -6400,14 +6402,16 @@ class ComputeManager(manager.Manager):
                phase=fields.NotificationPhase.END, bdms=bdms)

        if offload:
-            self._shelve_offload_instance(context, instance,
-                                          clean_shutdown=False, bdms=bdms)
+            self._shelve_offload_instance(
+                context, instance, clean_shutdown=False, bdms=bdms,
+                accel_uuids=accel_uuids)

    @wrap_exception()
    @reverts_task_state
    @wrap_instance_event(prefix='compute')
    @wrap_instance_fault
-    def shelve_offload_instance(self, context, instance, clean_shutdown):
+    def shelve_offload_instance(self, context, instance, clean_shutdown,
+            accel_uuids=None):
        """Remove a shelved instance from the hypervisor.

        This frees up those resources for use by other instances, but may lead
@ -6418,15 +6422,17 @@ class ComputeManager(manager.Manager):
        :param context: request context
        :param instance: nova.objects.instance.Instance
        :param clean_shutdown: give the GuestOS a chance to stop
+        :param accel_uuids: the accelerators uuids for the instance
        """

        @utils.synchronized(instance.uuid)
        def do_shelve_offload_instance():
-            self._shelve_offload_instance(context, instance, clean_shutdown)
+            self._shelve_offload_instance(context, instance, clean_shutdown,
+                                          accel_uuids=accel_uuids)
        do_shelve_offload_instance()

    def _shelve_offload_instance(self, context, instance, clean_shutdown,
-                                 bdms=None):
+                                 bdms=None, accel_uuids=None):
        LOG.info('Shelve offloading', instance=instance)
        if bdms is None:
            bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
@ -6451,6 +6457,12 @@ class ComputeManager(manager.Manager):
        # terminate all the connections with the volume server and the host
        self._terminate_volume_connections(context, instance, bdms)

+        # NOTE(brinzhang): Free up the accelerator resource occupied
+        # in the cyborg service.
+        if accel_uuids:
+            cyclient = cyborg.get_client(context)
+            cyclient.delete_arqs_for_instance(instance.uuid)
+
        # Free up the resource allocations in the placement service.
        # This should happen *before* the vm_state is changed to
        # SHELVED_OFFLOADED in case client-side code is polling the API to
@ -6490,8 +6502,9 @@ class ComputeManager(manager.Manager):
    @reverts_task_state
    @wrap_instance_event(prefix='compute')
    @wrap_instance_fault
-    def unshelve_instance(self, context, instance, image,
-                          filter_properties, node, request_spec=None):
+    def unshelve_instance(
+            self, context, instance, image, filter_properties, node,
+            request_spec=None, accel_uuids=None):
        """Unshelve the instance.

        :param context: request context
@ -6502,6 +6515,7 @@ class ComputeManager(manager.Manager):
        :param node: target compute node
        :param request_spec: the RequestSpec object used to schedule the
            instance
+        :param accel_uuids: the accelerators uuids for the instance
        """
        if filter_properties is None:
            filter_properties = {}
@ -6510,7 +6524,7 @@ class ComputeManager(manager.Manager):
        def do_unshelve_instance():
            self._unshelve_instance(
                context, instance, image, filter_properties, node,
-                request_spec)
+                request_spec, accel_uuids)
        do_unshelve_instance()

    def _unshelve_instance_key_scrub(self, instance):
@ -6527,7 +6541,7 @@ class ComputeManager(manager.Manager):
        instance.update(keys)

    def _unshelve_instance(self, context, instance, image, filter_properties,
-                           node, request_spec):
+                           node, request_spec, accel_uuids):
        LOG.info('Unshelving', instance=instance)
        bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
                context, instance.uuid)
@ -6574,6 +6588,19 @@ class ComputeManager(manager.Manager):
                provider_mappings=provider_mappings)
            network_info = self.network_api.get_instance_nw_info(
                context, instance)
+
+            accel_info = []
+            if accel_uuids:
+                try:
+                    accel_info = self._get_bound_arq_resources(
+                        context, instance, accel_uuids)
+                except (Exception, eventlet.timeout.Timeout) as exc:
+                    LOG.exception('Failure getting accelerator requests '
+                                  'with the exception: %s', exc,
+                                  instance=instance)
+                    self._build_resources_cleanup(instance, network_info)
+                    raise
+
            with self.rt.instance_claim(context, instance, node, allocations,
                                        limits):
                self.driver.spawn(context, instance, image_meta,
@ -6581,7 +6608,8 @@ class ComputeManager(manager.Manager):
                                  admin_password=None,
                                  allocations=allocations,
                                  network_info=network_info,
-                                  block_device_info=block_device_info)
+                                  block_device_info=block_device_info,
+                                  accel_info=accel_info)
        except Exception:
            with excutils.save_and_reraise_exception(logger=LOG):
                LOG.exception('Instance failed to spawn',
--- a/nova/compute/rpcapi.py
+++ b/nova/compute/rpcapi.py
@ -380,6 +380,9 @@ class ComputeAPI(object):
                 build_and_run_instance()
        * 5.12 - Add accel_uuids (accelerator requests) parameter to
                 rebuild_instance()
+        * 5.13 - Add accel_uuids (accelerator requests) parameter to
+                 shelve_instance(), shelve_offload_instance() and
+                 unshelve_instance()
    '''

    VERSION_ALIASES = {
@ -1350,40 +1353,65 @@ class ComputeAPI(object):
        cctxt.cast(ctxt, 'restore_instance', instance=instance)

    def shelve_instance(self, ctxt, instance, image_id=None,
-                        clean_shutdown=True):
-        version = '5.0'
-        cctxt = self.router.client(ctxt).prepare(
+                        clean_shutdown=True, accel_uuids=None):
+        msg_kwargs = {
+            'instance': instance,
+            'image_id': image_id,
+            'clean_shutdown': clean_shutdown,
+            'accel_uuids': accel_uuids,
+        }
+        client = self.router.client(ctxt)
+        version = '5.13'
+        if not client.can_send_version(version):
+            if accel_uuids:
+                LOG.error("Shelve with accelerators is not supported as "
+                          "RPC version is too old.")
+                raise exception.ForbiddenWithAccelerators()
+            else:
+                msg_kwargs.pop('accel_uuids')
+            version = '5.0'
+        cctxt = client.prepare(
                server=_compute_host(None, instance), version=version)
-        cctxt.cast(ctxt, 'shelve_instance', instance=instance,
-                   image_id=image_id, clean_shutdown=clean_shutdown)
+        cctxt.cast(ctxt, 'shelve_instance', **msg_kwargs)

    def shelve_offload_instance(self, ctxt, instance,
-                                clean_shutdown=True):
-        version = '5.0'
-        cctxt = self.router.client(ctxt).prepare(
+                                clean_shutdown=True, accel_uuids=None):
+        msg_kwargs = {
+            'instance': instance,
+            'clean_shutdown': clean_shutdown,
+            'accel_uuids': accel_uuids,
+        }
+        client = self.router.client(ctxt)
+        version = '5.13'
+        if not client.can_send_version(version):
+            msg_kwargs.pop('accel_uuids')
+            version = '5.0'
+        cctxt = client.prepare(
                server=_compute_host(None, instance), version=version)
-        cctxt.cast(ctxt, 'shelve_offload_instance', instance=instance,
-                   clean_shutdown=clean_shutdown)
+        cctxt.cast(ctxt, 'shelve_offload_instance', **msg_kwargs)

    def unshelve_instance(self, ctxt, instance, host, request_spec, image=None,
-                          filter_properties=None, node=None):
-        version = '5.2'
+                          filter_properties=None, node=None, accel_uuids=None):
+        version = '5.13'
        msg_kwargs = {
            'instance': instance,
            'image': image,
            'filter_properties': filter_properties,
            'node': node,
            'request_spec': request_spec,
+            'accel_uuids': accel_uuids,
        }

        client = self.router.client(ctxt)

+        if not client.can_send_version(version):
+            msg_kwargs.pop('accel_uuids')
+            version = '5.2'
        if not client.can_send_version(version):
            msg_kwargs.pop('request_spec')
            version = '5.0'

-        cctxt = client.prepare(
-                server=host, version=version)
+        cctxt = client.prepare(server=host, version=version)
        cctxt.cast(ctxt, 'unshelve_instance', **msg_kwargs)

    def volume_snapshot_create(self, ctxt, instance, volume_id,
--- a/nova/conductor/manager.py
+++ b/nova/conductor/manager.py
@ -838,7 +838,7 @@ class ComputeTaskManager(base.Base):

            try:
                accel_uuids = self._create_and_bind_arq_for_instance(
-                    context, instance, host, local_reqspec)
+                    context, instance, host.nodename, local_reqspec)
            except Exception as exc:
                LOG.exception('Failed to reschedule. Reason: %s', exc)
                self._cleanup_when_reschedule_fails(
@ -858,7 +858,7 @@ class ComputeTaskManager(base.Base):
                    limits=host.limits, host_list=host_list,
                    accel_uuids=accel_uuids)

-    def _create_and_bind_arq_for_instance(self, context, instance, host,
+    def _create_and_bind_arq_for_instance(self, context, instance, hostname,
                                          request_spec):
        try:
            resource_provider_mapping = (
@ -867,7 +867,7 @@ class ComputeTaskManager(base.Base):
            # http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html  # noqa
            return self._create_and_bind_arqs(
                context, instance.uuid, instance.flavor.extra_specs,
-                host.nodename, resource_provider_mapping)
+                hostname, resource_provider_mapping)
        except exception.AcceleratorRequestBindingFailed as exc:
            # If anything failed here we need to cleanup and bail out.
            cyclient = cyborg.get_client(context)
@ -963,14 +963,19 @@ class ComputeTaskManager(base.Base):
                    filter_properties = request_spec.\
                        to_legacy_filter_properties_dict()

-                    port_res_req = (
+                    external_resources = (
                        self.network_api.get_requested_resource_for_instance(
                            context, instance.uuid))
-                    # NOTE(gibi): When cyborg or other module wants to handle
-                    # similar non-nova resources then here we have to collect
-                    # all the external resource requests in a single list and
+                    extra_specs = request_spec.flavor.extra_specs
+                    device_profile = extra_specs.get('accel:device_profile')
+                    external_resources.extend(
+                        cyborg.get_device_profile_request_groups(
+                            context, device_profile) if device_profile else [])
+                    # NOTE(gibi): When other modules want to handle similar
+                    # non-nova resources then here we have to collect all
+                    # the external resource requests in a single list and
                    # add them to the RequestSpec.
-                    request_spec.requested_resources = port_res_req
+                    request_spec.requested_resources = external_resources

                    # NOTE(cfriesen): Ensure that we restrict the scheduler to
                    # the cell specified by the instance mapping.
@ -996,9 +1001,14 @@ class ComputeTaskManager(base.Base):
                    scheduler_utils.fill_provider_mapping(
                        request_spec, selection)

+                    # NOTE(brinzhang): For unshelve operation we should
+                    # re-create-and-bound the arqs for the instance.
+                    accel_uuids = self._create_and_bind_arq_for_instance(
+                        context, instance, node, request_spec)
                    self.compute_rpcapi.unshelve_instance(
                        context, instance, host, request_spec, image=image,
-                        filter_properties=filter_properties, node=node)
+                        filter_properties=filter_properties, node=node,
+                        accel_uuids=accel_uuids)
            except (exception.NoValidHost,
                    exception.UnsupportedPolicyException):
                instance.task_state = None
@ -1006,7 +1016,11 @@ class ComputeTaskManager(base.Base):
                LOG.warning("No valid host found for unshelve instance",
                            instance=instance)
                return
-            except Exception:
+            except Exception as exc:
+                if isinstance(exc, exception.AcceleratorRequestBindingFailed):
+                    cyclient = cyborg.get_client(context)
+                    cyclient.delete_arqs_by_uuid(exc.arqs)
+                LOG.exception('Failed to unshelve. Reason: %s', exc)
                with excutils.save_and_reraise_exception():
                    instance.task_state = None
                    instance.save()
@ -1225,12 +1239,24 @@ class ComputeTaskManager(base.Base):
            instance.availability_zone = (
                availability_zones.get_host_availability_zone(
                    context, host))
+            accel_uuids = []
            try:
-                accel_uuids = self._rebuild_cyborg_arq(
-                    context, instance, host, request_spec, evacuate)
-            except exception.AcceleratorRequestBindingFailed as exc:
-                cyclient = cyborg.get_client(context)
-                cyclient.delete_arqs_by_uuid(exc.arqs)
+                if instance.flavor.extra_specs.get('accel:device_profile'):
+                    cyclient = cyborg.get_client(context)
+                    if evacuate:
+                        # NOTE(brinzhang): For evacuate operation we should
+                        # delete the bound arqs, then re-create-and-bound the
+                        # arqs for the instance.
+                        cyclient.delete_arqs_for_instance(instance.uuid)
+                        accel_uuids = self._create_and_bind_arq_for_instance(
+                            context, instance, node, request_spec)
+                    else:
+                        accel_uuids = cyclient.get_arq_uuids_for_instance(
+                            instance)
+            except Exception as exc:
+                if isinstance(exc, exception.AcceleratorRequestBindingFailed):
+                    cyclient = cyborg.get_client(context)
+                    cyclient.delete_arqs_by_uuid(exc.arqs)
                LOG.exception('Failed to rebuild. Reason: %s', exc)
                raise exc

@ -1253,22 +1279,6 @@ class ComputeTaskManager(base.Base):
                request_spec=request_spec,
                accel_uuids=accel_uuids)

-    def _rebuild_cyborg_arq(
-            self, context, instance, host, request_spec, evacuate):
-        dp_name = instance.flavor.extra_specs.get('accel:device_profile')
-        if not dp_name:
-            return []
-
-        cyclient = cyborg.get_client(context)
-        if not evacuate:
-            return cyclient.get_arq_uuids_for_instance(instance)
-
-        cyclient.delete_arqs_for_instance(instance.uuid)
-        resource_provider_mapping = request_spec.get_request_group_mapping()
-        return self._create_and_bind_arqs(
-            context, instance.uuid, instance.flavor.extra_specs,
-            host, resource_provider_mapping)
-
    def _validate_image_traits_for_rebuild(self, context, instance, image_ref):
        """Validates that the traits specified in the image can be satisfied
        by the providers of the current allocations for the instance during
@ -1668,7 +1678,7 @@ class ComputeTaskManager(base.Base):

            try:
                accel_uuids = self._create_and_bind_arq_for_instance(
-                        context, instance, host, request_spec)
+                        context, instance, host.nodename, request_spec)
            except Exception as exc:
                with excutils.save_and_reraise_exception():
                    self._cleanup_build_artifacts(
--- a/nova/objects/service.py
+++ b/nova/objects/service.py
@ -31,7 +31,7 @@ LOG = logging.getLogger(__name__)


 # NOTE(danms): This is the global service version counter
-SERVICE_VERSION = 53
+SERVICE_VERSION = 54


 # NOTE(danms): This is our SERVICE_VERSION history. The idea is that any
@ -190,6 +190,10 @@ SERVICE_VERSION_HISTORY = (
    # Version 53: Compute RPC v5.12:
    # Add accel_uuids (accelerator requests) param to rebuild_instance
    {'compute_rpc': '5.12'},
+    # Version 54: Compute RPC v5.13:
+    # Add accel_uuids (accelerator requests) param to shelve_instance and
+    # shelve_offload_instance and unshelve_instance
+    {'compute_rpc': '5.13'},
 )

 # This is used to raise an error at service startup if older than N-1 computes
--- a/nova/tests/fixtures.py
+++ b/nova/tests/fixtures.py
@ -3095,7 +3095,10 @@ class CyborgFixture(fixtures.Fixture):
           This function uses bindings indexed by instance UUID to
           populate the bound ARQ templates in CyborgFixture.bound_arq_list.
        """
-        arq_host_rp_list = CyborgFixture.bindings_by_instance[instance_uuid]
+        arq_host_rp_list = CyborgFixture.bindings_by_instance.get(
+                instance_uuid)
+        if not arq_host_rp_list:
+            return []
        # The above looks like:
        # [{'hostname': $hostname,
        #   'device_rp_uuid': $device_rp_uuid,
@ -3117,7 +3120,7 @@ class CyborgFixture(fixtures.Fixture):

    @staticmethod
    def fake_delete_arqs_for_instance(instance_uuid):
-        return None
+        CyborgFixture.bindings_by_instance.pop(instance_uuid, None)

    def setUp(self):
        super(CyborgFixture, self).setUp()
--- a/nova/tests/functional/integrated_helpers.py
+++ b/nova/tests/functional/integrated_helpers.py
@ -486,9 +486,17 @@ class InstanceHelperMixin:
        self.api.post_server_action(server['id'], {'shelve': {}})
        return self._wait_for_state_change(server, expected_state)

+    def _shelve_offload_server(self, server):
+        """Shelve offload a server."""
+        self.api.post_server_action(server['id'], {'shelveOffload': {}})
+        self._wait_for_server_parameter(server,
+                                        {'status': 'SHELVED_OFFLOADED',
+                                         'OS-EXT-SRV-ATTR:host': None,
+                                         'OS-EXT-AZ:availability_zone': ''})
+
    def _unshelve_server(self, server, expected_state='ACTIVE'):
        """Unshelve a server."""
-        self.api.post_server_action(server['id'], {'unshelve': {}})
+        self.api.post_server_action(server['id'], {'unshelve': None})
        return self._wait_for_state_change(server, expected_state)

    def _evacuate_server(
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@ -33,6 +33,7 @@ from oslo_utils import timeutils
 from nova.compute import api as compute_api
 from nova.compute import instance_actions
 from nova.compute import manager as compute_manager
+from nova.compute import rpcapi as compute_rpcapi
 from nova import context
 from nova import exception
 from nova.network import constants
@ -7894,6 +7895,29 @@ class AcceleratorServerOpsTest(AcceleratorServerBase):
        self._wait_for_state_change(self.server, 'ACTIVE')
        self._check_allocations_usage(self.server)

+    def test_shelve_and_unshelve_ok(self):
+        self.flags(shelved_offload_time=1)
+        arqs = self.cyborg.fake_get_arqs_for_instance(self.server['id'])
+        self.assertEqual(len(arqs), 1)
+        self._shelve_server(self.server, 'SHELVED')
+        arqs = self.cyborg.fake_get_arqs_for_instance(self.server['id'])
+        self.assertEqual(len(arqs), 1)
+        self._unshelve_server(self.server)
+        arqs = self.cyborg.fake_get_arqs_for_instance(self.server['id'])
+        self.assertEqual(len(arqs), 1)
+
+    def test_shelve_offload_and_unshelve_ok(self):
+        self.flags(shelved_offload_time=-1)
+        arqs = self.cyborg.fake_get_arqs_for_instance(self.server['id'])
+        self.assertEqual(len(arqs), 1)
+        self._shelve_server(self.server, 'SHELVED')
+        self._shelve_offload_server(self.server)
+        arqs = self.cyborg.fake_get_arqs_for_instance(self.server['id'])
+        self.assertEqual(len(arqs), 0)
+        self._unshelve_server(self.server)
+        arqs = self.cyborg.fake_get_arqs_for_instance(self.server['id'])
+        self.assertEqual(len(arqs), 1)
+
    def test_resize_fails(self):
        ex = self.assertRaises(client.OpenStackApiException,
            self.api.post_server_action, self.server['id'],
@ -7953,6 +7977,81 @@ class AcceleratorServerOpsTest(AcceleratorServerBase):
        self.assertEqual(403, ex.response.status_code)
        self._check_allocations_usage(self.server)

+    @mock.patch.object(objects.service, 'get_minimum_version_all_cells')
+    def test_shelve_old_compute(self, old_compute_version):
+        """Tests when the source compute service is too old to call
+        shelve so OpenStackApiException is raised.
+        """
+        old_compute_version.return_value = 53
+        ex = self.assertRaises(client.OpenStackApiException,
+            self.api.post_server_action, self.server['id'],
+            {'shelve': {}})
+        self.assertEqual(403, ex.response.status_code)
+        self._check_allocations_usage(self.server)
+
+    def _test_shelve_instance_with_compute_rpc_pin(
+            self, version_cap, body=None):
+        self.flags(compute=version_cap, group='upgrade_levels')
+
+        self.flags(shelved_offload_time=-1)
+        self.api.post_server_action(self.server['id'], body)
+        self._wait_for_state_change(self.server, 'SHELVED')
+
+    def test_shelve_with_compute_rpc_pin_5_0(self):
+        self.flags(compute=5.0, group='upgrade_levels')
+        compute_rpcapi.reset_globals()
+        ex = self.assertRaises(
+            client.OpenStackApiException, self.api.post_server_action,
+            self.server['id'], {'shelve': {}})
+        self.assertEqual(403, ex.response.status_code)
+
+    def test_shelve_instance_5_13(self):
+        body = {'shelve': {}}
+        self._test_shelve_instance_with_compute_rpc_pin(
+                '5.13', body=body)
+
+    def _test_shelve_offload_instance_with_compute_rpc_pin(self, version_cap):
+        self.flags(compute=version_cap, group='upgrade_levels')
+
+        self.flags(shelved_offload_time=2)
+        self.api.post_server_action(self.server['id'], {'shelve': {}})
+        self._wait_for_state_change(self.server, 'SHELVED')
+
+        self.api.post_server_action(self.server['id'], {'shelveOffload': {}})
+        self._wait_for_state_change(self.server, 'SHELVED_OFFLOADED')
+
+    def test_shelve_offload_instance_5_0(self):
+        self._test_shelve_offload_instance_with_compute_rpc_pin('5.0')
+
+    def test_shelve_offload_instance_5_13(self):
+        self._test_shelve_offload_instance_with_compute_rpc_pin('5.13')
+
+    def _test_unshelve_instance_with_compute_rpc_pin(
+            self, version_cap, body=None):
+        self.flags(compute=version_cap, group='upgrade_levels')
+
+        self.api.microversion = '2.87'
+        self.api.post_server_action(self.server['id'], {'shelve': {}})
+        self._wait_for_state_change(self.server, 'SHELVED_OFFLOADED')
+
+        self.api.post_server_action(self.server['id'], body)
+        self._wait_for_state_change(self.server, 'ACTIVE')
+
+    def test_unshelve_instance_5_0(self):
+        body = {'unshelve': None}
+        self._test_unshelve_instance_with_compute_rpc_pin(
+                '5.0', body=body)
+
+    def test_unshelve_instance_5_2(self):
+        body = {'unshelve': None}
+        self._test_unshelve_instance_with_compute_rpc_pin(
+                '5.2', body=body)
+
+    def test_unshelve_instance_5_13(self):
+        body = {'unshelve': None}
+        self._test_unshelve_instance_with_compute_rpc_pin(
+                '5.13', body=body)
+

 class CrossCellResizeWithQoSPort(PortResourceRequestBasedSchedulingTestBase):
    NUMBER_OF_CELLS = 2
--- a/nova/tests/unit/compute/test_api.py
+++ b/nova/tests/unit/compute/test_api.py
@ -7637,6 +7637,127 @@ class ComputeAPIUnitTestCase(_ComputeAPIUnitTestMixIn, test.NoDBTestCase):
        # myfunc was not called
        self.assertEqual({}, args_info)

+    @mock.patch('nova.accelerator.cyborg._CyborgClient.'
+                'get_arq_uuids_for_instance')
+    @mock.patch.object(compute_utils, 'create_image')
+    def test_get_arqs_in_shelve(self, mock_create_img, mock_get_arq_uuids):
+        """Test get_arq_uuids_for_instance() was called if the
+        'accel:device_profile' exist in the flavor for shelve operation.
+        """
+        extra_specs = {'accel:device_profile': 'mydp'}
+        flavor = self._create_flavor(extra_specs=extra_specs)
+        params = dict(display_name="vm1")
+        instance = self._create_instance_obj(params=params, flavor=flavor)
+        mock_create_img.return_value = dict(id='fake-image-id')
+        mock_get_arq_uuids.return_value = (
+            ['983d1742-c8ca-4f8a-9f3c-3f54bbafaa7d'])
+        accel_uuids = mock_get_arq_uuids.return_value
+
+        with test.nested(
+            mock.patch('nova.objects.service.get_minimum_version_all_cells',
+                        return_value=54),
+            mock.patch('nova.compute.utils.is_volume_backed_instance',
+                          return_value=False),
+            mock.patch.object(objects.Instance, 'save'),
+            mock.patch.object(self.compute_api, '_record_action_start'),
+            mock.patch.object(compute_rpcapi.ComputeAPI, 'shelve_instance'),
+        ) as (
+            mock_get_min, mock_volume_backend,
+            mock_instance_save, mock_record_action, mock_shelve_service
+        ):
+            self.compute_api.shelve(self.context, instance)
+        mock_get_arq_uuids.assert_called_once()
+        mock_shelve_service.assert_called_once_with(
+            self.context, instance=instance, image_id='fake-image-id',
+            clean_shutdown=True, accel_uuids=accel_uuids)
+
+    @mock.patch('nova.accelerator.cyborg._CyborgClient.'
+                'get_arq_uuids_for_instance')
+    @mock.patch.object(compute_utils, 'create_image')
+    def test_shelve_with_unsupport_accelerators(
+            self, mock_create_img, mock_get_arq_uuids):
+        """Test get_arq_uuids_for_instance() was called if the
+        'accel:device_profile' exist in the flavor for shelve operation.
+        """
+        extra_specs = {'accel:device_profile': 'mydp'}
+        flavor = self._create_flavor(extra_specs=extra_specs)
+        params = dict(display_name="vm1")
+        instance = self._create_instance_obj(params=params, flavor=flavor)
+
+        with test.nested(
+            mock.patch('nova.objects.service.get_minimum_version_all_cells',
+                        return_value=54),
+            mock.patch('nova.compute.utils.is_volume_backed_instance',
+                          return_value=False),
+            mock.patch.object(objects.Instance, 'save'),
+            mock.patch.object(self.compute_api, '_record_action_start'),
+            mock.patch.object(compute_rpcapi.ComputeAPI, 'shelve_instance'),
+        ) as (
+            mock_get_min, mock_volume_backend,
+            mock_instance_save, mock_record_action, mock_shelve_service
+        ):
+            mock_shelve_service.side_effect = (
+                exception.ForbiddenWithAccelerators(
+                    oldest_supported_version=54,
+                    scope=mock.ANY,
+                    min_service_level=54,
+                    oldest_supported_service=54))
+            self.assertRaises(
+                exception.ForbiddenWithAccelerators,
+                self.compute_api.shelve,
+                self.context,
+                instance)
+
+    @mock.patch('nova.accelerator.cyborg._CyborgClient.'
+                'get_arq_uuids_for_instance')
+    @mock.patch('nova.objects.service.get_minimum_version_all_cells',
+                return_value=53)
+    def test_shelve_until_service_forbidden(self, mock_get_min,
+                                            mock_get_arq_uuuids):
+        """Ensure a 'ForbiddenWithAccelerators' exception raises if any
+        compute service less than the version of 54.
+        """
+        extra_specs = {'accel:device_profile': 'mydp'}
+        flavor = self._create_flavor(extra_specs=extra_specs)
+        instance = self._create_instance_obj(flavor=flavor)
+        self.assertRaisesRegex(exception.ForbiddenWithAccelerators,
+            'Forbidden with instances that have accelerators.',
+            self.compute_api.shelve, self.context, instance)
+        mock_get_arq_uuuids.assert_not_called()
+
+    @mock.patch('nova.accelerator.cyborg._CyborgClient.'
+                'get_arq_uuids_for_instance')
+    @mock.patch.object(compute_utils, 'create_image')
+    def test_get_arqs_in_shelve_offload(self, mock_create_img,
+                                        mock_get_arq_uuids):
+        """Test get_arq_uuids_for_instance() was called if the
+        'accel:device_profile' exist in the flavor for
+        shelve offload operation.
+        """
+        extra_specs = {'accel:device_profile': 'mydp'}
+        flavor = self._create_flavor(extra_specs=extra_specs)
+        params = {'vm_state': vm_states.SHELVED}
+        instance = self._create_instance_obj(params=params, flavor=flavor)
+        mock_create_img.return_value = dict(id='fake-image-id')
+        mock_get_arq_uuids.return_value = [uuids.fake]
+        accel_uuids = mock_get_arq_uuids.return_value
+        with test.nested(
+            mock.patch('nova.objects.service.get_minimum_version_all_cells',
+                        return_value=54),
+            mock.patch.object(objects.Instance, 'save'),
+            mock.patch.object(self.compute_api, '_record_action_start'),
+            mock.patch.object(compute_rpcapi.ComputeAPI,
+                              'shelve_offload_instance'),
+        ) as (
+            mock_get_min, mock_instance_save,
+            mock_record_action, mock_shelve_service
+        ):
+            self.compute_api.shelve_offload(self.context, instance)
+        mock_get_arq_uuids.assert_called_once()
+        mock_shelve_service.assert_called_once_with(
+            self.context, instance=instance, clean_shutdown=True,
+            accel_uuids=accel_uuids)
+
    # TODO(huaqiang): Remove in Wallaby
    @mock.patch('nova.objects.service.get_minimum_version_all_cells')
    def test__check_compute_service_for_mixed_instance(self, mock_ver):
--- a/nova/tests/unit/compute/test_rpcapi.py
+++ b/nova/tests/unit/compute/test_rpcapi.py
@ -897,19 +897,67 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
    def test_shelve_instance(self):
        self._test_compute_api('shelve_instance', 'cast',
                instance=self.fake_instance_obj, image_id='image_id',
-                clean_shutdown=True, version='5.0')
+                clean_shutdown=True, accel_uuids=None, version='5.13')
+
+    def test_shelve_instance_old_rpcapi(self):
+        # With rpcapi < 5.13, accel_uuids must be dropped in the client call.
+        ctxt = context.RequestContext('fake_user', 'fake_project')
+        compute_api = compute_rpcapi.ComputeAPI()
+        compute_api.router.client = mock.Mock()
+        mock_client = mock.MagicMock()
+        compute_api.router.client.return_value = mock_client
+        # Force can_send_version to False, so that 5.0 version is used.
+        mock_client.can_send_version.return_value = False
+        mock_cctx = mock.MagicMock()
+        mock_client.prepare.return_value = mock_cctx
+        compute_api.shelve_instance(
+                ctxt, instance=self.fake_instance_obj,
+                accel_uuids=[],
+                image_id='image_id', clean_shutdown=True)
+
+        mock_client.can_send_version.assert_called_once_with('5.13')
+        mock_client.prepare.assert_called_with(
+                server=self.fake_instance_obj.host, version='5.0')
+        mock_cctx.cast.assert_called_with(  # No accel_uuids
+                ctxt, 'shelve_instance',
+                instance=self.fake_instance_obj,
+                image_id='image_id', clean_shutdown=True)

    def test_shelve_offload_instance(self):
        self._test_compute_api('shelve_offload_instance', 'cast',
                instance=self.fake_instance_obj,
-                clean_shutdown=True, version='5.0')
+                clean_shutdown=True, accel_uuids=None, version='5.13')
+
+    def test_shelve_offload_instance_old_rpcapi(self):
+        # With rpcapi < 5.13, accel_uuids must be dropped in the client call.
+        ctxt = context.RequestContext('fake_user', 'fake_project')
+        compute_api = compute_rpcapi.ComputeAPI()
+        compute_api.router.client = mock.Mock()
+        mock_client = mock.MagicMock()
+        compute_api.router.client.return_value = mock_client
+        # Force can_send_version to False, so that 5.0 version is used.
+        mock_client.can_send_version.return_value = False
+        mock_cctx = mock.MagicMock()
+        mock_client.prepare.return_value = mock_cctx
+        compute_api.shelve_offload_instance(
+                ctxt, instance=self.fake_instance_obj,
+                accel_uuids=['938af7f9-f136-4e5a-bdbe-3b6feab54311'],
+                clean_shutdown=True,)
+
+        mock_client.can_send_version.assert_called_once_with('5.13')
+        mock_client.prepare.assert_called_with(
+                server=self.fake_instance_obj.host, version='5.0')
+        mock_cctx.cast.assert_called_with(  # No accel_uuids
+                ctxt, 'shelve_offload_instance',
+                instance=self.fake_instance_obj,
+                clean_shutdown=True)

    def test_unshelve_instance(self):
        self._test_compute_api('unshelve_instance', 'cast',
                instance=self.fake_instance_obj, host='host', image='image',
                filter_properties={'fakeprop': 'fakeval'}, node='node',
-                request_spec=self.fake_request_spec_obj,
-                version='5.2')
+                request_spec=self.fake_request_spec_obj, accel_uuids=None,
+                version='5.13')

    def test_cache_image(self):
        self._test_compute_api('cache_images', 'call',
@ -943,7 +991,10 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
            host='host', request_spec=self.fake_request_spec_obj,
            image='image')

-        mock_client.can_send_version.assert_called_once_with('5.2')
+        mock_client.can_send_version.assert_has_calls([
+            mock.call('5.13'),
+            mock.call('5.2'),
+        ])
        mock_client.prepare.assert_called_with(
            server='host', version='5.0')
        mock_cctx.cast.assert_called_with(
--- a/nova/tests/unit/compute/test_shelve.py
+++ b/nova/tests/unit/compute/test_shelve.py
@ -10,6 +10,7 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.

+import eventlet
 import mock
 from oslo_utils import fixture as utils_fixture
 from oslo_utils.fixture import uuidsentinel as uuids
@ -63,7 +64,8 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
                         mock_notify_instance_usage, mock_get_power_state,
                         mock_snapshot, mock_power_off, mock_terminate,
                         mock_get_bdms, clean_shutdown=True,
-                         guest_power_state=power_state.RUNNING):
+                         guest_power_state=power_state.RUNNING,
+                         accel_uuids=None):
        mock_get_power_state.return_value = 123

        CONF.set_override('shelved_offload_time', shelved_offload_time)
@ -127,7 +129,8 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
            mock_save.side_effect = check_save
            self.compute.shelve_instance(self.context, instance,
                                         image_id=image_id,
-                                         clean_shutdown=clean_shutdown)
+                                         clean_shutdown=clean_shutdown,
+                                         accel_uuids=accel_uuids)
            mock_notify.assert_has_calls([
                mock.call(self.context, instance, 'fake-mini',
                          action='shelve', phase='start', bdms=fake_bdms),
@ -181,6 +184,18 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
    def test_shelve_paused_instance(self):
        self._shelve_instance(-1, guest_power_state=power_state.PAUSED)

+    @mock.patch('nova.accelerator.cyborg._CyborgClient.'
+                'delete_arqs_for_instance')
+    def test_shelve_and_offload_with_accel_uuids(self, mock_del_arqs):
+        self._shelve_instance(0, accel_uuids=[uuids.fake])
+        mock_del_arqs.assert_called_once()
+
+    @mock.patch('nova.accelerator.cyborg._CyborgClient.'
+                'delete_arqs_for_instance')
+    def test_shelve_and_with_accel_uuids(self, mock_del_arqs):
+        self._shelve_instance(-1, accel_uuids=[uuids.fake])
+        mock_del_arqs.assert_not_called()
+
    @mock.patch.object(nova.virt.fake.SmallFakeDriver, 'power_off')
    def test_shelve_offload(self, mock_power_off):
        instance = self._shelve_offload()
@ -279,10 +294,12 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
                      mock_get_power_state, mock_spawn,
                      mock_prep_block_device, mock_notify_instance_usage,
                      mock_notify_instance_action,
-                      mock_get_bdms):
+                      mock_get_bdms,
+                      accel_uuids=None,
+                      instance=None):
        mock_bdms = mock.Mock()
        mock_get_bdms.return_value = mock_bdms
-        instance = self._create_fake_instance_obj()
+        instance = instance or self._create_fake_instance_obj()
        instance.task_state = task_states.UNSHELVING
        instance.save()
        image = {'id': uuids.image_id}
@ -340,7 +357,8 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
            self.compute.unshelve_instance(
                self.context, instance, image=image,
                filter_properties=filter_properties,
-                node=node, request_spec=objects.RequestSpec())
+                node=node, request_spec=objects.RequestSpec(),
+                accel_uuids=accel_uuids)

        mock_notify_instance_action.assert_has_calls([
            mock.call(self.context, instance, 'fake-mini',
@ -362,7 +380,7 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
        mock_spawn.assert_called_once_with(self.context, instance,
                test.MatchType(objects.ImageMeta), injected_files=[],
                admin_password=None, allocations={}, network_info=[],
-                block_device_info='fake_bdm')
+                block_device_info='fake_bdm', accel_info=mock.ANY)
        self.mock_get_allocations.assert_called_once_with(self.context,
                                                          instance.uuid)
        mock_get_power_state.assert_called_once_with(instance)
@ -380,6 +398,57 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
        self.assertEqual(self.compute.host, instance.host)
        self.assertFalse(instance.auto_disk_config)

+    @mock.patch.object(nova.compute.manager.ComputeManager,
+                       '_get_bound_arq_resources')
+    def test_unshelve_with_arqs(self, mock_get_arqs):
+        self.test_unshelve(accel_uuids=[uuids.fake])
+        mock_get_arqs.assert_called_once()
+
+    @mock.patch.object(nova.compute.manager.ComputeManager,
+                       '_nil_out_instance_obj_host_and_node')
+    @mock.patch.object(nova.compute.manager.ComputeManager,
+                       '_terminate_volume_connections')
+    @mock.patch.object(nova.compute.manager.ComputeManager,
+                       '_build_resources_cleanup')
+    @mock.patch.object(nova.compute.manager.ComputeManager,
+                       '_get_bound_arq_resources')
+    def test_unshelve_with_arqs_failes_cleanup_resources(
+            self, mock_get_arqs, mock_cleanup_resources,
+            mock_bdms, mock_nil):
+        instance = self._create_fake_instance_obj()
+
+        mock_get_arqs.side_effect = exception.AcceleratorRequestOpFailed(
+                instance_uuid=instance.uuid, op='get',
+                msg='Failure getting accelerator requests.')
+
+        exc = self.assertRaises(exception.AcceleratorRequestOpFailed,
+                                self.test_unshelve,
+                                accel_uuids=[uuids.fake],
+                                instance=instance)
+        self.assertIn('Failure getting accelerator requests.', str(exc))
+        mock_cleanup_resources.assert_called_once()
+
+    @mock.patch.object(nova.compute.manager.ComputeManager,
+                       '_nil_out_instance_obj_host_and_node')
+    @mock.patch.object(nova.compute.manager.ComputeManager,
+                       '_terminate_volume_connections')
+    @mock.patch.object(nova.compute.manager.ComputeManager,
+                       '_build_resources_cleanup')
+    @mock.patch.object(nova.compute.manager.ComputeManager,
+                       '_get_bound_arq_resources')
+    def test_unshelve_with_arqs_failes_cleanup_resources_timeout(
+            self, mock_get_arqs, mock_cleanup_resources,
+            mock_bdms, mock_nil):
+        instance = self._create_fake_instance_obj()
+
+        mock_get_arqs.side_effect = eventlet.timeout.Timeout()
+
+        self.assertRaises(eventlet.timeout.Timeout,
+                          self.test_unshelve,
+                          accel_uuids=[uuids.fake],
+                          instance=instance)
+        mock_cleanup_resources.assert_called_once()
+
    @mock.patch('nova.objects.BlockDeviceMappingList.get_by_instance_uuid')
    @mock.patch('nova.compute.utils.notify_about_instance_action')
    @mock.patch.object(nova.compute.resource_tracker.ResourceTracker,
@ -464,7 +533,7 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
                                                    {}, limits)
        mock_spawn.assert_called_once_with(self.context, instance,
                test.MatchType(objects.ImageMeta),
-                injected_files=[], admin_password=None,
+                injected_files=[], admin_password=None, accel_info=[],
                allocations={}, network_info=[], block_device_info='fake_bdm')
        self.mock_get_allocations.assert_called_once_with(self.context,
                                                          instance.uuid)
@ -551,7 +620,7 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
                                                    {}, limits)
        mock_spawn.assert_called_once_with(
            self.context, instance, test.MatchType(objects.ImageMeta),
-            injected_files=[], admin_password=None,
+            injected_files=[], admin_password=None, accel_info=[],
            allocations={}, network_info=[], block_device_info='fake_bdm')
        mock_terminate_volume_connections.assert_called_once_with(
            self.context, instance, mock_bdms)
@ -767,11 +836,11 @@ class ShelveComputeAPITestCase(test_compute.BaseTestCase):
            if boot_from_volume:
                rpcapi_shelve_offload_instance.assert_called_once_with(
                    self.context, instance=instance,
-                    clean_shutdown=clean_shutdown)
+                    clean_shutdown=clean_shutdown, accel_uuids=[])
            else:
                rpcapi_shelve_instance.assert_called_once_with(
                    self.context, instance=instance, image_id='fake-image-id',
-                    clean_shutdown=clean_shutdown)
+                    clean_shutdown=clean_shutdown, accel_uuids=[])

            db.instance_destroy(self.context, instance['uuid'])

@ -831,7 +900,7 @@ class ShelveComputeAPITestCase(test_compute.BaseTestCase):
            instance_save.assert_called_once_with(expected_task_state=[None])
            rpcapi_shelve_offload_instance.assert_called_once_with(
                    self.context, instance=fake_instance,
-                    clean_shutdown=clean_shutdown)
+                    clean_shutdown=clean_shutdown, accel_uuids=[])
            record.assert_called_once_with(self.context, fake_instance,
                                           instance_actions.SHELVE_OFFLOAD)

--- a/nova/tests/unit/conductor/test_conductor.py
+++ b/nova/tests/unit/conductor/test_conductor.py
@ -1340,6 +1340,7 @@ class _BaseTaskTestCase(object):
        system_metadata['shelved_host'] = 'fake-mini'

        fake_spec = fake_request_spec.fake_spec_obj()
+        fake_spec.flavor = instance.flavor
        # FIXME(sbauza): Modify the fake RequestSpec object to either add a
        # non-empty SchedulerRetries object or nullify the field
        fake_spec.retry = None
@ -1409,6 +1410,7 @@ class _BaseTaskTestCase(object):
            unshelve_instance.assert_called_once_with(
                self.context, mock.ANY, host['host'],
                test.MatchType(objects.RequestSpec), image=mock.ANY,
+                accel_uuids=[],
                filter_properties=filter_properties, node=host['nodename']
            )

@ -1450,6 +1452,7 @@ class _BaseTaskTestCase(object):
        instance.task_state = task_states.UNSHELVING
        # 'shelved_image_id' is None for volumebacked instance
        instance.system_metadata['shelved_image_id'] = None
+        self.request_spec.flavor = instance.flavor

        with test.nested(
            mock.patch.object(self.conductor_manager,
@ -1487,6 +1490,7 @@ class _BaseTaskTestCase(object):
        mock_im.return_value = objects.InstanceMapping(
            cell_mapping=cell_mapping)
        instance = self._create_fake_instance_obj()
+        fake_spec.flavor = instance.flavor
        instance.vm_state = vm_states.SHELVED_OFFLOADED
        instance.save()
        system_metadata = instance.system_metadata
@ -1506,6 +1510,7 @@ class _BaseTaskTestCase(object):
            filter_properties=dict(
                # populate_filter_properties adds limits={}
                fake_spec.to_legacy_filter_properties_dict(), limits={}),
+            accel_uuids=[],
            node='fake_node')

    def test_unshelve_instance_schedule_and_rebuild_novalid_host(self):
@ -1513,6 +1518,7 @@ class _BaseTaskTestCase(object):
        instance.vm_state = vm_states.SHELVED_OFFLOADED
        instance.save()
        system_metadata = instance.system_metadata
+        self.request_spec.flavor = instance.flavor

        def fake_schedule_instances(context, request_spec, *instances,
                **kwargs):
@ -1559,6 +1565,7 @@ class _BaseTaskTestCase(object):
        system_metadata['shelved_at'] = timeutils.utcnow()
        system_metadata['shelved_image_id'] = 'fake_image_id'
        system_metadata['shelved_host'] = 'fake-mini'
+        self.request_spec.flavor = instance.flavor
        self.assertRaises(messaging.MessagingTimeout,
                          self.conductor_manager.unshelve_instance,
                          self.context, instance, self.request_spec)
@ -1582,6 +1589,7 @@ class _BaseTaskTestCase(object):
            cell_mapping=objects.CellMapping.get_by_uuid(self.context,
                                                         uuids.cell1))
        instance = self._create_fake_instance_obj()
+        fake_spec.flavor = instance.flavor
        instance.vm_state = vm_states.SHELVED_OFFLOADED
        instance.save()
        system_metadata = instance.system_metadata
@ -1594,7 +1602,7 @@ class _BaseTaskTestCase(object):
            self.context, fake_spec, [instance.uuid], return_alternates=False)
        mock_unshelve.assert_called_once_with(
            self.context, instance, 'fake_host', fake_spec, image=None,
-            filter_properties={'limits': {}}, node='fake_node')
+            accel_uuids=[], filter_properties={'limits': {}}, node='fake_node')

    @mock.patch('nova.scheduler.utils.fill_provider_mapping')
    @mock.patch('nova.network.neutron.API.get_requested_resource_for_instance')
@ -1607,6 +1615,7 @@ class _BaseTaskTestCase(object):
        instance.save()

        request_spec = objects.RequestSpec()
+        request_spec.flavor = instance.flavor

        selection = objects.Selection(
            service_host='fake_host',
@ -1629,6 +1638,87 @@ class _BaseTaskTestCase(object):
        mock_fill_provider_mapping.assert_called_once_with(
            request_spec, selection)

+    @mock.patch('nova.accelerator.cyborg.get_device_profile_request_groups')
+    @mock.patch.object(conductor_manager.ComputeTaskManager,
+                       '_create_and_bind_arq_for_instance', )
+    @mock.patch('nova.scheduler.utils.fill_provider_mapping')
+    @mock.patch('nova.network.neutron.API.get_requested_resource_for_instance')
+    @mock.patch.object(conductor_manager.ComputeTaskManager,
+                       '_schedule_instances', )
+    def test_unshelve_instance_resource_request_with_device_profile(
+            self, mock_schedule, mock_get_res_req, mock_fill_provider_mapping,
+            mock_get_arq, mock_get_dp):
+        instance = self._create_fake_instance_obj()
+        instance.vm_state = vm_states.SHELVED_OFFLOADED
+        instance.save()
+
+        request_spec = objects.RequestSpec()
+        request_spec.flavor = instance.flavor
+        request_spec.flavor.extra_specs = {'accel:device_profile': 'mydp'}
+
+        selection = objects.Selection(
+            service_host='fake_host',
+            nodename='fake_node',
+            limits=None)
+        mock_schedule.return_value = [[selection]]
+
+        mock_get_res_req.return_value = []
+
+        dp_groups = [objects.RequestGroup(requester_id='deviceprofile2'),
+                     objects.RequestGroup(requester_id='deviceprofile3')]
+        mock_get_dp.return_value = dp_groups
+        mock_get_arq.return_value = []
+
+        self.conductor_manager.unshelve_instance(
+            self.context, instance, request_spec)
+
+        self.assertEqual(dp_groups, request_spec.requested_resources)
+
+        mock_get_res_req.assert_called_once_with(self.context, instance.uuid)
+        mock_schedule.assert_called_once_with(
+            self.context, request_spec, [instance.uuid],
+            return_alternates=False)
+        mock_fill_provider_mapping.assert_called_once_with(
+            request_spec, selection)
+
+    @mock.patch('nova.accelerator.cyborg._CyborgClient.delete_arqs_by_uuid')
+    @mock.patch('nova.accelerator.cyborg.get_device_profile_request_groups')
+    @mock.patch.object(conductor_manager.ComputeTaskManager,
+                       '_create_and_bind_arq_for_instance', )
+    @mock.patch('nova.scheduler.utils.fill_provider_mapping')
+    @mock.patch('nova.network.neutron.API.get_requested_resource_for_instance')
+    @mock.patch.object(conductor_manager.ComputeTaskManager,
+                       '_schedule_instances', )
+    def test_unshelve_instance_resource_request_with_arq_bind_fail(
+            self, mock_schedule, mock_get_res_req, mock_fill_provider_mapping,
+            mock_get_arqs, mock_get_dp, mock_del_arqs):
+        instance = self._create_fake_instance_obj()
+        instance.vm_state = vm_states.SHELVED_OFFLOADED
+        instance.save()
+
+        request_spec = objects.RequestSpec()
+        request_spec.flavor = instance.flavor
+        request_spec.flavor.extra_specs = {'accel:device_profile': 'mydp'}
+
+        selection = objects.Selection(
+            service_host='fake_host',
+            nodename='fake_node',
+            limits=None)
+        mock_schedule.return_value = [[selection]]
+        dp_groups = [objects.RequestGroup(requester_id='deviceprofile2'),
+                     objects.RequestGroup(requester_id='deviceprofile3')]
+        mock_get_dp.return_value = dp_groups
+        mock_get_res_req.return_value = []
+        arqs = ["fake-arq-uuid"]
+        mock_get_arqs.side_effect = exc.AcceleratorRequestBindingFailed(
+                                    arqs=arqs, msg='')
+        ex = self.assertRaises(exc.AcceleratorRequestBindingFailed,
+                               self.conductor_manager.unshelve_instance,
+                               context=self.context, instance=instance,
+                               request_spec=request_spec)
+        self.assertIn('Failed to bind accelerator requests', ex.message)
+        mock_del_arqs.assert_called_with(arqs)
+
    def test_rebuild_instance(self):
        inst_obj = self._create_fake_instance_obj()
        rebuild_args, compute_args = self._prepare_rebuild_args(
@ -1698,6 +1788,57 @@ class _BaseTaskTestCase(object):
            self.context, inst_obj, 'thebesthost', action='rebuild_scheduled',
            source='nova-conductor')

+    @mock.patch('nova.accelerator.cyborg._CyborgClient.'
+                'get_arq_uuids_for_instance')
+    @mock.patch('nova.compute.utils.notify_about_instance_rebuild')
+    def test_rebuild_instance_with_device_profile(self, mock_notify,
+                                                  mock_get_arqs):
+        inst_obj = self._create_fake_instance_obj()
+        inst_obj.flavor.extra_specs = {'accel:device_profile': 'mydp'}
+        inst_obj.host = 'noselect'
+        expected_host = 'thebesthost'
+        expected_node = 'thebestnode'
+        expected_limits = None
+        fake_selection = objects.Selection(service_host=expected_host,
+                nodename=expected_node, limits=None)
+        rebuild_args, compute_args = self._prepare_rebuild_args(
+            {'host': None, 'node': expected_node, 'limits': expected_limits})
+        fake_spec = objects.RequestSpec()
+        rebuild_args['request_spec'] = fake_spec
+        inst_uuids = [inst_obj.uuid]
+        arqs = ['fake-arq-uuid']
+        mock_get_arqs.return_value = arqs
+        with test.nested(
+            mock.patch.object(self.conductor_manager.compute_rpcapi,
+                              'rebuild_instance'),
+            mock.patch.object(scheduler_utils, 'setup_instance_group',
+                              return_value=False),
+            mock.patch.object(self.conductor_manager.query_client,
+                              'select_destinations',
+                              return_value=[[fake_selection]])
+        ) as (rebuild_mock, sig_mock, select_dest_mock):
+            self.conductor_manager.rebuild_instance(context=self.context,
+                                            instance=inst_obj,
+                                            **rebuild_args)
+            self.ensure_network_metadata_mock.assert_called_once_with(
+                inst_obj)
+            self.heal_reqspec_is_bfv_mock.assert_called_once_with(
+                self.context, fake_spec, inst_obj)
+            select_dest_mock.assert_called_once_with(self.context, fake_spec,
+                    inst_uuids, return_objects=True, return_alternates=False)
+            compute_args['host'] = expected_host
+            compute_args['request_spec'] = fake_spec
+            compute_args['accel_uuids'] = arqs
+            rebuild_mock.assert_called_once_with(self.context,
+                                            instance=inst_obj,
+                                            **compute_args)
+            self.assertEqual(inst_obj.project_id, fake_spec.project_id)
+        self.assertEqual('compute.instance.rebuild.scheduled',
+                         fake_notifier.NOTIFICATIONS[0].event_type)
+        mock_notify.assert_called_once_with(
+            self.context, inst_obj, 'thebesthost', action='rebuild_scheduled',
+            source='nova-conductor')
+
    def test_rebuild_instance_with_scheduler_no_host(self):
        inst_obj = self._create_fake_instance_obj()
        inst_obj.host = 'noselect'
@ -1896,6 +2037,151 @@ class _BaseTaskTestCase(object):
            self.context, inst_obj, 'thebesthost', action='rebuild_scheduled',
            source='nova-conductor')

+    @mock.patch(
+        'nova.accelerator.cyborg._CyborgClient.delete_arqs_for_instance')
+    @mock.patch.object(conductor_manager.ComputeTaskManager,
+                       '_create_and_bind_arq_for_instance', )
+    @mock.patch('nova.accelerator.cyborg.get_device_profile_request_groups')
+    @mock.patch('nova.compute.utils.notify_about_instance_rebuild')
+    def test_evacuate_instance_with_request_spec_device_profile(
+            self, mock_notify, mock_get_dp, mock_get_arqs, mock_del_arqs):
+        inst_obj = self._create_fake_instance_obj()
+        inst_obj.host = 'noselect'
+        expected_host = 'thebesthost'
+        expected_node = 'thebestnode'
+        expected_limits = None
+        fake_selection = objects.Selection(service_host=expected_host,
+                nodename=expected_node, limits=None)
+        fake_spec = objects.RequestSpec(ignore_hosts=[uuids.ignored_host])
+        fake_spec.flavor = inst_obj.flavor
+        fake_spec.flavor.extra_specs = {'accel:device_profile': 'mydp'}
+        rebuild_args, compute_args = self._prepare_rebuild_args(
+            {'host': None, 'node': expected_node, 'limits': expected_limits,
+             'request_spec': fake_spec, 'recreate': True})
+        dp_groups = [objects.RequestGroup(requester_id='deviceprofile2'),
+                     objects.RequestGroup(requester_id='deviceprofile3')]
+        mock_get_dp.return_value = dp_groups
+        arqs = ['fake-arq-uuid']
+        mock_get_arqs.return_value = arqs
+        with test.nested(
+            mock.patch.object(self.conductor_manager.compute_rpcapi,
+                              'rebuild_instance'),
+            mock.patch.object(scheduler_utils, 'setup_instance_group',
+                              return_value=False),
+            mock.patch.object(self.conductor_manager.query_client,
+                              'select_destinations',
+                              return_value=[[fake_selection]]),
+            mock.patch.object(fake_spec, 'reset_forced_destinations'),
+            mock.patch('nova.scheduler.utils.fill_provider_mapping'),
+            mock.patch('nova.network.neutron.API.'
+                       'get_requested_resource_for_instance',
+                       return_value=[])
+        ) as (rebuild_mock, sig_mock, select_dest_mock, reset_fd,
+              fill_rp_mapping_mock, get_req_res_mock):
+            self.conductor_manager.rebuild_instance(context=self.context,
+                                            instance=inst_obj,
+                                            **rebuild_args)
+            reset_fd.assert_called_once_with()
+            # The RequestSpec.ignore_hosts field should be overwritten.
+            self.assertEqual([inst_obj.host], fake_spec.ignore_hosts)
+            # The RequestSpec.requested_destination.cell field should be set.
+            self.assertIn('requested_destination', fake_spec)
+            self.assertIn('cell', fake_spec.requested_destination)
+            self.assertIsNotNone(fake_spec.requested_destination.cell)
+            select_dest_mock.assert_called_once_with(self.context,
+                    fake_spec, [inst_obj.uuid], return_objects=True,
+                    return_alternates=False)
+            compute_args['host'] = expected_host
+            compute_args['request_spec'] = fake_spec
+            compute_args['accel_uuids'] = arqs
+            rebuild_mock.assert_called_once_with(self.context,
+                                            instance=inst_obj,
+                                            **compute_args)
+            get_req_res_mock.assert_called_once_with(
+                self.context, inst_obj.uuid)
+            fill_rp_mapping_mock.assert_called_once_with(
+                fake_spec, fake_selection)
+            mock_del_arqs.assert_called_once()
+
+        self.assertEqual('compute.instance.rebuild.scheduled',
+                         fake_notifier.NOTIFICATIONS[0].event_type)
+        mock_notify.assert_called_once_with(
+            self.context, inst_obj, 'thebesthost', action='rebuild_scheduled',
+            source='nova-conductor')
+
+    @mock.patch('nova.accelerator.cyborg._CyborgClient.delete_arqs_by_uuid')
+    @mock.patch(
+        'nova.accelerator.cyborg._CyborgClient.delete_arqs_for_instance')
+    @mock.patch.object(conductor_manager.ComputeTaskManager,
+                       '_create_and_bind_arq_for_instance', )
+    @mock.patch('nova.accelerator.cyborg.get_device_profile_request_groups')
+    @mock.patch('nova.compute.utils.notify_about_instance_rebuild')
+    def test_evacuate_instance_with_request_spec_arq_bind_fail(
+            self, mock_notify, mock_get_dp, mock_get_arqs,
+            mock_del_arqs_instance, mock_del_arqs):
+        inst_obj = self._create_fake_instance_obj()
+        inst_obj.host = 'noselect'
+        expected_host = 'thebesthost'
+        expected_node = 'thebestnode'
+        expected_limits = None
+        fake_selection = objects.Selection(service_host=expected_host,
+                nodename=expected_node, limits=None)
+        fake_spec = objects.RequestSpec(ignore_hosts=[uuids.ignored_host])
+        fake_spec.flavor = inst_obj.flavor
+        fake_spec.flavor.extra_specs = {'accel:device_profile': 'mydp'}
+        dp_groups = [objects.RequestGroup(requester_id='deviceprofile2'),
+                     objects.RequestGroup(requester_id='deviceprofile3')]
+        mock_get_dp.return_value = dp_groups
+        fake_spec.requested_resources = dp_groups
+        rebuild_args, compute_args = self._prepare_rebuild_args(
+            {'host': None, 'node': expected_node, 'limits': expected_limits,
+             'request_spec': fake_spec, 'recreate': True})
+        arqs = ['fake-arq-uuid']
+        mock_get_arqs.side_effect = exc.AcceleratorRequestBindingFailed(
+                                    arqs=arqs, msg='')
+        with test.nested(
+            mock.patch.object(scheduler_utils, 'setup_instance_group',
+                              return_value=False),
+            mock.patch.object(self.conductor_manager.query_client,
+                              'select_destinations',
+                              return_value=[[fake_selection]]),
+            mock.patch.object(fake_spec, 'reset_forced_destinations'),
+            mock.patch('nova.scheduler.utils.fill_provider_mapping'),
+            mock.patch('nova.network.neutron.API.'
+                       'get_requested_resource_for_instance',
+                       return_value=[])
+        ) as (sig_mock, select_dest_mock, reset_fd,
+              fill_rp_mapping_mock, get_req_res_mock):
+            ex = self.assertRaises(exc.AcceleratorRequestBindingFailed,
+                                   self.conductor_manager.rebuild_instance,
+                                   context=self.context, instance=inst_obj,
+                                   **rebuild_args)
+            reset_fd.assert_called_once_with()
+            # The RequestSpec.ignore_hosts field should be overwritten.
+            self.assertEqual([inst_obj.host], fake_spec.ignore_hosts)
+            # The RequestSpec.requested_destination.cell field should be set.
+            self.assertIn('requested_destination', fake_spec)
+            self.assertIn('cell', fake_spec.requested_destination)
+            self.assertIsNotNone(fake_spec.requested_destination.cell)
+            select_dest_mock.assert_called_once_with(self.context,
+                    fake_spec, [inst_obj.uuid], return_objects=True,
+                    return_alternates=False)
+            compute_args['host'] = expected_host
+            compute_args['request_spec'] = fake_spec
+            get_req_res_mock.assert_called_once_with(
+                self.context, inst_obj.uuid)
+            fill_rp_mapping_mock.assert_called_once_with(
+                fake_spec, fake_selection)
+            self.assertIn('Failed to bind accelerator requests', ex.message)
+            mock_del_arqs.assert_called_with(arqs)
+            mock_del_arqs_instance.assert_called_once()
+
+        self.assertEqual('compute.instance.rebuild.scheduled',
+                         fake_notifier.NOTIFICATIONS[0].event_type)
+        mock_notify.assert_called_once_with(
+            self.context, inst_obj, 'thebesthost', action='rebuild_scheduled',
+            source='nova-conductor')
+
    @mock.patch(
        'nova.conductor.tasks.cross_cell_migrate.ConfirmResizeTask.execute')
    def test_confirm_snapshot_based_resize(self, mock_execute):
--- a/releasenotes/notes/bp-cyborg-rebuild-and-evacuate-97bba59988b8b072.yaml
+++ b/releasenotes/notes/bp-cyborg-rebuild-and-evacuate-97bba59988b8b072.yaml
@ -0,0 +1,12 @@
+---
+features:
+  - |
+    Add Cyborg shelve/unshelve support.
+
+    After shelve the ARQs are still kept bound to the instance.
+
+    After shelve offload the ARQs of the instance will be feered
+    in Cyborg.
+
+    During unshelve the ARQs will be reallocated and bound to
+    the instance if needed.