From cc630b4eb62c4a45ff08a1862a8339a0f129e0a3 Mon Sep 17 00:00:00 2001 From: Sundar Nadathur Date: Wed, 16 Jan 2019 00:35:55 -0800 Subject: [PATCH] Create and bind Cyborg ARQs. * Call Cyborg with device profile name to get ARQs (Accelerator Requests). Each ARQ corresponds to a single device profile group, which corrresponds to a single request group in request spec. * Match each ARQ to associated request group, and thereby obtain the corresponding RP for that ARQ. * Call Cyborg to bind the ARQ to that host/device-RP. * When Cyborg sends the ARQ bind notification events, wait for those events with a timeout. Change-Id: I0f8b6bf2b4f4510da6c84fede532533602b6af7f Blueprint: nova-cyborg-interaction --- nova/accelerator/cyborg.py | 118 +++++++++++ nova/compute/manager.py | 84 ++++++-- nova/conductor/manager.py | 55 +++++ nova/conf/compute.py | 11 + nova/exception.py | 4 + nova/tests/fixtures.py | 122 +++++++++++ nova/tests/functional/test_servers.py | 120 ++++++++++- nova/tests/unit/accelerator/test_cyborg.py | 217 ++++++++++++++++++++ nova/tests/unit/compute/test_compute_mgr.py | 182 ++++++++++++++++ nova/tests/unit/conductor/test_conductor.py | 161 ++++++++++++++- 10 files changed, 1054 insertions(+), 20 deletions(-) diff --git a/nova/accelerator/cyborg.py b/nova/accelerator/cyborg.py index 61aa78c86f8a..1991336f7638 100644 --- a/nova/accelerator/cyborg.py +++ b/nova/accelerator/cyborg.py @@ -80,6 +80,7 @@ def get_device_profile_request_groups(context, dp_name): class _CyborgClient(object): DEVICE_PROFILE_URL = "/device_profiles" + ARQ_URL = "/accelerator_requests" def __init__(self, context): auth = service_auth.get_auth_plugin(context) @@ -145,3 +146,120 @@ class _CyborgClient(object): rg.add_trait(trait_name=name, trait_type=val) request_groups.append(rg) return request_groups + + def _create_arqs(self, dp_name): + data = {"device_profile_name": dp_name} + resp, err_msg = self._call_cyborg(self._client.post, + self.ARQ_URL, json=data) + + if err_msg: + raise exception.AcceleratorRequestOpFailed( + op=_('create'), msg=err_msg) + + return resp.json().get('arqs') + + def create_arqs_and_match_resource_providers(self, dp_name, rg_rp_map): + """Create ARQs, match them with request groups and thereby + determine their corresponding RPs. + + :param dp_name: Device profile name + :param rg_rp_map: Request group - Resource Provider map + {requester_id: [resource_provider_uuid]} + :returns: + [arq], with each ARQ associated with an RP + :raises: DeviceProfileError, AcceleratorRequestOpFailed + """ + LOG.info('Creating ARQs for device profile %s', dp_name) + arqs = self._create_arqs(dp_name) + if not arqs or len(arqs) == 0: + msg = _('device profile name %s') % dp_name + raise exception.AcceleratorRequestOpFailed(op=_('create'), msg=msg) + for arq in arqs: + dp_group_id = arq['device_profile_group_id'] + arq['device_rp_uuid'] = None + requester_id = ( + get_device_profile_group_requester_id(dp_group_id)) + arq['device_rp_uuid'] = rg_rp_map[requester_id][0] + return arqs + + def bind_arqs(self, bindings): + """Initiate Cyborg bindings. + + Handles RFC 6902-compliant JSON patching, sparing + calling Nova code from those details. + + :param bindings: + { "$arq_uuid": { + "hostname": STRING + "device_rp_uuid": UUID + "instance_uuid": UUID + }, + ... + } + :returns: nothing + :raises: AcceleratorRequestOpFailed + """ + LOG.info('Binding ARQs.') + # Create a JSON patch in RFC 6902 format + patch_list = {} + for arq_uuid, binding in bindings.items(): + patch = [{"path": "/" + field, + "op": "add", + "value": value + } for field, value in binding.items()] + patch_list[arq_uuid] = patch + + resp, err_msg = self._call_cyborg(self._client.patch, + self.ARQ_URL, json=patch_list) + if err_msg: + msg = _(' Binding failed for ARQ UUIDs: ') + err_msg = err_msg + msg + ','.join(bindings.keys()) + raise exception.AcceleratorRequestOpFailed( + op=_('bind'), msg=err_msg) + + def get_arqs_for_instance(self, instance_uuid, only_resolved=False): + """Get ARQs for the instance. + + :param instance_uuid: Instance UUID + :param only_resolved: flag to return only resolved ARQs + :returns: List of ARQs for the instance: + if only_resolved: only those ARQs which have completed binding + else: all ARQs + The format of the returned data structure is as below: + [ + {'uuid': $arq_uuid, + 'device_profile_name': $dp_name, + 'device_profile_group_id': $dp_request_group_index, + 'state': 'Bound', + 'device_rp_uuid': $resource_provider_uuid, + 'hostname': $host_nodename, + 'instance_uuid': $instance_uuid, + 'attach_handle_info': { # PCI bdf + 'bus': '0c', 'device': '0', + 'domain': '0000', 'function': '0'}, + 'attach_handle_type': 'PCI' + # or 'TEST_PCI' for Cyborg fake driver + } + ] + :raises: AcceleratorRequestOpFailed + """ + query = {"instance": instance_uuid} + resp, err_msg = self._call_cyborg(self._client.get, + self.ARQ_URL, params=query) + + if err_msg: + err_msg = err_msg + _(' Instance %s') % instance_uuid + raise exception.AcceleratorRequestOpFailed( + op=_('get'), msg=err_msg) + + arqs = resp.json().get('arqs') + if not arqs: + err_msg = _('Cyborg returned no accelerator requests for ' + 'instance %s') % instance_uuid + raise exception.AcceleratorRequestOpFailed( + op=_('get'), msg=err_msg) + + if only_resolved: + arqs = [arq for arq in arqs if + arq['state'] in ['Bound', 'BindFailed', 'Deleting']] + return arqs diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 519bbad779c6..c2d4a9b21e86 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -56,6 +56,7 @@ from oslo_utils import units import six from six.moves import range +from nova.accelerator import cyborg from nova import block_device from nova.compute import api as compute from nova.compute import build_results @@ -2512,6 +2513,14 @@ class ComputeManager(manager.Manager): self.host, phase=fields.NotificationPhase.END, bdms=block_device_mapping) + def _build_resources_cleanup(self, instance, network_info): + # Make sure the async call finishes + if network_info is not None: + network_info.wait(do_raise=False) + self.driver.clean_networks_preparation(instance, + network_info) + self.driver.failed_spawn_cleanup(instance) + @contextlib.contextmanager def _build_resources(self, context, instance, requested_networks, security_groups, image_meta, block_device_mapping, @@ -2566,33 +2575,34 @@ class ComputeManager(manager.Manager): except (exception.InstanceNotFound, exception.UnexpectedDeletingTaskStateError): with excutils.save_and_reraise_exception(): - # Make sure the async call finishes - if network_info is not None: - network_info.wait(do_raise=False) - self.driver.clean_networks_preparation(instance, - network_info) - self.driver.failed_spawn_cleanup(instance) + self._build_resources_cleanup(instance, network_info) except (exception.UnexpectedTaskStateError, exception.OverQuota, exception.InvalidBDM) as e: - # Make sure the async call finishes - if network_info is not None: - network_info.wait(do_raise=False) - self.driver.clean_networks_preparation(instance, network_info) - self.driver.failed_spawn_cleanup(instance) + self._build_resources_cleanup(instance, network_info) raise exception.BuildAbortException(instance_uuid=instance.uuid, reason=e.format_message()) except Exception: LOG.exception('Failure prepping block device', instance=instance) - # Make sure the async call finishes - if network_info is not None: - network_info.wait(do_raise=False) - self.driver.clean_networks_preparation(instance, network_info) - self.driver.failed_spawn_cleanup(instance) + self._build_resources_cleanup(instance, network_info) msg = _('Failure prepping block device.') raise exception.BuildAbortException(instance_uuid=instance.uuid, reason=msg) + arqs = [] + dp_name = instance.flavor.extra_specs.get('accel:device_profile') + try: + if dp_name: + arqs = self._get_bound_arq_resources( + context, dp_name, instance) + except (Exception, eventlet.timeout.Timeout) as exc: + LOG.exception(exc.format_message()) + self._build_resources_cleanup(instance, network_info) + msg = _('Failure getting accelerator requests.') + raise exception.BuildAbortException(instance_uuid=instance.uuid, + reason=msg) + resources['accel_info'] = arqs + try: yield resources except Exception as exc: @@ -2623,6 +2633,48 @@ class ComputeManager(manager.Manager): instance_uuid=instance.uuid, reason=six.text_type(exc)) + def _get_bound_arq_resources(self, context, dp_name, instance): + """Get bound accelerator requests. + + The ARQ binding was kicked off in the conductor as an async + operation. Here we wait for the notification from Cyborg. + + If the notification arrived before this point, which can happen + in many/most cases (see [1]), it will be lost. To handle that, + we use exit_wait_early. + [1] https://review.opendev.org/#/c/631244/46/nova/compute/ + manager.py@2627 + + :param dp_name: Device profile name. Caller ensures this is valid. + :param instance: instance object + :returns: List of ARQs for which bindings have completed, + successfully or otherwise + """ + + cyclient = cyborg.get_client(context) + arqs = cyclient.get_arqs_for_instance(instance.uuid) + events = [('accelerator-request-bound', arq['uuid']) for arq in arqs] + timeout = CONF.arq_binding_timeout + with self.virtapi.wait_for_instance_event( + instance, events, deadline=timeout): + resolved_arqs = cyclient.get_arqs_for_instance( + instance.uuid, only_resolved=True) + # Events for these resolved ARQs may have already arrived. + # Such 'early' events need to be ignored. + early_events = [('accelerator-request-bound', arq['uuid']) + for arq in resolved_arqs] + if early_events: + self.virtapi.exit_wait_early(early_events) + + # Since a timeout in wait_for_instance_event will raise, we get + # here only if all binding events have been received. + if sorted(resolved_arqs) != sorted(arqs): + # Query Cyborg to get all. + arqs = cyclient.get_arqs_for_instance(instance.uuid) + else: + arqs = resolved_arqs # latter has the right arq.state + return arqs + def _cleanup_allocated_networks(self, context, instance, requested_networks): """Cleanup networks allocated for instance. diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py index 08a96e4cf9d6..d5a3a7695c04 100644 --- a/nova/conductor/manager.py +++ b/nova/conductor/manager.py @@ -31,6 +31,7 @@ from oslo_utils import timeutils from oslo_utils import versionutils import six +from nova.accelerator import cyborg from nova import availability_zones from nova.compute import instance_actions from nova.compute import rpcapi as compute_rpcapi @@ -835,6 +836,18 @@ class ComputeTaskManager(base.Base): LOG.debug("Selected host: %s; Selected node: %s; Alternates: %s", host.service_host, host.nodename, alts, instance=instance) + try: + resource_provider_mapping = ( + local_reqspec.get_request_group_mapping()) + self._create_and_bind_arqs( + context, instance.uuid, instance.flavor.extra_specs, + host.nodename, resource_provider_mapping) + except Exception as exc: + LOG.exception('Failed to reschedule. Reason: %s', exc) + self._cleanup_when_reschedule_fails(context, instance, exc, + legacy_request_spec, requested_networks) + continue + self.compute_rpcapi.build_and_run_instance(context, instance=instance, host=host.service_host, image=image, request_spec=local_reqspec, @@ -1604,6 +1617,21 @@ class ComputeTaskManager(base.Base): # this one. continue + try: + resource_provider_mapping = ( + request_spec.get_request_group_mapping()) + # Using nodename instead of hostname. See: + # http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa + self._create_and_bind_arqs( + context, instance.uuid, instance.flavor.extra_specs, + host.nodename, resource_provider_mapping) + except Exception as exc: + # If anything failed here we need to cleanup and bail out. + with excutils.save_and_reraise_exception(): + self._cleanup_build_artifacts( + context, exc, instances, build_requests, request_specs, + block_device_mapping, tags, cell_mapping_cache) + # NOTE(danms): Compute RPC expects security group names or ids # not objects, so convert this to a list of names until we can # pass the objects. @@ -1622,6 +1650,33 @@ class ComputeTaskManager(base.Base): host=host.service_host, node=host.nodename, limits=host.limits, host_list=host_list) + def _create_and_bind_arqs(self, context, instance_uuid, extra_specs, + hostname, resource_provider_mapping): + """Create ARQs, determine their RPs and initiate ARQ binding. + + The binding is asynchronous; Cyborg will notify on completion. + The notification will be handled in the compute manager. + """ + dp_name = extra_specs.get('accel:device_profile') + if not dp_name: + return + + LOG.debug('Calling Cyborg to get ARQs. dp_name=%s instance=%s', + dp_name, instance_uuid) + cyclient = cyborg.get_client(context) + arqs = cyclient.create_arqs_and_match_resource_providers( + dp_name, resource_provider_mapping) + LOG.debug('Got ARQs with resource provider mapping %s', arqs) + + bindings = {arq['uuid']: + {"hostname": hostname, + "device_rp_uuid": arq['device_rp_uuid'], + "instance_uuid": instance_uuid + } + for arq in arqs} + # Initiate Cyborg binding asynchronously + cyclient.bind_arqs(bindings=bindings) + @staticmethod def _map_instance_to_cell(context, instance, cell): """Update the instance mapping to point at the given cell. diff --git a/nova/conf/compute.py b/nova/conf/compute.py index 83e5487535bf..4656ac0df43d 100644 --- a/nova/conf/compute.py +++ b/nova/conf/compute.py @@ -176,6 +176,17 @@ Related options: * vif_plugging_is_fatal - If ``vif_plugging_timeout`` is set to zero and ``vif_plugging_is_fatal`` is False, events should not be expected to arrive at all. +"""), + cfg.IntOpt('arq_binding_timeout', + default=300, + min=1, + help=""" +Timeout for Accelerator Request (ARQ) bind event message arrival. + +Number of seconds to wait for ARQ bind resolution event to arrive. +The event indicates that every ARQ for an instance has either bound +successfully or failed to bind. If it does not arrive, instance bringup +is aborted with an exception. """), cfg.StrOpt('injected_network_template', default=paths.basedir_def('nova/virt/interfaces.template'), diff --git a/nova/exception.py b/nova/exception.py index 1fa944eba837..a5f16b85fc85 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -2293,3 +2293,7 @@ class UnexpectedResourceProviderNameForPCIRequest(NovaException): class DeviceProfileError(NovaException): msg_fmt = _("Device profile name %(name)s: %(msg)s") + + +class AcceleratorRequestOpFailed(NovaException): + msg_fmt = _("Failed to %(op)s accelerator requests: %(msg)s") diff --git a/nova/tests/fixtures.py b/nova/tests/fixtures.py index db044d071cac..444d7ec9c07b 100644 --- a/nova/tests/fixtures.py +++ b/nova/tests/fixtures.py @@ -2526,14 +2526,136 @@ def _get_device_profile(dp_name, trait): return dp +def get_arqs(dp_name): + arq = { + 'uuid': 'b59d34d3-787b-4fb0-a6b9-019cd81172f8', + 'device_profile_name': dp_name, + 'device_profile_group_id': 0, + 'state': 'Initial', + 'device_rp_uuid': None, + 'hostname': None, + 'instance_uuid': None, + 'attach_handle_info': {}, + 'attach_handle_type': '', + } + bound_arq = copy.deepcopy(arq) + bound_arq.update( + {'state': 'Bound', + 'attach_handle_type': 'TEST_PCI', + 'attach_handle_info': { + 'bus': '0c', + 'device': '0', + 'domain': '0000', + 'function': '0' + }, + }) + return [arq], [bound_arq] + + class CyborgFixture(fixtures.Fixture): """Fixture that mocks Cyborg APIs used by nova/accelerator/cyborg.py""" dp_name = 'fakedev-dp' trait = 'CUSTOM_FAKE_DEVICE' + arq_list, bound_arq_list = get_arqs(dp_name) + + # NOTE(Sundar): The bindings passed to the fake_bind_arqs() from the + # conductor are indexed by ARQ UUID and include the host name, device + # RP UUID and instance UUID. (See params to fake_bind_arqs below.) + # + # Later, when the compute manager calls fake_get_arqs_for_instance() with + # the instance UUID, the returned ARQs must contain the host name and + # device RP UUID. But these can vary from test to test. + # + # So, fake_bind_arqs() below takes bindings indexed by ARQ UUID and + # converts them to bindings indexed by instance UUID, which are then + # stored in the dict below. This dict looks like: + # { $instance_uuid: [ + # {'hostname': $hostname, + # 'device_rp_uuid': $device_rp_uuid, + # 'arq_uuid': $arq_uuid + # } + # ] + # } + # Since it is indexed by instance UUID, and that is presumably unique + # across concurrently executing tests, this should be safe for + # concurrent access. + bindings_by_instance = {} + + @staticmethod + def fake_bind_arqs(bindings): + """Simulate Cyborg ARQ bindings. + + Since Nova calls Cyborg for binding on per-instance basis, the + instance UUIDs would be the same for all ARQs in a single call. + + This function converts bindings indexed by ARQ UUID to bindings + indexed by instance UUID, so that fake_get_arqs_for_instance can + retrieve them later. + + :param bindings: + { "$arq_uuid": { + "hostname": STRING + "device_rp_uuid": UUID + "instance_uuid": UUID + }, + ... + } + :returns: None + """ + binding_by_instance = collections.defaultdict(list) + for index, arq_uuid in enumerate(bindings): + arq_binding = bindings[arq_uuid] + # instance_uuid is same for all ARQs in a single call. + instance_uuid = arq_binding['instance_uuid'] + newbinding = { + 'hostname': arq_binding['hostname'], + 'device_rp_uuid': arq_binding['device_rp_uuid'], + 'arq_uuid': arq_uuid, + } + binding_by_instance[instance_uuid].append(newbinding) + + CyborgFixture.bindings_by_instance.update(binding_by_instance) + + @staticmethod + def fake_get_arqs_for_instance(instance_uuid, only_resolved=False): + """Get list of bound ARQs for this instance. + + This function uses bindings indexed by instance UUID to + populate the bound ARQ templates in CyborgFixture.bound_arq_list. + """ + arq_host_rp_list = CyborgFixture.bindings_by_instance[instance_uuid] + # The above looks like: + # [{'hostname': $hostname, + # 'device_rp_uuid': $device_rp_uuid, + # 'arq_uuid': $arq_uuid + # }] + + bound_arq_list = copy.deepcopy(CyborgFixture.bound_arq_list) + for arq in bound_arq_list: + match = [(arq_host_rp['hostname'], + arq_host_rp['device_rp_uuid'], + instance_uuid) + for arq_host_rp in arq_host_rp_list + if arq_host_rp['arq_uuid'] == arq['uuid'] + ] + # Only 1 ARQ UUID would match, so len(match) == 1 + arq['hostname'], arq['device_rp_uuid'], arq['instance_uuid'] = ( + match[0][0], match[0][1], match[0][2]) + return bound_arq_list def setUp(self): super(CyborgFixture, self).setUp() self.mock_get_dp = self.useFixture(fixtures.MockPatch( 'nova.accelerator.cyborg._CyborgClient._get_device_profile_list', return_value=_get_device_profile(self.dp_name, self.trait))).mock + self.mock_create_arqs = self.useFixture(fixtures.MockPatch( + 'nova.accelerator.cyborg._CyborgClient._create_arqs', + return_value=self.arq_list)).mock + self.mock_bind_arqs = self.useFixture(fixtures.MockPatch( + 'nova.accelerator.cyborg._CyborgClient.bind_arqs', + side_effect=self.fake_bind_arqs)).mock + self.mock_get_arqs = self.useFixture(fixtures.MockPatch( + 'nova.accelerator.cyborg._CyborgClient.' + 'get_arqs_for_instance', + side_effect=self.fake_get_arqs_for_instance)).mock diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index a1f89952ffd8..012050b1c35f 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -7835,6 +7835,20 @@ class AcceleratorServerBase(integrated_helpers.ProviderUsageBaseTestCase): extra_spec=extra_specs) return flavor_id + def _check_allocations_usage(self, server_uuid): + host_rp_uuid = self.compute_rp_uuids[0] + device_rp_uuid = self.device_rp_map[host_rp_uuid] + expected_host_alloc = { + 'resources': {'VCPU': 2, 'MEMORY_MB': 2048, 'DISK_GB': 20}, + } + expected_device_alloc = {'resources': {'FPGA': 1}} + + host_alloc = self._get_allocations_by_provider_uuid(host_rp_uuid) + self.assertEqual(expected_host_alloc, host_alloc[server_uuid]) + + device_alloc = self._get_allocations_by_provider_uuid(device_rp_uuid) + self.assertEqual(expected_device_alloc, device_alloc[server_uuid]) + class AcceleratorServerTest(AcceleratorServerBase): def setUp(self): @@ -7844,10 +7858,110 @@ class AcceleratorServerTest(AcceleratorServerBase): def test_create_server(self): flavor_id = self._create_acc_flavor() server_name = 'accel_server1' - self._create_server( + server = self._create_server( server_name, flavor_id=flavor_id, image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6', networks='none', expected_state='ACTIVE') - self.cyborg.mock_get_dp.assert_called_once_with(self.cyborg.dp_name) - # TODO(Sundar): Add checks for Placement allocations. + # Verify that the host name and the device rp UUID are set properly. + # Other fields in the ARQ are hardcoded data from the fixture. + arqs = self.cyborg.fake_get_arqs_for_instance(server['id']) + self.assertEqual(self.device_rp_uuids[0], arqs[0]['device_rp_uuid']) + self.assertEqual(server['OS-EXT-SRV-ATTR:host'], arqs[0]['hostname']) + + # Check allocations and usage + self._check_allocations_usage(server['id']) + + +class AcceleratorServerReschedTest(AcceleratorServerBase): + + def setUp(self): + self.NUM_HOSTS = 2 + super(AcceleratorServerReschedTest, self).setUp() + + def _check_allocations_usage_resched(self, server): + # Check allocations on host where instance is running + server_uuid = server['id'] + + hostname = server['OS-EXT-SRV-ATTR:host'] + server_host_rp_uuid = self._get_provider_uuid_by_host(hostname) + expected_host_alloc = { + 'resources': {'VCPU': 2, 'MEMORY_MB': 2048, 'DISK_GB': 20}, + } + expected_device_alloc = {'resources': {'FPGA': 1}} + + for i in range(self.NUM_HOSTS): + compute_uuid = self.compute_rp_uuids[i] + device_uuid = self.device_rp_map[compute_uuid] + host_alloc = self._get_allocations_by_provider_uuid(compute_uuid) + device_alloc = self._get_allocations_by_provider_uuid(device_uuid) + if compute_uuid == server_host_rp_uuid: + self.assertEqual(expected_host_alloc, host_alloc[server_uuid]) + self.assertEqual(expected_device_alloc, + device_alloc[server_uuid]) + else: + self.assertEqual({}, host_alloc) + self.assertEqual({}, device_alloc) + + # NOTE(Sundar): ARQs for an instance could come from different + # devices in the same host, in general. But, in this test case, + # there is only one device in the host. So, we check for that. + device_rp_uuid = self.device_rp_map[server_host_rp_uuid] + expected_arq_bind_info = set([('Bound', hostname, + device_rp_uuid, server_uuid)]) + arqs = nova_fixtures.CyborgFixture.fake_get_arqs_for_instance( + server_uuid) + # The state is hardcoded but other fields come from the test case. + arq_bind_info = {(arq['state'], arq['hostname'], + arq['device_rp_uuid'], arq['instance_uuid']) + for arq in arqs} + self.assertSetEqual(expected_arq_bind_info, arq_bind_info) + + def _check_no_allocations(self, server_uuid): + allocs = self._get_allocations_by_server_uuid(server_uuid) + self.assertEqual(allocs, {}) + + for i in range(self.NUM_HOSTS): + usage = self._get_provider_usages( + self.device_rp_uuids[i]).get('FPGA') + self.assertEqual(usage, 0) + + def test_resched(self): + orig_spawn = fake.FakeDriver.spawn + + def fake_spawn(*args, **kwargs): + fake_spawn.count += 1 + if fake_spawn.count == 1: + raise exception.ComputeResourcesUnavailable( + reason='First host fake fail.', instance_uuid='fake') + else: + orig_spawn(*args, **kwargs) + fake_spawn.count = 0 + + with mock.patch('nova.virt.fake.FakeDriver.spawn', new=fake_spawn): + flavor_id = self._create_acc_flavor() + server_name = 'accel_server1' + server = self._create_server( + server_name, flavor_id=flavor_id, + image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6', + networks='none', expected_state='ACTIVE') + + self.assertEqual(2, fake_spawn.count) + self._check_allocations_usage_resched(server) + + def test_resched_fails(self): + + def throw_error(*args, **kwargs): + raise exception.ComputeResourcesUnavailable(reason='', + instance_uuid='fake') + + self.stub_out('nova.virt.fake.FakeDriver.spawn', throw_error) + + flavor_id = self._create_acc_flavor() + server_name = 'accel_server1' + server = self._create_server( + server_name, flavor_id=flavor_id, + image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6', + networks='none', expected_state='ERROR') + + self._check_no_allocations(server['id']) diff --git a/nova/tests/unit/accelerator/test_cyborg.py b/nova/tests/unit/accelerator/test_cyborg.py index 9cc93eb1da56..7c7165500736 100644 --- a/nova/tests/unit/accelerator/test_cyborg.py +++ b/nova/tests/unit/accelerator/test_cyborg.py @@ -12,16 +12,20 @@ # License for the specific language governing permissions and limitations # under the License. +import itertools import mock from keystoneauth1 import exceptions as ks_exc from requests.models import Response +from oslo_serialization import jsonutils + from nova.accelerator import cyborg from nova import context from nova import exception from nova.objects import request_spec from nova import test +from nova.tests.unit import fake_requests class CyborgTestCase(test.NoDBTestCase): @@ -130,3 +134,216 @@ class CyborgTestCase(test.NoDBTestCase): self.assertRaises(exception.DeviceProfileError, self.client.get_device_profile_groups, dp_name='mydp') + + def _get_arqs_and_request_groups(self): + arq_common = { + # All ARQs for an instance have same device profile name. + "device_profile_name": "noprog-dp", + "device_rp_uuid": "", + "hostname": "", + "instance_uuid": "", + "state": "Initial", + } + arq_variants = [ + {"device_profile_group_id": 0, + "uuid": "edbba496-3cc8-4256-94ca-dfe3413348eb"}, + {"device_profile_group_id": 1, + "uuid": "20125bcb-9f55-4e13-8e8c-3fee30e54cca"}, + ] + arqs = [dict(arq_common, **variant) for variant in arq_variants] + rg_rp_map = { + 'device_profile_0': ['c532cf11-02ed-4b03-9dd8-3e9a454131dc'], + 'device_profile_1': ['2c332d7b-daaf-4726-a80d-ecf5212da4b8'], + } + return arqs, rg_rp_map + + def _get_bound_arqs(self): + arqs, rg_rp_map = self._get_arqs_and_request_groups() + common = { + 'host_name': 'myhost', + 'instance_uuid': '15d3acf8-df76-400b-bfc9-484a5208daa1', + } + bindings = { + arqs[0]['uuid']: dict( + common, device_rp_uuid=rg_rp_map['device_profile_0'][0]), + arqs[1]['uuid']: dict( + common, device_rp_uuid=rg_rp_map['device_profile_1'][0]), + } + bound_arq_common = { + "attach_handle_info": { + "bus": "01", + "device": "00", + "domain": "0000", + "function": "0" # will vary function ID later + }, + "attach_handle_type": "PCI", + "state": "Bound", + # Devic eprofile name is common to all bound ARQs + "device_profile_name": arqs[0]["device_profile_name"], + **common + } + bound_arqs = [ + {'uuid': arq['uuid'], + 'device_profile_group_id': arq['device_profile_group_id'], + 'device_rp_uuid': bindings[arq['uuid']]['device_rp_uuid'], + **bound_arq_common} for arq in arqs] + for index, bound_arq in enumerate(bound_arqs): + bound_arq['attach_handle_info']['function'] = index # fix func ID + return bindings, bound_arqs + + @mock.patch('keystoneauth1.adapter.Adapter.post') + def test_create_arqs_failure(self, mock_cyborg_post): + # If Cyborg returns invalid response, raise exception. + mock_cyborg_post.return_value = None + self.assertRaises(exception.AcceleratorRequestOpFailed, + self.client._create_arqs, + dp_name='mydp') + + @mock.patch('nova.accelerator.cyborg._CyborgClient.' + '_create_arqs') + def test_create_arq_and_match_rps(self, mock_create_arqs): + # Happy path + arqs, rg_rp_map = self._get_arqs_and_request_groups() + dp_name = arqs[0]["device_profile_name"] + + mock_create_arqs.return_value = arqs + + ret_arqs = self.client.create_arqs_and_match_resource_providers( + dp_name, rg_rp_map) + + # Each value in rg_rp_map is a list. We merge them into a single list. + expected_rp_uuids = sorted(list( + itertools.chain.from_iterable(rg_rp_map.values()))) + ret_rp_uuids = sorted([arq['device_rp_uuid'] for arq in ret_arqs]) + self.assertEqual(expected_rp_uuids, ret_rp_uuids) + + @mock.patch('nova.accelerator.cyborg._CyborgClient.' + '_create_arqs') + def test_create_arq_and_match_rps_exception(self, mock_create_arqs): + # If Cyborg response does not contain ARQs, raise + arqs, rg_rp_map = self._get_arqs_and_request_groups() + dp_name = arqs[0]["device_profile_name"] + + mock_create_arqs.return_value = None + self.assertRaises( + exception.AcceleratorRequestOpFailed, + self.client.create_arqs_and_match_resource_providers, + dp_name, rg_rp_map) + + @mock.patch('keystoneauth1.adapter.Adapter.patch') + def test_bind_arqs(self, mock_cyborg_patch): + bindings, bound_arqs = self._get_bound_arqs() + arq_uuid = bound_arqs[0]['uuid'] + + patch_list = {} + for arq_uuid, binding in bindings.items(): + patch = [{"path": "/" + field, + "op": "add", + "value": value + } for field, value in binding.items()] + patch_list[arq_uuid] = patch + + self.client.bind_arqs(bindings) + + mock_cyborg_patch.assert_called_once_with( + self.client.ARQ_URL, json=mock.ANY) + called_params = mock_cyborg_patch.call_args.kwargs['json'] + self.assertEqual(sorted(called_params), sorted(patch_list)) + + @mock.patch('nova.accelerator.cyborg._CyborgClient._call_cyborg') + def test_bind_arqs_exception(self, mock_call_cyborg): + # If Cyborg returns invalid response, raise exception. + bindings, _ = self._get_bound_arqs() + mock_call_cyborg.return_value = None, 'Some error' + self.assertRaises(exception.AcceleratorRequestOpFailed, + self.client.bind_arqs, bindings=bindings) + + @mock.patch('keystoneauth1.adapter.Adapter.get') + def test_get_arqs_for_instance(self, mock_cyborg_get): + # Happy path, without only_resolved=True + _, bound_arqs = self._get_bound_arqs() + instance_uuid = bound_arqs[0]['instance_uuid'] + + query = {"instance": instance_uuid} + content = jsonutils.dumps({'arqs': bound_arqs}) + resp = fake_requests.FakeResponse(200, content) + mock_cyborg_get.return_value = resp + + ret_arqs = self.client.get_arqs_for_instance(instance_uuid) + + mock_cyborg_get.assert_called_once_with( + self.client.ARQ_URL, params=query) + + bound_arqs.sort(key=lambda x: x['uuid']) + ret_arqs.sort(key=lambda x: x['uuid']) + for ret_arq, bound_arq in zip(ret_arqs, bound_arqs): + self.assertDictEqual(ret_arq, bound_arq) + + @mock.patch('keystoneauth1.adapter.Adapter.get') + def test_get_arqs_for_instance_exception(self, mock_cyborg_get): + # If Cyborg returns an error code, raise exception + _, bound_arqs = self._get_bound_arqs() + instance_uuid = bound_arqs[0]['instance_uuid'] + + resp = fake_requests.FakeResponse(404, content='') + mock_cyborg_get.return_value = resp + self.assertRaises( + exception.AcceleratorRequestOpFailed, + self.client.get_arqs_for_instance, instance_uuid) + + @mock.patch('keystoneauth1.adapter.Adapter.get') + def test_get_arqs_for_instance_exception_no_resp(self, mock_cyborg_get): + # If Cyborg returns an error code, raise exception + _, bound_arqs = self._get_bound_arqs() + instance_uuid = bound_arqs[0]['instance_uuid'] + + content = jsonutils.dumps({'noarqs': 'oops'}) + resp = fake_requests.FakeResponse(200, content) + mock_cyborg_get.return_value = resp + self.assertRaisesRegex( + exception.AcceleratorRequestOpFailed, + 'Cyborg returned no accelerator requests for ', + self.client.get_arqs_for_instance, instance_uuid) + + @mock.patch('keystoneauth1.adapter.Adapter.get') + def test_get_arqs_for_instance_all_resolved(self, mock_cyborg_get): + # If all ARQs are resolved, return full list + _, bound_arqs = self._get_bound_arqs() + instance_uuid = bound_arqs[0]['instance_uuid'] + + query = {"instance": instance_uuid} + content = jsonutils.dumps({'arqs': bound_arqs}) + resp = fake_requests.FakeResponse(200, content) + mock_cyborg_get.return_value = resp + + ret_arqs = self.client.get_arqs_for_instance( + instance_uuid, only_resolved=True) + + mock_cyborg_get.assert_called_once_with( + self.client.ARQ_URL, params=query) + + bound_arqs.sort(key=lambda x: x['uuid']) + ret_arqs.sort(key=lambda x: x['uuid']) + for ret_arq, bound_arq in zip(ret_arqs, bound_arqs): + self.assertDictEqual(ret_arq, bound_arq) + + @mock.patch('keystoneauth1.adapter.Adapter.get') + def test_get_arqs_for_instance_some_resolved(self, mock_cyborg_get): + # If only some ARQs are resolved, return just the resolved ones + unbound_arqs, _ = self._get_arqs_and_request_groups() + _, bound_arqs = self._get_bound_arqs() + # Create a amixture of unbound and bound ARQs + arqs = [unbound_arqs[0], bound_arqs[0]] + instance_uuid = bound_arqs[0]['instance_uuid'] + + query = {"instance": instance_uuid} + content = jsonutils.dumps({'arqs': arqs}) + resp = fake_requests.FakeResponse(200, content) + mock_cyborg_get.return_value = resp + + ret_arqs = self.client.get_arqs_for_instance( + instance_uuid, only_resolved=True) + + mock_cyborg_get.assert_called_once_with( + self.client.ARQ_URL, params=query) + self.assertEqual(ret_arqs, [bound_arqs[0]]) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 5bfe2aaa9337..2a8a2cbe1fce 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -5993,6 +5993,188 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): mock_hooks.setdefault().run_post.assert_called_once_with( 'build_instance', result, mock.ANY, mock.ANY, f=None) + @mock.patch.object(objects.Instance, 'save') + @mock.patch.object(nova.compute.manager.ComputeManager, + '_default_block_device_names') + @mock.patch.object(nova.compute.manager.ComputeManager, + '_prep_block_device') + @mock.patch.object(virt_driver.ComputeDriver, + 'prepare_for_spawn') + @mock.patch.object(nova.compute.manager.ComputeManager, + '_build_networks_for_instance') + @mock.patch.object(virt_driver.ComputeDriver, + 'prepare_networks_before_block_device_mapping') + def _test_accel_build_resources(self, mock_prep_net, mock_build_net, + mock_prep_spawn, mock_prep_bd, mock_bdnames, mock_save): + + args = (self.context, self.instance, self.requested_networks, + self.security_groups, self.image, self.block_device_mapping, + self.resource_provider_mapping) + + resources = [] + with self.compute._build_resources(*args) as resources: + pass + + return resources + + @mock.patch.object(nova.compute.manager.ComputeManager, + '_get_bound_arq_resources') + def test_accel_build_resources_no_device_profile(self, mock_get_arqs): + # If dp_name is None, accel path is a no-op. + self.instance.flavor.extra_specs = {} + self._test_accel_build_resources() + mock_get_arqs.assert_not_called() + + @mock.patch.object(nova.compute.manager.ComputeManager, + '_get_bound_arq_resources') + def test_accel_build_resources(self, mock_get_arqs): + # Happy path for accels in build_resources + dp_name = "mydp" + self.instance.flavor.extra_specs = {"accel:device_profile": dp_name} + arq_list = fixtures.CyborgFixture.bound_arq_list + mock_get_arqs.return_value = arq_list + + resources = self._test_accel_build_resources() + + mock_get_arqs.assert_called_once_with(self.context, + dp_name, self.instance) + self.assertEqual(sorted(resources['accel_info']), sorted(arq_list)) + + @mock.patch.object(virt_driver.ComputeDriver, + 'clean_networks_preparation') + @mock.patch.object(nova.compute.manager.ComputeManager, + '_get_bound_arq_resources') + def test_accel_build_resources_exception(self, mock_get_arqs, + mock_clean_net): + dp_name = "mydp" + self.instance.flavor.extra_specs = {"accel:device_profile": dp_name} + mock_get_arqs.side_effect = ( + exception.AcceleratorRequestOpFailed(op='get', msg='')) + + self.assertRaises(exception.NovaException, + self._test_accel_build_resources) + mock_clean_net.assert_called_once() + + @mock.patch.object(nova.compute.manager.ComputeVirtAPI, + 'exit_wait_early') + @mock.patch.object(nova.compute.manager.ComputeVirtAPI, + 'wait_for_instance_event') + @mock.patch('nova.accelerator.cyborg._CyborgClient.' + 'get_arqs_for_instance') + def test_arq_bind_wait_exit_early(self, mock_get_arqs, + mock_wait_inst_ev, mock_exit_wait_early): + # Bound ARQs available on first query, quit early. + dp_name = fixtures.CyborgFixture.dp_name + arq_list = fixtures.CyborgFixture.bound_arq_list + self.instance.flavor.extra_specs = {"accel:device_profile": dp_name} + arq_events = [('accelerator-request-bound', arq['uuid']) + for arq in arq_list] + + # get_arqs_for_instance() is called twice, once to get all ARQs + # for the instance, once to get only the resolved ARQs + mock_get_arqs.return_value = arq_list + + ret_arqs = self.compute._get_bound_arq_resources( + self.context, dp_name, self.instance) + + mock_wait_inst_ev.assert_called_once_with( + self.instance, arq_events, deadline=mock.ANY) + mock_exit_wait_early.assert_called_once_with(arq_events) + + mock_get_arqs.assert_has_calls([ + mock.call(self.instance.uuid), + mock.call(self.instance.uuid, only_resolved=True)]) + + self.assertEqual(sorted(ret_arqs), sorted(arq_list)) + + @mock.patch.object(nova.compute.manager.ComputeVirtAPI, + 'exit_wait_early') + @mock.patch.object(nova.compute.manager.ComputeVirtAPI, + 'wait_for_instance_event') + @mock.patch('nova.accelerator.cyborg._CyborgClient.' + 'get_arqs_for_instance') + def test_arq_bind_wait(self, mock_get_arqs, + mock_wait_inst_ev, mock_exit_wait_early): + # If binding is in progress, must wait. + dp_name = fixtures.CyborgFixture.dp_name + arq_list = fixtures.CyborgFixture.bound_arq_list + self.instance.flavor.extra_specs = {"accel:device_profile": dp_name} + arq_events = [('accelerator-request-bound', arq['uuid']) + for arq in arq_list] + # get_arqs_for_instance gets called 3 times, returning the full + # ARQ list first, [] as resolved ARQs next, and the full list finally + mock_get_arqs.side_effect = [arq_list, [], arq_list] + + ret_arqs = self.compute._get_bound_arq_resources( + self.context, dp_name, self.instance) + + mock_wait_inst_ev.assert_called_once_with( + self.instance, arq_events, deadline=mock.ANY) + mock_exit_wait_early.assert_not_called() + self.assertEqual(sorted(ret_arqs), sorted(arq_list)) + mock_get_arqs.assert_has_calls([ + mock.call(self.instance.uuid), + mock.call(self.instance.uuid, only_resolved=True), + mock.call(self.instance.uuid)]) + + @mock.patch.object(nova.compute.manager.ComputeVirtAPI, + 'exit_wait_early') + @mock.patch.object(nova.compute.manager.ComputeVirtAPI, + 'wait_for_instance_event') + @mock.patch('nova.accelerator.cyborg._CyborgClient.' + 'get_arqs_for_instance') + def test_arq_bind_timeout(self, mock_get_arqs, + mock_wait_inst_ev, mock_exit_wait_early): + # If binding fails even after wait, exception is thrown + dp_name = fixtures.CyborgFixture.dp_name + arq_list = fixtures.CyborgFixture.bound_arq_list + self.instance.flavor.extra_specs = {"accel:device_profile": dp_name} + arq_events = [('accelerator-request-bound', arq['uuid']) + for arq in arq_list] + + mock_get_arqs.return_value = arq_list + mock_wait_inst_ev.side_effect = eventlet_timeout.Timeout + + self.assertRaises(eventlet_timeout.Timeout, + self.compute._get_bound_arq_resources, + self.context, dp_name, self.instance) + + mock_wait_inst_ev.assert_called_once_with( + self.instance, arq_events, deadline=mock.ANY) + mock_exit_wait_early.assert_not_called() + mock_get_arqs.assert_called_once_with(self.instance.uuid) + + @mock.patch.object(nova.compute.manager.ComputeVirtAPI, + 'exit_wait_early') + @mock.patch.object(nova.compute.manager.ComputeVirtAPI, + 'wait_for_instance_event') + @mock.patch('nova.accelerator.cyborg._CyborgClient.' + 'get_arqs_for_instance') + def test_arq_bind_exception(self, mock_get_arqs, + mock_wait_inst_ev, mock_exit_wait_early): + # If the code inside the context manager of _get_bound_arq_resources + # raises an exception, that exception must be handled. + dp_name = fixtures.CyborgFixture.dp_name + arq_list = fixtures.CyborgFixture.bound_arq_list + self.instance.flavor.extra_specs = {"accel:device_profile": dp_name} + arq_events = [('accelerator-request-bound', arq['uuid']) + for arq in arq_list] + + mock_get_arqs.side_effect = [ + arq_list, + exception.AcceleratorRequestOpFailed(op='', msg='')] + + self.assertRaises(exception.AcceleratorRequestOpFailed, + self.compute._get_bound_arq_resources, + self.context, dp_name, self.instance) + + mock_wait_inst_ev.assert_called_once_with( + self.instance, arq_events, deadline=mock.ANY) + mock_exit_wait_early.assert_not_called() + mock_get_arqs.assert_has_calls([ + mock.call(self.instance.uuid), + mock.call(self.instance.uuid, only_resolved=True)]) + def test_build_and_run_instance_called_with_proper_args(self): self._test_build_and_run_instance() diff --git a/nova/tests/unit/conductor/test_conductor.py b/nova/tests/unit/conductor/test_conductor.py index 39ff24c83472..a642526e36d2 100644 --- a/nova/tests/unit/conductor/test_conductor.py +++ b/nova/tests/unit/conductor/test_conductor.py @@ -437,6 +437,8 @@ class _BaseTaskTestCase(object): def test_cold_migrate_forced_shutdown(self): self._test_cold_migrate(clean_shutdown=False) + @mock.patch.object(conductor_manager.ComputeTaskManager, + '_create_and_bind_arqs') @mock.patch.object(compute_rpcapi.ComputeAPI, 'build_and_run_instance') @mock.patch.object(db, 'block_device_mapping_get_all_by_instance', return_value=[]) @@ -448,7 +450,7 @@ class _BaseTaskTestCase(object): @mock.patch.object(objects.RequestSpec, 'from_primitives') def test_build_instances(self, mock_fp, mock_save, mock_getaz, mock_buildreq, mock_schedule, mock_bdm, - mock_build): + mock_build, mock_create_bind_arqs): """Tests creating two instances and the scheduler returns a unique host/node combo for each instance. """ @@ -530,6 +532,86 @@ class _BaseTaskTestCase(object): security_groups='security_groups', block_device_mapping=mock.ANY, node='node2', limits=None, host_list=sched_return[1])]) + mock_create_bind_arqs.assert_has_calls([ + mock.call(self.context, instances[0].uuid, + instances[0].flavor.extra_specs, 'node1', mock.ANY), + mock.call(self.context, instances[1].uuid, + instances[1].flavor.extra_specs, 'node2', mock.ANY), + ]) + + @mock.patch.object(conductor_manager.ComputeTaskManager, + '_cleanup_when_reschedule_fails') + @mock.patch.object(conductor_manager.ComputeTaskManager, + '_create_and_bind_arqs') + @mock.patch.object(compute_rpcapi.ComputeAPI, 'build_and_run_instance') + @mock.patch.object(db, 'block_device_mapping_get_all_by_instance', + return_value=[]) + @mock.patch.object(conductor_manager.ComputeTaskManager, + '_schedule_instances') + @mock.patch('nova.objects.BuildRequest.get_by_instance_uuid') + @mock.patch('nova.availability_zones.get_host_availability_zone') + @mock.patch('nova.objects.Instance.save') + @mock.patch.object(objects.RequestSpec, 'from_primitives') + def test_build_instances_arq_failure(self, mock_fp, mock_save, mock_getaz, + mock_buildreq, mock_schedule, mock_bdm, + mock_build, mock_create_bind_arqs, mock_cleanup): + """If _create_and_bind_arqs throws an exception, + _destroy_build_request must be called for each instance. + """ + fake_spec = objects.RequestSpec() + mock_fp.return_value = fake_spec + instance_type = objects.Flavor.get_by_name(self.context, 'm1.small') + # NOTE(danms): Avoid datetime timezone issues with converted flavors + instance_type.created_at = None + instances = [objects.Instance(context=self.context, + id=i, + uuid=uuids.fake, + flavor=instance_type) for i in range(2)] + instance_properties = obj_base.obj_to_primitive(instances[0]) + instance_properties['system_metadata'] = flavors.save_flavor_info( + {}, instance_type) + + sched_return = copy.deepcopy(fake_host_lists2) + mock_schedule.return_value = sched_return + + # build_instances() is a cast, we need to wait for it to complete + self.useFixture(cast_as_call.CastAsCall(self)) + + mock_getaz.return_value = 'myaz' + mock_create_bind_arqs.side_effect = ( + exc.AcceleratorRequestOpFailed(op='', msg='')) + + self.conductor.build_instances(self.context, + instances=instances, + image={'fake_data': 'should_pass_silently'}, + filter_properties={}, + admin_password='admin_password', + injected_files='injected_files', + requested_networks=None, + security_groups='security_groups', + block_device_mapping='block_device_mapping', + legacy_bdm=False, host_lists=None) + mock_create_bind_arqs.assert_has_calls([ + mock.call(self.context, instances[0].uuid, + instances[0].flavor.extra_specs, 'node1', mock.ANY), + mock.call(self.context, instances[1].uuid, + instances[1].flavor.extra_specs, 'node2', mock.ANY), + ]) + # Comparing instances fails because the instance objects have changed + # in the above flow. So, we compare the fields instead. + mock_cleanup.assert_has_calls([ + mock.call(self.context, test.MatchType(objects.Instance), + test.MatchType(exc.AcceleratorRequestOpFailed), + test.MatchType(dict), None), + mock.call(self.context, test.MatchType(objects.Instance), + test.MatchType(exc.AcceleratorRequestOpFailed), + test.MatchType(dict), None), + ]) + call_list = mock_cleanup.call_args_list + for idx, instance in enumerate(instances): + actual_inst = call_list[idx][0][1] + self.assertEqual(actual_inst['uuid'], instance['uuid']) + self.assertEqual(actual_inst['flavor']['extra_specs'], {}) @mock.patch.object(scheduler_utils, 'build_request_spec') @mock.patch.object(scheduler_utils, 'setup_instance_group') @@ -1880,6 +1962,45 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase): self.params = params self.flavor = objects.Flavor.get_by_name(self.ctxt, 'm1.tiny') + @mock.patch('nova.accelerator.cyborg.get_client') + def test_create_bind_arqs_no_device_profile(self, mock_get_client): + # If no device profile name, it is a no op. + hostname = 'myhost' + instance = fake_instance.fake_instance_obj(self.context) + + instance.flavor.extra_specs = {} + self.conductor._create_and_bind_arqs(self.context, + instance.uuid, instance.flavor.extra_specs, + hostname, resource_provider_mapping=mock.ANY) + mock_get_client.assert_not_called() + + @mock.patch('nova.accelerator.cyborg._CyborgClient.bind_arqs') + @mock.patch('nova.accelerator.cyborg._CyborgClient.' + 'create_arqs_and_match_resource_providers') + def test_create_bind_arqs(self, mock_create, mock_bind): + # Happy path + hostname = 'myhost' + instance = fake_instance.fake_instance_obj(self.context) + dp_name = 'mydp' + instance.flavor.extra_specs = {'accel:device_profile': dp_name} + + in_arq_list, _ = fixtures.get_arqs(dp_name) + mock_create.return_value = in_arq_list + + self.conductor._create_and_bind_arqs(self.context, + instance.uuid, instance.flavor.extra_specs, + hostname, resource_provider_mapping=mock.ANY) + + mock_create.assert_called_once_with(dp_name, mock.ANY) + + expected_bindings = { + 'b59d34d3-787b-4fb0-a6b9-019cd81172f8': + {'hostname': hostname, + 'device_rp_uuid': mock.ANY, + 'instance_uuid': instance.uuid} + } + mock_bind.assert_called_once_with(bindings=expected_bindings) + @mock.patch('nova.availability_zones.get_host_availability_zone') @mock.patch('nova.compute.rpcapi.ComputeAPI.build_and_run_instance') @mock.patch('nova.scheduler.rpcapi.SchedulerAPI.select_destinations') @@ -2399,6 +2520,44 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase): self.params['request_specs'][0].requested_resources = [] self._do_schedule_and_build_instances_test(self.params) + @mock.patch.object(conductor_manager.ComputeTaskManager, + '_create_and_bind_arqs') + def test_schedule_and_build_instances_with_arqs_bind_ok( + self, mock_create_bind_arqs): + extra_specs = {'accel:device_profile': 'mydp'} + instance = self.params['build_requests'][0].instance + instance.flavor.extra_specs = extra_specs + + self._do_schedule_and_build_instances_test(self.params) + + # NOTE(Sundar): At this point, the instance has not been + # associated with a host yet. The default host.nodename is + # 'node1'. + mock_create_bind_arqs.assert_called_once_with( + self.params['context'], instance.uuid, extra_specs, + 'node1', mock.ANY) + + @mock.patch.object(conductor_manager.ComputeTaskManager, + '_cleanup_build_artifacts') + @mock.patch.object(conductor_manager.ComputeTaskManager, + '_create_and_bind_arqs') + def test_schedule_and_build_instances_with_arqs_bind_exception( + self, mock_create_bind_arqs, mock_cleanup): + # Exceptions in _create_and_bind_arqs result in cleanup + mock_create_bind_arqs.side_effect = ( + exc.AcceleratorRequestOpFailed(op='', msg='')) + + try: + self._do_schedule_and_build_instances_test(self.params) + except exc.AcceleratorRequestOpFailed: + pass + + mock_cleanup.assert_called_once_with( + self.params['context'], mock.ANY, mock.ANY, + self.params['build_requests'], self.params['request_specs'], + self.params['block_device_mapping'], self.params['tags'], + mock.ANY) + def test_map_instance_to_cell_already_mapped(self): """Tests a scenario where an instance is already mapped to a cell during scheduling.