From d913de7471871abbf6920fc672171657f75890dc Mon Sep 17 00:00:00 2001 From: Ed Leafe Date: Tue, 10 Oct 2017 23:25:29 +0000 Subject: [PATCH] Make conductor pass and use host_lists The earlier patches in the series generated alternates and Selection objects, and modified the RPC calls to send them to the conductor. This patch has the conductor pass these host_lists to the compute for the build process, and, if the build fails, has the compute pass the host_list back to the conductor. Also fixes a bug in the scheduler manager exposed by this change when using the CachingScheduler in a reschedule functional test. Blueprint: return-alternate-hosts Change-Id: Iae904afb6cb4fcea8bb27741d774ffbe986a5fb4 --- nova/compute/manager.py | 45 ++++--- nova/compute/rpcapi.py | 44 +++++-- nova/conductor/api.py | 5 +- nova/conductor/manager.py | 84 ++++++++++-- nova/conductor/rpcapi.py | 42 +++--- nova/objects/service.py | 4 +- nova/scheduler/filter_scheduler.py | 9 +- nova/scheduler/manager.py | 5 +- .../regressions/test_bug_1671648.py | 8 ++ nova/tests/unit/compute/test_compute_mgr.py | 38 +++--- nova/tests/unit/compute/test_rpcapi.py | 46 ++++++- nova/tests/unit/conductor/test_conductor.py | 124 +++++++++++++++--- nova/tests/unit/objects/test_service.py | 2 +- .../unit/scheduler/test_filter_scheduler.py | 10 +- 14 files changed, 365 insertions(+), 101 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index d617c4e265ef..6275bcb9ab77 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -485,7 +485,7 @@ class ComputeVirtAPI(virtapi.VirtAPI): class ComputeManager(manager.Manager): """Manages the running instances from creation to destruction.""" - target = messaging.Target(version='4.18') + target = messaging.Target(version='4.19') # How long to wait in seconds before re-issuing a shutdown # signal to an instance during power off. The overall @@ -1738,7 +1738,7 @@ class ComputeManager(manager.Manager): filter_properties, admin_password=None, injected_files=None, requested_networks=None, security_groups=None, block_device_mapping=None, - node=None, limits=None): + node=None, limits=None, host_list=None): @utils.synchronized(instance.uuid) def _locked_do_build_and_run_instance(*args, **kwargs): @@ -1760,24 +1760,24 @@ class ComputeManager(manager.Manager): result = build_results.FAILED raise finally: - fails = (build_results.FAILED, - build_results.RESCHEDULED) - if result in fails: - # Remove the allocation records from Placement for - # the instance if the build failed or is being - # rescheduled to another node. The instance.host is + if result == build_results.FAILED: + # Remove the allocation records from Placement for the + # instance if the build failed. The instance.host is # likely set to None in _do_build_and_run_instance - # which means if the user deletes the instance, it will - # be deleted in the API, not the compute service. + # which means if the user deletes the instance, it + # will be deleted in the API, not the compute service. # Setting the instance.host to None in # _do_build_and_run_instance means that the # ResourceTracker will no longer consider this instance # to be claiming resources against it, so we want to - # reflect that same thing in Placement. - rt = self._get_resource_tracker() - rt.reportclient.delete_allocation_for_instance( - instance.uuid) + # reflect that same thing in Placement. No need to + # call this for a reschedule, as the allocations will + # have already been removed in + # self._do_build_and_run_instance(). + self._delete_allocation_for_instance(instance.uuid) + if result in (build_results.FAILED, + build_results.RESCHEDULED): self._build_failed() else: self._failed_builds = 0 @@ -1789,7 +1789,11 @@ class ComputeManager(manager.Manager): context, instance, image, request_spec, filter_properties, admin_password, injected_files, requested_networks, security_groups, - block_device_mapping, node, limits) + block_device_mapping, node, limits, host_list) + + def _delete_allocation_for_instance(self, instance_uuid): + rt = self._get_resource_tracker() + rt.reportclient.delete_allocation_for_instance(instance_uuid) def _check_device_tagging(self, requested_networks, block_device_mapping): tagging_requested = False @@ -1817,7 +1821,7 @@ class ComputeManager(manager.Manager): def _do_build_and_run_instance(self, context, instance, image, request_spec, filter_properties, admin_password, injected_files, requested_networks, security_groups, block_device_mapping, - node=None, limits=None): + node=None, limits=None, host_list=None): try: LOG.debug('Starting instance...', instance=instance) @@ -1895,11 +1899,18 @@ class ComputeManager(manager.Manager): self._nil_out_instance_obj_host_and_node(instance) instance.task_state = task_states.SCHEDULING instance.save() + # The instance will have already claimed resources from this host + # before this build was attempted. Now that it has failed, we need + # to unclaim those resources before casting to the conductor, so + # that if there are alternate hosts available for a retry, it can + # claim resources on that new host for the instance. + self._delete_allocation_for_instance(instance.uuid) self.compute_task_api.build_instances(context, [instance], image, filter_properties, admin_password, injected_files, requested_networks, security_groups, - block_device_mapping, request_spec=request_spec) + block_device_mapping, request_spec=request_spec, + host_lists=[host_list]) return build_results.RESCHEDULED except (exception.InstanceNotFound, exception.UnexpectedDeletingTaskStateError): diff --git a/nova/compute/rpcapi.py b/nova/compute/rpcapi.py index 81149297fdc4..cf890b56bdaf 100644 --- a/nova/compute/rpcapi.py +++ b/nova/compute/rpcapi.py @@ -329,6 +329,14 @@ class ComputeAPI(object): * 4.16 - Add tag argument to attach_interface() * 4.17 - Add new_attachment_id to swap_volume. * 4.18 - Add migration to prep_resize() + + ... Pike supports messaging version 4.18. So any changes to existing + methods in 4.x after that point should be done so that they can handle + the version_cap being set to 4.18. + + * 4.19 - build_and_run_instance() now gets a 'host_list' parameter + representing potential alternate hosts for retries within a + cell. ''' VERSION_ALIASES = { @@ -339,6 +347,7 @@ class ComputeAPI(object): 'mitaka': '4.11', 'newton': '4.13', 'ocata': '4.13', + 'pike': '4.18', } def __init__(self): @@ -1116,22 +1125,31 @@ class ComputeAPI(object): def build_and_run_instance(self, ctxt, instance, host, image, request_spec, filter_properties, admin_password=None, injected_files=None, requested_networks=None, security_groups=None, - block_device_mapping=None, node=None, limits=None): + block_device_mapping=None, node=None, limits=None, + host_list=None): # NOTE(edleafe): compute nodes can only use the dict form of limits. if isinstance(limits, objects.SchedulerLimits): limits = limits.to_dict() - version = '4.0' - cctxt = self.router.client(ctxt).prepare( - server=host, version=version) - cctxt.cast(ctxt, 'build_and_run_instance', instance=instance, - image=image, request_spec=request_spec, - filter_properties=filter_properties, - admin_password=admin_password, - injected_files=injected_files, - requested_networks=requested_networks, - security_groups=security_groups, - block_device_mapping=block_device_mapping, node=node, - limits=limits) + kwargs = {"instance": instance, + "image": image, + "request_spec": request_spec, + "filter_properties": filter_properties, + "admin_password": admin_password, + "injected_files": injected_files, + "requested_networks": requested_networks, + "security_groups": security_groups, + "block_device_mapping": block_device_mapping, + "node": node, + "limits": limits, + "host_list": host_list, + } + client = self.router.client(ctxt) + version = '4.19' + if not client.can_send_version(version): + version = '4.0' + kwargs.pop("host_list") + cctxt = client.prepare(server=host, version=version) + cctxt.cast(ctxt, 'build_and_run_instance', **kwargs) def quiesce_instance(self, ctxt, instance): version = '4.0' diff --git a/nova/conductor/api.py b/nova/conductor/api.py index 29a466f4ada1..35317d9d85a8 100644 --- a/nova/conductor/api.py +++ b/nova/conductor/api.py @@ -113,7 +113,7 @@ class ComputeTaskAPI(object): def build_instances(self, context, instances, image, filter_properties, admin_password, injected_files, requested_networks, security_groups, block_device_mapping, legacy_bdm=True, - request_spec=None): + request_spec=None, host_lists=None): self.conductor_compute_rpcapi.build_instances(context, instances=instances, image=image, filter_properties=filter_properties, @@ -121,7 +121,8 @@ class ComputeTaskAPI(object): requested_networks=requested_networks, security_groups=security_groups, block_device_mapping=block_device_mapping, - legacy_bdm=legacy_bdm, request_spec=request_spec) + legacy_bdm=legacy_bdm, request_spec=request_spec, + host_lists=host_lists) def schedule_and_build_instances(self, context, build_requests, request_spec, image, diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py index 5bbfcd9ea86a..d1013b4147dc 100644 --- a/nova/conductor/manager.py +++ b/nova/conductor/manager.py @@ -21,6 +21,7 @@ import functools from oslo_config import cfg from oslo_log import log as logging import oslo_messaging as messaging +from oslo_serialization import jsonutils from oslo_utils import excutils from oslo_utils import versionutils import six @@ -218,7 +219,7 @@ class ComputeTaskManager(base.Base): may involve coordinating activities on multiple compute nodes. """ - target = messaging.Target(namespace='compute_task', version='1.18') + target = messaging.Target(namespace='compute_task', version='1.19') def __init__(self): super(ComputeTaskManager, self).__init__() @@ -227,6 +228,7 @@ class ComputeTaskManager(base.Base): self.network_api = network.API() self.servicegroup_api = servicegroup.API() self.scheduler_client = scheduler_client.SchedulerClient() + self.report_client = self.scheduler_client.reportclient self.notifier = rpc.get_notifier('compute', CONF.host) def reset(self): @@ -516,7 +518,7 @@ class ComputeTaskManager(base.Base): def build_instances(self, context, instances, image, filter_properties, admin_password, injected_files, requested_networks, security_groups, block_device_mapping=None, legacy_bdm=True, - request_spec=None): + request_spec=None, host_lists=None): # TODO(ndipanov): Remove block_device_mapping and legacy_bdm in version # 2.0 of the RPC API. # TODO(danms): Remove this in version 2.0 of the RPC API @@ -548,6 +550,11 @@ class ComputeTaskManager(base.Base): # work on a RequestSpec object rather than filter_properties. request_spec = request_spec.to_legacy_request_spec_dict() + # 'host_lists' will be None in one of two cases: when running cellsv1, + # or during a reschedule from a pre-Queens compute. In all other cases, + # it will be a list of lists, though the lists may be empty if there + # are no more hosts left in a rescheduling situation. + is_reschedule = host_lists is not None try: # check retry policy. Rather ugly use of instances[0]... # but if we've exceeded max retries... then we really only @@ -559,8 +566,22 @@ class ComputeTaskManager(base.Base): instance_uuids = [instance.uuid for instance in instances] spec_obj = objects.RequestSpec.from_primitives( context, request_spec, filter_properties) - host_lists = self._schedule_instances(context, spec_obj, - instance_uuids, return_alternates=True) + LOG.debug("Rescheduling: %s", is_reschedule) + if is_reschedule: + # Make sure that we have a host, as we may have exhausted all + # our alternates + if not host_lists[0]: + # We have an empty list of hosts, so this instance has + # failed to build. + msg = ("Exhausted all hosts available for retrying build " + "failures for instance %(instance_uuid)s." % + {"instance_uuid": instances[0].uuid}) + raise exception.MaxRetriesExceeded(reason=msg) + else: + # This is not a reschedule, so we need to call the scheduler to + # get appropriate hosts for the request. + host_lists = self._schedule_instances(context, spec_obj, + instance_uuids, return_alternates=True) except Exception as exc: num_attempts = filter_properties.get( 'retry', {}).get('num_attempts', 1) @@ -585,8 +606,45 @@ class ComputeTaskManager(base.Base): context, instance, requested_networks) return + elevated = context.elevated() for (instance, host_list) in six.moves.zip(instances, host_lists): - host = host_list[0] + host = host_list.pop(0) + if is_reschedule: + # If this runs in the superconductor, the first instance will + # already have its resources claimed in placement. If this is a + # retry, though, this is running in the cell conductor, and we + # need to claim first to ensure that the alternate host still + # has its resources available. Note that there are schedulers + # that don't support Placement, so must assume that the host is + # still available. + host_available = False + while host and not host_available: + if host.allocation_request: + alloc_req = jsonutils.loads(host.allocation_request) + else: + alloc_req = None + if alloc_req: + host_available = scheduler_utils.claim_resources( + elevated, self.report_client, spec_obj, + instance.uuid, alloc_req, + host.allocation_request_version) + else: + # Some deployments use different schedulers that do not + # use Placement, so they will not have an + # allocation_request to claim with. For those cases, + # there is no concept of claiming, so just assume that + # the host is valid. + host_available = True + if not host_available: + # Insufficient resources remain on that host, so + # discard it and try the next. + host = host_list.pop(0) if host_list else None + if not host_available: + # No more available hosts for retrying the build. + msg = ("Exhausted all hosts available for retrying build " + "failures for instance %(instance_uuid)s." % + {"instance_uuid": instance.uuid}) + raise exception.MaxRetriesExceeded(reason=msg) instance.availability_zone = ( availability_zones.get_host_availability_zone(context, host.service_host)) @@ -634,6 +692,10 @@ class ComputeTaskManager(base.Base): inst_mapping.destroy() return + alts = [(alt.service_host, alt.nodename) for alt in host_list] + LOG.debug("Selected host: %s; Selected node: %s; Alternates: %s", + host.service_host, host.nodename, alts, instance=instance) + self.compute_rpcapi.build_and_run_instance(context, instance=instance, host=host.service_host, image=image, request_spec=local_reqspec, @@ -643,7 +705,7 @@ class ComputeTaskManager(base.Base): requested_networks=requested_networks, security_groups=security_groups, block_device_mapping=bdms, node=host.nodename, - limits=host.limits) + limits=host.limits, host_list=host_list) def _schedule_instances(self, context, request_spec, instance_uuids=None, return_alternates=False): @@ -803,8 +865,7 @@ class ComputeTaskManager(base.Base): # in the API. try: scheduler_utils.claim_resources_on_destination( - self.scheduler_client.reportclient, instance, - source_node, dest_node) + self.report_client, instance, source_node, dest_node) except exception.NoValidHost as ex: with excutils.save_and_reraise_exception(): self._set_vm_state_and_notify( @@ -1134,7 +1195,10 @@ class ComputeTaskManager(base.Base): cell = cell_mapping_cache[instance.uuid] # host_list is a list of one or more Selection objects, the first # of which has been selected and its resources claimed. - host = host_list[0] + host = host_list.pop(0) + alts = [(alt.service_host, alt.nodename) for alt in host_list] + LOG.debug("Selected host: %s; Selected node: %s; Alternates: %s", + host.service_host, host.nodename, alts, instance=instance) filter_props = request_spec.to_legacy_filter_properties_dict() scheduler_utils.populate_retry(filter_props, instance.uuid) scheduler_utils.populate_filter_properties(filter_props, @@ -1192,7 +1256,7 @@ class ComputeTaskManager(base.Base): security_groups=legacy_secgroups, block_device_mapping=instance_bdms, host=host.service_host, node=host.nodename, - limits=host.limits) + limits=host.limits, host_list=host_list) def _cleanup_build_artifacts(self, context, exc, instances, build_requests, request_specs, cell_mapping_cache): diff --git a/nova/conductor/rpcapi.py b/nova/conductor/rpcapi.py index af69ed063b49..85ab5f66f0c5 100644 --- a/nova/conductor/rpcapi.py +++ b/nova/conductor/rpcapi.py @@ -275,6 +275,9 @@ class ComputeTaskAPI(object): 1.16 - Added schedule_and_build_instances 1.17 - Added tags to schedule_and_build_instances() 1.18 - Added request_spec to build_instances(). + 1.19 - build_instances() now gets a 'host_lists' parameter that represents + potential alternate hosts for retries within a cell for each + instance. """ def __init__(self): @@ -329,39 +332,42 @@ class ComputeTaskAPI(object): def build_instances(self, context, instances, image, filter_properties, admin_password, injected_files, requested_networks, security_groups, block_device_mapping, legacy_bdm=True, - request_spec=None): + request_spec=None, host_lists=None): image_p = jsonutils.to_primitive(image) - version = '1.18' - send_reqspec = True + kwargs = {"instances": instances, "image": image_p, + "filter_properties": filter_properties, + "admin_password": admin_password, + "injected_files": injected_files, + "requested_networks": requested_networks, + "security_groups": security_groups, + "request_spec": request_spec, + "host_lists": host_lists} + version = '1.19' + if not self.client.can_send_version(version): + version = '1.18' + kwargs.pop("host_lists") if not self.client.can_send_version(version): - send_reqspec = False version = '1.10' + kwargs.pop("request_spec") if not self.client.can_send_version(version): version = '1.9' if 'instance_type' in filter_properties: flavor = filter_properties['instance_type'] flavor_p = objects_base.obj_to_primitive(flavor) - filter_properties = dict(filter_properties, - instance_type=flavor_p) - kw = {'instances': instances, 'image': image_p, - 'filter_properties': filter_properties, - 'admin_password': admin_password, - 'injected_files': injected_files, - 'requested_networks': requested_networks, - 'security_groups': security_groups} - if send_reqspec: - kw['request_spec'] = request_spec + kwargs["filter_properties"] = dict(filter_properties, + instance_type=flavor_p) if not self.client.can_send_version(version): version = '1.8' - kw['requested_networks'] = kw['requested_networks'].as_tuples() + nets = kwargs['requested_networks'].as_tuples() + kwargs['requested_networks'] = nets if not self.client.can_send_version('1.7'): version = '1.5' bdm_p = objects_base.obj_to_primitive(block_device_mapping) - kw.update({'block_device_mapping': bdm_p, - 'legacy_bdm': legacy_bdm}) + kwargs.update({'block_device_mapping': bdm_p, + 'legacy_bdm': legacy_bdm}) cctxt = self.client.prepare(version=version) - cctxt.cast(context, 'build_instances', **kw) + cctxt.cast(context, 'build_instances', **kwargs) def schedule_and_build_instances(self, context, build_requests, request_specs, diff --git a/nova/objects/service.py b/nova/objects/service.py index b8760128c029..5d644fb55229 100644 --- a/nova/objects/service.py +++ b/nova/objects/service.py @@ -31,7 +31,7 @@ LOG = logging.getLogger(__name__) # NOTE(danms): This is the global service version counter -SERVICE_VERSION = 25 +SERVICE_VERSION = 26 # NOTE(danms): This is our SERVICE_VERSION history. The idea is that any @@ -120,6 +120,8 @@ SERVICE_VERSION_HISTORY = ( # Version 25: Compute hosts allow migration-based allocations # for live migration. {'compute_rpc': '4.18'}, + # Version 26: Adds a 'host_list' parameter to build_and_run_instance() + {'compute_rpc': '4.19'}, ) diff --git a/nova/scheduler/filter_scheduler.py b/nova/scheduler/filter_scheduler.py index fbeee0234cce..805f765ad3be 100644 --- a/nova/scheduler/filter_scheduler.py +++ b/nova/scheduler/filter_scheduler.py @@ -345,7 +345,14 @@ class FilterScheduler(driver.Scheduler): selections_to_return = [] for selected_host in selected_hosts: # This is the list of hosts for one particular instance. - selection = objects.Selection.from_host_state(selected_host) + if alloc_reqs_by_rp_uuid: + selected_alloc_req = alloc_reqs_by_rp_uuid.get( + selected_host.uuid)[0] + else: + selected_alloc_req = None + selection = objects.Selection.from_host_state(selected_host, + allocation_request=selected_alloc_req, + allocation_request_version=allocation_request_version) selected_plus_alts = [selection] cell_uuid = selected_host.cell_uuid # This will populate the alternates with many of the same unclaimed diff --git a/nova/scheduler/manager.py b/nova/scheduler/manager.py index 88fb163e8911..235767d602f1 100644 --- a/nova/scheduler/manager.py +++ b/nova/scheduler/manager.py @@ -116,7 +116,8 @@ class SchedulerManager(manager.Manager): request_spec, filter_properties) resources = utils.resources_from_request_spec(spec_obj) - alloc_reqs_by_rp_uuid, provider_summaries = None, None + alloc_reqs_by_rp_uuid, provider_summaries, allocation_request_version \ + = None, None, None if self.driver.USES_ALLOCATION_CANDIDATES: res = self.placement_client.get_allocation_candidates(resources) if res is None: @@ -144,7 +145,7 @@ class SchedulerManager(manager.Manager): rp_uuid = rr['resource_provider']['uuid'] alloc_reqs_by_rp_uuid[rp_uuid].append(ar) - # Only return alteranates if both return_objects and return_alternates + # Only return alternates if both return_objects and return_alternates # are True. return_alternates = return_alternates and return_objects selections = self.driver.select_destinations(ctxt, spec_obj, diff --git a/nova/tests/functional/regressions/test_bug_1671648.py b/nova/tests/functional/regressions/test_bug_1671648.py index 08f874a7a3f0..a5b98425f985 100644 --- a/nova/tests/functional/regressions/test_bug_1671648.py +++ b/nova/tests/functional/regressions/test_bug_1671648.py @@ -148,3 +148,11 @@ class TestRetryBetweenComputeNodeBuilds(test.TestCase): # Assert that we retried. self.assertEqual(2, self.attempts) + + +class TestRetryBetweenComputeNodeBuildsCachingScheduler( + TestRetryBetweenComputeNodeBuilds): + """Tests the reschedule scenario using the CachingScheduler.""" + def setUp(self): + self.flags(driver='caching_scheduler', group='scheduler') + super(TestRetryBetweenComputeNodeBuildsCachingScheduler, self).setUp() diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 186367d58ec3..2f28d13275fb 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -73,6 +73,7 @@ from nova.volume import cinder CONF = nova.conf.CONF +fake_host_list = [mock.sentinel.host1] class ComputeManagerUnitTestCase(test.NoDBTestCase): @@ -4365,7 +4366,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): requested_networks=self.requested_networks, security_groups=self.security_groups, block_device_mapping=self.block_device_mapping, node=self.node, - limits=self.limits) + limits=self.limits, host_list=fake_host_list) self._assert_build_instance_hook_called(mock_hooks, build_results.ACTIVE) @@ -4396,7 +4397,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): port_id=uuids.port_instance)], security_groups=self.security_groups, block_device_mapping=self.block_device_mapping, node=self.node, - limits=self.limits) + limits=self.limits, host_list=fake_host_list) requested_network = mock_build_and_run.call_args[0][5][0] self.assertEqual('fake_network_id', requested_network.network_id) self.assertEqual('10.0.0.1', str(requested_network.address)) @@ -4431,7 +4432,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): requested_networks=self.requested_networks, security_groups=self.security_groups, block_device_mapping=self.block_device_mapping, node=self.node, - limits=self.limits) + limits=self.limits, host_list=fake_host_list) self._instance_action_events(mock_start, mock_finish) self._assert_build_instance_update(mock_save) @@ -4480,7 +4481,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): requested_networks=self.requested_networks, security_groups=self.security_groups, block_device_mapping=self.block_device_mapping, - node=self.node, limits=self.limits) + node=self.node, limits=self.limits, + host_list=fake_host_list) self._assert_build_instance_hook_called(mock_hooks, build_results.RESCHEDULED) @@ -4498,7 +4500,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): [self.instance], self.image, self.filter_properties, self.admin_pass, self.injected_files, self.requested_networks, self.security_groups, self.block_device_mapping, - request_spec={}) + request_spec={}, host_lists=[fake_host_list]) @mock.patch.object(manager.ComputeManager, '_shutdown_instance') @mock.patch.object(manager.ComputeManager, '_build_networks_for_instance') @@ -4569,7 +4571,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): requested_networks=self.requested_networks, security_groups=self.security_groups, block_device_mapping=self.block_device_mapping, node=self.node, - limits=self.limits) + limits=self.limits, host_list=fake_host_list) mock_build_and_run.assert_called_once_with(self.context, instance, @@ -4583,7 +4585,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): [instance], self.image, self.filter_properties, self.admin_pass, self.injected_files, self.requested_networks, self.security_groups, self.block_device_mapping, - request_spec={}) + request_spec={}, host_lists=[fake_host_list]) @mock.patch.object(manager.ComputeManager, '_build_and_run_instance') @mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances') @@ -4625,7 +4627,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): requested_networks=self.requested_networks, security_groups=self.security_groups, block_device_mapping=self.block_device_mapping, node=self.node, - limits=self.limits) + limits=self.limits, host_list=fake_host_list) mock_build_and_run.assert_called_once_with(self.context, instance, @@ -4639,7 +4641,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): [instance], self.image, self.filter_properties, self.admin_pass, self.injected_files, self.requested_networks, self.security_groups, self.block_device_mapping, - request_spec={}) + request_spec={}, host_lists=[fake_host_list]) @mock.patch.object(objects.InstanceActionEvent, 'event_finish_with_failure') @@ -4668,7 +4670,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): requested_networks=self.requested_networks, security_groups=self.security_groups, block_device_mapping=self.block_device_mapping, node=self.node, - limits=self.limits) + limits=self.limits, host_list=fake_host_list) self._assert_build_instance_hook_called(mock_hooks, build_results.FAILED) @@ -4721,7 +4723,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): requested_networks=self.requested_networks, security_groups=self.security_groups, block_device_mapping=self.block_device_mapping, - node=self.node, limits=self.limits) + node=self.node, limits=self.limits, + host_list=fake_host_list) self._assert_build_instance_hook_called(mock_hooks, build_results.RESCHEDULED) @@ -4740,7 +4743,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): [self.instance], self.image, self.filter_properties, self.admin_pass, self.injected_files, self.requested_networks, self.security_groups, self.block_device_mapping, - request_spec={}) + request_spec={}, host_lists=[fake_host_list]) @mock.patch.object(objects.InstanceActionEvent, 'event_finish_with_failure') @@ -4770,7 +4773,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): requested_networks=self.requested_networks, security_groups=self.security_groups, block_device_mapping=self.block_device_mapping, node=self.node, - limits=self.limits) + limits=self.limits, host_list=fake_host_list) self._assert_build_instance_hook_called(mock_hooks, build_results.RESCHEDULED) @@ -4789,7 +4792,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): [self.instance], self.image, self.filter_properties, self.admin_pass, self.injected_files, self.requested_networks, self.security_groups, self.block_device_mapping, - request_spec={}) + request_spec={}, host_lists=[fake_host_list]) @mock.patch.object(objects.InstanceActionEvent, 'event_finish_with_failure') @@ -4820,7 +4823,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): requested_networks=self.requested_networks, security_groups=self.security_groups, block_device_mapping=self.block_device_mapping, node=self.node, - limits=self.limits) + limits=self.limits, host_list=fake_host_list) self._assert_build_instance_hook_called(mock_hooks, build_results.FAILED) @@ -5163,7 +5166,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): requested_networks=self.requested_networks, security_groups=self.security_groups, block_device_mapping=self.block_device_mapping, - node=self.node, limits=self.limits) + node=self.node, limits=self.limits, + host_list=fake_host_list) self._instance_action_events(mock_start, mock_finish) self._assert_build_instance_update(mock_save, reschedule_update=True) @@ -5177,7 +5181,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): self.image, self.filter_properties, self.admin_pass, self.injected_files, self.requested_networks, self.security_groups, self.block_device_mapping, - request_spec={}) + request_spec={}, host_lists=[fake_host_list]) mock_nil.assert_called_once_with(self.instance) mock_clean.assert_called_once_with(self.context, self.instance, self.compute.host) diff --git a/nova/tests/unit/compute/test_rpcapi.py b/nova/tests/unit/compute/test_rpcapi.py index 416b8039cd75..eabbdd0c5a03 100644 --- a/nova/tests/unit/compute/test_rpcapi.py +++ b/nova/tests/unit/compute/test_rpcapi.py @@ -16,6 +16,7 @@ Unit Tests for nova.compute.rpcapi """ +import copy import mock from oslo_serialization import jsonutils @@ -23,6 +24,7 @@ from nova.compute import rpcapi as compute_rpcapi import nova.conf from nova import context from nova import exception +from nova import objects from nova.objects import block_device as objects_block_dev from nova.objects import migrate_data as migrate_data_obj from nova.objects import migration as migration_obj @@ -731,7 +733,49 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): admin_password='passwd', injected_files=None, requested_networks=['network1'], security_groups=None, block_device_mapping=None, node='node', limits=[], - version='4.0') + host_list=None, version='4.19') + + def test_build_and_run_instance_4_18(self): + ctxt = context.RequestContext('fake_user', 'fake_project') + rpcapi = compute_rpcapi.ComputeAPI() + mock_client = mock.Mock() + rpcapi.router.client = mock.Mock(return_value=mock_client) + mock_client.can_send_version = mock.Mock(return_value=False) + prepare_mock = mock.Mock() + prepare_mock.cast = mock.Mock() + mock_client.prepare.return_value = prepare_mock + fake_limit = {"memory_mb": 1024, "disk_gb": 100, "vcpus": 2, + "numa_topology": None} + fake_limit_obj = objects.SchedulerLimits.from_dict(fake_limit) + args = (self.fake_instance_obj, "host", "image", "request_spec", + "filter_properties") + kwargs = { + "admin_password": 'passwd', + "injected_files": None, + "requested_networks": ['network1'], + "security_groups": None, + "block_device_mapping": None, + "node": 'node', + "limits": fake_limit_obj, + "host_list": ["host"], + } + + expected_kwargs = copy.deepcopy(kwargs) + # Since we're failing the 'can_send_version' check, the host_list + # should be removed, and the limits objects should be converted to the + # older dict format. + expected_kwargs.pop("host_list") + expected_kwargs["limits"] = fake_limit_obj.to_dict() + # Add in the args, which will be added to the kwargs dict in the RPC + # call + expected_kwargs["instance"] = self.fake_instance_obj + expected_kwargs["image"] = "image" + expected_kwargs["request_spec"] = "request_spec" + expected_kwargs["filter_properties"] = "filter_properties" + + rpcapi.build_and_run_instance(ctxt, *args, **kwargs) + prepare_mock.cast.assert_called_once_with(ctxt, + "build_and_run_instance", **expected_kwargs) def test_quiesce_instance(self): self._test_compute_api('quiesce_instance', 'call', diff --git a/nova/tests/unit/conductor/test_conductor.py b/nova/tests/unit/conductor/test_conductor.py index 003c124863ff..4c9d35eaa70a 100644 --- a/nova/tests/unit/conductor/test_conductor.py +++ b/nova/tests/unit/conductor/test_conductor.py @@ -65,10 +65,40 @@ from nova import utils CONF = conf.CONF +fake_alloc1 = {"allocations": [ + {"resource_provider": {"uuid": uuids.host1}, + "resources": {"VCPU": 1, + "MEMORY_MB": 1024, + "DISK_GB": 100} + }]} +fake_alloc2 = {"allocations": [ + {"resource_provider": {"uuid": uuids.host2}, + "resources": {"VCPU": 1, + "MEMORY_MB": 1024, + "DISK_GB": 100} + }]} +fake_alloc3 = {"allocations": [ + {"resource_provider": {"uuid": uuids.host3}, + "resources": {"VCPU": 1, + "MEMORY_MB": 1024, + "DISK_GB": 100} + }]} +fake_alloc_json1 = jsonutils.dumps(fake_alloc1) +fake_alloc_json2 = jsonutils.dumps(fake_alloc2) +fake_alloc_json3 = jsonutils.dumps(fake_alloc3) +fake_alloc_version = "1.23" fake_selection1 = objects.Selection(service_host="host1", nodename="node1", - cell_uuid=uuids.cell, limits=None) + cell_uuid=uuids.cell, limits=None, allocation_request=fake_alloc_json1, + allocation_request_version=fake_alloc_version) fake_selection2 = objects.Selection(service_host="host2", nodename="node2", - cell_uuid=uuids.cell, limits=None) + cell_uuid=uuids.cell, limits=None, allocation_request=fake_alloc_json2, + allocation_request_version=fake_alloc_version) +fake_selection3 = objects.Selection(service_host="host3", nodename="node3", + cell_uuid=uuids.cell, limits=None, allocation_request=fake_alloc_json3, + allocation_request_version=fake_alloc_version) +fake_host_lists1 = [[fake_selection1]] +fake_host_lists2 = [[fake_selection1], [fake_selection2]] +fake_host_lists_alt = [[fake_selection1, fake_selection2, fake_selection3]] class FakeContext(context.RequestContext): @@ -422,9 +452,10 @@ class _BaseTaskTestCase(object): 'instance_type': instance_type_p, 'num_instances': 2} filter_properties = {'retry': {'num_attempts': 1, 'hosts': []}} + sched_return = copy.deepcopy(fake_host_lists2) self.conductor_manager._schedule_instances(self.context, fake_spec, [uuids.fake, uuids.fake], return_alternates=True - ).AndReturn([[fake_selection1], [fake_selection2]]) + ).AndReturn(sched_return) db.block_device_mapping_get_all_by_instance(self.context, instances[0].uuid).AndReturn([]) filter_properties2 = {'retry': {'num_attempts': 1, @@ -442,7 +473,7 @@ class _BaseTaskTestCase(object): requested_networks=None, security_groups='security_groups', block_device_mapping=mox.IgnoreArg(), - node='node1', limits=None) + node='node1', limits=None, host_list=sched_return[0]) db.block_device_mapping_get_all_by_instance(self.context, instances[1].uuid).AndReturn([]) filter_properties3 = {'limits': {}, @@ -460,7 +491,7 @@ class _BaseTaskTestCase(object): requested_networks=None, security_groups='security_groups', block_device_mapping=mox.IgnoreArg(), - node='node2', limits=None) + node='node2', limits=None, host_list=sched_return[1]) self.mox.ReplayAll() # build_instances() is a cast, we need to wait for it to complete @@ -477,7 +508,7 @@ class _BaseTaskTestCase(object): requested_networks=None, security_groups='security_groups', block_device_mapping='block_device_mapping', - legacy_bdm=False) + legacy_bdm=False, host_lists=None) mock_getaz.assert_has_calls([ mock.call(self.context, 'host1'), mock.call(self.context, 'host2')]) @@ -670,6 +701,62 @@ class _BaseTaskTestCase(object): mock.call(self.context, instances[1].uuid)]) self.assertFalse(mock_get_by_host.called) + @mock.patch("nova.scheduler.utils.claim_resources", return_value=False) + @mock.patch.object(objects.Instance, 'save') + def test_build_instances_exhaust_host_list(self, _mock_save, mock_claim): + # A list of three alternate hosts for one instance + host_lists = copy.deepcopy(fake_host_lists_alt) + instance = fake_instance.fake_instance_obj(self.context) + image = {'fake-data': 'should_pass_silently'} + expected_claim_count = len(host_lists[0]) + + # build_instances() is a cast, we need to wait for it to complete + self.useFixture(cast_as_call.CastAsCall(self)) + # Since claim_resources() is mocked to always return False, we will run + # out of alternate hosts, and MaxRetriesExceeded should be raised. + self.assertRaises(exc.MaxRetriesExceeded, + self.conductor.build_instances, context=self.context, + instances=[instance], image=image, filter_properties={}, + admin_password='admin_password', + injected_files='injected_files', requested_networks=None, + security_groups='security_groups', + block_device_mapping=None, legacy_bdm=None, + host_lists=host_lists) + self.assertEqual(expected_claim_count, mock_claim.call_count) + + @mock.patch.object(conductor_manager.ComputeTaskManager, + '_destroy_build_request') + @mock.patch.object(conductor_manager.LOG, 'debug') + @mock.patch("nova.scheduler.utils.claim_resources", return_value=True) + @mock.patch.object(objects.Instance, 'save') + def test_build_instances_logs_selected_and_alts(self, _mock_save, + mock_claim, mock_debug, mock_destroy): + # A list of three alternate hosts for one instance + host_lists = copy.deepcopy(fake_host_lists_alt) + expected_host = host_lists[0][0] + expected_alts = host_lists[0][1:] + instance = fake_instance.fake_instance_obj(self.context) + image = {'fake-data': 'should_pass_silently'} + + # build_instances() is a cast, we need to wait for it to complete + self.useFixture(cast_as_call.CastAsCall(self)) + with mock.patch.object(self.conductor_manager.compute_rpcapi, + 'build_and_run_instance'): + self.conductor.build_instances(context=self.context, + instances=[instance], image=image, filter_properties={}, + admin_password='admin_password', + injected_files='injected_files', requested_networks=None, + security_groups='security_groups', + block_device_mapping=None, legacy_bdm=None, + host_lists=host_lists) + # The last LOG.debug call should record the selected host name and the + # list of alternates. + last_call = mock_debug.call_args_list[-1][0] + self.assertIn(expected_host.service_host, last_call) + expected_alt_hosts = [(alt.service_host, alt.nodename) + for alt in expected_alts] + self.assertIn(expected_alt_hosts, last_call) + @mock.patch.object(objects.BuildRequest, 'get_by_instance_uuid') @mock.patch.object(objects.Instance, 'save') @mock.patch.object(objects.InstanceMapping, 'get_by_instance_uuid') @@ -804,7 +891,8 @@ class _BaseTaskTestCase(object): requested_networks=None, security_groups='security_groups', block_device_mapping='block_device_mapping', - legacy_bdm=False) + legacy_bdm=False, + host_lists=None) do_test() @@ -849,8 +937,11 @@ class _BaseTaskTestCase(object): requested_networks=None, security_groups='security_groups', block_device_mapping='block_device_mapping', - legacy_bdm=False) + legacy_bdm=False, host_lists=None) + expected_build_run_host_list = copy.copy(fake_host_lists1[0]) + if expected_build_run_host_list: + expected_build_run_host_list.pop(0) mock_build_and_run.assert_called_once_with( self.context, instance=mock.ANY, @@ -866,7 +957,8 @@ class _BaseTaskTestCase(object): security_groups='security_groups', block_device_mapping=test.MatchType( objects.BlockDeviceMappingList), - node='node1', limits=None) + node='node1', limits=None, + host_list=expected_build_run_host_list) mock_pop_inst_map.assert_not_called() mock_destroy_build_req.assert_not_called() @@ -2396,9 +2488,11 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase): mox.IgnoreArg()).AndReturn(spec) filter_properties = {'retry': {'num_attempts': 1, 'hosts': []}} inst_uuids = [inst.uuid for inst in instances] + + sched_return = copy.deepcopy(fake_host_lists2) self.conductor_manager._schedule_instances(self.context, fake_spec, inst_uuids, return_alternates=True).AndReturn( - [[fake_selection1], [fake_selection2]]) + sched_return) instances[0].save().AndRaise( exc.InstanceNotFound(instance_id=instances[0].uuid)) instances[1].save() @@ -2415,7 +2509,7 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase): requested_networks=None, security_groups='security_groups', block_device_mapping=mox.IsA(objects.BlockDeviceMappingList), - node='node2', limits=None) + node='node2', limits=None, host_list=[]) self.mox.ReplayAll() # build_instances() is a cast, we need to wait for it to complete @@ -2430,7 +2524,7 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase): requested_networks=None, security_groups='security_groups', block_device_mapping='block_device_mapping', - legacy_bdm=False) + legacy_bdm=False, host_lists=None) # RequestSpec.from_primitives is called once before we call the # scheduler to select_destinations and then once per instance that # gets build in the compute. @@ -2496,7 +2590,7 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase): requested_networks=None, security_groups='security_groups', block_device_mapping=mock.ANY, - node='node2', limits=None) + node='node2', limits=None, host_list=[]) @mock.patch('nova.objects.Instance.save') def test_build_instances_max_retries_exceeded(self, mock_save): @@ -2828,7 +2922,7 @@ class ConductorTaskRPCAPITestCase(_BaseTaskTestCase, cctxt_mock = mock.MagicMock() @mock.patch.object(self.conductor.client, 'can_send_version', - return_value=True) + side_effect=(False, True, True, True, True)) @mock.patch.object(self.conductor.client, 'prepare', return_value=cctxt_mock) def _test(prepare_mock, can_send_mock): @@ -2859,7 +2953,7 @@ class ConductorTaskRPCAPITestCase(_BaseTaskTestCase, cctxt_mock = mock.MagicMock() @mock.patch.object(self.conductor.client, 'can_send_version', - side_effect=(False, True, True, True)) + side_effect=(False, False, True, True, True)) @mock.patch.object(self.conductor.client, 'prepare', return_value=cctxt_mock) def _test(prepare_mock, can_send_mock): diff --git a/nova/tests/unit/objects/test_service.py b/nova/tests/unit/objects/test_service.py index 9d1e749b0fc3..59704bed7182 100644 --- a/nova/tests/unit/objects/test_service.py +++ b/nova/tests/unit/objects/test_service.py @@ -470,7 +470,7 @@ class TestServiceVersion(test.TestCase): self.assertEqual( current, calculated, 'Changes detected that require a SERVICE_VERSION change. Please ' - 'increment nova.objects.service.SERVICE_VERSION, and make sure it' + 'increment nova.objects.service.SERVICE_VERSION, and make sure it ' 'is equal to nova.compute.manager.ComputeManager.target.version.') def test_version_in_init(self): diff --git a/nova/tests/unit/scheduler/test_filter_scheduler.py b/nova/tests/unit/scheduler/test_filter_scheduler.py index 884fdc47939c..7e72089b8dd9 100644 --- a/nova/tests/unit/scheduler/test_filter_scheduler.py +++ b/nova/tests/unit/scheduler/test_filter_scheduler.py @@ -201,7 +201,9 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase): selected_hosts = self.driver._schedule(ctx, spec_obj, instance_uuids, alloc_reqs_by_rp_uuid, mock.sentinel.provider_summaries) - expected_selection = [[objects.Selection.from_host_state(host_state)]] + sel_obj = objects.Selection.from_host_state(host_state, + allocation_request=fake_alloc) + expected_selection = [[sel_obj]] mock_get_all_states.assert_called_once_with( ctx.elevated.return_value, spec_obj, mock.sentinel.provider_summaries) @@ -387,7 +389,8 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase): alloc_reqs_by_rp_uuid, mock.sentinel.provider_summaries, return_alternates=True) - sel0 = objects.Selection.from_host_state(host_state0) + sel0 = objects.Selection.from_host_state(host_state0, + allocation_request=fake_alloc0) sel1 = objects.Selection.from_host_state(host_state1, allocation_request=fake_alloc1) sel2 = objects.Selection.from_host_state(host_state2, @@ -452,7 +455,8 @@ class FilterSchedulerTestCase(test_scheduler.SchedulerTestCase): alloc_reqs_by_rp_uuid, mock.sentinel.provider_summaries, return_alternates=False) - sel0 = objects.Selection.from_host_state(host_state0) + sel0 = objects.Selection.from_host_state(host_state0, + allocation_request=fake_alloc0) expected_selection = [[sel0]] self.assertEqual(expected_selection, selected_hosts)