From 4cc1798bd430a3f7b411c8c029f8748e3b2a725f Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Fri, 25 Jan 2019 14:52:58 -0500 Subject: [PATCH] Add prep_snapshot_based_resize_at_dest compute method This adds a new method to the compute service which will be synchronously RPC called from (super)conductor when preparing for a cross-cell resize. It will perform an RT.resize_claim() which will claim things like PCI devices and/or NUMA topology resources which are not otherwise "claimed" in the placement service during scheduling. The MigrationContext is created in the target cell DB as part of this claim. Notifications, fault and instance action event creation should be consistent with the same-cell "prep_resize" method. One difference is the reverts_task_state decorator is not used here since conductor is responsible for trying alternative hosts and it does not make sense for this method to reset the instance task_state to None on failure if conductor is going to try another host. The existing prep_resize method is not used in general since for cross-cell-resize conductor handles orchestrating the call to the source compute and reschedules, which are things prep_resize does for same-cell resize. We could munge the existing method but I felt this was cleaner to keep them separate. Part of blueprint cross-cell-resize Change-Id: I518ae675b7a67da64a5796e57e87860f0c3ef0db --- nova/compute/manager.py | 75 ++++++++++- nova/compute/rpcapi.py | 47 +++++++ nova/conf/rpc.py | 1 + nova/objects/service.py | 4 +- nova/tests/unit/compute/test_compute_mgr.py | 135 ++++++++++++++++++++ nova/tests/unit/compute/test_rpcapi.py | 41 ++++++ 6 files changed, 301 insertions(+), 2 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index e24ceaae54af..9ab103f93584 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -521,7 +521,7 @@ class ComputeVirtAPI(virtapi.VirtAPI): class ComputeManager(manager.Manager): """Manages the running instances from creation to destruction.""" - target = messaging.Target(version='5.4') + target = messaging.Target(version='5.5') def __init__(self, compute_driver=None, *args, **kwargs): """Load configuration options and connect to the hypervisor.""" @@ -4803,6 +4803,79 @@ class ComputeManager(manager.Manager): # not re-scheduling six.reraise(*exc_info) + @messaging.expected_exceptions(exception.MigrationPreCheckError) + @wrap_exception() + @wrap_instance_event(prefix='compute') + @wrap_instance_fault + def prep_snapshot_based_resize_at_dest( + self, ctxt, instance, flavor, nodename, migration, limits, + request_spec): + """Performs pre-cross-cell resize resource claim on the dest host. + + This runs on the destination host in a cross-cell resize operation + before the resize is actually started. + + Performs a resize_claim for resources that are not claimed in placement + like PCI devices and NUMA topology. + + Note that this is different from same-cell prep_resize in that this: + + * Does not RPC cast to the source compute, that is orchestrated from + conductor. + * This does not reschedule on failure, conductor handles that since + conductor is synchronously RPC calling this method. As such, the + reverts_task_state decorator is not used on this method. + + :param ctxt: user auth request context + :param instance: the instance being resized + :param flavor: the flavor being resized to (unchanged for cold migrate) + :param nodename: Name of the target compute node + :param migration: nova.objects.Migration object for the operation + :param limits: nova.objects.SchedulerLimits object of resource limits + :param request_spec: nova.objects.RequestSpec object for the operation + :returns: nova.objects.MigrationContext; the migration context created + on the destination host during the resize_claim. + :raises: nova.exception.MigrationPreCheckError if the pre-check + validation fails for the given host selection + """ + LOG.debug('Checking if we can cross-cell migrate instance to this ' + 'host (%s).', self.host, instance=instance) + self._send_prep_resize_notifications( + ctxt, instance, fields.NotificationPhase.START, flavor) + # TODO(mriedem): _update_pci_request_spec_with_allocated_interface_name + # should be called here if the request spec has request group mappings, + # e.g. for things like QoS ports with resource requests. Do it outside + # the try/except so if it raises BuildAbortException we do not attempt + # to reschedule. + try: + # Get the allocations within the try/except block in case we get + # an error so MigrationPreCheckError is raised up. + allocations = self.reportclient.get_allocs_for_consumer( + ctxt, instance.uuid)['allocations'] + # Claim resources on this target host using the new flavor which + # will create the MigrationContext object. Note that in the future + # if we want to do other validation here we should do it within + # the MoveClaim context so we can drop the claim if anything fails. + self.rt.resize_claim( + ctxt, instance, flavor, nodename, migration, allocations, + image_meta=instance.image_meta, limits=limits) + except Exception as ex: + err = six.text_type(ex) + LOG.warning( + 'Cross-cell resize pre-checks failed for this host (%s). ' + 'Cleaning up. Failure: %s', self.host, err, + instance=instance, exc_info=True) + raise exception.MigrationPreCheckError( + reason=(_("Pre-checks failed on host '%(host)s'. " + "Error: %(error)s") % + {'host': self.host, 'error': err})) + finally: + self._send_prep_resize_notifications( + ctxt, instance, fields.NotificationPhase.END, flavor) + + # ResourceTracker.resize_claim() sets instance.migration_context. + return instance.migration_context + @wrap_exception() @reverts_task_state @wrap_instance_event(prefix='compute') diff --git a/nova/compute/rpcapi.py b/nova/compute/rpcapi.py index e599a0919cc6..ff7ecbca583c 100644 --- a/nova/compute/rpcapi.py +++ b/nova/compute/rpcapi.py @@ -371,6 +371,7 @@ class ComputeAPI(object): check_can_live_migrate_destination(), and a new drop_move_claim_at_destination() method * 5.4 - Add cache_images() support + * 5.5 - Add prep_snapshot_based_resize_at_dest() ''' VERSION_ALIASES = { @@ -845,6 +846,52 @@ class ComputeAPI(object): cctxt = client.prepare(server=host, version=version) cctxt.cast(ctxt, 'prep_resize', **msg_args) + def prep_snapshot_based_resize_at_dest( + self, ctxt, instance, flavor, nodename, migration, limits, + request_spec, destination): + """Performs pre-cross-cell resize resource claim on the dest host. + + This runs on the destination host in a cross-cell resize operation + before the resize is actually started. + + Performs a resize_claim for resources that are not claimed in placement + like PCI devices and NUMA topology. + + Note that this is different from same-cell prep_resize in that this: + + * Does not RPC cast to the source compute, that is orchestrated from + conductor. + * This does not reschedule on failure, conductor handles that since + conductor is synchronously RPC calling this method. + + :param ctxt: user auth request context + :param instance: the instance being resized + :param flavor: the flavor being resized to (unchanged for cold migrate) + :param nodename: Name of the target compute node + :param migration: nova.objects.Migration object for the operation + :param limits: nova.objects.SchedulerLimits object of resource limits + :param request_spec: nova.objects.RequestSpec object for the operation + :param destination: possible target host for the cross-cell resize + :returns: nova.objects.MigrationContext; the migration context created + on the destination host during the resize_claim. + :raises: nova.exception.MigrationPreCheckError if the pre-check + validation fails for the given host selection or the destination + compute service is too old for this method + :raises: oslo_messaging.exceptions.MessagingTimeout if the pre-check + RPC call times out + """ + version = '5.5' + client = self.router.client(ctxt) + if not client.can_send_version(version): + raise exception.MigrationPreCheckError(reason=_('Compute too old')) + cctxt = client.prepare(server=destination, version=version, + call_monitor_timeout=CONF.rpc_response_timeout, + timeout=CONF.long_rpc_timeout) + return cctxt.call(ctxt, 'prep_snapshot_based_resize_at_dest', + instance=instance, flavor=flavor, nodename=nodename, + migration=migration, limits=limits, + request_spec=request_spec) + def reboot_instance(self, ctxt, instance, block_device_info, reboot_type): version = '5.0' diff --git a/nova/conf/rpc.py b/nova/conf/rpc.py index 68cd4de6315b..149020052751 100644 --- a/nova/conf/rpc.py +++ b/nova/conf/rpc.py @@ -30,6 +30,7 @@ Operations with RPC calls that utilize this value: * scheduling * enabling/disabling a compute service * image pre-caching +* snapshot-based / cross-cell resize Related options: diff --git a/nova/objects/service.py b/nova/objects/service.py index 68cd3ebc73d3..e421bac40184 100644 --- a/nova/objects/service.py +++ b/nova/objects/service.py @@ -31,7 +31,7 @@ LOG = logging.getLogger(__name__) # NOTE(danms): This is the global service version counter -SERVICE_VERSION = 41 +SERVICE_VERSION = 42 # NOTE(danms): This is our SERVICE_VERSION history. The idea is that any @@ -163,6 +163,8 @@ SERVICE_VERSION_HISTORY = ( {'compute_rpc': '5.3'}, # Version 41: Add cache_images() to compute rpcapi (version 5.4) {'compute_rpc': '5.4'}, + # Version 42: Compute RPC version 5.5; +prep_snapshot_based_resize_at_dest + {'compute_rpc': '5.5'}, ) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 100332da9e6a..7cf05348ed81 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -10167,6 +10167,141 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, self.assertEqual(new_dev.address, updated_nw_info[1]['profile']['pci_slot']) + @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'get_allocs_for_consumer') + def test_prep_snapshot_based_resize_at_dest(self, get_allocs): + """Tests happy path for prep_snapshot_based_resize_at_dest""" + # Setup mocks. + flavor = self.instance.flavor + limits = objects.SchedulerLimits() + request_spec = objects.RequestSpec() + # resize_claim normally sets instance.migration_context and returns + # a MoveClaim which is a context manager. Rather than deal with + # mocking a context manager we just set the migration_context on the + # fake instance ahead of time to ensure it is returned as expected. + self.instance.migration_context = objects.MigrationContext() + with test.nested( + mock.patch.object(self.compute, '_send_prep_resize_notifications'), + mock.patch.object(self.compute.rt, 'resize_claim'), + ) as ( + _send_prep_resize_notifications, resize_claim, + ): + # Run the code. + mc = self.compute.prep_snapshot_based_resize_at_dest( + self.context, self.instance, flavor, 'nodename', + self.migration, limits, request_spec) + self.assertIs(mc, self.instance.migration_context) + # Assert the mock calls. + _send_prep_resize_notifications.assert_has_calls([ + mock.call(self.context, self.instance, + fields.NotificationPhase.START, flavor), + mock.call(self.context, self.instance, + fields.NotificationPhase.END, flavor)]) + resize_claim.assert_called_once_with( + self.context, self.instance, flavor, 'nodename', self.migration, + get_allocs.return_value['allocations'], + image_meta=test.MatchType(objects.ImageMeta), limits=limits) + + @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'get_allocs_for_consumer') + @mock.patch('nova.compute.utils.add_instance_fault_from_exc') + def test_prep_snapshot_based_resize_at_dest_get_allocs_fails( + self, add_fault, get_allocs): + """Tests that getting allocations fails and ExpectedException + is raised with the MigrationPreCheckError inside. + """ + # Setup mocks. + flavor = self.instance.flavor + limits = objects.SchedulerLimits() + request_spec = objects.RequestSpec() + ex1 = exception.ConsumerAllocationRetrievalFailed( + consumer_uuid=self.instance.uuid, error='oops') + get_allocs.side_effect = ex1 + with test.nested( + mock.patch.object(self.compute, + '_send_prep_resize_notifications'), + mock.patch.object(self.compute.rt, 'resize_claim') + ) as ( + _send_prep_resize_notifications, resize_claim, + ): + # Run the code. + ex2 = self.assertRaises( + messaging.ExpectedException, + self.compute.prep_snapshot_based_resize_at_dest, + self.context, self.instance, flavor, 'nodename', + self.migration, limits, request_spec) + wrapped_exc = ex2.exc_info[1] + # The original error should be in the MigrationPreCheckError which + # itself is in the ExpectedException. + self.assertIn(ex1.format_message(), six.text_type(wrapped_exc)) + # Assert the mock calls. + _send_prep_resize_notifications.assert_has_calls([ + mock.call(self.context, self.instance, + fields.NotificationPhase.START, flavor), + mock.call(self.context, self.instance, + fields.NotificationPhase.END, flavor)]) + resize_claim.assert_not_called() + # Assert the decorators that are triggered on error + add_fault.assert_called_once_with( + self.context, self.instance, wrapped_exc, mock.ANY) + # There would really be three notifications but because we mocked out + # _send_prep_resize_notifications there is just the one error + # notification from the wrap_exception decorator. + self.assertEqual(1, len(fake_notifier.VERSIONED_NOTIFICATIONS)) + self.assertEqual( + 'compute.%s' % fields.NotificationAction.EXCEPTION, + fake_notifier.VERSIONED_NOTIFICATIONS[0]['event_type']) + + @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'get_allocs_for_consumer') + @mock.patch('nova.compute.utils.add_instance_fault_from_exc') + def test_prep_snapshot_based_resize_at_dest_claim_fails( + self, add_fault, get_allocs): + """Tests that the resize_claim fails and ExpectedException + is raised with the MigrationPreCheckError inside. + """ + # Setup mocks. + flavor = self.instance.flavor + limits = objects.SchedulerLimits() + request_spec = objects.RequestSpec() + ex1 = exception.ComputeResourcesUnavailable(reason='numa') + with test.nested( + mock.patch.object(self.compute, '_send_prep_resize_notifications'), + mock.patch.object(self.compute.rt, 'resize_claim', side_effect=ex1) + ) as ( + _send_prep_resize_notifications, resize_claim, + ): + # Run the code. + ex2 = self.assertRaises( + messaging.ExpectedException, + self.compute.prep_snapshot_based_resize_at_dest, + self.context, self.instance, flavor, 'nodename', + self.migration, limits, request_spec) + wrapped_exc = ex2.exc_info[1] + # The original error should be in the MigrationPreCheckError which + # itself is in the ExpectedException. + self.assertIn(ex1.format_message(), six.text_type(wrapped_exc)) + # Assert the mock calls. + _send_prep_resize_notifications.assert_has_calls([ + mock.call(self.context, self.instance, + fields.NotificationPhase.START, flavor), + mock.call(self.context, self.instance, + fields.NotificationPhase.END, flavor)]) + resize_claim.assert_called_once_with( + self.context, self.instance, flavor, 'nodename', self.migration, + get_allocs.return_value['allocations'], + image_meta=test.MatchType(objects.ImageMeta), limits=limits) + # Assert the decorators that are triggered on error + add_fault.assert_called_once_with( + self.context, self.instance, wrapped_exc, mock.ANY) + # There would really be three notifications but because we mocked out + # _send_prep_resize_notifications there is just the one error + # notification from the wrap_exception decorator. + self.assertEqual(1, len(fake_notifier.VERSIONED_NOTIFICATIONS)) + self.assertEqual( + 'compute.%s' % fields.NotificationAction.EXCEPTION, + fake_notifier.VERSIONED_NOTIFICATIONS[0]['event_type']) + class ComputeManagerInstanceUsageAuditTestCase(test.TestCase): def setUp(self): diff --git a/nova/tests/unit/compute/test_rpcapi.py b/nova/tests/unit/compute/test_rpcapi.py index a11d1bcd89a6..c366cd9382f0 100644 --- a/nova/tests/unit/compute/test_rpcapi.py +++ b/nova/tests/unit/compute/test_rpcapi.py @@ -19,10 +19,12 @@ Unit Tests for nova.compute.rpcapi import mock from oslo_serialization import jsonutils from oslo_utils.fixture import uuidsentinel as uuids +import six from nova.compute import rpcapi as compute_rpcapi from nova import context from nova import exception +from nova import objects from nova.objects import block_device as objects_block_dev from nova.objects import migration as migration_obj from nova.objects import service as service_obj @@ -494,6 +496,45 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): node='node', clean_shutdown=True, host_list=None, version='5.1') + def test_prep_snapshot_based_resize_at_dest(self): + """Tests happy path for prep_snapshot_based_resize_at_dest rpc call""" + self.flags(long_rpc_timeout=1234) + self._test_compute_api( + 'prep_snapshot_based_resize_at_dest', 'call', + # compute method kwargs + instance=self.fake_instance_obj, + flavor=self.fake_flavor_obj, + nodename='node', + migration=migration_obj.Migration(), + limits={}, + request_spec=objects.RequestSpec(), + destination='dest', + # client.prepare kwargs + version='5.5', call_monitor_timeout=60, timeout=1234, + # assert the expected return value + _return_value=mock.sentinel.migration_context) + + @mock.patch('nova.rpc.ClientRouter.client') + def test_prep_snapshot_based_resize_at_dest_old_compute(self, mock_client): + """Tests when the destination compute service is too old to call + prep_snapshot_based_resize_at_dest so MigrationPreCheckError is + raised. + """ + mock_client.return_value.can_send_version.return_value = False + rpcapi = compute_rpcapi.ComputeAPI() + ex = self.assertRaises( + exception.MigrationPreCheckError, + rpcapi.prep_snapshot_based_resize_at_dest, + self.context, + instance=self.fake_instance_obj, + flavor=self.fake_flavor_obj, + nodename='node', + migration=migration_obj.Migration(), + limits={}, + request_spec=objects.RequestSpec(), + destination='dest') + self.assertIn('Compute too old', six.text_type(ex)) + def test_reboot_instance(self): self.maxDiff = None self._test_compute_api('reboot_instance', 'cast',