Add prep_snapshot_based_resize_at_dest compute method

This adds a new method to the compute service which will
be synchronously RPC called from (super)conductor when
preparing for a cross-cell resize. It will perform an
RT.resize_claim() which will claim things like PCI devices
and/or NUMA topology resources which are not otherwise "claimed"
in the placement service during scheduling. The MigrationContext
is created in the target cell DB as part of this claim.

Notifications, fault and instance action event creation should
be consistent with the same-cell "prep_resize" method. One
difference is the reverts_task_state decorator is not used here
since conductor is responsible for trying alternative hosts and
it does not make sense for this method to reset the instance
task_state to None on failure if conductor is going to try
another host. The existing prep_resize method is not used in
general since for cross-cell-resize conductor handles orchestrating
the call to the source compute and reschedules, which are things
prep_resize does for same-cell resize. We could munge the
existing method but I felt this was cleaner to keep them separate.

Part of blueprint cross-cell-resize

Change-Id: I518ae675b7a67da64a5796e57e87860f0c3ef0db
This commit is contained in:
Matt Riedemann 2019-01-25 14:52:58 -05:00
parent 6b60cae019
commit 4cc1798bd4
6 changed files with 301 additions and 2 deletions

View File

@ -521,7 +521,7 @@ class ComputeVirtAPI(virtapi.VirtAPI):
class ComputeManager(manager.Manager): class ComputeManager(manager.Manager):
"""Manages the running instances from creation to destruction.""" """Manages the running instances from creation to destruction."""
target = messaging.Target(version='5.4') target = messaging.Target(version='5.5')
def __init__(self, compute_driver=None, *args, **kwargs): def __init__(self, compute_driver=None, *args, **kwargs):
"""Load configuration options and connect to the hypervisor.""" """Load configuration options and connect to the hypervisor."""
@ -4803,6 +4803,79 @@ class ComputeManager(manager.Manager):
# not re-scheduling # not re-scheduling
six.reraise(*exc_info) six.reraise(*exc_info)
@messaging.expected_exceptions(exception.MigrationPreCheckError)
@wrap_exception()
@wrap_instance_event(prefix='compute')
@wrap_instance_fault
def prep_snapshot_based_resize_at_dest(
self, ctxt, instance, flavor, nodename, migration, limits,
request_spec):
"""Performs pre-cross-cell resize resource claim on the dest host.
This runs on the destination host in a cross-cell resize operation
before the resize is actually started.
Performs a resize_claim for resources that are not claimed in placement
like PCI devices and NUMA topology.
Note that this is different from same-cell prep_resize in that this:
* Does not RPC cast to the source compute, that is orchestrated from
conductor.
* This does not reschedule on failure, conductor handles that since
conductor is synchronously RPC calling this method. As such, the
reverts_task_state decorator is not used on this method.
:param ctxt: user auth request context
:param instance: the instance being resized
:param flavor: the flavor being resized to (unchanged for cold migrate)
:param nodename: Name of the target compute node
:param migration: nova.objects.Migration object for the operation
:param limits: nova.objects.SchedulerLimits object of resource limits
:param request_spec: nova.objects.RequestSpec object for the operation
:returns: nova.objects.MigrationContext; the migration context created
on the destination host during the resize_claim.
:raises: nova.exception.MigrationPreCheckError if the pre-check
validation fails for the given host selection
"""
LOG.debug('Checking if we can cross-cell migrate instance to this '
'host (%s).', self.host, instance=instance)
self._send_prep_resize_notifications(
ctxt, instance, fields.NotificationPhase.START, flavor)
# TODO(mriedem): _update_pci_request_spec_with_allocated_interface_name
# should be called here if the request spec has request group mappings,
# e.g. for things like QoS ports with resource requests. Do it outside
# the try/except so if it raises BuildAbortException we do not attempt
# to reschedule.
try:
# Get the allocations within the try/except block in case we get
# an error so MigrationPreCheckError is raised up.
allocations = self.reportclient.get_allocs_for_consumer(
ctxt, instance.uuid)['allocations']
# Claim resources on this target host using the new flavor which
# will create the MigrationContext object. Note that in the future
# if we want to do other validation here we should do it within
# the MoveClaim context so we can drop the claim if anything fails.
self.rt.resize_claim(
ctxt, instance, flavor, nodename, migration, allocations,
image_meta=instance.image_meta, limits=limits)
except Exception as ex:
err = six.text_type(ex)
LOG.warning(
'Cross-cell resize pre-checks failed for this host (%s). '
'Cleaning up. Failure: %s', self.host, err,
instance=instance, exc_info=True)
raise exception.MigrationPreCheckError(
reason=(_("Pre-checks failed on host '%(host)s'. "
"Error: %(error)s") %
{'host': self.host, 'error': err}))
finally:
self._send_prep_resize_notifications(
ctxt, instance, fields.NotificationPhase.END, flavor)
# ResourceTracker.resize_claim() sets instance.migration_context.
return instance.migration_context
@wrap_exception() @wrap_exception()
@reverts_task_state @reverts_task_state
@wrap_instance_event(prefix='compute') @wrap_instance_event(prefix='compute')

View File

@ -371,6 +371,7 @@ class ComputeAPI(object):
check_can_live_migrate_destination(), and a new check_can_live_migrate_destination(), and a new
drop_move_claim_at_destination() method drop_move_claim_at_destination() method
* 5.4 - Add cache_images() support * 5.4 - Add cache_images() support
* 5.5 - Add prep_snapshot_based_resize_at_dest()
''' '''
VERSION_ALIASES = { VERSION_ALIASES = {
@ -845,6 +846,52 @@ class ComputeAPI(object):
cctxt = client.prepare(server=host, version=version) cctxt = client.prepare(server=host, version=version)
cctxt.cast(ctxt, 'prep_resize', **msg_args) cctxt.cast(ctxt, 'prep_resize', **msg_args)
def prep_snapshot_based_resize_at_dest(
self, ctxt, instance, flavor, nodename, migration, limits,
request_spec, destination):
"""Performs pre-cross-cell resize resource claim on the dest host.
This runs on the destination host in a cross-cell resize operation
before the resize is actually started.
Performs a resize_claim for resources that are not claimed in placement
like PCI devices and NUMA topology.
Note that this is different from same-cell prep_resize in that this:
* Does not RPC cast to the source compute, that is orchestrated from
conductor.
* This does not reschedule on failure, conductor handles that since
conductor is synchronously RPC calling this method.
:param ctxt: user auth request context
:param instance: the instance being resized
:param flavor: the flavor being resized to (unchanged for cold migrate)
:param nodename: Name of the target compute node
:param migration: nova.objects.Migration object for the operation
:param limits: nova.objects.SchedulerLimits object of resource limits
:param request_spec: nova.objects.RequestSpec object for the operation
:param destination: possible target host for the cross-cell resize
:returns: nova.objects.MigrationContext; the migration context created
on the destination host during the resize_claim.
:raises: nova.exception.MigrationPreCheckError if the pre-check
validation fails for the given host selection or the destination
compute service is too old for this method
:raises: oslo_messaging.exceptions.MessagingTimeout if the pre-check
RPC call times out
"""
version = '5.5'
client = self.router.client(ctxt)
if not client.can_send_version(version):
raise exception.MigrationPreCheckError(reason=_('Compute too old'))
cctxt = client.prepare(server=destination, version=version,
call_monitor_timeout=CONF.rpc_response_timeout,
timeout=CONF.long_rpc_timeout)
return cctxt.call(ctxt, 'prep_snapshot_based_resize_at_dest',
instance=instance, flavor=flavor, nodename=nodename,
migration=migration, limits=limits,
request_spec=request_spec)
def reboot_instance(self, ctxt, instance, block_device_info, def reboot_instance(self, ctxt, instance, block_device_info,
reboot_type): reboot_type):
version = '5.0' version = '5.0'

View File

@ -30,6 +30,7 @@ Operations with RPC calls that utilize this value:
* scheduling * scheduling
* enabling/disabling a compute service * enabling/disabling a compute service
* image pre-caching * image pre-caching
* snapshot-based / cross-cell resize
Related options: Related options:

View File

@ -31,7 +31,7 @@ LOG = logging.getLogger(__name__)
# NOTE(danms): This is the global service version counter # NOTE(danms): This is the global service version counter
SERVICE_VERSION = 41 SERVICE_VERSION = 42
# NOTE(danms): This is our SERVICE_VERSION history. The idea is that any # NOTE(danms): This is our SERVICE_VERSION history. The idea is that any
@ -163,6 +163,8 @@ SERVICE_VERSION_HISTORY = (
{'compute_rpc': '5.3'}, {'compute_rpc': '5.3'},
# Version 41: Add cache_images() to compute rpcapi (version 5.4) # Version 41: Add cache_images() to compute rpcapi (version 5.4)
{'compute_rpc': '5.4'}, {'compute_rpc': '5.4'},
# Version 42: Compute RPC version 5.5; +prep_snapshot_based_resize_at_dest
{'compute_rpc': '5.5'},
) )

View File

@ -10167,6 +10167,141 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
self.assertEqual(new_dev.address, self.assertEqual(new_dev.address,
updated_nw_info[1]['profile']['pci_slot']) updated_nw_info[1]['profile']['pci_slot'])
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'get_allocs_for_consumer')
def test_prep_snapshot_based_resize_at_dest(self, get_allocs):
"""Tests happy path for prep_snapshot_based_resize_at_dest"""
# Setup mocks.
flavor = self.instance.flavor
limits = objects.SchedulerLimits()
request_spec = objects.RequestSpec()
# resize_claim normally sets instance.migration_context and returns
# a MoveClaim which is a context manager. Rather than deal with
# mocking a context manager we just set the migration_context on the
# fake instance ahead of time to ensure it is returned as expected.
self.instance.migration_context = objects.MigrationContext()
with test.nested(
mock.patch.object(self.compute, '_send_prep_resize_notifications'),
mock.patch.object(self.compute.rt, 'resize_claim'),
) as (
_send_prep_resize_notifications, resize_claim,
):
# Run the code.
mc = self.compute.prep_snapshot_based_resize_at_dest(
self.context, self.instance, flavor, 'nodename',
self.migration, limits, request_spec)
self.assertIs(mc, self.instance.migration_context)
# Assert the mock calls.
_send_prep_resize_notifications.assert_has_calls([
mock.call(self.context, self.instance,
fields.NotificationPhase.START, flavor),
mock.call(self.context, self.instance,
fields.NotificationPhase.END, flavor)])
resize_claim.assert_called_once_with(
self.context, self.instance, flavor, 'nodename', self.migration,
get_allocs.return_value['allocations'],
image_meta=test.MatchType(objects.ImageMeta), limits=limits)
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'get_allocs_for_consumer')
@mock.patch('nova.compute.utils.add_instance_fault_from_exc')
def test_prep_snapshot_based_resize_at_dest_get_allocs_fails(
self, add_fault, get_allocs):
"""Tests that getting allocations fails and ExpectedException
is raised with the MigrationPreCheckError inside.
"""
# Setup mocks.
flavor = self.instance.flavor
limits = objects.SchedulerLimits()
request_spec = objects.RequestSpec()
ex1 = exception.ConsumerAllocationRetrievalFailed(
consumer_uuid=self.instance.uuid, error='oops')
get_allocs.side_effect = ex1
with test.nested(
mock.patch.object(self.compute,
'_send_prep_resize_notifications'),
mock.patch.object(self.compute.rt, 'resize_claim')
) as (
_send_prep_resize_notifications, resize_claim,
):
# Run the code.
ex2 = self.assertRaises(
messaging.ExpectedException,
self.compute.prep_snapshot_based_resize_at_dest,
self.context, self.instance, flavor, 'nodename',
self.migration, limits, request_spec)
wrapped_exc = ex2.exc_info[1]
# The original error should be in the MigrationPreCheckError which
# itself is in the ExpectedException.
self.assertIn(ex1.format_message(), six.text_type(wrapped_exc))
# Assert the mock calls.
_send_prep_resize_notifications.assert_has_calls([
mock.call(self.context, self.instance,
fields.NotificationPhase.START, flavor),
mock.call(self.context, self.instance,
fields.NotificationPhase.END, flavor)])
resize_claim.assert_not_called()
# Assert the decorators that are triggered on error
add_fault.assert_called_once_with(
self.context, self.instance, wrapped_exc, mock.ANY)
# There would really be three notifications but because we mocked out
# _send_prep_resize_notifications there is just the one error
# notification from the wrap_exception decorator.
self.assertEqual(1, len(fake_notifier.VERSIONED_NOTIFICATIONS))
self.assertEqual(
'compute.%s' % fields.NotificationAction.EXCEPTION,
fake_notifier.VERSIONED_NOTIFICATIONS[0]['event_type'])
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'get_allocs_for_consumer')
@mock.patch('nova.compute.utils.add_instance_fault_from_exc')
def test_prep_snapshot_based_resize_at_dest_claim_fails(
self, add_fault, get_allocs):
"""Tests that the resize_claim fails and ExpectedException
is raised with the MigrationPreCheckError inside.
"""
# Setup mocks.
flavor = self.instance.flavor
limits = objects.SchedulerLimits()
request_spec = objects.RequestSpec()
ex1 = exception.ComputeResourcesUnavailable(reason='numa')
with test.nested(
mock.patch.object(self.compute, '_send_prep_resize_notifications'),
mock.patch.object(self.compute.rt, 'resize_claim', side_effect=ex1)
) as (
_send_prep_resize_notifications, resize_claim,
):
# Run the code.
ex2 = self.assertRaises(
messaging.ExpectedException,
self.compute.prep_snapshot_based_resize_at_dest,
self.context, self.instance, flavor, 'nodename',
self.migration, limits, request_spec)
wrapped_exc = ex2.exc_info[1]
# The original error should be in the MigrationPreCheckError which
# itself is in the ExpectedException.
self.assertIn(ex1.format_message(), six.text_type(wrapped_exc))
# Assert the mock calls.
_send_prep_resize_notifications.assert_has_calls([
mock.call(self.context, self.instance,
fields.NotificationPhase.START, flavor),
mock.call(self.context, self.instance,
fields.NotificationPhase.END, flavor)])
resize_claim.assert_called_once_with(
self.context, self.instance, flavor, 'nodename', self.migration,
get_allocs.return_value['allocations'],
image_meta=test.MatchType(objects.ImageMeta), limits=limits)
# Assert the decorators that are triggered on error
add_fault.assert_called_once_with(
self.context, self.instance, wrapped_exc, mock.ANY)
# There would really be three notifications but because we mocked out
# _send_prep_resize_notifications there is just the one error
# notification from the wrap_exception decorator.
self.assertEqual(1, len(fake_notifier.VERSIONED_NOTIFICATIONS))
self.assertEqual(
'compute.%s' % fields.NotificationAction.EXCEPTION,
fake_notifier.VERSIONED_NOTIFICATIONS[0]['event_type'])
class ComputeManagerInstanceUsageAuditTestCase(test.TestCase): class ComputeManagerInstanceUsageAuditTestCase(test.TestCase):
def setUp(self): def setUp(self):

View File

@ -19,10 +19,12 @@ Unit Tests for nova.compute.rpcapi
import mock import mock
from oslo_serialization import jsonutils from oslo_serialization import jsonutils
from oslo_utils.fixture import uuidsentinel as uuids from oslo_utils.fixture import uuidsentinel as uuids
import six
from nova.compute import rpcapi as compute_rpcapi from nova.compute import rpcapi as compute_rpcapi
from nova import context from nova import context
from nova import exception from nova import exception
from nova import objects
from nova.objects import block_device as objects_block_dev from nova.objects import block_device as objects_block_dev
from nova.objects import migration as migration_obj from nova.objects import migration as migration_obj
from nova.objects import service as service_obj from nova.objects import service as service_obj
@ -494,6 +496,45 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
node='node', clean_shutdown=True, host_list=None, node='node', clean_shutdown=True, host_list=None,
version='5.1') version='5.1')
def test_prep_snapshot_based_resize_at_dest(self):
"""Tests happy path for prep_snapshot_based_resize_at_dest rpc call"""
self.flags(long_rpc_timeout=1234)
self._test_compute_api(
'prep_snapshot_based_resize_at_dest', 'call',
# compute method kwargs
instance=self.fake_instance_obj,
flavor=self.fake_flavor_obj,
nodename='node',
migration=migration_obj.Migration(),
limits={},
request_spec=objects.RequestSpec(),
destination='dest',
# client.prepare kwargs
version='5.5', call_monitor_timeout=60, timeout=1234,
# assert the expected return value
_return_value=mock.sentinel.migration_context)
@mock.patch('nova.rpc.ClientRouter.client')
def test_prep_snapshot_based_resize_at_dest_old_compute(self, mock_client):
"""Tests when the destination compute service is too old to call
prep_snapshot_based_resize_at_dest so MigrationPreCheckError is
raised.
"""
mock_client.return_value.can_send_version.return_value = False
rpcapi = compute_rpcapi.ComputeAPI()
ex = self.assertRaises(
exception.MigrationPreCheckError,
rpcapi.prep_snapshot_based_resize_at_dest,
self.context,
instance=self.fake_instance_obj,
flavor=self.fake_flavor_obj,
nodename='node',
migration=migration_obj.Migration(),
limits={},
request_spec=objects.RequestSpec(),
destination='dest')
self.assertIn('Compute too old', six.text_type(ex))
def test_reboot_instance(self): def test_reboot_instance(self):
self.maxDiff = None self.maxDiff = None
self._test_compute_api('reboot_instance', 'cast', self._test_compute_api('reboot_instance', 'cast',