Merge "Cleanup allocations in failed prep_resize"

This commit is contained in:
Jenkins 2017-08-25 00:27:39 +00:00 committed by Gerrit Code Review
commit a99e936ff1
4 changed files with 122 additions and 39 deletions

View File

@ -3789,6 +3789,7 @@ class ComputeManager(manager.Manager):
current_period=True) current_period=True)
self._notify_about_instance_usage( self._notify_about_instance_usage(
context, instance, "resize.prep.start") context, instance, "resize.prep.start")
failed = False
try: try:
self._prep_resize(context, image, instance, self._prep_resize(context, image, instance,
instance_type, filter_properties, instance_type, filter_properties,
@ -3797,14 +3798,31 @@ class ComputeManager(manager.Manager):
# instance to be migrated is backed by LVM. # instance to be migrated is backed by LVM.
# Remove when LVM migration is implemented. # Remove when LVM migration is implemented.
except exception.MigrationPreCheckError: except exception.MigrationPreCheckError:
# TODO(mriedem): How is it even possible to get here?
# _prep_resize does not call the driver. The resize_instance
# method does, but we RPC cast to the source node to do that
# so we shouldn't even get this exception...
failed = True
raise raise
except Exception: except Exception:
failed = True
# try to re-schedule the resize elsewhere: # try to re-schedule the resize elsewhere:
exc_info = sys.exc_info() exc_info = sys.exc_info()
self._reschedule_resize_or_reraise(context, image, instance, self._reschedule_resize_or_reraise(context, image, instance,
exc_info, instance_type, request_spec, exc_info, instance_type, request_spec,
filter_properties) filter_properties)
finally: finally:
if failed:
# Since we hit a failure, we're either rescheduling or dead
# and either way we need to cleanup any allocations created
# by the scheduler for the destination node. Note that for
# a resize to the same host, the scheduler will merge the
# flavors, so here we'd be subtracting the new flavor from
# the allocated resources on this node.
rt = self._get_resource_tracker()
rt.delete_allocation_for_failed_resize(
instance, node, instance_type)
extra_usage_info = dict( extra_usage_info = dict(
new_instance_type=instance_type.name, new_instance_type=instance_type.name,
new_instance_type_id=instance_type.id) new_instance_type_id=instance_type.id)

View File

@ -1237,6 +1237,30 @@ class ResourceTracker(object):
'on the source node %s', cn.uuid, 'on the source node %s', cn.uuid,
instance=instance) instance=instance)
def delete_allocation_for_failed_resize(self, instance, node, flavor):
"""Delete instance allocations for the node during a failed resize
:param instance: The instance being resized/migrated.
:param node: The node provider on which the instance should have
allocations to remove. If this is a resize to the same host, then
the new_flavor resources are subtracted from the single allocation.
:param flavor: This is the new_flavor during a resize.
"""
resources = scheduler_utils.resources_from_flavor(instance, flavor)
cn = self.compute_nodes[node]
res = self.reportclient.remove_provider_from_instance_allocation(
instance.uuid, cn.uuid, instance.user_id, instance.project_id,
resources)
if not res:
if instance.instance_type_id == flavor.id:
operation = 'migration'
else:
operation = 'resize'
LOG.error('Failed to clean allocation after a failed '
'%(operation)s on node %(node)s',
{'operation': operation, 'node': cn.uuid},
instance=instance)
def _find_orphaned_instances(self): def _find_orphaned_instances(self):
"""Given the set of instances and migrations already account for """Given the set of instances and migrations already account for
by resource tracker, sanity check the hypervisor to determine by resource tracker, sanity check the hypervisor to determine

View File

@ -1029,11 +1029,15 @@ class SchedulerReportClient(object):
then PUTs the resulting allocation back to the placement API for the then PUTs the resulting allocation back to the placement API for the
consumer. consumer.
We call this method on two occasions: on the source host during a We call this method on three occasions:
confirm_resize() operation and on the destination host during a
revert_resize() operation. This is important to reconcile the 1. On the source host during a confirm_resize() operation.
"doubled-up" allocation that the scheduler constructs when claiming 2. On the destination host during a revert_resize() operation.
resources against the destination host during a move operation. 3. On the destination host when prep_resize fails.
This is important to reconcile the "doubled-up" allocation that the
scheduler constructs when claiming resources against the destination
host during a move operation.
If the move was between hosts, the entire allocation for rp_uuid will If the move was between hosts, the entire allocation for rp_uuid will
be dropped. If the move is a resize on the same host, then we will be dropped. If the move is a resize on the same host, then we will

View File

@ -1949,39 +1949,17 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
self.assertFlavorMatchesAllocation( self.assertFlavorMatchesAllocation(
{'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages) {'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages)
def test_rescheduling_when_migrating_instance(self): def _wait_for_prep_resize_fail_completion(self, server, expected_action):
"""Tests that allocations are removed from the destination node by """Polls instance action events for the given instance and action
the compute service when a cold migrate / resize fails and a reschedule until it finds the compute_prep_resize action event with an error
request is sent back to conductor. result.
""" """
source_hostname = self.compute1.manager.host
server = self._boot_and_check_allocations(
self.flavor1, source_hostname)
def fake_prep_resize(*args, **kwargs):
raise test.TestingException('Simulated _prep_resize failure.')
# Yes this isn't great in a functional test, but it's simple.
self.stub_out('nova.compute.manager.ComputeManager._prep_resize',
fake_prep_resize)
# Now migrate the server which is going to fail on the destination.
self.api.post_server_action(server['id'], {'migrate': None})
# When the destination compute calls conductor and conductor calls
# the scheduler to get another host, it's going to fail since there
# are only two hosts and we started on the first and failed on the
# second, so the scheduler will raise NoValidHost and a fault will
# be recorded on the instance. However, the instance is not put into
# ERROR state since it's still OK on the source host, but faults are
# only shown in the API for ERROR/DELETED instances, so we can't poll
# for the fault. So we poll for instance action events instead.
completion_event = None completion_event = None
for attempt in range(10): for attempt in range(10):
actions = self.api.get_instance_actions(server['id']) actions = self.api.get_instance_actions(server['id'])
# Look for the migrate action. # Look for the migrate action.
for action in actions: for action in actions:
if action['action'] == 'migrate': if action['action'] == expected_action:
events = ( events = (
self.api.api_get( self.api.api_get(
'/servers/%s/os-instance-actions/%s' % '/servers/%s/os-instance-actions/%s' %
@ -2005,19 +1983,78 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
self.fail('Timed out waiting for compute_prep_resize failure ' self.fail('Timed out waiting for compute_prep_resize failure '
'event. Current instance actions: %s' % actions) 'event. Current instance actions: %s' % actions)
def test_rescheduling_when_migrating_instance(self):
"""Tests that allocations are removed from the destination node by
the compute service when a cold migrate / resize fails and a reschedule
request is sent back to conductor.
"""
source_hostname = self.compute1.manager.host
server = self._boot_and_check_allocations(
self.flavor1, source_hostname)
def fake_prep_resize(*args, **kwargs):
raise test.TestingException('Simulated _prep_resize failure.')
# Yes this isn't great in a functional test, but it's simple.
self.stub_out('nova.compute.manager.ComputeManager._prep_resize',
fake_prep_resize)
# Now migrate the server which is going to fail on the destination.
self.api.post_server_action(server['id'], {'migrate': None})
self._wait_for_prep_resize_fail_completion(server, 'migrate')
dest_hostname = self._other_hostname(source_hostname) dest_hostname = self._other_hostname(source_hostname)
dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname) dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
failed_usages = self._get_provider_usages(dest_rp_uuid) failed_usages = self._get_provider_usages(dest_rp_uuid)
# FIXME(mriedem): Due to bug 1712850 the allocations are not removed # Expects no allocation records on the failed host.
# from the destination node before the reschedule. Remove this once self.assertFlavorMatchesAllocation(
# the bug is fixed as it should be: {'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages)
# # Expects no allocation records on the failed host.
# self.assertFlavorMatchesAllocation(
# {'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages)
self.assertFlavorMatchesAllocation(self.flavor1, failed_usages)
# Ensure the allocation records still exist on the source host. # Ensure the allocation records still exist on the source host.
source_rp_uuid = self._get_provider_uuid_by_host(source_hostname) source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
source_usages = self._get_provider_usages(source_rp_uuid) source_usages = self._get_provider_usages(source_rp_uuid)
self.assertFlavorMatchesAllocation(self.flavor1, source_usages) self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
def test_resize_to_same_host_prep_resize_fails(self):
"""Tests that when we resize to the same host and resize fails in
the prep_resize method, we cleanup the allocations before rescheduling.
"""
# make sure that the test only uses a single host
compute2_service_id = self.admin_api.get_services(
host=self.compute2.host, binary='nova-compute')[0]['id']
self.admin_api.put_service(compute2_service_id, {'status': 'disabled'})
hostname = self.compute1.manager.host
rp_uuid = self._get_provider_uuid_by_host(hostname)
server = self._boot_and_check_allocations(self.flavor1, hostname)
def fake_prep_resize(*args, **kwargs):
# Ensure the allocations are doubled now before we fail.
usages = self._get_provider_usages(rp_uuid)
self.assertFlavorsMatchAllocation(
self.flavor1, self.flavor2, usages)
raise test.TestingException('Simulated _prep_resize failure.')
# Yes this isn't great in a functional test, but it's simple.
self.stub_out('nova.compute.manager.ComputeManager._prep_resize',
fake_prep_resize)
self.flags(allow_resize_to_same_host=True)
resize_req = {
'resize': {
'flavorRef': self.flavor2['id']
}
}
self.api.post_server_action(server['id'], resize_req)
self._wait_for_prep_resize_fail_completion(server, 'resize')
# Ensure the allocation records still exist on the host.
source_rp_uuid = self._get_provider_uuid_by_host(hostname)
source_usages = self._get_provider_usages(source_rp_uuid)
# The new_flavor should have been subtracted from the doubled
# allocation which just leaves us with the original flavor.
self.assertFlavorMatchesAllocation(self.flavor1, source_usages)