Merge "Cleanup allocations in failed prep_resize"
This commit is contained in:
commit
a99e936ff1
@ -3789,6 +3789,7 @@ class ComputeManager(manager.Manager):
|
|||||||
current_period=True)
|
current_period=True)
|
||||||
self._notify_about_instance_usage(
|
self._notify_about_instance_usage(
|
||||||
context, instance, "resize.prep.start")
|
context, instance, "resize.prep.start")
|
||||||
|
failed = False
|
||||||
try:
|
try:
|
||||||
self._prep_resize(context, image, instance,
|
self._prep_resize(context, image, instance,
|
||||||
instance_type, filter_properties,
|
instance_type, filter_properties,
|
||||||
@ -3797,14 +3798,31 @@ class ComputeManager(manager.Manager):
|
|||||||
# instance to be migrated is backed by LVM.
|
# instance to be migrated is backed by LVM.
|
||||||
# Remove when LVM migration is implemented.
|
# Remove when LVM migration is implemented.
|
||||||
except exception.MigrationPreCheckError:
|
except exception.MigrationPreCheckError:
|
||||||
|
# TODO(mriedem): How is it even possible to get here?
|
||||||
|
# _prep_resize does not call the driver. The resize_instance
|
||||||
|
# method does, but we RPC cast to the source node to do that
|
||||||
|
# so we shouldn't even get this exception...
|
||||||
|
failed = True
|
||||||
raise
|
raise
|
||||||
except Exception:
|
except Exception:
|
||||||
|
failed = True
|
||||||
# try to re-schedule the resize elsewhere:
|
# try to re-schedule the resize elsewhere:
|
||||||
exc_info = sys.exc_info()
|
exc_info = sys.exc_info()
|
||||||
self._reschedule_resize_or_reraise(context, image, instance,
|
self._reschedule_resize_or_reraise(context, image, instance,
|
||||||
exc_info, instance_type, request_spec,
|
exc_info, instance_type, request_spec,
|
||||||
filter_properties)
|
filter_properties)
|
||||||
finally:
|
finally:
|
||||||
|
if failed:
|
||||||
|
# Since we hit a failure, we're either rescheduling or dead
|
||||||
|
# and either way we need to cleanup any allocations created
|
||||||
|
# by the scheduler for the destination node. Note that for
|
||||||
|
# a resize to the same host, the scheduler will merge the
|
||||||
|
# flavors, so here we'd be subtracting the new flavor from
|
||||||
|
# the allocated resources on this node.
|
||||||
|
rt = self._get_resource_tracker()
|
||||||
|
rt.delete_allocation_for_failed_resize(
|
||||||
|
instance, node, instance_type)
|
||||||
|
|
||||||
extra_usage_info = dict(
|
extra_usage_info = dict(
|
||||||
new_instance_type=instance_type.name,
|
new_instance_type=instance_type.name,
|
||||||
new_instance_type_id=instance_type.id)
|
new_instance_type_id=instance_type.id)
|
||||||
|
@ -1237,6 +1237,30 @@ class ResourceTracker(object):
|
|||||||
'on the source node %s', cn.uuid,
|
'on the source node %s', cn.uuid,
|
||||||
instance=instance)
|
instance=instance)
|
||||||
|
|
||||||
|
def delete_allocation_for_failed_resize(self, instance, node, flavor):
|
||||||
|
"""Delete instance allocations for the node during a failed resize
|
||||||
|
|
||||||
|
:param instance: The instance being resized/migrated.
|
||||||
|
:param node: The node provider on which the instance should have
|
||||||
|
allocations to remove. If this is a resize to the same host, then
|
||||||
|
the new_flavor resources are subtracted from the single allocation.
|
||||||
|
:param flavor: This is the new_flavor during a resize.
|
||||||
|
"""
|
||||||
|
resources = scheduler_utils.resources_from_flavor(instance, flavor)
|
||||||
|
cn = self.compute_nodes[node]
|
||||||
|
res = self.reportclient.remove_provider_from_instance_allocation(
|
||||||
|
instance.uuid, cn.uuid, instance.user_id, instance.project_id,
|
||||||
|
resources)
|
||||||
|
if not res:
|
||||||
|
if instance.instance_type_id == flavor.id:
|
||||||
|
operation = 'migration'
|
||||||
|
else:
|
||||||
|
operation = 'resize'
|
||||||
|
LOG.error('Failed to clean allocation after a failed '
|
||||||
|
'%(operation)s on node %(node)s',
|
||||||
|
{'operation': operation, 'node': cn.uuid},
|
||||||
|
instance=instance)
|
||||||
|
|
||||||
def _find_orphaned_instances(self):
|
def _find_orphaned_instances(self):
|
||||||
"""Given the set of instances and migrations already account for
|
"""Given the set of instances and migrations already account for
|
||||||
by resource tracker, sanity check the hypervisor to determine
|
by resource tracker, sanity check the hypervisor to determine
|
||||||
|
@ -1029,11 +1029,15 @@ class SchedulerReportClient(object):
|
|||||||
then PUTs the resulting allocation back to the placement API for the
|
then PUTs the resulting allocation back to the placement API for the
|
||||||
consumer.
|
consumer.
|
||||||
|
|
||||||
We call this method on two occasions: on the source host during a
|
We call this method on three occasions:
|
||||||
confirm_resize() operation and on the destination host during a
|
|
||||||
revert_resize() operation. This is important to reconcile the
|
1. On the source host during a confirm_resize() operation.
|
||||||
"doubled-up" allocation that the scheduler constructs when claiming
|
2. On the destination host during a revert_resize() operation.
|
||||||
resources against the destination host during a move operation.
|
3. On the destination host when prep_resize fails.
|
||||||
|
|
||||||
|
This is important to reconcile the "doubled-up" allocation that the
|
||||||
|
scheduler constructs when claiming resources against the destination
|
||||||
|
host during a move operation.
|
||||||
|
|
||||||
If the move was between hosts, the entire allocation for rp_uuid will
|
If the move was between hosts, the entire allocation for rp_uuid will
|
||||||
be dropped. If the move is a resize on the same host, then we will
|
be dropped. If the move is a resize on the same host, then we will
|
||||||
|
@ -1949,39 +1949,17 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
|
|||||||
self.assertFlavorMatchesAllocation(
|
self.assertFlavorMatchesAllocation(
|
||||||
{'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages)
|
{'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages)
|
||||||
|
|
||||||
def test_rescheduling_when_migrating_instance(self):
|
def _wait_for_prep_resize_fail_completion(self, server, expected_action):
|
||||||
"""Tests that allocations are removed from the destination node by
|
"""Polls instance action events for the given instance and action
|
||||||
the compute service when a cold migrate / resize fails and a reschedule
|
until it finds the compute_prep_resize action event with an error
|
||||||
request is sent back to conductor.
|
result.
|
||||||
"""
|
"""
|
||||||
source_hostname = self.compute1.manager.host
|
|
||||||
server = self._boot_and_check_allocations(
|
|
||||||
self.flavor1, source_hostname)
|
|
||||||
|
|
||||||
def fake_prep_resize(*args, **kwargs):
|
|
||||||
raise test.TestingException('Simulated _prep_resize failure.')
|
|
||||||
|
|
||||||
# Yes this isn't great in a functional test, but it's simple.
|
|
||||||
self.stub_out('nova.compute.manager.ComputeManager._prep_resize',
|
|
||||||
fake_prep_resize)
|
|
||||||
|
|
||||||
# Now migrate the server which is going to fail on the destination.
|
|
||||||
self.api.post_server_action(server['id'], {'migrate': None})
|
|
||||||
|
|
||||||
# When the destination compute calls conductor and conductor calls
|
|
||||||
# the scheduler to get another host, it's going to fail since there
|
|
||||||
# are only two hosts and we started on the first and failed on the
|
|
||||||
# second, so the scheduler will raise NoValidHost and a fault will
|
|
||||||
# be recorded on the instance. However, the instance is not put into
|
|
||||||
# ERROR state since it's still OK on the source host, but faults are
|
|
||||||
# only shown in the API for ERROR/DELETED instances, so we can't poll
|
|
||||||
# for the fault. So we poll for instance action events instead.
|
|
||||||
completion_event = None
|
completion_event = None
|
||||||
for attempt in range(10):
|
for attempt in range(10):
|
||||||
actions = self.api.get_instance_actions(server['id'])
|
actions = self.api.get_instance_actions(server['id'])
|
||||||
# Look for the migrate action.
|
# Look for the migrate action.
|
||||||
for action in actions:
|
for action in actions:
|
||||||
if action['action'] == 'migrate':
|
if action['action'] == expected_action:
|
||||||
events = (
|
events = (
|
||||||
self.api.api_get(
|
self.api.api_get(
|
||||||
'/servers/%s/os-instance-actions/%s' %
|
'/servers/%s/os-instance-actions/%s' %
|
||||||
@ -2005,19 +1983,78 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
|
|||||||
self.fail('Timed out waiting for compute_prep_resize failure '
|
self.fail('Timed out waiting for compute_prep_resize failure '
|
||||||
'event. Current instance actions: %s' % actions)
|
'event. Current instance actions: %s' % actions)
|
||||||
|
|
||||||
|
def test_rescheduling_when_migrating_instance(self):
|
||||||
|
"""Tests that allocations are removed from the destination node by
|
||||||
|
the compute service when a cold migrate / resize fails and a reschedule
|
||||||
|
request is sent back to conductor.
|
||||||
|
"""
|
||||||
|
source_hostname = self.compute1.manager.host
|
||||||
|
server = self._boot_and_check_allocations(
|
||||||
|
self.flavor1, source_hostname)
|
||||||
|
|
||||||
|
def fake_prep_resize(*args, **kwargs):
|
||||||
|
raise test.TestingException('Simulated _prep_resize failure.')
|
||||||
|
|
||||||
|
# Yes this isn't great in a functional test, but it's simple.
|
||||||
|
self.stub_out('nova.compute.manager.ComputeManager._prep_resize',
|
||||||
|
fake_prep_resize)
|
||||||
|
|
||||||
|
# Now migrate the server which is going to fail on the destination.
|
||||||
|
self.api.post_server_action(server['id'], {'migrate': None})
|
||||||
|
|
||||||
|
self._wait_for_prep_resize_fail_completion(server, 'migrate')
|
||||||
|
|
||||||
dest_hostname = self._other_hostname(source_hostname)
|
dest_hostname = self._other_hostname(source_hostname)
|
||||||
dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
|
dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
|
||||||
|
|
||||||
failed_usages = self._get_provider_usages(dest_rp_uuid)
|
failed_usages = self._get_provider_usages(dest_rp_uuid)
|
||||||
# FIXME(mriedem): Due to bug 1712850 the allocations are not removed
|
# Expects no allocation records on the failed host.
|
||||||
# from the destination node before the reschedule. Remove this once
|
self.assertFlavorMatchesAllocation(
|
||||||
# the bug is fixed as it should be:
|
{'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages)
|
||||||
# # Expects no allocation records on the failed host.
|
|
||||||
# self.assertFlavorMatchesAllocation(
|
|
||||||
# {'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages)
|
|
||||||
self.assertFlavorMatchesAllocation(self.flavor1, failed_usages)
|
|
||||||
|
|
||||||
# Ensure the allocation records still exist on the source host.
|
# Ensure the allocation records still exist on the source host.
|
||||||
source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
|
source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
|
||||||
source_usages = self._get_provider_usages(source_rp_uuid)
|
source_usages = self._get_provider_usages(source_rp_uuid)
|
||||||
self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
|
self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
|
||||||
|
|
||||||
|
def test_resize_to_same_host_prep_resize_fails(self):
|
||||||
|
"""Tests that when we resize to the same host and resize fails in
|
||||||
|
the prep_resize method, we cleanup the allocations before rescheduling.
|
||||||
|
"""
|
||||||
|
# make sure that the test only uses a single host
|
||||||
|
compute2_service_id = self.admin_api.get_services(
|
||||||
|
host=self.compute2.host, binary='nova-compute')[0]['id']
|
||||||
|
self.admin_api.put_service(compute2_service_id, {'status': 'disabled'})
|
||||||
|
|
||||||
|
hostname = self.compute1.manager.host
|
||||||
|
rp_uuid = self._get_provider_uuid_by_host(hostname)
|
||||||
|
|
||||||
|
server = self._boot_and_check_allocations(self.flavor1, hostname)
|
||||||
|
|
||||||
|
def fake_prep_resize(*args, **kwargs):
|
||||||
|
# Ensure the allocations are doubled now before we fail.
|
||||||
|
usages = self._get_provider_usages(rp_uuid)
|
||||||
|
self.assertFlavorsMatchAllocation(
|
||||||
|
self.flavor1, self.flavor2, usages)
|
||||||
|
raise test.TestingException('Simulated _prep_resize failure.')
|
||||||
|
|
||||||
|
# Yes this isn't great in a functional test, but it's simple.
|
||||||
|
self.stub_out('nova.compute.manager.ComputeManager._prep_resize',
|
||||||
|
fake_prep_resize)
|
||||||
|
|
||||||
|
self.flags(allow_resize_to_same_host=True)
|
||||||
|
resize_req = {
|
||||||
|
'resize': {
|
||||||
|
'flavorRef': self.flavor2['id']
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.api.post_server_action(server['id'], resize_req)
|
||||||
|
|
||||||
|
self._wait_for_prep_resize_fail_completion(server, 'resize')
|
||||||
|
|
||||||
|
# Ensure the allocation records still exist on the host.
|
||||||
|
source_rp_uuid = self._get_provider_uuid_by_host(hostname)
|
||||||
|
source_usages = self._get_provider_usages(source_rp_uuid)
|
||||||
|
# The new_flavor should have been subtracted from the doubled
|
||||||
|
# allocation which just leaves us with the original flavor.
|
||||||
|
self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
|
||||||
|
Loading…
Reference in New Issue
Block a user