Cleanup allocations on invalid dest node during live migration

Starting in Pike, the scheduler creates an allocation on a
chosen destination node during live migration. This happens
before the destination node pre-checks occur, which could
fail and trigger a retry to the scheduler. The allocations
created in Placement against the failed destination node
were not being cleaned up though.

This change adds some cleanup code to the live migration task
in conductor to clean the allocations for the failed destination
node before retrying.

The functional recreate test for the bug is updated to show
that the bug is fixed now.

Also updates the docstring in the SchedulerReportClient
remove_provider_from_instance_allocation method so that we
no longer have to enumerate all of the places that call it.

Change-Id: I41e5e1fa9938b5e04f7e20f78ccd77eca658885f
Closes-Bug: #1712411
This commit is contained in:
Matt Riedemann 2017-08-29 12:28:43 -04:00
parent 0b05655ef8
commit 94b904abba
4 changed files with 89 additions and 31 deletions

View File

@ -329,8 +329,9 @@ class LiveMigrationTask(base.TaskBase):
self._check_not_over_max_retries(attempted_hosts)
request_spec.ignore_hosts = attempted_hosts
try:
host = self.scheduler_client.select_destinations(self.context,
request_spec, [self.instance.uuid])[0]['host']
hoststate = self.scheduler_client.select_destinations(
self.context, request_spec, [self.instance.uuid])[0]
host = hoststate['host']
except messaging.RemoteError as ex:
# TODO(ShaoHe Feng) There maybe multi-scheduler, and the
# scheduling algorithm is R-R, we can let other scheduler try.
@ -346,9 +347,47 @@ class LiveMigrationTask(base.TaskBase):
LOG.debug("Skipping host: %(host)s because: %(e)s",
{"host": host, "e": e})
attempted_hosts.append(host)
# The scheduler would have created allocations against the
# selected destination host in Placement, so we need to remove
# those before moving on.
self._remove_host_allocations(host, hoststate['nodename'])
host = None
return host
def _remove_host_allocations(self, host, node):
"""Removes instance allocations against the given host from Placement
:param host: The name of the host.
:param node: The name of the node.
"""
# Get the compute node object since we need the UUID.
# TODO(mriedem): If the result of select_destinations eventually
# returns the compute node uuid, we wouldn't need to look it
# up via host/node and we can save some time.
try:
compute_node = objects.ComputeNode.get_by_host_and_nodename(
self.context, host, node)
except exception.ComputeHostNotFound:
# This shouldn't happen, but we're being careful.
LOG.info('Unable to remove instance allocations from host %s '
'and node %s since it was not found.', host, node,
instance=self.instance)
return
# Calculate the resource class amounts to subtract from the allocations
# on the node based on the instance flavor.
resources = scheduler_utils.resources_from_flavor(
self.instance, self.instance.flavor)
# Now remove the allocations for our instance against that node.
# Note that this does not remove allocations against any other node
# or shared resource provider, it's just undoing what the scheduler
# allocated for the given (destination) node.
self.scheduler_client.reportclient.\
remove_provider_from_instance_allocation(
self.instance.uuid, compute_node.uuid, self.instance.user_id,
self.instance.project_id, resources)
def _check_not_over_max_retries(self, attempted_hosts):
if CONF.migrate_max_retries == -1:
return

View File

@ -1029,13 +1029,7 @@ class SchedulerReportClient(object):
then PUTs the resulting allocation back to the placement API for the
consumer.
We call this method on three occasions:
1. On the source host during a confirm_resize() operation.
2. On the destination host during a revert_resize() operation.
3. On the destination host when prep_resize fails.
This is important to reconcile the "doubled-up" allocation that the
This is used to reconcile the "doubled-up" allocation that the
scheduler constructs when claiming resources against the destination
host during a move operation.

View File

@ -1949,23 +1949,17 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
dest_usages = self._get_provider_usages(dest_rp_uuid)
# FIXME(mriedem): This is bug 1712411 where the allocations, created
# by the scheduler, via conductor, are not cleaned up on the migration
# pre-check error. Uncomment the following code when this is fixed.
self.assertFlavorMatchesAllocation(self.flavor1, dest_usages)
# # Assert the allocations, created by the scheduler, are cleaned up
# # after the migration pre-check error happens.
# self.assertFlavorMatchesAllocation(
# {'vcpus': 0, 'ram': 0, 'disk': 0}, dest_usages)
# Assert the allocations, created by the scheduler, are cleaned up
# after the migration pre-check error happens.
self.assertFlavorMatchesAllocation(
{'vcpus': 0, 'ram': 0, 'disk': 0}, dest_usages)
allocations = self._get_allocations_by_server_uuid(server['id'])
# FIXME(mriedem): After bug 1712411 is fixed there should only be 1
# allocation for the instance, on the source node.
self.assertEqual(2, len(allocations))
# There should only be 1 allocation for the instance on the source node
self.assertEqual(1, len(allocations))
self.assertIn(source_rp_uuid, allocations)
self.assertIn(dest_rp_uuid, allocations)
dest_allocation = allocations[dest_rp_uuid]['resources']
self.assertFlavorMatchesAllocation(self.flavor1, dest_allocation)
self.assertFlavorMatchesAllocation(
self.flavor1, allocations[source_rp_uuid]['resources'])
self._delete_and_check_allocations(
server, source_rp_uuid, dest_rp_uuid)

View File

@ -373,7 +373,7 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
scheduler_utils.setup_instance_group(self.context, self.fake_spec)
self.task.scheduler_client.select_destinations(self.context,
self.fake_spec, [self.instance.uuid]).AndReturn(
[{'host': 'host1'}])
[{'host': 'host1', 'nodename': 'node1'}])
self.task._check_compatible_with_source_hypervisor("host1")\
.AndRaise(error)
@ -384,7 +384,11 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
self.task._call_livem_checks_on_host("host2")
self.mox.ReplayAll()
self.assertEqual("host2", self.task._find_destination())
with mock.patch.object(self.task,
'_remove_host_allocations') as remove_allocs:
self.assertEqual("host2", self.task._find_destination())
# Should have removed allocations for the first host.
remove_allocs.assert_called_once_with('host1', 'node1')
def test_find_destination_retry_with_old_hypervisor(self):
self._test_find_destination_retry_hypervisor_raises(
@ -409,7 +413,7 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
scheduler_utils.setup_instance_group(self.context, self.fake_spec)
self.task.scheduler_client.select_destinations(self.context,
self.fake_spec, [self.instance.uuid]).AndReturn(
[{'host': 'host1'}])
[{'host': 'host1', 'nodename': 'node1'}])
self.task._check_compatible_with_source_hypervisor("host1")
self.task._call_livem_checks_on_host("host1")\
.AndRaise(exception.Invalid)
@ -421,7 +425,11 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
self.task._call_livem_checks_on_host("host2")
self.mox.ReplayAll()
self.assertEqual("host2", self.task._find_destination())
with mock.patch.object(self.task,
'_remove_host_allocations') as remove_allocs:
self.assertEqual("host2", self.task._find_destination())
# Should have removed allocations for the first host.
remove_allocs.assert_called_once_with('host1', 'node1')
def test_find_destination_retry_with_failed_migration_pre_checks(self):
self.flags(migrate_max_retries=1)
@ -438,7 +446,7 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
scheduler_utils.setup_instance_group(self.context, self.fake_spec)
self.task.scheduler_client.select_destinations(self.context,
self.fake_spec, [self.instance.uuid]).AndReturn(
[{'host': 'host1'}])
[{'host': 'host1', 'nodename': 'node1'}])
self.task._check_compatible_with_source_hypervisor("host1")
self.task._call_livem_checks_on_host("host1")\
.AndRaise(exception.MigrationPreCheckError("reason"))
@ -450,7 +458,11 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
self.task._call_livem_checks_on_host("host2")
self.mox.ReplayAll()
self.assertEqual("host2", self.task._find_destination())
with mock.patch.object(self.task,
'_remove_host_allocations') as remove_allocs:
self.assertEqual("host2", self.task._find_destination())
# Should have removed allocations for the first host.
remove_allocs.assert_called_once_with('host1', 'node1')
def test_find_destination_retry_exceeds_max(self):
self.flags(migrate_max_retries=0)
@ -466,16 +478,23 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
scheduler_utils.setup_instance_group(self.context, self.fake_spec)
self.task.scheduler_client.select_destinations(self.context,
self.fake_spec, [self.instance.uuid]).AndReturn(
[{'host': 'host1'}])
[{'host': 'host1', 'nodename': 'node1'}])
self.task._check_compatible_with_source_hypervisor("host1")\
.AndRaise(exception.DestinationHypervisorTooOld)
self.mox.ReplayAll()
with mock.patch.object(self.task.migration, 'save') as save_mock:
with test.nested(
mock.patch.object(self.task.migration, 'save'),
mock.patch.object(self.task, '_remove_host_allocations')
) as (
save_mock, remove_allocs
):
self.assertRaises(exception.MaxRetriesExceeded,
self.task._find_destination)
self.assertEqual('failed', self.task.migration.status)
save_mock.assert_called_once_with()
# Should have removed allocations for the first host.
remove_allocs.assert_called_once_with('host1', 'node1')
def test_find_destination_when_runs_out_of_hosts(self):
self.mox.StubOutWithMock(utils, 'get_image_from_system_metadata')
@ -636,3 +655,15 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
self.task._get_destination_cell_mapping)
mock_get.assert_called_once_with(
self.task.context, self.task.destination)
@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename',
side_effect=exception.ComputeHostNotFound(host='host'))
def test_remove_host_allocations_compute_host_not_found(self, get_cn):
"""Tests that failing to find a ComputeNode will not blow up
the _remove_host_allocations method.
"""
with mock.patch.object(
self.task.scheduler_client.reportclient,
'remove_provider_from_instance_allocation') as remove_provider:
self.task._remove_host_allocations('host', 'node')
remove_provider.assert_not_called()