Merge "Cleanup allocations on invalid dest node during live migration" into stable/pike

This commit is contained in:
Jenkins 2017-09-15 22:19:12 +00:00 committed by Gerrit Code Review
commit 46554875d2
4 changed files with 89 additions and 31 deletions

View File

@ -267,8 +267,9 @@ class LiveMigrationTask(base.TaskBase):
self._check_not_over_max_retries(attempted_hosts)
request_spec.ignore_hosts = attempted_hosts
try:
host = self.scheduler_client.select_destinations(self.context,
request_spec, [self.instance.uuid])[0]['host']
hoststate = self.scheduler_client.select_destinations(
self.context, request_spec, [self.instance.uuid])[0]
host = hoststate['host']
except messaging.RemoteError as ex:
# TODO(ShaoHe Feng) There maybe multi-scheduler, and the
# scheduling algorithm is R-R, we can let other scheduler try.
@ -284,9 +285,47 @@ class LiveMigrationTask(base.TaskBase):
LOG.debug("Skipping host: %(host)s because: %(e)s",
{"host": host, "e": e})
attempted_hosts.append(host)
# The scheduler would have created allocations against the
# selected destination host in Placement, so we need to remove
# those before moving on.
self._remove_host_allocations(host, hoststate['nodename'])
host = None
return host
def _remove_host_allocations(self, host, node):
"""Removes instance allocations against the given host from Placement
:param host: The name of the host.
:param node: The name of the node.
"""
# Get the compute node object since we need the UUID.
# TODO(mriedem): If the result of select_destinations eventually
# returns the compute node uuid, we wouldn't need to look it
# up via host/node and we can save some time.
try:
compute_node = objects.ComputeNode.get_by_host_and_nodename(
self.context, host, node)
except exception.ComputeHostNotFound:
# This shouldn't happen, but we're being careful.
LOG.info('Unable to remove instance allocations from host %s '
'and node %s since it was not found.', host, node,
instance=self.instance)
return
# Calculate the resource class amounts to subtract from the allocations
# on the node based on the instance flavor.
resources = scheduler_utils.resources_from_flavor(
self.instance, self.instance.flavor)
# Now remove the allocations for our instance against that node.
# Note that this does not remove allocations against any other node
# or shared resource provider, it's just undoing what the scheduler
# allocated for the given (destination) node.
self.scheduler_client.reportclient.\
remove_provider_from_instance_allocation(
self.instance.uuid, compute_node.uuid, self.instance.user_id,
self.instance.project_id, resources)
def _check_not_over_max_retries(self, attempted_hosts):
if CONF.migrate_max_retries == -1:
return

View File

@ -1014,13 +1014,7 @@ class SchedulerReportClient(object):
then PUTs the resulting allocation back to the placement API for the
consumer.
We call this method on three occasions:
1. On the source host during a confirm_resize() operation.
2. On the destination host during a revert_resize() operation.
3. On the destination host when prep_resize fails.
This is important to reconcile the "doubled-up" allocation that the
This is used to reconcile the "doubled-up" allocation that the
scheduler constructs when claiming resources against the destination
host during a move operation.

View File

@ -2185,23 +2185,17 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
dest_usages = self._get_provider_usages(dest_rp_uuid)
# FIXME(mriedem): This is bug 1712411 where the allocations, created
# by the scheduler, via conductor, are not cleaned up on the migration
# pre-check error. Uncomment the following code when this is fixed.
self.assertFlavorMatchesAllocation(self.flavor1, dest_usages)
# # Assert the allocations, created by the scheduler, are cleaned up
# # after the migration pre-check error happens.
# self.assertFlavorMatchesAllocation(
# {'vcpus': 0, 'ram': 0, 'disk': 0}, dest_usages)
# Assert the allocations, created by the scheduler, are cleaned up
# after the migration pre-check error happens.
self.assertFlavorMatchesAllocation(
{'vcpus': 0, 'ram': 0, 'disk': 0}, dest_usages)
allocations = self._get_allocations_by_server_uuid(server['id'])
# FIXME(mriedem): After bug 1712411 is fixed there should only be 1
# allocation for the instance, on the source node.
self.assertEqual(2, len(allocations))
# There should only be 1 allocation for the instance on the source node
self.assertEqual(1, len(allocations))
self.assertIn(source_rp_uuid, allocations)
self.assertIn(dest_rp_uuid, allocations)
dest_allocation = allocations[dest_rp_uuid]['resources']
self.assertFlavorMatchesAllocation(self.flavor1, dest_allocation)
self.assertFlavorMatchesAllocation(
self.flavor1, allocations[source_rp_uuid]['resources'])
self._delete_and_check_allocations(
server, source_rp_uuid, dest_rp_uuid)

View File

@ -375,7 +375,7 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
scheduler_utils.setup_instance_group(self.context, self.fake_spec)
self.task.scheduler_client.select_destinations(self.context,
self.fake_spec, [self.instance.uuid]).AndReturn(
[{'host': 'host1'}])
[{'host': 'host1', 'nodename': 'node1'}])
self.task._check_compatible_with_source_hypervisor("host1")\
.AndRaise(error)
@ -386,7 +386,11 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
self.task._call_livem_checks_on_host("host2")
self.mox.ReplayAll()
self.assertEqual("host2", self.task._find_destination())
with mock.patch.object(self.task,
'_remove_host_allocations') as remove_allocs:
self.assertEqual("host2", self.task._find_destination())
# Should have removed allocations for the first host.
remove_allocs.assert_called_once_with('host1', 'node1')
def test_find_destination_retry_with_old_hypervisor(self):
self._test_find_destination_retry_hypervisor_raises(
@ -411,7 +415,7 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
scheduler_utils.setup_instance_group(self.context, self.fake_spec)
self.task.scheduler_client.select_destinations(self.context,
self.fake_spec, [self.instance.uuid]).AndReturn(
[{'host': 'host1'}])
[{'host': 'host1', 'nodename': 'node1'}])
self.task._check_compatible_with_source_hypervisor("host1")
self.task._call_livem_checks_on_host("host1")\
.AndRaise(exception.Invalid)
@ -423,7 +427,11 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
self.task._call_livem_checks_on_host("host2")
self.mox.ReplayAll()
self.assertEqual("host2", self.task._find_destination())
with mock.patch.object(self.task,
'_remove_host_allocations') as remove_allocs:
self.assertEqual("host2", self.task._find_destination())
# Should have removed allocations for the first host.
remove_allocs.assert_called_once_with('host1', 'node1')
def test_find_destination_retry_with_failed_migration_pre_checks(self):
self.flags(migrate_max_retries=1)
@ -440,7 +448,7 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
scheduler_utils.setup_instance_group(self.context, self.fake_spec)
self.task.scheduler_client.select_destinations(self.context,
self.fake_spec, [self.instance.uuid]).AndReturn(
[{'host': 'host1'}])
[{'host': 'host1', 'nodename': 'node1'}])
self.task._check_compatible_with_source_hypervisor("host1")
self.task._call_livem_checks_on_host("host1")\
.AndRaise(exception.MigrationPreCheckError("reason"))
@ -452,7 +460,11 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
self.task._call_livem_checks_on_host("host2")
self.mox.ReplayAll()
self.assertEqual("host2", self.task._find_destination())
with mock.patch.object(self.task,
'_remove_host_allocations') as remove_allocs:
self.assertEqual("host2", self.task._find_destination())
# Should have removed allocations for the first host.
remove_allocs.assert_called_once_with('host1', 'node1')
def test_find_destination_retry_exceeds_max(self):
self.flags(migrate_max_retries=0)
@ -468,16 +480,23 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
scheduler_utils.setup_instance_group(self.context, self.fake_spec)
self.task.scheduler_client.select_destinations(self.context,
self.fake_spec, [self.instance.uuid]).AndReturn(
[{'host': 'host1'}])
[{'host': 'host1', 'nodename': 'node1'}])
self.task._check_compatible_with_source_hypervisor("host1")\
.AndRaise(exception.DestinationHypervisorTooOld)
self.mox.ReplayAll()
with mock.patch.object(self.task.migration, 'save') as save_mock:
with test.nested(
mock.patch.object(self.task.migration, 'save'),
mock.patch.object(self.task, '_remove_host_allocations')
) as (
save_mock, remove_allocs
):
self.assertRaises(exception.MaxRetriesExceeded,
self.task._find_destination)
self.assertEqual('failed', self.task.migration.status)
save_mock.assert_called_once_with()
# Should have removed allocations for the first host.
remove_allocs.assert_called_once_with('host1', 'node1')
def test_find_destination_when_runs_out_of_hosts(self):
self.mox.StubOutWithMock(utils, 'get_image_from_system_metadata')
@ -542,3 +561,15 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
self.task._get_destination_cell_mapping)
mock_get.assert_called_once_with(
self.task.context, self.task.destination)
@mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename',
side_effect=exception.ComputeHostNotFound(host='host'))
def test_remove_host_allocations_compute_host_not_found(self, get_cn):
"""Tests that failing to find a ComputeNode will not blow up
the _remove_host_allocations method.
"""
with mock.patch.object(
self.task.scheduler_client.reportclient,
'remove_provider_from_instance_allocation') as remove_provider:
self.task._remove_host_allocations('host', 'node')
remove_provider.assert_not_called()