From a1462d2608bd0c3ee9cd8686b145bb1d84ffab3b Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Tue, 29 Aug 2017 12:28:43 -0400 Subject: [PATCH] Cleanup allocations on invalid dest node during live migration Starting in Pike, the scheduler creates an allocation on a chosen destination node during live migration. This happens before the destination node pre-checks occur, which could fail and trigger a retry to the scheduler. The allocations created in Placement against the failed destination node were not being cleaned up though. This change adds some cleanup code to the live migration task in conductor to clean the allocations for the failed destination node before retrying. The functional recreate test for the bug is updated to show that the bug is fixed now. Also updates the docstring in the SchedulerReportClient remove_provider_from_instance_allocation method so that we no longer have to enumerate all of the places that call it. Change-Id: I41e5e1fa9938b5e04f7e20f78ccd77eca658885f Closes-Bug: #1712411 (cherry picked from commit 94b904abbad1c9655b6dec1a2e58d73bc913ed47) --- nova/conductor/tasks/live_migrate.py | 43 ++++++++++++++++- nova/scheduler/client/report.py | 8 +--- nova/tests/functional/test_servers.py | 22 ++++----- .../unit/conductor/tasks/test_live_migrate.py | 47 +++++++++++++++---- 4 files changed, 89 insertions(+), 31 deletions(-) diff --git a/nova/conductor/tasks/live_migrate.py b/nova/conductor/tasks/live_migrate.py index e063b01991c8..0fb8bc527cbf 100644 --- a/nova/conductor/tasks/live_migrate.py +++ b/nova/conductor/tasks/live_migrate.py @@ -329,8 +329,9 @@ class LiveMigrationTask(base.TaskBase): self._check_not_over_max_retries(attempted_hosts) request_spec.ignore_hosts = attempted_hosts try: - host = self.scheduler_client.select_destinations(self.context, - request_spec, [self.instance.uuid])[0]['host'] + hoststate = self.scheduler_client.select_destinations( + self.context, request_spec, [self.instance.uuid])[0] + host = hoststate['host'] except messaging.RemoteError as ex: # TODO(ShaoHe Feng) There maybe multi-scheduler, and the # scheduling algorithm is R-R, we can let other scheduler try. @@ -346,9 +347,47 @@ class LiveMigrationTask(base.TaskBase): LOG.debug("Skipping host: %(host)s because: %(e)s", {"host": host, "e": e}) attempted_hosts.append(host) + # The scheduler would have created allocations against the + # selected destination host in Placement, so we need to remove + # those before moving on. + self._remove_host_allocations(host, hoststate['nodename']) host = None return host + def _remove_host_allocations(self, host, node): + """Removes instance allocations against the given host from Placement + + :param host: The name of the host. + :param node: The name of the node. + """ + # Get the compute node object since we need the UUID. + # TODO(mriedem): If the result of select_destinations eventually + # returns the compute node uuid, we wouldn't need to look it + # up via host/node and we can save some time. + try: + compute_node = objects.ComputeNode.get_by_host_and_nodename( + self.context, host, node) + except exception.ComputeHostNotFound: + # This shouldn't happen, but we're being careful. + LOG.info('Unable to remove instance allocations from host %s ' + 'and node %s since it was not found.', host, node, + instance=self.instance) + return + + # Calculate the resource class amounts to subtract from the allocations + # on the node based on the instance flavor. + resources = scheduler_utils.resources_from_flavor( + self.instance, self.instance.flavor) + + # Now remove the allocations for our instance against that node. + # Note that this does not remove allocations against any other node + # or shared resource provider, it's just undoing what the scheduler + # allocated for the given (destination) node. + self.scheduler_client.reportclient.\ + remove_provider_from_instance_allocation( + self.instance.uuid, compute_node.uuid, self.instance.user_id, + self.instance.project_id, resources) + def _check_not_over_max_retries(self, attempted_hosts): if CONF.migrate_max_retries == -1: return diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py index 08b5235188f7..c7099c16dd79 100644 --- a/nova/scheduler/client/report.py +++ b/nova/scheduler/client/report.py @@ -1014,13 +1014,7 @@ class SchedulerReportClient(object): then PUTs the resulting allocation back to the placement API for the consumer. - We call this method on three occasions: - - 1. On the source host during a confirm_resize() operation. - 2. On the destination host during a revert_resize() operation. - 3. On the destination host when prep_resize fails. - - This is important to reconcile the "doubled-up" allocation that the + This is used to reconcile the "doubled-up" allocation that the scheduler constructs when claiming resources against the destination host during a move operation. diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index 9787f6ee1697..6f259bfb9edd 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -1953,23 +1953,17 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin): self.assertFlavorMatchesAllocation(self.flavor1, source_usages) dest_usages = self._get_provider_usages(dest_rp_uuid) - # FIXME(mriedem): This is bug 1712411 where the allocations, created - # by the scheduler, via conductor, are not cleaned up on the migration - # pre-check error. Uncomment the following code when this is fixed. - self.assertFlavorMatchesAllocation(self.flavor1, dest_usages) - # # Assert the allocations, created by the scheduler, are cleaned up - # # after the migration pre-check error happens. - # self.assertFlavorMatchesAllocation( - # {'vcpus': 0, 'ram': 0, 'disk': 0}, dest_usages) + # Assert the allocations, created by the scheduler, are cleaned up + # after the migration pre-check error happens. + self.assertFlavorMatchesAllocation( + {'vcpus': 0, 'ram': 0, 'disk': 0}, dest_usages) allocations = self._get_allocations_by_server_uuid(server['id']) - # FIXME(mriedem): After bug 1712411 is fixed there should only be 1 - # allocation for the instance, on the source node. - self.assertEqual(2, len(allocations)) + # There should only be 1 allocation for the instance on the source node + self.assertEqual(1, len(allocations)) self.assertIn(source_rp_uuid, allocations) - self.assertIn(dest_rp_uuid, allocations) - dest_allocation = allocations[dest_rp_uuid]['resources'] - self.assertFlavorMatchesAllocation(self.flavor1, dest_allocation) + self.assertFlavorMatchesAllocation( + self.flavor1, allocations[source_rp_uuid]['resources']) self._delete_and_check_allocations( server, source_rp_uuid, dest_rp_uuid) diff --git a/nova/tests/unit/conductor/tasks/test_live_migrate.py b/nova/tests/unit/conductor/tasks/test_live_migrate.py index 29a2ba407d19..5ae6303c97c6 100644 --- a/nova/tests/unit/conductor/tasks/test_live_migrate.py +++ b/nova/tests/unit/conductor/tasks/test_live_migrate.py @@ -373,7 +373,7 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase): scheduler_utils.setup_instance_group(self.context, self.fake_spec) self.task.scheduler_client.select_destinations(self.context, self.fake_spec, [self.instance.uuid]).AndReturn( - [{'host': 'host1'}]) + [{'host': 'host1', 'nodename': 'node1'}]) self.task._check_compatible_with_source_hypervisor("host1")\ .AndRaise(error) @@ -384,7 +384,11 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase): self.task._call_livem_checks_on_host("host2") self.mox.ReplayAll() - self.assertEqual("host2", self.task._find_destination()) + with mock.patch.object(self.task, + '_remove_host_allocations') as remove_allocs: + self.assertEqual("host2", self.task._find_destination()) + # Should have removed allocations for the first host. + remove_allocs.assert_called_once_with('host1', 'node1') def test_find_destination_retry_with_old_hypervisor(self): self._test_find_destination_retry_hypervisor_raises( @@ -409,7 +413,7 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase): scheduler_utils.setup_instance_group(self.context, self.fake_spec) self.task.scheduler_client.select_destinations(self.context, self.fake_spec, [self.instance.uuid]).AndReturn( - [{'host': 'host1'}]) + [{'host': 'host1', 'nodename': 'node1'}]) self.task._check_compatible_with_source_hypervisor("host1") self.task._call_livem_checks_on_host("host1")\ .AndRaise(exception.Invalid) @@ -421,7 +425,11 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase): self.task._call_livem_checks_on_host("host2") self.mox.ReplayAll() - self.assertEqual("host2", self.task._find_destination()) + with mock.patch.object(self.task, + '_remove_host_allocations') as remove_allocs: + self.assertEqual("host2", self.task._find_destination()) + # Should have removed allocations for the first host. + remove_allocs.assert_called_once_with('host1', 'node1') def test_find_destination_retry_with_failed_migration_pre_checks(self): self.flags(migrate_max_retries=1) @@ -438,7 +446,7 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase): scheduler_utils.setup_instance_group(self.context, self.fake_spec) self.task.scheduler_client.select_destinations(self.context, self.fake_spec, [self.instance.uuid]).AndReturn( - [{'host': 'host1'}]) + [{'host': 'host1', 'nodename': 'node1'}]) self.task._check_compatible_with_source_hypervisor("host1") self.task._call_livem_checks_on_host("host1")\ .AndRaise(exception.MigrationPreCheckError("reason")) @@ -450,7 +458,11 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase): self.task._call_livem_checks_on_host("host2") self.mox.ReplayAll() - self.assertEqual("host2", self.task._find_destination()) + with mock.patch.object(self.task, + '_remove_host_allocations') as remove_allocs: + self.assertEqual("host2", self.task._find_destination()) + # Should have removed allocations for the first host. + remove_allocs.assert_called_once_with('host1', 'node1') def test_find_destination_retry_exceeds_max(self): self.flags(migrate_max_retries=0) @@ -466,16 +478,23 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase): scheduler_utils.setup_instance_group(self.context, self.fake_spec) self.task.scheduler_client.select_destinations(self.context, self.fake_spec, [self.instance.uuid]).AndReturn( - [{'host': 'host1'}]) + [{'host': 'host1', 'nodename': 'node1'}]) self.task._check_compatible_with_source_hypervisor("host1")\ .AndRaise(exception.DestinationHypervisorTooOld) self.mox.ReplayAll() - with mock.patch.object(self.task.migration, 'save') as save_mock: + with test.nested( + mock.patch.object(self.task.migration, 'save'), + mock.patch.object(self.task, '_remove_host_allocations') + ) as ( + save_mock, remove_allocs + ): self.assertRaises(exception.MaxRetriesExceeded, self.task._find_destination) self.assertEqual('failed', self.task.migration.status) save_mock.assert_called_once_with() + # Should have removed allocations for the first host. + remove_allocs.assert_called_once_with('host1', 'node1') def test_find_destination_when_runs_out_of_hosts(self): self.mox.StubOutWithMock(utils, 'get_image_from_system_metadata') @@ -636,3 +655,15 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase): self.task._get_destination_cell_mapping) mock_get.assert_called_once_with( self.task.context, self.task.destination) + + @mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename', + side_effect=exception.ComputeHostNotFound(host='host')) + def test_remove_host_allocations_compute_host_not_found(self, get_cn): + """Tests that failing to find a ComputeNode will not blow up + the _remove_host_allocations method. + """ + with mock.patch.object( + self.task.scheduler_client.reportclient, + 'remove_provider_from_instance_allocation') as remove_provider: + self.task._remove_host_allocations('host', 'node') + remove_provider.assert_not_called()