Cleanup allocations on invalid dest node during live migration

Starting in Pike, the scheduler creates an allocation on a chosen destination node during live migration. This happens before the destination node pre-checks occur, which could fail and trigger a retry to the scheduler. The allocations created in Placement against the failed destination node were not being cleaned up though. This change adds some cleanup code to the live migration task in conductor to clean the allocations for the failed destination node before retrying. The functional recreate test for the bug is updated to show that the bug is fixed now. Also updates the docstring in the SchedulerReportClient remove_provider_from_instance_allocation method so that we no longer have to enumerate all of the places that call it. Change-Id: I41e5e1fa9938b5e04f7e20f78ccd77eca658885f Closes-Bug: #1712411
2017-08-29 12:28:43 -04:00 · 2017-08-29 12:28:43 -04:00 · 94b904abba
parent 0b05655ef8
commit 94b904abba
4 changed files with 89 additions and 31 deletions
--- a/nova/conductor/tasks/live_migrate.py
+++ b/nova/conductor/tasks/live_migrate.py
@ -329,8 +329,9 @@ class LiveMigrationTask(base.TaskBase):
            self._check_not_over_max_retries(attempted_hosts)
            request_spec.ignore_hosts = attempted_hosts
            try:
-                host = self.scheduler_client.select_destinations(self.context,
-                        request_spec, [self.instance.uuid])[0]['host']
+                hoststate = self.scheduler_client.select_destinations(
+                    self.context, request_spec, [self.instance.uuid])[0]
+                host = hoststate['host']
            except messaging.RemoteError as ex:
                # TODO(ShaoHe Feng) There maybe multi-scheduler, and the
                # scheduling algorithm is R-R, we can let other scheduler try.
@ -346,9 +347,47 @@ class LiveMigrationTask(base.TaskBase):
                LOG.debug("Skipping host: %(host)s because: %(e)s",
                    {"host": host, "e": e})
                attempted_hosts.append(host)
+                # The scheduler would have created allocations against the
+                # selected destination host in Placement, so we need to remove
+                # those before moving on.
+                self._remove_host_allocations(host, hoststate['nodename'])
                host = None
        return host

+    def _remove_host_allocations(self, host, node):
+        """Removes instance allocations against the given host from Placement
+
+        :param host: The name of the host.
+        :param node: The name of the node.
+        """
+        # Get the compute node object since we need the UUID.
+        # TODO(mriedem): If the result of select_destinations eventually
+        # returns the compute node uuid, we wouldn't need to look it
+        # up via host/node and we can save some time.
+        try:
+            compute_node = objects.ComputeNode.get_by_host_and_nodename(
+                self.context, host, node)
+        except exception.ComputeHostNotFound:
+            # This shouldn't happen, but we're being careful.
+            LOG.info('Unable to remove instance allocations from host %s '
+                     'and node %s since it was not found.', host, node,
+                     instance=self.instance)
+            return
+
+        # Calculate the resource class amounts to subtract from the allocations
+        # on the node based on the instance flavor.
+        resources = scheduler_utils.resources_from_flavor(
+            self.instance, self.instance.flavor)
+
+        # Now remove the allocations for our instance against that node.
+        # Note that this does not remove allocations against any other node
+        # or shared resource provider, it's just undoing what the scheduler
+        # allocated for the given (destination) node.
+        self.scheduler_client.reportclient.\
+            remove_provider_from_instance_allocation(
+                self.instance.uuid, compute_node.uuid, self.instance.user_id,
+                self.instance.project_id, resources)
+
    def _check_not_over_max_retries(self, attempted_hosts):
        if CONF.migrate_max_retries == -1:
            return
--- a/nova/scheduler/client/report.py
+++ b/nova/scheduler/client/report.py
@ -1029,13 +1029,7 @@ class SchedulerReportClient(object):
        then PUTs the resulting allocation back to the placement API for the
        consumer.

-        We call this method on three occasions:
-
-        1. On the source host during a confirm_resize() operation.
-        2. On the destination host during a revert_resize() operation.
-        3. On the destination host when prep_resize fails.
-
-        This is important to reconcile the "doubled-up" allocation that the
+        This is used to reconcile the "doubled-up" allocation that the
        scheduler constructs when claiming resources against the destination
        host during a move operation.

--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@ -1949,23 +1949,17 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
        self.assertFlavorMatchesAllocation(self.flavor1, source_usages)

        dest_usages = self._get_provider_usages(dest_rp_uuid)
-        # FIXME(mriedem): This is bug 1712411 where the allocations, created
-        # by the scheduler, via conductor, are not cleaned up on the migration
-        # pre-check error. Uncomment the following code when this is fixed.
-        self.assertFlavorMatchesAllocation(self.flavor1, dest_usages)
-        # # Assert the allocations, created by the scheduler, are cleaned up
-        # # after the migration pre-check error happens.
-        # self.assertFlavorMatchesAllocation(
-        #     {'vcpus': 0, 'ram': 0, 'disk': 0}, dest_usages)
+        # Assert the allocations, created by the scheduler, are cleaned up
+        # after the migration pre-check error happens.
+        self.assertFlavorMatchesAllocation(
+            {'vcpus': 0, 'ram': 0, 'disk': 0}, dest_usages)

        allocations = self._get_allocations_by_server_uuid(server['id'])
-        # FIXME(mriedem): After bug 1712411 is fixed there should only be 1
-        # allocation for the instance, on the source node.
-        self.assertEqual(2, len(allocations))
+        # There should only be 1 allocation for the instance on the source node
+        self.assertEqual(1, len(allocations))
        self.assertIn(source_rp_uuid, allocations)
-        self.assertIn(dest_rp_uuid, allocations)
-        dest_allocation = allocations[dest_rp_uuid]['resources']
-        self.assertFlavorMatchesAllocation(self.flavor1, dest_allocation)
+        self.assertFlavorMatchesAllocation(
+            self.flavor1, allocations[source_rp_uuid]['resources'])

        self._delete_and_check_allocations(
            server, source_rp_uuid, dest_rp_uuid)
--- a/nova/tests/unit/conductor/tasks/test_live_migrate.py
+++ b/nova/tests/unit/conductor/tasks/test_live_migrate.py
@ -373,7 +373,7 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
        scheduler_utils.setup_instance_group(self.context, self.fake_spec)
        self.task.scheduler_client.select_destinations(self.context,
                self.fake_spec, [self.instance.uuid]).AndReturn(
-                        [{'host': 'host1'}])
+                        [{'host': 'host1', 'nodename': 'node1'}])
        self.task._check_compatible_with_source_hypervisor("host1")\
                .AndRaise(error)

@ -384,7 +384,11 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
        self.task._call_livem_checks_on_host("host2")

        self.mox.ReplayAll()
-        self.assertEqual("host2", self.task._find_destination())
+        with mock.patch.object(self.task,
+                               '_remove_host_allocations') as remove_allocs:
+            self.assertEqual("host2", self.task._find_destination())
+        # Should have removed allocations for the first host.
+        remove_allocs.assert_called_once_with('host1', 'node1')

    def test_find_destination_retry_with_old_hypervisor(self):
        self._test_find_destination_retry_hypervisor_raises(
@ -409,7 +413,7 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
        scheduler_utils.setup_instance_group(self.context, self.fake_spec)
        self.task.scheduler_client.select_destinations(self.context,
                self.fake_spec, [self.instance.uuid]).AndReturn(
-                        [{'host': 'host1'}])
+                        [{'host': 'host1', 'nodename': 'node1'}])
        self.task._check_compatible_with_source_hypervisor("host1")
        self.task._call_livem_checks_on_host("host1")\
                .AndRaise(exception.Invalid)
@ -421,7 +425,11 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
        self.task._call_livem_checks_on_host("host2")

        self.mox.ReplayAll()
-        self.assertEqual("host2", self.task._find_destination())
+        with mock.patch.object(self.task,
+                               '_remove_host_allocations') as remove_allocs:
+            self.assertEqual("host2", self.task._find_destination())
+        # Should have removed allocations for the first host.
+        remove_allocs.assert_called_once_with('host1', 'node1')

    def test_find_destination_retry_with_failed_migration_pre_checks(self):
        self.flags(migrate_max_retries=1)
@ -438,7 +446,7 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
        scheduler_utils.setup_instance_group(self.context, self.fake_spec)
        self.task.scheduler_client.select_destinations(self.context,
                self.fake_spec, [self.instance.uuid]).AndReturn(
-                        [{'host': 'host1'}])
+                        [{'host': 'host1', 'nodename': 'node1'}])
        self.task._check_compatible_with_source_hypervisor("host1")
        self.task._call_livem_checks_on_host("host1")\
                .AndRaise(exception.MigrationPreCheckError("reason"))
@ -450,7 +458,11 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
        self.task._call_livem_checks_on_host("host2")

        self.mox.ReplayAll()
-        self.assertEqual("host2", self.task._find_destination())
+        with mock.patch.object(self.task,
+                               '_remove_host_allocations') as remove_allocs:
+            self.assertEqual("host2", self.task._find_destination())
+        # Should have removed allocations for the first host.
+        remove_allocs.assert_called_once_with('host1', 'node1')

    def test_find_destination_retry_exceeds_max(self):
        self.flags(migrate_max_retries=0)
@ -466,16 +478,23 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
        scheduler_utils.setup_instance_group(self.context, self.fake_spec)
        self.task.scheduler_client.select_destinations(self.context,
                self.fake_spec, [self.instance.uuid]).AndReturn(
-                        [{'host': 'host1'}])
+                        [{'host': 'host1', 'nodename': 'node1'}])
        self.task._check_compatible_with_source_hypervisor("host1")\
                .AndRaise(exception.DestinationHypervisorTooOld)

        self.mox.ReplayAll()
-        with mock.patch.object(self.task.migration, 'save') as save_mock:
+        with test.nested(
+            mock.patch.object(self.task.migration, 'save'),
+            mock.patch.object(self.task, '_remove_host_allocations')
+        ) as (
+            save_mock, remove_allocs
+        ):
            self.assertRaises(exception.MaxRetriesExceeded,
                              self.task._find_destination)
            self.assertEqual('failed', self.task.migration.status)
            save_mock.assert_called_once_with()
+            # Should have removed allocations for the first host.
+            remove_allocs.assert_called_once_with('host1', 'node1')

    def test_find_destination_when_runs_out_of_hosts(self):
        self.mox.StubOutWithMock(utils, 'get_image_from_system_metadata')
@ -636,3 +655,15 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
                          self.task._get_destination_cell_mapping)
        mock_get.assert_called_once_with(
            self.task.context, self.task.destination)
+
+    @mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename',
+                       side_effect=exception.ComputeHostNotFound(host='host'))
+    def test_remove_host_allocations_compute_host_not_found(self, get_cn):
+        """Tests that failing to find a ComputeNode will not blow up
+        the _remove_host_allocations method.
+        """
+        with mock.patch.object(
+                self.task.scheduler_client.reportclient,
+                'remove_provider_from_instance_allocation') as remove_provider:
+            self.task._remove_host_allocations('host', 'node')
+        remove_provider.assert_not_called()