Merge "Cleanup allocations on invalid dest node during live migration" into stable/pike

2017-09-15 22:19:12 +00:00 · 2017-09-15 22:19:12 +00:00 · 46554875d2
parent d3d690cc67 a1462d2608
commit 46554875d2
4 changed files with 89 additions and 31 deletions
--- a/nova/conductor/tasks/live_migrate.py
+++ b/nova/conductor/tasks/live_migrate.py
@ -267,8 +267,9 @@ class LiveMigrationTask(base.TaskBase):
            self._check_not_over_max_retries(attempted_hosts)
            request_spec.ignore_hosts = attempted_hosts
            try:
-                host = self.scheduler_client.select_destinations(self.context,
-                        request_spec, [self.instance.uuid])[0]['host']
+                hoststate = self.scheduler_client.select_destinations(
+                    self.context, request_spec, [self.instance.uuid])[0]
+                host = hoststate['host']
            except messaging.RemoteError as ex:
                # TODO(ShaoHe Feng) There maybe multi-scheduler, and the
                # scheduling algorithm is R-R, we can let other scheduler try.
@ -284,9 +285,47 @@ class LiveMigrationTask(base.TaskBase):
                LOG.debug("Skipping host: %(host)s because: %(e)s",
                    {"host": host, "e": e})
                attempted_hosts.append(host)
+                # The scheduler would have created allocations against the
+                # selected destination host in Placement, so we need to remove
+                # those before moving on.
+                self._remove_host_allocations(host, hoststate['nodename'])
                host = None
        return host

+    def _remove_host_allocations(self, host, node):
+        """Removes instance allocations against the given host from Placement
+
+        :param host: The name of the host.
+        :param node: The name of the node.
+        """
+        # Get the compute node object since we need the UUID.
+        # TODO(mriedem): If the result of select_destinations eventually
+        # returns the compute node uuid, we wouldn't need to look it
+        # up via host/node and we can save some time.
+        try:
+            compute_node = objects.ComputeNode.get_by_host_and_nodename(
+                self.context, host, node)
+        except exception.ComputeHostNotFound:
+            # This shouldn't happen, but we're being careful.
+            LOG.info('Unable to remove instance allocations from host %s '
+                     'and node %s since it was not found.', host, node,
+                     instance=self.instance)
+            return
+
+        # Calculate the resource class amounts to subtract from the allocations
+        # on the node based on the instance flavor.
+        resources = scheduler_utils.resources_from_flavor(
+            self.instance, self.instance.flavor)
+
+        # Now remove the allocations for our instance against that node.
+        # Note that this does not remove allocations against any other node
+        # or shared resource provider, it's just undoing what the scheduler
+        # allocated for the given (destination) node.
+        self.scheduler_client.reportclient.\
+            remove_provider_from_instance_allocation(
+                self.instance.uuid, compute_node.uuid, self.instance.user_id,
+                self.instance.project_id, resources)
+
    def _check_not_over_max_retries(self, attempted_hosts):
        if CONF.migrate_max_retries == -1:
            return
--- a/nova/scheduler/client/report.py
+++ b/nova/scheduler/client/report.py
@ -1014,13 +1014,7 @@ class SchedulerReportClient(object):
        then PUTs the resulting allocation back to the placement API for the
        consumer.

-        We call this method on three occasions:
-
-        1. On the source host during a confirm_resize() operation.
-        2. On the destination host during a revert_resize() operation.
-        3. On the destination host when prep_resize fails.
-
-        This is important to reconcile the "doubled-up" allocation that the
+        This is used to reconcile the "doubled-up" allocation that the
        scheduler constructs when claiming resources against the destination
        host during a move operation.

--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@ -2185,23 +2185,17 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
        self.assertFlavorMatchesAllocation(self.flavor1, source_usages)

        dest_usages = self._get_provider_usages(dest_rp_uuid)
-        # FIXME(mriedem): This is bug 1712411 where the allocations, created
-        # by the scheduler, via conductor, are not cleaned up on the migration
-        # pre-check error. Uncomment the following code when this is fixed.
-        self.assertFlavorMatchesAllocation(self.flavor1, dest_usages)
-        # # Assert the allocations, created by the scheduler, are cleaned up
-        # # after the migration pre-check error happens.
-        # self.assertFlavorMatchesAllocation(
-        #     {'vcpus': 0, 'ram': 0, 'disk': 0}, dest_usages)
+        # Assert the allocations, created by the scheduler, are cleaned up
+        # after the migration pre-check error happens.
+        self.assertFlavorMatchesAllocation(
+            {'vcpus': 0, 'ram': 0, 'disk': 0}, dest_usages)

        allocations = self._get_allocations_by_server_uuid(server['id'])
-        # FIXME(mriedem): After bug 1712411 is fixed there should only be 1
-        # allocation for the instance, on the source node.
-        self.assertEqual(2, len(allocations))
+        # There should only be 1 allocation for the instance on the source node
+        self.assertEqual(1, len(allocations))
        self.assertIn(source_rp_uuid, allocations)
-        self.assertIn(dest_rp_uuid, allocations)
-        dest_allocation = allocations[dest_rp_uuid]['resources']
-        self.assertFlavorMatchesAllocation(self.flavor1, dest_allocation)
+        self.assertFlavorMatchesAllocation(
+            self.flavor1, allocations[source_rp_uuid]['resources'])

        self._delete_and_check_allocations(
            server, source_rp_uuid, dest_rp_uuid)
--- a/nova/tests/unit/conductor/tasks/test_live_migrate.py
+++ b/nova/tests/unit/conductor/tasks/test_live_migrate.py
@ -375,7 +375,7 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
        scheduler_utils.setup_instance_group(self.context, self.fake_spec)
        self.task.scheduler_client.select_destinations(self.context,
                self.fake_spec, [self.instance.uuid]).AndReturn(
-                        [{'host': 'host1'}])
+                        [{'host': 'host1', 'nodename': 'node1'}])
        self.task._check_compatible_with_source_hypervisor("host1")\
                .AndRaise(error)

@ -386,7 +386,11 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
        self.task._call_livem_checks_on_host("host2")

        self.mox.ReplayAll()
-        self.assertEqual("host2", self.task._find_destination())
+        with mock.patch.object(self.task,
+                               '_remove_host_allocations') as remove_allocs:
+            self.assertEqual("host2", self.task._find_destination())
+        # Should have removed allocations for the first host.
+        remove_allocs.assert_called_once_with('host1', 'node1')

    def test_find_destination_retry_with_old_hypervisor(self):
        self._test_find_destination_retry_hypervisor_raises(
@ -411,7 +415,7 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
        scheduler_utils.setup_instance_group(self.context, self.fake_spec)
        self.task.scheduler_client.select_destinations(self.context,
                self.fake_spec, [self.instance.uuid]).AndReturn(
-                        [{'host': 'host1'}])
+                        [{'host': 'host1', 'nodename': 'node1'}])
        self.task._check_compatible_with_source_hypervisor("host1")
        self.task._call_livem_checks_on_host("host1")\
                .AndRaise(exception.Invalid)
@ -423,7 +427,11 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
        self.task._call_livem_checks_on_host("host2")

        self.mox.ReplayAll()
-        self.assertEqual("host2", self.task._find_destination())
+        with mock.patch.object(self.task,
+                               '_remove_host_allocations') as remove_allocs:
+            self.assertEqual("host2", self.task._find_destination())
+        # Should have removed allocations for the first host.
+        remove_allocs.assert_called_once_with('host1', 'node1')

    def test_find_destination_retry_with_failed_migration_pre_checks(self):
        self.flags(migrate_max_retries=1)
@ -440,7 +448,7 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
        scheduler_utils.setup_instance_group(self.context, self.fake_spec)
        self.task.scheduler_client.select_destinations(self.context,
                self.fake_spec, [self.instance.uuid]).AndReturn(
-                        [{'host': 'host1'}])
+                        [{'host': 'host1', 'nodename': 'node1'}])
        self.task._check_compatible_with_source_hypervisor("host1")
        self.task._call_livem_checks_on_host("host1")\
                .AndRaise(exception.MigrationPreCheckError("reason"))
@ -452,7 +460,11 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
        self.task._call_livem_checks_on_host("host2")

        self.mox.ReplayAll()
-        self.assertEqual("host2", self.task._find_destination())
+        with mock.patch.object(self.task,
+                               '_remove_host_allocations') as remove_allocs:
+            self.assertEqual("host2", self.task._find_destination())
+        # Should have removed allocations for the first host.
+        remove_allocs.assert_called_once_with('host1', 'node1')

    def test_find_destination_retry_exceeds_max(self):
        self.flags(migrate_max_retries=0)
@ -468,16 +480,23 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
        scheduler_utils.setup_instance_group(self.context, self.fake_spec)
        self.task.scheduler_client.select_destinations(self.context,
                self.fake_spec, [self.instance.uuid]).AndReturn(
-                        [{'host': 'host1'}])
+                        [{'host': 'host1', 'nodename': 'node1'}])
        self.task._check_compatible_with_source_hypervisor("host1")\
                .AndRaise(exception.DestinationHypervisorTooOld)

        self.mox.ReplayAll()
-        with mock.patch.object(self.task.migration, 'save') as save_mock:
+        with test.nested(
+            mock.patch.object(self.task.migration, 'save'),
+            mock.patch.object(self.task, '_remove_host_allocations')
+        ) as (
+            save_mock, remove_allocs
+        ):
            self.assertRaises(exception.MaxRetriesExceeded,
                              self.task._find_destination)
            self.assertEqual('failed', self.task.migration.status)
            save_mock.assert_called_once_with()
+            # Should have removed allocations for the first host.
+            remove_allocs.assert_called_once_with('host1', 'node1')

    def test_find_destination_when_runs_out_of_hosts(self):
        self.mox.StubOutWithMock(utils, 'get_image_from_system_metadata')
@ -542,3 +561,15 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
                          self.task._get_destination_cell_mapping)
        mock_get.assert_called_once_with(
            self.task.context, self.task.destination)
+
+    @mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename',
+                       side_effect=exception.ComputeHostNotFound(host='host'))
+    def test_remove_host_allocations_compute_host_not_found(self, get_cn):
+        """Tests that failing to find a ComputeNode will not blow up
+        the _remove_host_allocations method.
+        """
+        with mock.patch.object(
+                self.task.scheduler_client.reportclient,
+                'remove_provider_from_instance_allocation') as remove_provider:
+            self.task._remove_host_allocations('host', 'node')
+        remove_provider.assert_not_called()