Remove allocation when booting instance rescheduled or aborted

In Pike release, there is no auto-heal in the resource tracker. So when instance booting failed, the allocation records aren't cleaned up. This patch clean up the allocation records when the instance rescheduled and aborted. Change-Id: I95d2f4c9392883052188fb7901451530068502db Closes-Bug: #1712718 (cherry picked from commit c19ebcbd58)
2017-08-24 11:38:31 +08:00 · 2017-08-24 11:38:31 +08:00 · 27fa0f03d2
parent 495621bfc5
commit 27fa0f03d2
3 changed files with 102 additions and 4 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -1724,12 +1724,34 @@ class ComputeManager(manager.Manager):
                try:
                    result = self._do_build_and_run_instance(*args, **kwargs)
                except Exception:
+                    # NOTE(mriedem): This should really only happen if
+                    # _decode_files in _do_build_and_run_instance fails, and
+                    # that's before a guest is spawned so it's OK to remove
+                    # allocations for the instance for this node from Placement
+                    # below as there is no guest consuming resources anyway.
+                    # The _decode_files case could be handled more specifically
+                    # but that's left for another day.
                    result = build_results.FAILED
                    raise
                finally:
                    fails = (build_results.FAILED,
                             build_results.RESCHEDULED)
                    if result in fails:
+                        # Remove the allocation records from Placement for
+                        # the instance if the build failed or is being
+                        # rescheduled to another node. The instance.host is
+                        # likely set to None in _do_build_and_run_instance
+                        # which means if the user deletes the instance, it will
+                        # be deleted in the API, not the compute service.
+                        # Setting the instance.host to None in
+                        # _do_build_and_run_instance means that the
+                        # ResourceTracker will no longer consider this instance
+                        # to be claiming resources against it, so we want to
+                        # reflect that same thing in Placement.
+                        rt = self._get_resource_tracker()
+                        rt.reportclient.delete_allocation_for_instance(
+                            instance.uuid)
+
                        self._build_failed()
                    else:
                        self._failed_builds = 0
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@ -23,6 +23,7 @@ from oslo_serialization import base64
 from oslo_utils import timeutils

 from nova.compute import api as compute_api
+from nova.compute import manager as compute_manager
 from nova.compute import rpcapi
 from nova import context
 from nova import exception
@ -1877,3 +1878,77 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):

        self._delete_and_check_allocations(
            server, source_rp_uuid, dest_rp_uuid)
+
+    def test_rescheduling_when_booting_instance(self):
+        self.failed_hostname = None
+        old_build_resources = (compute_manager.ComputeManager.
+            _build_resources)
+
+        def fake_build_resources(sl, *args, **kwargs):
+            # We failed on the first scheduling
+            if not self.failed_hostname:
+                self.failed_hostname = sl.host
+                raise Exception()
+
+            return old_build_resources(sl, *args, **kwargs)
+
+        self.stub_out('nova.compute.manager.ComputeManager._build_resources',
+                      fake_build_resources)
+
+        server_req = self._build_minimal_create_server_request(
+                self.api, 'some-server', flavor_id=self.flavor1['id'],
+                image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
+                networks=[])
+
+        created_server = self.api.post_server({'server': server_req})
+        server = self._wait_for_state_change(
+                self.admin_api, created_server, 'ACTIVE')
+        dest_hostname = server['OS-EXT-SRV-ATTR:host']
+
+        LOG.info('failed on %s', self.failed_hostname)
+        LOG.info('booting on %s', dest_hostname)
+
+        failed_rp_uuid = self._get_provider_uuid_by_host(self.failed_hostname)
+        dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
+
+        failed_usages = self._get_provider_usages(failed_rp_uuid)
+        # Expects no allocation records on the failed host.
+        self.assertFlavorMatchesAllocation(
+           {'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages)
+
+        # Ensure the allocation records on the destination host.
+        dest_usages = self._get_provider_usages(dest_rp_uuid)
+        self.assertFlavorMatchesAllocation(self.flavor1, dest_usages)
+
+    def test_abort_when_booting_instance(self):
+        self.failed_hostname = None
+        old_build_resources = (compute_manager.ComputeManager.
+            _build_resources)
+
+        def fake_build_resources(sl, *args, **kwargs):
+            # We failed on the first scheduling
+            if not self.failed_hostname:
+                self.failed_hostname = sl.host
+                raise exception.BuildAbortException(instance_uuid='fake_uuid',
+                    reason='just abort')
+
+            return old_build_resources(sl, *args, **kwargs)
+
+        self.stub_out('nova.compute.manager.ComputeManager._build_resources',
+                      fake_build_resources)
+
+        server_req = self._build_minimal_create_server_request(
+                self.api, 'some-server', flavor_id=self.flavor1['id'],
+                image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
+                networks=[])
+
+        created_server = self.api.post_server({'server': server_req})
+        self._wait_for_state_change(self.admin_api, created_server, 'ERROR')
+
+        LOG.info('failed on %s', self.failed_hostname)
+
+        failed_rp_uuid = self._get_provider_uuid_by_host(self.failed_hostname)
+        failed_usages = self._get_provider_usages(failed_rp_uuid)
+        # Expects no allocation records on the failed host.
+        self.assertFlavorMatchesAllocation(
+           {'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages)
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@ -4834,13 +4834,13 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
    @mock.patch.object(manager.ComputeManager,
                       '_nil_out_instance_obj_host_and_node')
    @mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances')
-    @mock.patch.object(manager.ComputeManager, '_get_resource_tracker')
-    def test_reschedule_on_resources_unavailable(self, mock_get_resource,
+    @mock.patch.object(resource_tracker.ResourceTracker, 'instance_claim')
+    def test_reschedule_on_resources_unavailable(self, mock_claim,
                mock_build, mock_nil, mock_save, mock_start,
                mock_finish, mock_notify):
        reason = 'resource unavailable'
        exc = exception.ComputeResourcesUnavailable(reason=reason)
-        mock_get_resource.side_effect = exc
+        mock_claim.side_effect = exc
        self._do_build_instance_update(mock_save, reschedule_update=True)

        with mock.patch.object(
@ -4858,7 +4858,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):

        self._instance_action_events(mock_start, mock_finish)
        self._assert_build_instance_update(mock_save, reschedule_update=True)
-        mock_get_resource.assert_called_once_with()
+        mock_claim.assert_called_once_with(self.context, self.instance,
+            self.node, self.limits)
        mock_notify.assert_has_calls([
            mock.call(self.context, self.instance, 'create.start',
                extra_usage_info= {'image_name': self.image.get('name')}),