diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 45947298ef9c..3386c1934f20 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -1723,12 +1723,34 @@ class ComputeManager(manager.Manager): try: result = self._do_build_and_run_instance(*args, **kwargs) except Exception: + # NOTE(mriedem): This should really only happen if + # _decode_files in _do_build_and_run_instance fails, and + # that's before a guest is spawned so it's OK to remove + # allocations for the instance for this node from Placement + # below as there is no guest consuming resources anyway. + # The _decode_files case could be handled more specifically + # but that's left for another day. result = build_results.FAILED raise finally: fails = (build_results.FAILED, build_results.RESCHEDULED) if result in fails: + # Remove the allocation records from Placement for + # the instance if the build failed or is being + # rescheduled to another node. The instance.host is + # likely set to None in _do_build_and_run_instance + # which means if the user deletes the instance, it will + # be deleted in the API, not the compute service. + # Setting the instance.host to None in + # _do_build_and_run_instance means that the + # ResourceTracker will no longer consider this instance + # to be claiming resources against it, so we want to + # reflect that same thing in Placement. + rt = self._get_resource_tracker() + rt.reportclient.delete_allocation_for_instance( + instance.uuid) + self._build_failed() else: self._failed_builds = 0 diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index 73769b653401..9d3e96f33da2 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -23,6 +23,7 @@ from oslo_serialization import base64 from oslo_utils import timeutils from nova.compute import api as compute_api +from nova.compute import manager as compute_manager from nova.compute import rpcapi from nova import context from nova import exception @@ -1875,3 +1876,77 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin): self._delete_and_check_allocations( server, source_rp_uuid, dest_rp_uuid) + + def test_rescheduling_when_booting_instance(self): + self.failed_hostname = None + old_build_resources = (compute_manager.ComputeManager. + _build_resources) + + def fake_build_resources(sl, *args, **kwargs): + # We failed on the first scheduling + if not self.failed_hostname: + self.failed_hostname = sl.host + raise Exception() + + return old_build_resources(sl, *args, **kwargs) + + self.stub_out('nova.compute.manager.ComputeManager._build_resources', + fake_build_resources) + + server_req = self._build_minimal_create_server_request( + self.api, 'some-server', flavor_id=self.flavor1['id'], + image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6', + networks=[]) + + created_server = self.api.post_server({'server': server_req}) + server = self._wait_for_state_change( + self.admin_api, created_server, 'ACTIVE') + dest_hostname = server['OS-EXT-SRV-ATTR:host'] + + LOG.info('failed on %s', self.failed_hostname) + LOG.info('booting on %s', dest_hostname) + + failed_rp_uuid = self._get_provider_uuid_by_host(self.failed_hostname) + dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname) + + failed_usages = self._get_provider_usages(failed_rp_uuid) + # Expects no allocation records on the failed host. + self.assertFlavorMatchesAllocation( + {'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages) + + # Ensure the allocation records on the destination host. + dest_usages = self._get_provider_usages(dest_rp_uuid) + self.assertFlavorMatchesAllocation(self.flavor1, dest_usages) + + def test_abort_when_booting_instance(self): + self.failed_hostname = None + old_build_resources = (compute_manager.ComputeManager. + _build_resources) + + def fake_build_resources(sl, *args, **kwargs): + # We failed on the first scheduling + if not self.failed_hostname: + self.failed_hostname = sl.host + raise exception.BuildAbortException(instance_uuid='fake_uuid', + reason='just abort') + + return old_build_resources(sl, *args, **kwargs) + + self.stub_out('nova.compute.manager.ComputeManager._build_resources', + fake_build_resources) + + server_req = self._build_minimal_create_server_request( + self.api, 'some-server', flavor_id=self.flavor1['id'], + image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6', + networks=[]) + + created_server = self.api.post_server({'server': server_req}) + self._wait_for_state_change(self.admin_api, created_server, 'ERROR') + + LOG.info('failed on %s', self.failed_hostname) + + failed_rp_uuid = self._get_provider_uuid_by_host(self.failed_hostname) + failed_usages = self._get_provider_usages(failed_rp_uuid) + # Expects no allocation records on the failed host. + self.assertFlavorMatchesAllocation( + {'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index ac33e1be19b3..3487c42b2434 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -4840,13 +4840,13 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): @mock.patch.object(manager.ComputeManager, '_nil_out_instance_obj_host_and_node') @mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances') - @mock.patch.object(manager.ComputeManager, '_get_resource_tracker') - def test_reschedule_on_resources_unavailable(self, mock_get_resource, + @mock.patch.object(resource_tracker.ResourceTracker, 'instance_claim') + def test_reschedule_on_resources_unavailable(self, mock_claim, mock_build, mock_nil, mock_save, mock_start, mock_finish, mock_notify): reason = 'resource unavailable' exc = exception.ComputeResourcesUnavailable(reason=reason) - mock_get_resource.side_effect = exc + mock_claim.side_effect = exc self._do_build_instance_update(mock_save, reschedule_update=True) with mock.patch.object( @@ -4864,7 +4864,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): self._instance_action_events(mock_start, mock_finish) self._assert_build_instance_update(mock_save, reschedule_update=True) - mock_get_resource.assert_called_once_with() + mock_claim.assert_called_once_with(self.context, self.instance, + self.node, self.limits) mock_notify.assert_has_calls([ mock.call(self.context, self.instance, 'create.start', extra_usage_info= {'image_name': self.image.get('name')}),