Delete allocations from API if nova-compute is down

When performing a "local delete" of an instance, we need to delete the allocations that the instance has against any resource providers in Placement. It should be noted that without this change, restarting the nova-compute service will delete the allocations for its compute node (assuming the compute node UUID is the same as before the instance was deleted). That is shown in the existing functional test modified here. The more important reason for this change is that in order to fix bug 1756179, we need to make sure the resource provider allocations for a given compute node are gone by the time the compute service is deleted. This adds a new functional test and a release note for the new behavior and need to configure nova-api for talking to placement, which is idempotent if not configured thanks to the @safe_connect decorator used in SchedulerReportClient. Closes-Bug: #1679750 Related-Bug: #1756179 Change-Id: If507e23f0b7e5fa417041c3870d77786498f741d
2018-04-11 21:24:43 -04:00 · 2018-04-11 21:24:43 -04:00 · ea9d0af313
commit ea9d0af313
parent 42f62f1ed2
4 changed files with 103 additions and 21 deletions
--- a/nova/compute/api.py
+++ b/nova/compute/api.py
@ -250,6 +250,13 @@ class API(base.Base):
        self.image_api = image_api or image.API()
        self.network_api = network_api or network.API()
        self.volume_api = volume_api or cinder.API()
        # NOTE(mriedem): This looks a bit weird but we get the reportclient
        # via SchedulerClient since it lazy-loads SchedulerReportClient on
        # the first usage which helps to avoid a bunch of lockutils spam in
        # the nova-api logs every time the service is restarted (remember
        # that pretty much all of the REST API controllers construct this
        # API class).
        self.placementclient = scheduler_client.SchedulerClient().reportclient
        self.security_group_api = (security_group_api or
            openstack_driver.get_openstack_security_group_driver())
        self.consoleauth_rpcapi = consoleauth_rpcapi.ConsoleAuthAPI()
@ -2059,6 +2066,10 @@ class API(base.Base):
            # cleanup volumes
            self._local_cleanup_bdm_volumes(bdms, instance, context)
            # Cleanup allocations in Placement since we can't do it from the
            # compute service.
            self.placementclient.delete_allocation_for_instance(
                context, instance.uuid)
            cb(context, instance, bdms, local=True)
            instance.destroy()
--- a/nova/tests/functional/regressions/test_bug_1679750.py
+++ b/nova/tests/functional/regressions/test_bug_1679750.py
@ -10,6 +10,7 @@
 # License for the specific language governing permissions and limitations
 # under the License.
 from nova.scheduler.client import report as reportclient
 from nova import test
 from nova.tests import fixtures as nova_fixtures
 from nova.tests.functional import integrated_helpers
@ -55,12 +56,87 @@ class TestLocalDeleteAllocations(test.TestCase,
        resp = self.placement_api.get(fmt % {'uuid': rp_uuid})
        return resp.body['usages']
-    def test_local_delete_removes_allocations(self):
+    # NOTE(mriedem): It would be preferable to use the PlacementFixture as
    # a context manager but that causes some issues when trying to delete the
    # server in test_local_delete_removes_allocations_after_compute_restart.
    def _stub_compute_api_to_not_configure_placement(self):
        """Prior to the compute API deleting allocations in the "local delete"
        case, nova.conf for nova-api might not be configured for talking to
        the placement service, so we can mock that behavior by stubbing out
        the placement client in the compute API to no-op as if safe_connect
        failed and returned None to the caller.
        """
        orig_delete_alloc = (
            reportclient.SchedulerReportClient.delete_allocation_for_instance)
        self.call_count = 0
        def fake_delete_allocation_for_instance(*args, **kwargs):
            # The first call will be from the API, so ignore that one and
            # return None like the @safe_connect decorator would if nova-api
            # wasn't configured to talk to placement.
            if self.call_count:
                orig_delete_alloc(*args, **kwargs)
            else:
                self.call_count += 1
        self.stub_out('nova.scheduler.client.report.SchedulerReportClient.'
                      'delete_allocation_for_instance',
                      fake_delete_allocation_for_instance)
    def test_local_delete_removes_allocations_after_compute_restart(self):
        """Tests that allocations are removed after a local delete.
        This tests the scenario where a server is local deleted (because the
        compute host is down) and we want to make sure that its allocations
-        have been cleaned up.
+        have been cleaned up once the nova-compute service restarts.
        """
        self._stub_compute_api_to_not_configure_placement()
        # Get allocations, make sure they are 0.
        resp = self.placement_api.get('/resource_providers')
        rp_uuid = resp.body['resource_providers'][0]['uuid']
        usages_before = self._get_usages(rp_uuid)
        for usage in usages_before.values():
            self.assertEqual(0, usage)
        # Create a server.
        server = self._build_minimal_create_server_request(self.api,
            'local-delete-test', self.image_id, self.flavor_id, 'none')
        server = self.admin_api.post_server({'server': server})
        server = self._wait_for_state_change(self.api, server, 'ACTIVE')
        # Assert usages are non zero now.
        usages_during = self._get_usages(rp_uuid)
        for usage in usages_during.values():
            self.assertNotEqual(0, usage)
        # Force-down compute to trigger local delete.
        self.compute.stop()
        compute_service_id = self.admin_api.get_services(
            host=self.compute.host, binary='nova-compute')[0]['id']
        self.admin_api.put_service(compute_service_id, {'forced_down': True})
        # Delete the server (will be a local delete because compute is down).
        self.api.delete_server(server['id'])
        # Assert usages are still non-zero.
        usages_during = self._get_usages(rp_uuid)
        for usage in usages_during.values():
            self.assertNotEqual(0, usage)
        # Start the compute service again. Before it comes up, it will call the
        # update_available_resource code in the ResourceTracker which is what
        # "heals" the allocations for the deleted instance.
        self.compute.start()
        # Get the allocations again to check against the original.
        usages_after = self._get_usages(rp_uuid)
        # They should match.
        self.assertEqual(usages_before, usages_after)
    def test_local_delete_removes_allocations_from_api(self):
        """Tests that the compute API deletes allocations when the compute
        service on which the instance was running is down.
        """
        # Get allocations, make sure they are 0.
        resp = self.placement_api.get('/resource_providers')
@ -89,12 +165,7 @@ class TestLocalDeleteAllocations(test.TestCase,
        # Delete the server (will be a local delete because compute is down).
        self.api.delete_server(server['id'])
-        # Start the compute service again. Before it comes up, it will call the
+        # Get the allocations again to make sure they were deleted.
        # update_available_resource code in the ResourceTracker which is what
        # "heals" the allocations for the deleted instance.
        self.compute.start()
        # Get the allocations again to check against the original.
        usages_after = self._get_usages(rp_uuid)
        # They should match.
--- a/nova/tests/functional/wsgi/test_services.py
+++ b/nova/tests/functional/wsgi/test_services.py
@ -86,12 +86,7 @@ class TestServicesAPI(test_servers.ProviderUsageBaseTestCase):
        self.assertEqual(409, ex.response.status_code)
        # Now delete the instance and wait for it to be gone.
-        # Note that we can't use self._delete_and_check_allocations here
+        self._delete_and_check_allocations(server)
        # because of bug 1679750 where allocations are not deleted when
        # an instance is deleted and the compute service it's running on
        # is down.
        self.api.delete_server(server['id'])
        self._wait_until_deleted(server)
        # Now we can delete the service.
        self.admin_api.api_delete('/os-services/%s' % service['id'])
@ -120,10 +115,3 @@ class TestServicesAPI(test_servers.ProviderUsageBaseTestCase):
        # in a 404 response.
        resp = self.placement_api.get('/resource_providers/%s' % rp_uuid)
        self.assertEqual(200, resp.status)
        # Assert the allocations still exist for the server against the
        # compute node resource provider. Once the bug is fixed, there should
        # be no allocations for the server.
        allocations = self._get_allocations_by_server_uuid(server['id'])
        self.assertEqual(1, len(allocations))
        allocation = allocations[rp_uuid]['resources']
        self.assertFlavorMatchesAllocation(flavor, allocation)
--- a/releasenotes/notes/bug-1679750-local-delete-allocations-cb7bfbcb6c36b6a2.yaml
+++ b/releasenotes/notes/bug-1679750-local-delete-allocations-cb7bfbcb6c36b6a2.yaml
@ -0,0 +1,12 @@
 ---
 upgrade:
  - |
    The ``nova-api`` service now requires the ``[placement]`` section to be
    configured in nova.conf if you are using a separate config file just for
    that service. This is because the ``nova-api`` service now needs to talk
    to the placement service in order to delete resource provider allocations
    when deleting an instance and the ``nova-compute`` service on which that
    instance is running is down. This change is idempotent if
    ``[placement]`` is not configured in ``nova-api`` but it will result in
    new warnings in the logs until configured. See bug
    https://bugs.launchpad.net/nova/+bug/1679750 for more details.