Delete allocations from API if nova-compute is down

When performing a "local delete" of an instance, we
need to delete the allocations that the instance has
against any resource providers in Placement.

It should be noted that without this change, restarting
the nova-compute service will delete the allocations
for its compute node (assuming the compute node UUID
is the same as before the instance was deleted). That
is shown in the existing functional test modified here.

The more important reason for this change is that in
order to fix bug 1756179, we need to make sure the
resource provider allocations for a given compute node
are gone by the time the compute service is deleted.

This adds a new functional test and a release note for
the new behavior and need to configure nova-api for
talking to placement, which is idempotent if
not configured thanks to the @safe_connect decorator
used in SchedulerReportClient.

Closes-Bug: #1679750
Related-Bug: #1756179

Change-Id: If507e23f0b7e5fa417041c3870d77786498f741d
This commit is contained in:
Matt Riedemann 2018-04-11 21:24:43 -04:00
parent 42f62f1ed2
commit ea9d0af313
4 changed files with 103 additions and 21 deletions

View File

@ -250,6 +250,13 @@ class API(base.Base):
self.image_api = image_api or image.API() self.image_api = image_api or image.API()
self.network_api = network_api or network.API() self.network_api = network_api or network.API()
self.volume_api = volume_api or cinder.API() self.volume_api = volume_api or cinder.API()
# NOTE(mriedem): This looks a bit weird but we get the reportclient
# via SchedulerClient since it lazy-loads SchedulerReportClient on
# the first usage which helps to avoid a bunch of lockutils spam in
# the nova-api logs every time the service is restarted (remember
# that pretty much all of the REST API controllers construct this
# API class).
self.placementclient = scheduler_client.SchedulerClient().reportclient
self.security_group_api = (security_group_api or self.security_group_api = (security_group_api or
openstack_driver.get_openstack_security_group_driver()) openstack_driver.get_openstack_security_group_driver())
self.consoleauth_rpcapi = consoleauth_rpcapi.ConsoleAuthAPI() self.consoleauth_rpcapi = consoleauth_rpcapi.ConsoleAuthAPI()
@ -2059,6 +2066,10 @@ class API(base.Base):
# cleanup volumes # cleanup volumes
self._local_cleanup_bdm_volumes(bdms, instance, context) self._local_cleanup_bdm_volumes(bdms, instance, context)
# Cleanup allocations in Placement since we can't do it from the
# compute service.
self.placementclient.delete_allocation_for_instance(
context, instance.uuid)
cb(context, instance, bdms, local=True) cb(context, instance, bdms, local=True)
instance.destroy() instance.destroy()

View File

@ -10,6 +10,7 @@
# License for the specific language governing permissions and limitations # License for the specific language governing permissions and limitations
# under the License. # under the License.
from nova.scheduler.client import report as reportclient
from nova import test from nova import test
from nova.tests import fixtures as nova_fixtures from nova.tests import fixtures as nova_fixtures
from nova.tests.functional import integrated_helpers from nova.tests.functional import integrated_helpers
@ -55,12 +56,87 @@ class TestLocalDeleteAllocations(test.TestCase,
resp = self.placement_api.get(fmt % {'uuid': rp_uuid}) resp = self.placement_api.get(fmt % {'uuid': rp_uuid})
return resp.body['usages'] return resp.body['usages']
def test_local_delete_removes_allocations(self): # NOTE(mriedem): It would be preferable to use the PlacementFixture as
# a context manager but that causes some issues when trying to delete the
# server in test_local_delete_removes_allocations_after_compute_restart.
def _stub_compute_api_to_not_configure_placement(self):
"""Prior to the compute API deleting allocations in the "local delete"
case, nova.conf for nova-api might not be configured for talking to
the placement service, so we can mock that behavior by stubbing out
the placement client in the compute API to no-op as if safe_connect
failed and returned None to the caller.
"""
orig_delete_alloc = (
reportclient.SchedulerReportClient.delete_allocation_for_instance)
self.call_count = 0
def fake_delete_allocation_for_instance(*args, **kwargs):
# The first call will be from the API, so ignore that one and
# return None like the @safe_connect decorator would if nova-api
# wasn't configured to talk to placement.
if self.call_count:
orig_delete_alloc(*args, **kwargs)
else:
self.call_count += 1
self.stub_out('nova.scheduler.client.report.SchedulerReportClient.'
'delete_allocation_for_instance',
fake_delete_allocation_for_instance)
def test_local_delete_removes_allocations_after_compute_restart(self):
"""Tests that allocations are removed after a local delete. """Tests that allocations are removed after a local delete.
This tests the scenario where a server is local deleted (because the This tests the scenario where a server is local deleted (because the
compute host is down) and we want to make sure that its allocations compute host is down) and we want to make sure that its allocations
have been cleaned up. have been cleaned up once the nova-compute service restarts.
"""
self._stub_compute_api_to_not_configure_placement()
# Get allocations, make sure they are 0.
resp = self.placement_api.get('/resource_providers')
rp_uuid = resp.body['resource_providers'][0]['uuid']
usages_before = self._get_usages(rp_uuid)
for usage in usages_before.values():
self.assertEqual(0, usage)
# Create a server.
server = self._build_minimal_create_server_request(self.api,
'local-delete-test', self.image_id, self.flavor_id, 'none')
server = self.admin_api.post_server({'server': server})
server = self._wait_for_state_change(self.api, server, 'ACTIVE')
# Assert usages are non zero now.
usages_during = self._get_usages(rp_uuid)
for usage in usages_during.values():
self.assertNotEqual(0, usage)
# Force-down compute to trigger local delete.
self.compute.stop()
compute_service_id = self.admin_api.get_services(
host=self.compute.host, binary='nova-compute')[0]['id']
self.admin_api.put_service(compute_service_id, {'forced_down': True})
# Delete the server (will be a local delete because compute is down).
self.api.delete_server(server['id'])
# Assert usages are still non-zero.
usages_during = self._get_usages(rp_uuid)
for usage in usages_during.values():
self.assertNotEqual(0, usage)
# Start the compute service again. Before it comes up, it will call the
# update_available_resource code in the ResourceTracker which is what
# "heals" the allocations for the deleted instance.
self.compute.start()
# Get the allocations again to check against the original.
usages_after = self._get_usages(rp_uuid)
# They should match.
self.assertEqual(usages_before, usages_after)
def test_local_delete_removes_allocations_from_api(self):
"""Tests that the compute API deletes allocations when the compute
service on which the instance was running is down.
""" """
# Get allocations, make sure they are 0. # Get allocations, make sure they are 0.
resp = self.placement_api.get('/resource_providers') resp = self.placement_api.get('/resource_providers')
@ -89,12 +165,7 @@ class TestLocalDeleteAllocations(test.TestCase,
# Delete the server (will be a local delete because compute is down). # Delete the server (will be a local delete because compute is down).
self.api.delete_server(server['id']) self.api.delete_server(server['id'])
# Start the compute service again. Before it comes up, it will call the # Get the allocations again to make sure they were deleted.
# update_available_resource code in the ResourceTracker which is what
# "heals" the allocations for the deleted instance.
self.compute.start()
# Get the allocations again to check against the original.
usages_after = self._get_usages(rp_uuid) usages_after = self._get_usages(rp_uuid)
# They should match. # They should match.

View File

@ -86,12 +86,7 @@ class TestServicesAPI(test_servers.ProviderUsageBaseTestCase):
self.assertEqual(409, ex.response.status_code) self.assertEqual(409, ex.response.status_code)
# Now delete the instance and wait for it to be gone. # Now delete the instance and wait for it to be gone.
# Note that we can't use self._delete_and_check_allocations here self._delete_and_check_allocations(server)
# because of bug 1679750 where allocations are not deleted when
# an instance is deleted and the compute service it's running on
# is down.
self.api.delete_server(server['id'])
self._wait_until_deleted(server)
# Now we can delete the service. # Now we can delete the service.
self.admin_api.api_delete('/os-services/%s' % service['id']) self.admin_api.api_delete('/os-services/%s' % service['id'])
@ -120,10 +115,3 @@ class TestServicesAPI(test_servers.ProviderUsageBaseTestCase):
# in a 404 response. # in a 404 response.
resp = self.placement_api.get('/resource_providers/%s' % rp_uuid) resp = self.placement_api.get('/resource_providers/%s' % rp_uuid)
self.assertEqual(200, resp.status) self.assertEqual(200, resp.status)
# Assert the allocations still exist for the server against the
# compute node resource provider. Once the bug is fixed, there should
# be no allocations for the server.
allocations = self._get_allocations_by_server_uuid(server['id'])
self.assertEqual(1, len(allocations))
allocation = allocations[rp_uuid]['resources']
self.assertFlavorMatchesAllocation(flavor, allocation)

View File

@ -0,0 +1,12 @@
---
upgrade:
- |
The ``nova-api`` service now requires the ``[placement]`` section to be
configured in nova.conf if you are using a separate config file just for
that service. This is because the ``nova-api`` service now needs to talk
to the placement service in order to delete resource provider allocations
when deleting an instance and the ``nova-compute`` service on which that
instance is running is down. This change is idempotent if
``[placement]`` is not configured in ``nova-api`` but it will result in
new warnings in the logs until configured. See bug
https://bugs.launchpad.net/nova/+bug/1679750 for more details.