Delete allocations from API if nova-compute is down
When performing a "local delete" of an instance, we need to delete the allocations that the instance has against any resource providers in Placement. It should be noted that without this change, restarting the nova-compute service will delete the allocations for its compute node (assuming the compute node UUID is the same as before the instance was deleted). That is shown in the existing functional test modified here. The more important reason for this change is that in order to fix bug 1756179, we need to make sure the resource provider allocations for a given compute node are gone by the time the compute service is deleted. This adds a new functional test and a release note for the new behavior and need to configure nova-api for talking to placement, which is idempotent if not configured thanks to the @safe_connect decorator used in SchedulerReportClient. Closes-Bug: #1679750 Related-Bug: #1756179 Change-Id: If507e23f0b7e5fa417041c3870d77786498f741d
This commit is contained in:
parent
42f62f1ed2
commit
ea9d0af313
@ -250,6 +250,13 @@ class API(base.Base):
|
|||||||
self.image_api = image_api or image.API()
|
self.image_api = image_api or image.API()
|
||||||
self.network_api = network_api or network.API()
|
self.network_api = network_api or network.API()
|
||||||
self.volume_api = volume_api or cinder.API()
|
self.volume_api = volume_api or cinder.API()
|
||||||
|
# NOTE(mriedem): This looks a bit weird but we get the reportclient
|
||||||
|
# via SchedulerClient since it lazy-loads SchedulerReportClient on
|
||||||
|
# the first usage which helps to avoid a bunch of lockutils spam in
|
||||||
|
# the nova-api logs every time the service is restarted (remember
|
||||||
|
# that pretty much all of the REST API controllers construct this
|
||||||
|
# API class).
|
||||||
|
self.placementclient = scheduler_client.SchedulerClient().reportclient
|
||||||
self.security_group_api = (security_group_api or
|
self.security_group_api = (security_group_api or
|
||||||
openstack_driver.get_openstack_security_group_driver())
|
openstack_driver.get_openstack_security_group_driver())
|
||||||
self.consoleauth_rpcapi = consoleauth_rpcapi.ConsoleAuthAPI()
|
self.consoleauth_rpcapi = consoleauth_rpcapi.ConsoleAuthAPI()
|
||||||
@ -2059,6 +2066,10 @@ class API(base.Base):
|
|||||||
|
|
||||||
# cleanup volumes
|
# cleanup volumes
|
||||||
self._local_cleanup_bdm_volumes(bdms, instance, context)
|
self._local_cleanup_bdm_volumes(bdms, instance, context)
|
||||||
|
# Cleanup allocations in Placement since we can't do it from the
|
||||||
|
# compute service.
|
||||||
|
self.placementclient.delete_allocation_for_instance(
|
||||||
|
context, instance.uuid)
|
||||||
cb(context, instance, bdms, local=True)
|
cb(context, instance, bdms, local=True)
|
||||||
instance.destroy()
|
instance.destroy()
|
||||||
|
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
# License for the specific language governing permissions and limitations
|
# License for the specific language governing permissions and limitations
|
||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
|
from nova.scheduler.client import report as reportclient
|
||||||
from nova import test
|
from nova import test
|
||||||
from nova.tests import fixtures as nova_fixtures
|
from nova.tests import fixtures as nova_fixtures
|
||||||
from nova.tests.functional import integrated_helpers
|
from nova.tests.functional import integrated_helpers
|
||||||
@ -55,12 +56,87 @@ class TestLocalDeleteAllocations(test.TestCase,
|
|||||||
resp = self.placement_api.get(fmt % {'uuid': rp_uuid})
|
resp = self.placement_api.get(fmt % {'uuid': rp_uuid})
|
||||||
return resp.body['usages']
|
return resp.body['usages']
|
||||||
|
|
||||||
def test_local_delete_removes_allocations(self):
|
# NOTE(mriedem): It would be preferable to use the PlacementFixture as
|
||||||
|
# a context manager but that causes some issues when trying to delete the
|
||||||
|
# server in test_local_delete_removes_allocations_after_compute_restart.
|
||||||
|
def _stub_compute_api_to_not_configure_placement(self):
|
||||||
|
"""Prior to the compute API deleting allocations in the "local delete"
|
||||||
|
case, nova.conf for nova-api might not be configured for talking to
|
||||||
|
the placement service, so we can mock that behavior by stubbing out
|
||||||
|
the placement client in the compute API to no-op as if safe_connect
|
||||||
|
failed and returned None to the caller.
|
||||||
|
"""
|
||||||
|
orig_delete_alloc = (
|
||||||
|
reportclient.SchedulerReportClient.delete_allocation_for_instance)
|
||||||
|
self.call_count = 0
|
||||||
|
|
||||||
|
def fake_delete_allocation_for_instance(*args, **kwargs):
|
||||||
|
# The first call will be from the API, so ignore that one and
|
||||||
|
# return None like the @safe_connect decorator would if nova-api
|
||||||
|
# wasn't configured to talk to placement.
|
||||||
|
if self.call_count:
|
||||||
|
orig_delete_alloc(*args, **kwargs)
|
||||||
|
else:
|
||||||
|
self.call_count += 1
|
||||||
|
|
||||||
|
self.stub_out('nova.scheduler.client.report.SchedulerReportClient.'
|
||||||
|
'delete_allocation_for_instance',
|
||||||
|
fake_delete_allocation_for_instance)
|
||||||
|
|
||||||
|
def test_local_delete_removes_allocations_after_compute_restart(self):
|
||||||
"""Tests that allocations are removed after a local delete.
|
"""Tests that allocations are removed after a local delete.
|
||||||
|
|
||||||
This tests the scenario where a server is local deleted (because the
|
This tests the scenario where a server is local deleted (because the
|
||||||
compute host is down) and we want to make sure that its allocations
|
compute host is down) and we want to make sure that its allocations
|
||||||
have been cleaned up.
|
have been cleaned up once the nova-compute service restarts.
|
||||||
|
"""
|
||||||
|
self._stub_compute_api_to_not_configure_placement()
|
||||||
|
# Get allocations, make sure they are 0.
|
||||||
|
resp = self.placement_api.get('/resource_providers')
|
||||||
|
rp_uuid = resp.body['resource_providers'][0]['uuid']
|
||||||
|
usages_before = self._get_usages(rp_uuid)
|
||||||
|
for usage in usages_before.values():
|
||||||
|
self.assertEqual(0, usage)
|
||||||
|
|
||||||
|
# Create a server.
|
||||||
|
server = self._build_minimal_create_server_request(self.api,
|
||||||
|
'local-delete-test', self.image_id, self.flavor_id, 'none')
|
||||||
|
server = self.admin_api.post_server({'server': server})
|
||||||
|
server = self._wait_for_state_change(self.api, server, 'ACTIVE')
|
||||||
|
|
||||||
|
# Assert usages are non zero now.
|
||||||
|
usages_during = self._get_usages(rp_uuid)
|
||||||
|
for usage in usages_during.values():
|
||||||
|
self.assertNotEqual(0, usage)
|
||||||
|
|
||||||
|
# Force-down compute to trigger local delete.
|
||||||
|
self.compute.stop()
|
||||||
|
compute_service_id = self.admin_api.get_services(
|
||||||
|
host=self.compute.host, binary='nova-compute')[0]['id']
|
||||||
|
self.admin_api.put_service(compute_service_id, {'forced_down': True})
|
||||||
|
|
||||||
|
# Delete the server (will be a local delete because compute is down).
|
||||||
|
self.api.delete_server(server['id'])
|
||||||
|
|
||||||
|
# Assert usages are still non-zero.
|
||||||
|
usages_during = self._get_usages(rp_uuid)
|
||||||
|
for usage in usages_during.values():
|
||||||
|
self.assertNotEqual(0, usage)
|
||||||
|
|
||||||
|
# Start the compute service again. Before it comes up, it will call the
|
||||||
|
# update_available_resource code in the ResourceTracker which is what
|
||||||
|
# "heals" the allocations for the deleted instance.
|
||||||
|
self.compute.start()
|
||||||
|
|
||||||
|
# Get the allocations again to check against the original.
|
||||||
|
usages_after = self._get_usages(rp_uuid)
|
||||||
|
|
||||||
|
# They should match.
|
||||||
|
self.assertEqual(usages_before, usages_after)
|
||||||
|
|
||||||
|
def test_local_delete_removes_allocations_from_api(self):
|
||||||
|
"""Tests that the compute API deletes allocations when the compute
|
||||||
|
service on which the instance was running is down.
|
||||||
"""
|
"""
|
||||||
# Get allocations, make sure they are 0.
|
# Get allocations, make sure they are 0.
|
||||||
resp = self.placement_api.get('/resource_providers')
|
resp = self.placement_api.get('/resource_providers')
|
||||||
@ -89,12 +165,7 @@ class TestLocalDeleteAllocations(test.TestCase,
|
|||||||
# Delete the server (will be a local delete because compute is down).
|
# Delete the server (will be a local delete because compute is down).
|
||||||
self.api.delete_server(server['id'])
|
self.api.delete_server(server['id'])
|
||||||
|
|
||||||
# Start the compute service again. Before it comes up, it will call the
|
# Get the allocations again to make sure they were deleted.
|
||||||
# update_available_resource code in the ResourceTracker which is what
|
|
||||||
# "heals" the allocations for the deleted instance.
|
|
||||||
self.compute.start()
|
|
||||||
|
|
||||||
# Get the allocations again to check against the original.
|
|
||||||
usages_after = self._get_usages(rp_uuid)
|
usages_after = self._get_usages(rp_uuid)
|
||||||
|
|
||||||
# They should match.
|
# They should match.
|
||||||
|
@ -86,12 +86,7 @@ class TestServicesAPI(test_servers.ProviderUsageBaseTestCase):
|
|||||||
self.assertEqual(409, ex.response.status_code)
|
self.assertEqual(409, ex.response.status_code)
|
||||||
|
|
||||||
# Now delete the instance and wait for it to be gone.
|
# Now delete the instance and wait for it to be gone.
|
||||||
# Note that we can't use self._delete_and_check_allocations here
|
self._delete_and_check_allocations(server)
|
||||||
# because of bug 1679750 where allocations are not deleted when
|
|
||||||
# an instance is deleted and the compute service it's running on
|
|
||||||
# is down.
|
|
||||||
self.api.delete_server(server['id'])
|
|
||||||
self._wait_until_deleted(server)
|
|
||||||
|
|
||||||
# Now we can delete the service.
|
# Now we can delete the service.
|
||||||
self.admin_api.api_delete('/os-services/%s' % service['id'])
|
self.admin_api.api_delete('/os-services/%s' % service['id'])
|
||||||
@ -120,10 +115,3 @@ class TestServicesAPI(test_servers.ProviderUsageBaseTestCase):
|
|||||||
# in a 404 response.
|
# in a 404 response.
|
||||||
resp = self.placement_api.get('/resource_providers/%s' % rp_uuid)
|
resp = self.placement_api.get('/resource_providers/%s' % rp_uuid)
|
||||||
self.assertEqual(200, resp.status)
|
self.assertEqual(200, resp.status)
|
||||||
# Assert the allocations still exist for the server against the
|
|
||||||
# compute node resource provider. Once the bug is fixed, there should
|
|
||||||
# be no allocations for the server.
|
|
||||||
allocations = self._get_allocations_by_server_uuid(server['id'])
|
|
||||||
self.assertEqual(1, len(allocations))
|
|
||||||
allocation = allocations[rp_uuid]['resources']
|
|
||||||
self.assertFlavorMatchesAllocation(flavor, allocation)
|
|
||||||
|
@ -0,0 +1,12 @@
|
|||||||
|
---
|
||||||
|
upgrade:
|
||||||
|
- |
|
||||||
|
The ``nova-api`` service now requires the ``[placement]`` section to be
|
||||||
|
configured in nova.conf if you are using a separate config file just for
|
||||||
|
that service. This is because the ``nova-api`` service now needs to talk
|
||||||
|
to the placement service in order to delete resource provider allocations
|
||||||
|
when deleting an instance and the ``nova-compute`` service on which that
|
||||||
|
instance is running is down. This change is idempotent if
|
||||||
|
``[placement]`` is not configured in ``nova-api`` but it will result in
|
||||||
|
new warnings in the logs until configured. See bug
|
||||||
|
https://bugs.launchpad.net/nova/+bug/1679750 for more details.
|
Loading…
Reference in New Issue
Block a user