Delete resource providers for all nodes when deleting compute service

Change I7b8622b178d5043ed1556d7bdceaf60f47e5ac80 started deleting the
compute node resource provider associated with a compute node when
deleting a nova-compute service. However, it would only delete the
first compute node associated with the service which means for an
ironic compute service that is managing multiple nodes, the resource
providers were not cleaned up in placement. This fixes the issue by
iterating all the compute nodes and cleaning up their providers.
Note this could be potentially a lot of nodes, but we don't really
have many good options here but to iterate them and clean them up
one at a time.

Note that this is best-effort but because of how the
SchedulerReportClient.delete_resource_provider method ignores
ResourceProviderInUse errors, and we could have stale allocations
on the host for which delete_resource_provider is not accounting,
namely allocations from evacuated instances (or incomplete migrations
though you can't migrate baremetal instances today), we could still
delete the compute service and orphan those in-use providers. That,
however, is no worse than before this change where we did not try
to cleanup all providers. The issue described above is being tracked
with bug 1829479 and will be dealt with separately.

Change-Id: I9e852e25ea89f32bf19cdaeb1f5dac8f749f5dbc
Closes-Bug: #1811726
This commit is contained in:
Matt Riedemann 2019-05-03 15:23:57 -04:00
parent b9bcbab86b
commit 650fe118d1
3 changed files with 32 additions and 9 deletions

View File

@ -266,9 +266,14 @@ class ServiceController(wsgi.Controller):
ag.id, ag.id,
service.host) service.host)
# remove the corresponding resource provider record from # remove the corresponding resource provider record from
# placement for this compute node # placement for the compute nodes managed by this service;
self.placementclient.delete_resource_provider( # remember that an ironic compute service can manage multiple
context, service.compute_node, cascade=True) # nodes
compute_nodes = objects.ComputeNodeList.get_all_by_host(
context, service.host)
for compute_node in compute_nodes:
self.placementclient.delete_resource_provider(
context, compute_node, cascade=True)
# remove the host_mapping of this host. # remove the host_mapping of this host.
try: try:
hm = objects.HostMapping.get_by_host(context, service.host) hm = objects.HostMapping.get_by_host(context, service.host)

View File

@ -713,25 +713,33 @@ class ServicesTestV21(test.TestCase):
"""Tests that we are still able to successfully delete a nova-compute """Tests that we are still able to successfully delete a nova-compute
service even if the HostMapping is not found. service even if the HostMapping is not found.
""" """
@mock.patch('nova.objects.ComputeNodeList.get_all_by_host',
return_value=objects.ComputeNodeList(objects=[
objects.ComputeNode(host='host1',
hypervisor_hostname='node1'),
objects.ComputeNode(host='host1',
hypervisor_hostname='node2')]))
@mock.patch.object(self.controller.host_api, 'service_get_by_id', @mock.patch.object(self.controller.host_api, 'service_get_by_id',
return_value=objects.Service( return_value=objects.Service(
host='host1', binary='nova-compute', host='host1', binary='nova-compute'))
compute_node=objects.ComputeNode()))
@mock.patch.object(self.controller.aggregate_api, @mock.patch.object(self.controller.aggregate_api,
'get_aggregates_by_host', 'get_aggregates_by_host',
return_value=objects.AggregateList()) return_value=objects.AggregateList())
@mock.patch.object(self.controller.placementclient, @mock.patch.object(self.controller.placementclient,
'delete_resource_provider') 'delete_resource_provider')
def _test(delete_resource_provider, def _test(delete_resource_provider,
get_aggregates_by_host, service_get_by_id): get_aggregates_by_host, service_get_by_id,
cn_get_all_by_host):
self.controller.delete(self.req, 2) self.controller.delete(self.req, 2)
ctxt = self.req.environ['nova.context'] ctxt = self.req.environ['nova.context']
service_get_by_id.assert_called_once_with(ctxt, 2) service_get_by_id.assert_called_once_with(ctxt, 2)
get_instances.assert_called_once_with(ctxt, 'host1') get_instances.assert_called_once_with(ctxt, 'host1')
get_aggregates_by_host.assert_called_once_with(ctxt, 'host1') get_aggregates_by_host.assert_called_once_with(ctxt, 'host1')
delete_resource_provider.assert_called_once_with( self.assertEqual(2, delete_resource_provider.call_count)
ctxt, service_get_by_id.return_value.compute_node, nodes = cn_get_all_by_host.return_value
cascade=True) delete_resource_provider.assert_has_calls([
mock.call(ctxt, node, cascade=True) for node in nodes
], any_order=True)
get_hm.assert_called_once_with(ctxt, 'host1') get_hm.assert_called_once_with(ctxt, 'host1')
service_delete.assert_called_once_with() service_delete.assert_called_once_with()
_test() _test()

View File

@ -0,0 +1,10 @@
---
fixes:
- |
`Bug 1811726`_ is fixed by deleting the resource provider (in placement)
associated with each compute node record managed by a ``nova-compute``
service when that service is deleted via the
``DELETE /os-services/{service_id}`` API. This is particularly important
for compute services managing ironic baremetal nodes.
.. _Bug 1811726: https://bugs.launchpad.net/nova/+bug/1811726