Merge "Clean up allocations left by evacuation when deleting service" into stable/wallaby
This commit is contained in:
commit
05848fbbb2
@ -36,6 +36,7 @@ from nova import context as nova_context
|
||||
from nova import exception
|
||||
from nova.i18n import _
|
||||
from nova import objects
|
||||
from nova.objects import fields
|
||||
from nova import utils
|
||||
|
||||
|
||||
@ -2218,6 +2219,22 @@ class SchedulerReportClient(object):
|
||||
return {consumer: self.get_allocs_for_consumer(context, consumer)
|
||||
for consumer in consumers}
|
||||
|
||||
def _remove_allocations_for_evacuated_instances(self, context,
|
||||
compute_node):
|
||||
filters = {
|
||||
'source_compute': compute_node.host,
|
||||
'status': ['done'],
|
||||
'migration_type': fields.MigrationType.EVACUATION,
|
||||
}
|
||||
evacuations = objects.MigrationList.get_by_filters(context, filters)
|
||||
|
||||
for evacuation in evacuations:
|
||||
if not self.remove_provider_tree_from_instance_allocation(
|
||||
context, evacuation.instance_uuid, compute_node.uuid):
|
||||
LOG.error("Failed to clean allocation of evacuated "
|
||||
"instance on the source node %s",
|
||||
compute_node.uuid, instance=evacuation.instance)
|
||||
|
||||
def delete_resource_provider(self, context, compute_node, cascade=False):
|
||||
"""Deletes the ResourceProvider record for the compute_node.
|
||||
|
||||
@ -2236,16 +2253,19 @@ class SchedulerReportClient(object):
|
||||
# Delete any allocations for this resource provider.
|
||||
# Since allocations are by consumer, we get the consumers on this
|
||||
# host, which are its instances.
|
||||
# NOTE(mriedem): This assumes the only allocations on this node
|
||||
# are instances, but there could be migration consumers if the
|
||||
# node is deleted during a migration or allocations from an
|
||||
# evacuated host (bug 1829479). Obviously an admin shouldn't
|
||||
# do that but...you know. I guess the provider deletion should fail
|
||||
# in that case which is what we'd want to happen.
|
||||
instance_uuids = objects.InstanceList.get_uuids_by_host_and_node(
|
||||
context, host, nodename)
|
||||
for instance_uuid in instance_uuids:
|
||||
self.delete_allocation_for_instance(context, instance_uuid)
|
||||
|
||||
# When an instance is evacuated, its allocation remains in
|
||||
# the source compute node until the node recovers again.
|
||||
# If the broken compute never recovered but instead it is
|
||||
# decommissioned, then we should delete the allocations of
|
||||
# successfully evacuated instances during service delete.
|
||||
self._remove_allocations_for_evacuated_instances(context,
|
||||
compute_node)
|
||||
|
||||
# Ensure to delete resource provider in tree by top-down
|
||||
# traversable order.
|
||||
rps_to_refresh = self.get_providers_in_tree(context, rp_uuid)
|
||||
|
@ -1573,71 +1573,6 @@ class TestNovaManagePlacementAudit(
|
||||
self.assertIn('Processed 1 allocation.', output)
|
||||
self.assertEqual(4, ret)
|
||||
|
||||
# TODO(sbauza): Remove this test once bug #1829479 is fixed
|
||||
def test_audit_orphaned_allocations_from_deleted_compute_evacuate(self):
|
||||
"""Evacuate a server and the delete the source node so that it will
|
||||
leave a source allocation that the audit command will find.
|
||||
"""
|
||||
|
||||
source_hostname = self.compute1.host
|
||||
dest_hostname = self.compute2.host
|
||||
|
||||
source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
|
||||
dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
|
||||
|
||||
server = self._boot_and_check_allocations(self.flavor, source_hostname)
|
||||
|
||||
# Stop the service and fake it down
|
||||
self.compute1.stop()
|
||||
source_service_id = self.admin_api.get_services(
|
||||
host=source_hostname, binary='nova-compute')[0]['id']
|
||||
self.admin_api.put_service(source_service_id, {'forced_down': 'true'})
|
||||
|
||||
# evacuate the instance to the target
|
||||
self._evacuate_server(
|
||||
server, {'host': dest_hostname}, expected_host=dest_hostname)
|
||||
|
||||
# Now the instance is gone, we can delete the compute service
|
||||
self.admin_api.api_delete('/os-services/%s' % source_service_id)
|
||||
|
||||
# Since the compute is deleted, we should have in theory a single
|
||||
# allocation against the destination resource provider, but evacuated
|
||||
# instances are not having their allocations deleted. See bug #1829479.
|
||||
# We have two allocations for the same consumer, source and destination
|
||||
self._check_allocation_during_evacuate(
|
||||
self.flavor, server['id'], source_rp_uuid, dest_rp_uuid)
|
||||
|
||||
# Now, run the audit command that will find this orphaned allocation
|
||||
ret = self.cli.audit(verbose=True)
|
||||
output = self.output.getvalue()
|
||||
self.assertIn(
|
||||
'Allocations for consumer UUID %(consumer_uuid)s on '
|
||||
'Resource Provider %(rp_uuid)s can be deleted' %
|
||||
{'consumer_uuid': server['id'],
|
||||
'rp_uuid': source_rp_uuid},
|
||||
output)
|
||||
self.assertIn('Processed 1 allocation.', output)
|
||||
self.assertEqual(3, ret)
|
||||
|
||||
# Now we want to delete the orphaned allocation that is duplicate
|
||||
ret = self.cli.audit(delete=True, verbose=True)
|
||||
|
||||
# We finally should only have the target allocations
|
||||
self.assertFlavorMatchesUsage(dest_rp_uuid, self.flavor)
|
||||
self.assertRequestMatchesUsage({'VCPU': 0,
|
||||
'MEMORY_MB': 0,
|
||||
'DISK_GB': 0}, source_rp_uuid)
|
||||
|
||||
output = self.output.getvalue()
|
||||
self.assertIn(
|
||||
'Deleted allocations for consumer UUID %(consumer_uuid)s on '
|
||||
'Resource Provider %(rp_uuid)s' %
|
||||
{'consumer_uuid': server['id'],
|
||||
'rp_uuid': source_rp_uuid},
|
||||
output)
|
||||
self.assertIn('Processed 1 allocation.', output)
|
||||
self.assertEqual(4, ret)
|
||||
|
||||
|
||||
class TestDBArchiveDeletedRows(integrated_helpers._IntegratedTestBase):
|
||||
"""Functional tests for the "nova-manage db archive_deleted_rows" CLI."""
|
||||
|
@ -120,8 +120,7 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase):
|
||||
"""Tests a scenario where a server is created on a host, the host
|
||||
goes down, the server is evacuated to another host, and then the
|
||||
source host compute service is deleted. After that the deleted
|
||||
compute service is restarted. Related placement resources are checked
|
||||
throughout.
|
||||
compute service is restarted and starts successfully.
|
||||
"""
|
||||
# Create our source host that we will evacuate *from* later.
|
||||
host1 = self._start_compute('host1')
|
||||
@ -152,23 +151,15 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase):
|
||||
services = self.admin_api.get_services(
|
||||
binary='nova-compute', host='host1')
|
||||
self.assertEqual(0, len(services), services)
|
||||
# FIXME(mriedem): This is bug 1829479 where the compute service is
|
||||
# deleted but the resource provider is not because there are still
|
||||
# allocations against the provider from the evacuated server.
|
||||
# Then the resource provider is also deleted.
|
||||
resp = self.placement.get('/resource_providers/%s' % rp_uuid)
|
||||
self.assertEqual(200, resp.status)
|
||||
self.assertFlavorMatchesUsage(rp_uuid, flavor)
|
||||
# Try to restart the host1 compute service to create a new resource
|
||||
# provider.
|
||||
self.assertEqual(404, resp.status)
|
||||
# Try to restart the host1 compute service to create a new service
|
||||
# and a new resource provider.
|
||||
self.restart_compute_service(host1)
|
||||
# FIXME(mriedem): This is bug 1817833 where restarting the now-deleted
|
||||
# compute service attempts to create a new resource provider with a
|
||||
# new uuid but the same name which results in a conflict. The service
|
||||
# does not die, however, because _update_available_resource_for_node
|
||||
# catches and logs but does not re-raise the error.
|
||||
log_output = self.stdlog.logger.output
|
||||
self.assertIn('Error updating resources for node host1.', log_output)
|
||||
self.assertIn('Failed to create resource provider host1', log_output)
|
||||
# Make sure the compute service record for host1 is recreated.
|
||||
service = self.admin_api.get_services(
|
||||
binary='nova-compute', host='host1')[0]
|
||||
|
||||
def test_migrate_confirm_after_deleted_source_compute(self):
|
||||
"""Tests a scenario where a server is cold migrated and while in
|
||||
|
@ -3184,15 +3184,41 @@ class TestAssociations(SchedulerReportClientTestCase):
|
||||
|
||||
class TestAllocations(SchedulerReportClientTestCase):
|
||||
|
||||
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
|
||||
"remove_provider_tree_from_instance_allocation")
|
||||
@mock.patch('nova.objects.MigrationList.get_by_filters')
|
||||
def test_remove_allocations_for_evacuated_instances(self,
|
||||
mock_get_migrations, mock_rm_pr_tree):
|
||||
cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
|
||||
hypervisor_hostname="fake_hostname", )
|
||||
migrations = [mock.Mock(instance_uuid=uuids.inst1, status='done'),
|
||||
mock.Mock(instance_uuid=uuids.inst2, status='done')]
|
||||
mock_get_migrations.return_value = migrations
|
||||
mock_rm_pr_tree.return_value = True
|
||||
self.client._remove_allocations_for_evacuated_instances(self.context,
|
||||
cn)
|
||||
mock_get_migrations.assert_called_once_with(
|
||||
self.context,
|
||||
{'source_compute': cn.host, 'status': ['done'],
|
||||
'migration_type': 'evacuation'})
|
||||
mock_rm_pr_tree.assert_has_calls(
|
||||
[mock.call(self.context, uuids.inst1, cn.uuid),
|
||||
mock.call(self.context, uuids.inst2, cn.uuid)])
|
||||
# status of migrations should be kept
|
||||
for migration in migrations:
|
||||
self.assertEqual('done', migration.status)
|
||||
|
||||
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
|
||||
'get_providers_in_tree')
|
||||
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
|
||||
"delete")
|
||||
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
|
||||
"_remove_allocations_for_evacuated_instances")
|
||||
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
|
||||
"delete_allocation_for_instance")
|
||||
@mock.patch("nova.objects.InstanceList.get_uuids_by_host_and_node")
|
||||
def test_delete_resource_provider_cascade(self, mock_by_host,
|
||||
mock_del_alloc, mock_delete, mock_get_rpt):
|
||||
mock_del_alloc, mock_evacuate, mock_delete, mock_get_rpt):
|
||||
cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
|
||||
hypervisor_hostname="fake_hostname", )
|
||||
mock_by_host.return_value = [uuids.inst1, uuids.inst2]
|
||||
@ -3208,6 +3234,7 @@ class TestAllocations(SchedulerReportClientTestCase):
|
||||
mock_by_host.assert_called_once_with(
|
||||
self.context, cn.host, cn.hypervisor_hostname)
|
||||
self.assertEqual(2, mock_del_alloc.call_count)
|
||||
mock_evacuate.assert_called_once_with(self.context, cn)
|
||||
exp_url = "/resource_providers/%s" % uuids.cn
|
||||
mock_delete.assert_called_once_with(
|
||||
exp_url, global_request_id=self.context.global_id)
|
||||
@ -3217,11 +3244,13 @@ class TestAllocations(SchedulerReportClientTestCase):
|
||||
'get_providers_in_tree')
|
||||
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
|
||||
"delete")
|
||||
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
|
||||
"_remove_allocations_for_evacuated_instances")
|
||||
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
|
||||
"delete_allocation_for_instance")
|
||||
@mock.patch("nova.objects.InstanceList.get_uuids_by_host_and_node")
|
||||
def test_delete_resource_provider_no_cascade(self, mock_by_host,
|
||||
mock_del_alloc, mock_delete, mock_get_rpt):
|
||||
mock_del_alloc, mock_evacuate, mock_delete, mock_get_rpt):
|
||||
self.client._association_refresh_time[uuids.cn] = mock.Mock()
|
||||
cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
|
||||
hypervisor_hostname="fake_hostname", )
|
||||
@ -3236,6 +3265,7 @@ class TestAllocations(SchedulerReportClientTestCase):
|
||||
mock_delete.return_value = resp_mock
|
||||
self.client.delete_resource_provider(self.context, cn)
|
||||
mock_del_alloc.assert_not_called()
|
||||
mock_evacuate.assert_not_called()
|
||||
exp_url = "/resource_providers/%s" % uuids.cn
|
||||
mock_delete.assert_called_once_with(
|
||||
exp_url, global_request_id=self.context.global_id)
|
||||
|
8
releasenotes/notes/bug-1829479-cd2db21526965e6d.yaml
Normal file
8
releasenotes/notes/bug-1829479-cd2db21526965e6d.yaml
Normal file
@ -0,0 +1,8 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
`Bug #1829479 <https://bugs.launchpad.net/nova/+bug/1829479>`_: Now
|
||||
deleting a nova-compute service removes allocations of successfully
|
||||
evacuated instances. This allows the associated resource provider to be
|
||||
deleted automatically even if the nova-compute service cannot recover after
|
||||
all instances on the node have been successfully evacuated.
|
Loading…
Reference in New Issue
Block a user