Add a placement audit command
There are different situations when allocations can be orphaned. Adding a new nova-manage command to lookup at all resource providers and check against the related compute nodes whether they have orphaned allocations. Change-Id: I537ed74503d208957f0a97af3ab754a6750dac20 Closes-Bug: #1793569
This commit is contained in:
parent
9fa3600fca
commit
c03716be1f
@ -686,6 +686,42 @@ Placement
|
||||
- An unexpected error occurred.
|
||||
|
||||
|
||||
``nova-manage placement audit [--verbose] [--delete] [--resource_provider <uuid>]``
|
||||
Iterates over all the Resource Providers (or just one if you provide the
|
||||
UUID) and then verifies if the compute allocations are either related to
|
||||
an existing instance or a migration UUID.
|
||||
If not, it will tell which allocations are orphaned.
|
||||
|
||||
You can also ask to delete all the orphaned allocations by specifying
|
||||
``-delete``.
|
||||
|
||||
Specify ``--verbose`` to get detailed progress output during execution.
|
||||
|
||||
This command requires that the
|
||||
:oslo.config:option:`api_database.connection` and
|
||||
:oslo.config:group:`placement` configuration options are set. Placement API
|
||||
>= 1.14 is required.
|
||||
|
||||
**Return Codes**
|
||||
|
||||
.. list-table::
|
||||
:widths: 20 80
|
||||
:header-rows: 1
|
||||
|
||||
* - Return code
|
||||
- Description
|
||||
* - 0
|
||||
- No orphaned allocations were found
|
||||
* - 1
|
||||
- An unexpected error occurred
|
||||
* - 3
|
||||
- Orphaned allocations were found
|
||||
* - 4
|
||||
- All found orphaned allocations were deleted
|
||||
* - 127
|
||||
- Invalid input
|
||||
|
||||
|
||||
See Also
|
||||
========
|
||||
|
||||
|
@ -32,6 +32,7 @@ import traceback
|
||||
from dateutil import parser as dateutil_parser
|
||||
from keystoneauth1 import exceptions as ks_exc
|
||||
from neutronclient.common import exceptions as neutron_client_exc
|
||||
import os_resource_classes as orc
|
||||
from oslo_config import cfg
|
||||
from oslo_db import exception as db_exc
|
||||
from oslo_log import log as logging
|
||||
@ -2391,6 +2392,300 @@ class PlacementCommands(object):
|
||||
|
||||
return return_code
|
||||
|
||||
def _get_instances_and_current_migrations(self, ctxt, cn_uuid):
|
||||
if self.cn_uuid_mapping.get(cn_uuid):
|
||||
cell_uuid, cn_host, cn_node = self.cn_uuid_mapping[cn_uuid]
|
||||
else:
|
||||
# We need to find the compute node record from all cells.
|
||||
results = context.scatter_gather_skip_cell0(
|
||||
ctxt, objects.ComputeNode.get_by_uuid, cn_uuid)
|
||||
for result_cell_uuid, result in results.items():
|
||||
if not context.is_cell_failure_sentinel(result):
|
||||
cn = result
|
||||
cell_uuid = result_cell_uuid
|
||||
break
|
||||
else:
|
||||
return False
|
||||
cn_host, cn_node = (cn.host, cn.hypervisor_hostname)
|
||||
self.cn_uuid_mapping[cn_uuid] = (cell_uuid, cn_host, cn_node)
|
||||
cell_mapping = objects.CellMapping.get_by_uuid(ctxt, cell_uuid)
|
||||
|
||||
# Get all the active instances from this compute node
|
||||
if self.instances_mapping.get(cn_uuid):
|
||||
inst_uuids = self.instances_mapping[cn_uuid]
|
||||
else:
|
||||
# Get the instance list record from the cell.
|
||||
with context.target_cell(ctxt, cell_mapping) as cctxt:
|
||||
instances = objects.InstanceList.get_by_host_and_node(
|
||||
cctxt, cn_host, cn_node, expected_attrs=[])
|
||||
inst_uuids = [instance.uuid for instance in instances]
|
||||
self.instances_mapping[cn_uuid] = inst_uuids
|
||||
|
||||
# Get all *active* migrations for this compute node
|
||||
# NOTE(sbauza): Since migrations are transient, it's better to not
|
||||
# cache the results as they could be stale
|
||||
with context.target_cell(ctxt, cell_mapping) as cctxt:
|
||||
migs = objects.MigrationList.get_in_progress_by_host_and_node(
|
||||
cctxt, cn_host, cn_node)
|
||||
mig_uuids = [migration.uuid for migration in migs]
|
||||
|
||||
return (inst_uuids, mig_uuids)
|
||||
|
||||
def _delete_allocations_from_consumer(self, ctxt, placement, provider,
|
||||
consumer_uuid, consumer_type):
|
||||
"""Deletes allocations from a resource provider with consumer UUID.
|
||||
|
||||
:param ctxt: nova.context.RequestContext
|
||||
:param placement: nova.scheduler.client.report.SchedulerReportClient
|
||||
to communicate with the Placement service API.
|
||||
:param provider: Resource Provider to look at.
|
||||
:param consumer_uuid: the consumer UUID having allocations.
|
||||
:param consumer_type: the type of consumer,
|
||||
either 'instance' or 'migration'
|
||||
:returns: bool whether the allocations were deleted.
|
||||
"""
|
||||
# We need to be careful and only remove the allocations
|
||||
# against this specific RP or we would delete the
|
||||
# whole instance usage and then it would require some
|
||||
# healing.
|
||||
# TODO(sbauza): Remove this extra check once placement
|
||||
# supports querying allocation delete on both
|
||||
# consumer and resource provider parameters.
|
||||
allocations = placement.get_allocs_for_consumer(
|
||||
ctxt, consumer_uuid)
|
||||
if len(allocations['allocations']) > 1:
|
||||
# This consumer has resources spreaded amongst
|
||||
# multiple RPs (think nested or shared for example)
|
||||
# We then need to just update the usage to remove
|
||||
# the orphaned resources on the specific RP
|
||||
del allocations['allocations'][provider['uuid']]
|
||||
try:
|
||||
placement.put_allocations(
|
||||
ctxt, consumer_uuid, allocations)
|
||||
except exception.AllocationUpdateFailed:
|
||||
return False
|
||||
|
||||
else:
|
||||
try:
|
||||
placement.delete_allocation_for_instance(
|
||||
ctxt, consumer_uuid, consumer_type)
|
||||
except exception.AllocationDeleteFailed:
|
||||
return False
|
||||
return True
|
||||
|
||||
def _check_orphaned_allocations_for_provider(self, ctxt, placement,
|
||||
output, provider,
|
||||
delete):
|
||||
"""Finds orphaned allocations for a specific resource provider.
|
||||
|
||||
:param ctxt: nova.context.RequestContext
|
||||
:param placement: nova.scheduler.client.report.SchedulerReportClient
|
||||
to communicate with the Placement service API.
|
||||
:param output: function that takes a single message for verbose output
|
||||
:param provider: Resource Provider to look at.
|
||||
:param delete: deletes the found orphaned allocations.
|
||||
:return: a tuple (<number of orphaned allocs>, <number of faults>)
|
||||
"""
|
||||
num_processed = 0
|
||||
faults = 0
|
||||
|
||||
# TODO(sbauza): Are we sure we have all Nova RCs ?
|
||||
# FIXME(sbauza): Possibly use consumer types once Placement API
|
||||
# supports them.
|
||||
# NOTE(sbauza): We check allocations having *any* below RC, not having
|
||||
# *all* of them.
|
||||
NOVA_RCS = [orc.VCPU, orc.MEMORY_MB, orc.DISK_GB, orc.VGPU,
|
||||
orc.NET_BW_EGR_KILOBIT_PER_SEC,
|
||||
orc.NET_BW_IGR_KILOBIT_PER_SEC,
|
||||
orc.PCPU, orc.MEM_ENCRYPTION_CONTEXT]
|
||||
|
||||
# Since the RP can be a child RP, we need to get the root RP as it's
|
||||
# the compute node UUID
|
||||
# NOTE(sbauza): In case Placement doesn't support 1.14 microversion,
|
||||
# that means we don't have nested RPs.
|
||||
# Since we ask for microversion 1.14, all RPs have a root RP UUID.
|
||||
cn_uuid = provider.get("root_provider_uuid")
|
||||
# Now get all the existing instances and active migrations for this
|
||||
# compute node
|
||||
result = self._get_instances_and_current_migrations(ctxt, cn_uuid)
|
||||
if result is False:
|
||||
# We don't want to hard stop here because the compute service could
|
||||
# have disappear while we could still have orphaned allocations.
|
||||
output(_('The compute node for UUID %s can not be '
|
||||
'found') % cn_uuid)
|
||||
inst_uuids, mig_uuids = result or ([], [])
|
||||
try:
|
||||
pallocs = placement.get_allocations_for_resource_provider(
|
||||
ctxt, provider['uuid'])
|
||||
except exception.ResourceProviderAllocationRetrievalFailed:
|
||||
print(_('Not able to find allocations for resource '
|
||||
'provider %s.') % provider['uuid'])
|
||||
raise
|
||||
|
||||
# Verify every allocations for each consumer UUID
|
||||
for consumer_uuid, consumer_resources in six.iteritems(
|
||||
pallocs.allocations):
|
||||
consumer_allocs = consumer_resources['resources']
|
||||
if any(rc in NOVA_RCS
|
||||
for rc in consumer_allocs):
|
||||
# We reset the consumer type for each allocation
|
||||
consumer_type = None
|
||||
# This is an allocation for Nova resources
|
||||
# We need to guess whether the instance was deleted
|
||||
# or if the instance is currently migrating
|
||||
if not (consumer_uuid in inst_uuids or
|
||||
consumer_uuid in mig_uuids):
|
||||
# By default we suspect the orphaned allocation was for a
|
||||
# migration...
|
||||
consumer_type = 'migration'
|
||||
if not(consumer_uuid in inst_uuids):
|
||||
# ... but if we can't find it either for an instance,
|
||||
# that means it was for this.
|
||||
consumer_type = 'instance'
|
||||
if consumer_type is not None:
|
||||
output(_('Allocations were set against consumer UUID '
|
||||
'%(consumer_uuid)s but no existing instances or '
|
||||
'active migrations are related. ')
|
||||
% {'consumer_uuid': consumer_uuid})
|
||||
if delete:
|
||||
deleted = self._delete_allocations_from_consumer(
|
||||
ctxt, placement, provider, consumer_uuid,
|
||||
consumer_type)
|
||||
if not deleted:
|
||||
print(_('Not able to delete allocations '
|
||||
'for consumer UUID %s')
|
||||
% consumer_uuid)
|
||||
faults += 1
|
||||
continue
|
||||
output(_('Deleted allocations for consumer UUID '
|
||||
'%(consumer_uuid)s on Resource Provider '
|
||||
'%(rp)s: %(allocations)s')
|
||||
% {'consumer_uuid': consumer_uuid,
|
||||
'rp': provider['uuid'],
|
||||
'allocations': consumer_allocs})
|
||||
else:
|
||||
output(_('Allocations for consumer UUID '
|
||||
'%(consumer_uuid)s on Resource Provider '
|
||||
'%(rp)s can be deleted: '
|
||||
'%(allocations)s')
|
||||
% {'consumer_uuid': consumer_uuid,
|
||||
'rp': provider['uuid'],
|
||||
'allocations': consumer_allocs})
|
||||
num_processed += 1
|
||||
return (num_processed, faults)
|
||||
|
||||
# TODO(sbauza): Move this to the scheduler report client ?
|
||||
def _get_resource_provider(self, context, placement, uuid):
|
||||
"""Returns a single Resource Provider by its UUID.
|
||||
|
||||
:param context: The nova.context.RequestContext auth context
|
||||
:param placement: nova.scheduler.client.report.SchedulerReportClient
|
||||
to communicate with the Placement service API.
|
||||
:param uuid: A specific Resource Provider UUID
|
||||
:return: the existing resource provider.
|
||||
:raises: keystoneauth1.exceptions.base.ClientException on failure to
|
||||
communicate with the placement API
|
||||
"""
|
||||
|
||||
resource_providers = self._get_resource_providers(context, placement,
|
||||
uuid=uuid)
|
||||
if not resource_providers:
|
||||
# The endpoint never returns a 404, it rather returns an empty list
|
||||
raise exception.ResourceProviderNotFound(name_or_uuid=uuid)
|
||||
return resource_providers[0]
|
||||
|
||||
def _get_resource_providers(self, context, placement, **kwargs):
|
||||
"""Returns all resource providers regardless of their relationships.
|
||||
|
||||
:param context: The nova.context.RequestContext auth context
|
||||
:param placement: nova.scheduler.client.report.SchedulerReportClient
|
||||
to communicate with the Placement service API.
|
||||
:param kwargs: extra attributes for the query string
|
||||
:return: list of resource providers.
|
||||
:raises: keystoneauth1.exceptions.base.ClientException on failure to
|
||||
communicate with the placement API
|
||||
"""
|
||||
url = '/resource_providers'
|
||||
if 'uuid' in kwargs:
|
||||
url += '&uuid=%s' % kwargs['uuid']
|
||||
|
||||
resp = placement.get(url, global_request_id=context.global_id,
|
||||
version='1.14')
|
||||
if resp is None:
|
||||
raise exception.PlacementAPIConnectFailure()
|
||||
|
||||
data = resp.json()
|
||||
resource_providers = data.get('resource_providers')
|
||||
|
||||
return resource_providers
|
||||
|
||||
@action_description(
|
||||
_("Audits orphaned allocations that are no longer corresponding to "
|
||||
"existing instance resources. This command requires that "
|
||||
"the [api_database]/connection and [placement] configuration "
|
||||
"options are set."))
|
||||
@args('--verbose', action='store_true', dest='verbose', default=False,
|
||||
help='Provide verbose output during execution.')
|
||||
@args('--resource_provider', metavar='<provider_uuid>',
|
||||
dest='provider_uuid',
|
||||
help='UUID of a specific resource provider to verify.')
|
||||
@args('--delete', action='store_true', dest='delete', default=False,
|
||||
help='Deletes orphaned allocations that were found.')
|
||||
def audit(self, verbose=False, provider_uuid=None, delete=False):
|
||||
"""Provides information about orphaned allocations that can be removed
|
||||
|
||||
Return codes:
|
||||
|
||||
* 0: Command completed successfully and no orphaned allocations exist.
|
||||
* 1: An unexpected error happened during run.
|
||||
* 3: Orphaned allocations were detected.
|
||||
* 4: Orphaned allocations were detected and deleted.
|
||||
* 127: Invalid input.
|
||||
"""
|
||||
|
||||
ctxt = context.get_admin_context()
|
||||
output = lambda msg: None
|
||||
if verbose:
|
||||
output = lambda msg: print(msg)
|
||||
|
||||
placement = report.SchedulerReportClient()
|
||||
# Resets two in-memory dicts for knowing instances per compute node
|
||||
self.cn_uuid_mapping = collections.defaultdict(tuple)
|
||||
self.instances_mapping = collections.defaultdict(list)
|
||||
|
||||
num_processed = 0
|
||||
faults = 0
|
||||
|
||||
if provider_uuid:
|
||||
try:
|
||||
resource_provider = self._get_resource_provider(
|
||||
ctxt, placement, provider_uuid)
|
||||
except exception.ResourceProviderNotFound:
|
||||
print(_('Resource provider with UUID %s does not exist.') %
|
||||
provider_uuid)
|
||||
return 127
|
||||
resource_providers = [resource_provider]
|
||||
else:
|
||||
resource_providers = self._get_resource_providers(ctxt, placement)
|
||||
|
||||
for provider in resource_providers:
|
||||
(nb_p, faults) = self._check_orphaned_allocations_for_provider(
|
||||
ctxt, placement, output, provider, delete)
|
||||
num_processed += nb_p
|
||||
if faults > 0:
|
||||
print(_('The Resource Provider %s had problems when '
|
||||
'deleting allocations. Stopping now. Please fix the '
|
||||
'problem by hand and run again.') %
|
||||
provider['uuid'])
|
||||
return 1
|
||||
if num_processed > 0:
|
||||
suffix = 's.' if num_processed > 1 else '.'
|
||||
output(_('Processed %(num)s allocation%(suffix)s')
|
||||
% {'num': num_processed,
|
||||
'suffix': suffix})
|
||||
return 4 if delete else 3
|
||||
return 0
|
||||
|
||||
|
||||
CATEGORIES = {
|
||||
'api_db': ApiDbCommands,
|
||||
|
@ -1393,6 +1393,232 @@ class TestNovaManagePlacementSyncAggregates(
|
||||
'%s should be in two provider aggregates' % host)
|
||||
|
||||
|
||||
class TestNovaManagePlacementAudit(
|
||||
integrated_helpers.ProviderUsageBaseTestCase):
|
||||
"""Functional tests for nova-manage placement audit"""
|
||||
|
||||
# Let's just use a simple fake driver
|
||||
compute_driver = 'fake.SmallFakeDriver'
|
||||
|
||||
def setUp(self):
|
||||
super(TestNovaManagePlacementAudit, self).setUp()
|
||||
self.cli = manage.PlacementCommands()
|
||||
# Make sure we have two computes for migrations
|
||||
self.compute1 = self._start_compute('host1')
|
||||
self.compute2 = self._start_compute('host2')
|
||||
|
||||
# Make sure we have two hypervisors reported in the API.
|
||||
hypervisors = self.admin_api.api_get(
|
||||
'/os-hypervisors').body['hypervisors']
|
||||
self.assertEqual(2, len(hypervisors))
|
||||
|
||||
self.output = StringIO()
|
||||
self.useFixture(fixtures.MonkeyPatch('sys.stdout', self.output))
|
||||
|
||||
self.flavor = self.api.get_flavors()[0]
|
||||
|
||||
def _delete_instance_but_keep_its_allocations(self, server):
|
||||
"""Mocks out the call to Placement for deleting the allocations but
|
||||
still performs the instance deletion.
|
||||
"""
|
||||
|
||||
with mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
|
||||
'delete_allocation_for_instance'):
|
||||
self.api.delete_server(server['id'])
|
||||
self._wait_until_deleted(server)
|
||||
|
||||
def test_audit_orphaned_allocation_from_instance_delete(self):
|
||||
"""Creates a server and deletes it by retaining its allocations so the
|
||||
audit command can find it.
|
||||
"""
|
||||
target_hostname = self.compute1.host
|
||||
rp_uuid = self._get_provider_uuid_by_host(target_hostname)
|
||||
|
||||
server = self._boot_and_check_allocations(self.flavor, target_hostname)
|
||||
|
||||
# let's mock the allocation delete call to placement
|
||||
with mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
|
||||
'delete_allocation_for_instance'):
|
||||
self.api.delete_server(server['id'])
|
||||
self._wait_until_deleted(server)
|
||||
|
||||
# make sure the allocation is still around
|
||||
self.assertFlavorMatchesUsage(rp_uuid, self.flavor)
|
||||
|
||||
# Don't ask to delete the orphaned allocations, just audit them
|
||||
ret = self.cli.audit(verbose=True)
|
||||
# The allocation should still exist
|
||||
self.assertFlavorMatchesUsage(rp_uuid, self.flavor)
|
||||
|
||||
output = self.output.getvalue()
|
||||
self.assertIn(
|
||||
'Allocations for consumer UUID %(consumer_uuid)s on '
|
||||
'Resource Provider %(rp_uuid)s can be deleted' %
|
||||
{'consumer_uuid': server['id'],
|
||||
'rp_uuid': rp_uuid},
|
||||
output)
|
||||
self.assertIn('Processed 1 allocation.', output)
|
||||
self.assertEqual(3, ret)
|
||||
|
||||
# Now ask the audit command to delete the rogue allocations.
|
||||
ret = self.cli.audit(delete=True, verbose=True)
|
||||
|
||||
# The allocations are now deleted
|
||||
self.assertRequestMatchesUsage({'VCPU': 0,
|
||||
'MEMORY_MB': 0,
|
||||
'DISK_GB': 0}, rp_uuid)
|
||||
|
||||
output = self.output.getvalue()
|
||||
self.assertIn(
|
||||
'Deleted allocations for consumer UUID %s' % server['id'], output)
|
||||
self.assertIn('Processed 1 allocation.', output)
|
||||
self.assertEqual(4, ret)
|
||||
|
||||
def test_audit_orphaned_allocations_from_confirmed_resize(self):
|
||||
"""Resize a server but when confirming it, leave the migration
|
||||
allocation there so the audit command can find it.
|
||||
"""
|
||||
source_hostname = self.compute1.host
|
||||
dest_hostname = self.compute2.host
|
||||
|
||||
source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
|
||||
dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
|
||||
|
||||
old_flavor = self.flavor
|
||||
new_flavor = self.api.get_flavors()[1]
|
||||
# we want to make sure we resize to compute2
|
||||
self.flags(allow_resize_to_same_host=False)
|
||||
|
||||
server = self._boot_and_check_allocations(self.flavor, source_hostname)
|
||||
|
||||
# Do a resize
|
||||
post = {
|
||||
'resize': {
|
||||
'flavorRef': new_flavor['id']
|
||||
}
|
||||
}
|
||||
self._move_and_check_allocations(
|
||||
server, request=post, old_flavor=old_flavor,
|
||||
new_flavor=new_flavor, source_rp_uuid=source_rp_uuid,
|
||||
dest_rp_uuid=dest_rp_uuid)
|
||||
|
||||
# Retain the migration UUID record for later usage
|
||||
migration_uuid = self.get_migration_uuid_for_instance(server['id'])
|
||||
|
||||
# Confirm the resize so it should in theory delete the source
|
||||
# allocations but mock out the allocation delete for the source
|
||||
post = {'confirmResize': None}
|
||||
with mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
|
||||
'delete_allocation_for_instance'):
|
||||
self.api.post_server_action(
|
||||
server['id'], post, check_response_status=[204])
|
||||
self._wait_for_state_change(server, 'ACTIVE')
|
||||
|
||||
# The target host usage should be according to the new flavor...
|
||||
self.assertFlavorMatchesUsage(dest_rp_uuid, new_flavor)
|
||||
# ...but we should still see allocations for the source compute
|
||||
self.assertFlavorMatchesUsage(source_rp_uuid, old_flavor)
|
||||
|
||||
# Now, run the audit command that will find this orphaned allocation
|
||||
ret = self.cli.audit(verbose=True)
|
||||
output = self.output.getvalue()
|
||||
self.assertIn(
|
||||
'Allocations for consumer UUID %(consumer_uuid)s on '
|
||||
'Resource Provider %(rp_uuid)s can be deleted' %
|
||||
{'consumer_uuid': migration_uuid,
|
||||
'rp_uuid': source_rp_uuid},
|
||||
output)
|
||||
self.assertIn('Processed 1 allocation.', output)
|
||||
self.assertEqual(3, ret)
|
||||
|
||||
# Now we want to delete the orphaned allocation that is duplicate
|
||||
ret = self.cli.audit(delete=True, verbose=True)
|
||||
|
||||
# There should be no longer usage for the source host since the
|
||||
# allocation disappeared
|
||||
self.assertRequestMatchesUsage({'VCPU': 0,
|
||||
'MEMORY_MB': 0,
|
||||
'DISK_GB': 0}, source_rp_uuid)
|
||||
|
||||
output = self.output.getvalue()
|
||||
self.assertIn(
|
||||
'Deleted allocations for consumer UUID %(consumer_uuid)s on '
|
||||
'Resource Provider %(rp_uuid)s' %
|
||||
{'consumer_uuid': migration_uuid,
|
||||
'rp_uuid': source_rp_uuid},
|
||||
output)
|
||||
self.assertIn('Processed 1 allocation.', output)
|
||||
self.assertEqual(4, ret)
|
||||
|
||||
# TODO(sbauza): Mock this test once bug #1829479 is fixed
|
||||
def test_audit_orphaned_allocations_from_deleted_compute_evacuate(self):
|
||||
"""Evacuate a server and the delete the source node so that it will
|
||||
leave a source allocation that the audit command will find.
|
||||
"""
|
||||
|
||||
source_hostname = self.compute1.host
|
||||
dest_hostname = self.compute2.host
|
||||
|
||||
source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
|
||||
dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
|
||||
|
||||
server = self._boot_and_check_allocations(self.flavor, source_hostname)
|
||||
|
||||
# Stop the service and fake it down
|
||||
self.compute1.stop()
|
||||
source_service_id = self.admin_api.get_services(
|
||||
host=source_hostname, binary='nova-compute')[0]['id']
|
||||
self.admin_api.put_service(source_service_id, {'forced_down': 'true'})
|
||||
|
||||
# evacuate the instance to the target
|
||||
post = {'evacuate': {"host": dest_hostname}}
|
||||
self.admin_api.post_server_action(server['id'], post)
|
||||
self._wait_for_server_parameter(server,
|
||||
{'OS-EXT-SRV-ATTR:host': dest_hostname,
|
||||
'status': 'ACTIVE'})
|
||||
|
||||
# Now the instance is gone, we can delete the compute service
|
||||
self.admin_api.api_delete('/os-services/%s' % source_service_id)
|
||||
|
||||
# Since the compute is deleted, we should have in theory a single
|
||||
# allocation against the destination resource provider, but evacuated
|
||||
# instances are not having their allocations deleted. See bug #1829479.
|
||||
# We have two allocations for the same consumer, source and destination
|
||||
self._check_allocation_during_evacuate(
|
||||
self.flavor, server['id'], source_rp_uuid, dest_rp_uuid)
|
||||
|
||||
# Now, run the audit command that will find this orphaned allocation
|
||||
ret = self.cli.audit(verbose=True)
|
||||
output = self.output.getvalue()
|
||||
self.assertIn(
|
||||
'Allocations for consumer UUID %(consumer_uuid)s on '
|
||||
'Resource Provider %(rp_uuid)s can be deleted' %
|
||||
{'consumer_uuid': server['id'],
|
||||
'rp_uuid': source_rp_uuid},
|
||||
output)
|
||||
self.assertIn('Processed 1 allocation.', output)
|
||||
self.assertEqual(3, ret)
|
||||
|
||||
# Now we want to delete the orphaned allocation that is duplicate
|
||||
ret = self.cli.audit(delete=True, verbose=True)
|
||||
|
||||
# We finally should only have the target allocations
|
||||
self.assertFlavorMatchesUsage(dest_rp_uuid, self.flavor)
|
||||
self.assertRequestMatchesUsage({'VCPU': 0,
|
||||
'MEMORY_MB': 0,
|
||||
'DISK_GB': 0}, source_rp_uuid)
|
||||
|
||||
output = self.output.getvalue()
|
||||
self.assertIn(
|
||||
'Deleted allocations for consumer UUID %(consumer_uuid)s on '
|
||||
'Resource Provider %(rp_uuid)s' %
|
||||
{'consumer_uuid': server['id'],
|
||||
'rp_uuid': source_rp_uuid},
|
||||
output)
|
||||
self.assertIn('Processed 1 allocation.', output)
|
||||
self.assertEqual(4, ret)
|
||||
|
||||
|
||||
class TestDBArchiveDeletedRows(integrated_helpers._IntegratedTestBase):
|
||||
"""Functional tests for the "nova-manage db archive_deleted_rows" CLI."""
|
||||
api_major_version = 'v2.1'
|
||||
|
@ -34,6 +34,7 @@ from nova.db import migration
|
||||
from nova.db.sqlalchemy import migration as sqla_migration
|
||||
from nova import exception
|
||||
from nova import objects
|
||||
from nova.scheduler.client import report
|
||||
from nova import test
|
||||
from nova.tests import fixtures as nova_fixtures
|
||||
from nova.tests.unit import fake_requests
|
||||
@ -2851,6 +2852,142 @@ class TestNovaManagePlacement(test.NoDBTestCase):
|
||||
neutron.update_port.assert_called_once_with(
|
||||
uuidsentinel.port_id, body=expected_update_body)
|
||||
|
||||
def test_audit_with_wrong_provider_uuid(self):
|
||||
with mock.patch.object(
|
||||
self.cli, '_get_resource_provider',
|
||||
side_effect=exception.ResourceProviderNotFound(
|
||||
name_or_uuid=uuidsentinel.fake_uuid)):
|
||||
ret = self.cli.audit(
|
||||
provider_uuid=uuidsentinel.fake_uuid)
|
||||
self.assertEqual(127, ret)
|
||||
output = self.output.getvalue()
|
||||
self.assertIn(
|
||||
'Resource provider with UUID %s' % uuidsentinel.fake_uuid,
|
||||
output)
|
||||
|
||||
@mock.patch.object(manage.PlacementCommands,
|
||||
'_check_orphaned_allocations_for_provider')
|
||||
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.get')
|
||||
def _test_audit(self, get_resource_providers, check_orphaned_allocs,
|
||||
verbose=False, delete=False, errors=False, found=False):
|
||||
rps = [
|
||||
{"generation": 1,
|
||||
"uuid": uuidsentinel.rp1,
|
||||
"links": None,
|
||||
"name": "rp1",
|
||||
"parent_provider_uuid": None,
|
||||
"root_provider_uuid": uuidsentinel.rp1},
|
||||
{"generation": 1,
|
||||
"uuid": uuidsentinel.rp2,
|
||||
"links": None,
|
||||
"name": "rp2",
|
||||
"parent_provider_uuid": None,
|
||||
"root_provider_uuid": uuidsentinel.rp2},
|
||||
]
|
||||
get_resource_providers.return_value = fake_requests.FakeResponse(
|
||||
200, content=jsonutils.dumps({"resource_providers": rps}))
|
||||
|
||||
if errors:
|
||||
# We found one orphaned allocation per RP but RP1 got a fault
|
||||
check_orphaned_allocs.side_effect = ((1, 1), (1, 0))
|
||||
elif found:
|
||||
# we found one orphaned allocation per RP and we had no faults
|
||||
check_orphaned_allocs.side_effect = ((1, 0), (1, 0))
|
||||
else:
|
||||
# No orphaned allocations are found for all the RPs
|
||||
check_orphaned_allocs.side_effect = ((0, 0), (0, 0))
|
||||
|
||||
ret = self.cli.audit(verbose=verbose, delete=delete)
|
||||
if errors:
|
||||
# Any fault stops the audit and provides a return code equals to 1
|
||||
expected_ret = 1
|
||||
elif found and delete:
|
||||
# We found orphaned allocations and deleted them
|
||||
expected_ret = 4
|
||||
elif found and not delete:
|
||||
# We found orphaned allocations but we left them
|
||||
expected_ret = 3
|
||||
else:
|
||||
# Nothing was found
|
||||
expected_ret = 0
|
||||
self.assertEqual(expected_ret, ret)
|
||||
|
||||
call1 = mock.call(mock.ANY, mock.ANY, mock.ANY, rps[0], delete)
|
||||
call2 = mock.call(mock.ANY, mock.ANY, mock.ANY, rps[1], delete)
|
||||
if errors:
|
||||
# We stop checking other RPs once we got a fault
|
||||
check_orphaned_allocs.assert_has_calls([call1])
|
||||
else:
|
||||
# All the RPs are checked
|
||||
check_orphaned_allocs.assert_has_calls([call1, call2])
|
||||
|
||||
if verbose and found:
|
||||
output = self.output.getvalue()
|
||||
self.assertIn('Processed 2 allocations', output)
|
||||
if errors:
|
||||
output = self.output.getvalue()
|
||||
self.assertIn(
|
||||
'The Resource Provider %s had problems' % rps[0]["uuid"],
|
||||
output)
|
||||
|
||||
def test_audit_not_found_orphaned_allocs(self):
|
||||
self._test_audit(found=False)
|
||||
|
||||
def test_audit_found_orphaned_allocs_not_verbose(self):
|
||||
self._test_audit(found=True)
|
||||
|
||||
def test_audit_found_orphaned_allocs_verbose(self):
|
||||
self._test_audit(found=True, verbose=True)
|
||||
|
||||
def test_audit_found_orphaned_allocs_and_deleted_them(self):
|
||||
self._test_audit(found=True, delete=True)
|
||||
|
||||
def test_audit_found_orphaned_allocs_but_got_errors(self):
|
||||
self._test_audit(errors=True)
|
||||
|
||||
@mock.patch.object(manage.PlacementCommands,
|
||||
'_delete_allocations_from_consumer')
|
||||
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
|
||||
'get_allocations_for_resource_provider')
|
||||
@mock.patch.object(manage.PlacementCommands,
|
||||
'_get_instances_and_current_migrations')
|
||||
def test_check_orphaned_allocations_for_provider(self,
|
||||
get_insts_and_migs,
|
||||
get_allocs_for_rp,
|
||||
delete_allocs):
|
||||
provider = {"generation": 1,
|
||||
"uuid": uuidsentinel.rp1,
|
||||
"links": None,
|
||||
"name": "rp1",
|
||||
"parent_provider_uuid": None,
|
||||
"root_provider_uuid": uuidsentinel.rp1}
|
||||
compute_resources = {'VCPU': 1, 'MEMORY_MB': 2048, 'DISK_GB': 20}
|
||||
allocations = {
|
||||
# Some orphaned compute allocation
|
||||
uuidsentinel.orphaned_alloc1: {'resources': compute_resources},
|
||||
# Some existing instance allocation
|
||||
uuidsentinel.inst1: {'resources': compute_resources},
|
||||
# Some existing migration allocation
|
||||
uuidsentinel.mig1: {'resources': compute_resources},
|
||||
# Some other allocation not related to Nova
|
||||
uuidsentinel.other_alloc1: {'resources': {'CUSTOM_GOO'}},
|
||||
}
|
||||
|
||||
get_insts_and_migs.return_value = (
|
||||
[uuidsentinel.inst1],
|
||||
[uuidsentinel.mig1])
|
||||
get_allocs_for_rp.return_value = report.ProviderAllocInfo(allocations)
|
||||
|
||||
ctxt = context.RequestContext()
|
||||
placement = report.SchedulerReportClient()
|
||||
ret = self.cli._check_orphaned_allocations_for_provider(
|
||||
ctxt, placement, lambda x: x, provider, True)
|
||||
get_allocs_for_rp.assert_called_once_with(ctxt, uuidsentinel.rp1)
|
||||
delete_allocs.assert_called_once_with(ctxt, placement, provider,
|
||||
uuidsentinel.orphaned_alloc1,
|
||||
'instance')
|
||||
self.assertEqual((1, 0), ret)
|
||||
|
||||
|
||||
class TestNovaManageMain(test.NoDBTestCase):
|
||||
"""Tests the nova-manage:main() setup code."""
|
||||
|
12
releasenotes/notes/placement-audit-59a00dcfb188c6ac.yaml
Normal file
12
releasenotes/notes/placement-audit-59a00dcfb188c6ac.yaml
Normal file
@ -0,0 +1,12 @@
|
||||
---
|
||||
other:
|
||||
- |
|
||||
A new ``nova-manage placement audit`` CLI has been added to help identify
|
||||
orphaned compute allocations in the Placement API that are no longer
|
||||
related to either instances or migrations.
|
||||
Some race conditions in Nova could not remove allocations for some
|
||||
instances or migrations when they're done and then it would create some
|
||||
capacity issues. Thanks to the command, you could know the orphaned
|
||||
allocations and ask to remove them.
|
||||
For more details on CLI usage, see the man page entry:
|
||||
https://docs.openstack.org/nova/latest/cli/nova-manage.html#placement
|
Loading…
x
Reference in New Issue
Block a user