Add a placement audit command

There are different situations when allocations can be orphaned. Adding a new nova-manage command to lookup at all resource providers and check against the related compute nodes whether they have orphaned allocations. Change-Id: I537ed74503d208957f0a97af3ab754a6750dac20 Closes-Bug: #1793569
2019-07-10 18:11:23 +02:00 · 2019-07-10 18:11:23 +02:00 · c03716be1f
commit c03716be1f
parent 9fa3600fca
5 changed files with 706 additions and 0 deletions
--- a/doc/source/cli/nova-manage.rst
+++ b/doc/source/cli/nova-manage.rst
@ -686,6 +686,42 @@ Placement
         - An unexpected error occurred.


+``nova-manage placement audit [--verbose] [--delete] [--resource_provider <uuid>]``
+    Iterates over all the Resource Providers (or just one if you provide the
+    UUID) and then verifies if the compute allocations are either related to
+    an existing instance or a migration UUID.
+    If not, it will tell which allocations are orphaned.
+
+    You can also ask to delete all the orphaned allocations by specifying
+    ``-delete``.
+
+    Specify ``--verbose`` to get detailed progress output during execution.
+
+    This command requires that the
+    :oslo.config:option:`api_database.connection` and
+    :oslo.config:group:`placement` configuration options are set. Placement API
+    >= 1.14 is required.
+
+    **Return Codes**
+
+    .. list-table::
+       :widths: 20 80
+       :header-rows: 1
+
+       * - Return code
+         - Description
+       * - 0
+         - No orphaned allocations were found
+       * - 1
+         - An unexpected error occurred
+       * - 3
+         - Orphaned allocations were found
+       * - 4
+         - All found orphaned allocations were deleted
+       * - 127
+         - Invalid input
+
+
 See Also
 ========

--- a/nova/cmd/manage.py
+++ b/nova/cmd/manage.py
@ -32,6 +32,7 @@ import traceback
 from dateutil import parser as dateutil_parser
 from keystoneauth1 import exceptions as ks_exc
 from neutronclient.common import exceptions as neutron_client_exc
+import os_resource_classes as orc
 from oslo_config import cfg
 from oslo_db import exception as db_exc
 from oslo_log import log as logging
@ -2391,6 +2392,300 @@ class PlacementCommands(object):

        return return_code

+    def _get_instances_and_current_migrations(self, ctxt, cn_uuid):
+        if self.cn_uuid_mapping.get(cn_uuid):
+            cell_uuid, cn_host, cn_node = self.cn_uuid_mapping[cn_uuid]
+        else:
+            # We need to find the compute node record from all cells.
+            results = context.scatter_gather_skip_cell0(
+                ctxt, objects.ComputeNode.get_by_uuid, cn_uuid)
+            for result_cell_uuid, result in results.items():
+                if not context.is_cell_failure_sentinel(result):
+                    cn = result
+                    cell_uuid = result_cell_uuid
+                    break
+            else:
+                return False
+            cn_host, cn_node = (cn.host, cn.hypervisor_hostname)
+            self.cn_uuid_mapping[cn_uuid] = (cell_uuid, cn_host, cn_node)
+        cell_mapping = objects.CellMapping.get_by_uuid(ctxt, cell_uuid)
+
+        # Get all the active instances from this compute node
+        if self.instances_mapping.get(cn_uuid):
+            inst_uuids = self.instances_mapping[cn_uuid]
+        else:
+            # Get the instance list record from the cell.
+            with context.target_cell(ctxt, cell_mapping) as cctxt:
+                instances = objects.InstanceList.get_by_host_and_node(
+                    cctxt, cn_host, cn_node, expected_attrs=[])
+            inst_uuids = [instance.uuid for instance in instances]
+            self.instances_mapping[cn_uuid] = inst_uuids
+
+        # Get all *active* migrations for this compute node
+        # NOTE(sbauza): Since migrations are transient, it's better to not
+        # cache the results as they could be stale
+        with context.target_cell(ctxt, cell_mapping) as cctxt:
+            migs = objects.MigrationList.get_in_progress_by_host_and_node(
+                cctxt, cn_host, cn_node)
+        mig_uuids = [migration.uuid for migration in migs]
+
+        return (inst_uuids, mig_uuids)
+
+    def _delete_allocations_from_consumer(self, ctxt, placement, provider,
+                                          consumer_uuid, consumer_type):
+        """Deletes allocations from a resource provider with consumer UUID.
+
+        :param ctxt: nova.context.RequestContext
+        :param placement: nova.scheduler.client.report.SchedulerReportClient
+            to communicate with the Placement service API.
+        :param provider: Resource Provider to look at.
+        :param consumer_uuid: the consumer UUID having allocations.
+        :param consumer_type: the type of consumer,
+            either 'instance' or 'migration'
+        :returns: bool whether the allocations were deleted.
+        """
+        # We need to be careful and only remove the allocations
+        # against this specific RP or we would delete the
+        # whole instance usage and then it would require some
+        # healing.
+        # TODO(sbauza): Remove this extra check once placement
+        # supports querying allocation delete on both
+        # consumer and resource provider parameters.
+        allocations = placement.get_allocs_for_consumer(
+            ctxt, consumer_uuid)
+        if len(allocations['allocations']) > 1:
+            # This consumer has resources spreaded amongst
+            # multiple RPs (think nested or shared for example)
+            # We then need to just update the usage to remove
+            # the orphaned resources on the specific RP
+            del allocations['allocations'][provider['uuid']]
+            try:
+                placement.put_allocations(
+                    ctxt, consumer_uuid, allocations)
+            except exception.AllocationUpdateFailed:
+                return False
+
+        else:
+            try:
+                placement.delete_allocation_for_instance(
+                    ctxt, consumer_uuid, consumer_type)
+            except exception.AllocationDeleteFailed:
+                return False
+        return True
+
+    def _check_orphaned_allocations_for_provider(self, ctxt, placement,
+                                                 output, provider,
+                                                 delete):
+        """Finds orphaned allocations for a specific resource provider.
+
+        :param ctxt: nova.context.RequestContext
+        :param placement: nova.scheduler.client.report.SchedulerReportClient
+            to communicate with the Placement service API.
+        :param output: function that takes a single message for verbose output
+        :param provider: Resource Provider to look at.
+        :param delete: deletes the found orphaned allocations.
+        :return: a tuple (<number of orphaned allocs>, <number of faults>)
+        """
+        num_processed = 0
+        faults = 0
+
+        # TODO(sbauza): Are we sure we have all Nova RCs ?
+        # FIXME(sbauza): Possibly use consumer types once Placement API
+        # supports them.
+        # NOTE(sbauza): We check allocations having *any* below RC, not having
+        # *all* of them.
+        NOVA_RCS = [orc.VCPU, orc.MEMORY_MB, orc.DISK_GB, orc.VGPU,
+                    orc.NET_BW_EGR_KILOBIT_PER_SEC,
+                    orc.NET_BW_IGR_KILOBIT_PER_SEC,
+                    orc.PCPU, orc.MEM_ENCRYPTION_CONTEXT]
+
+        # Since the RP can be a child RP, we need to get the root RP as it's
+        # the compute node UUID
+        # NOTE(sbauza): In case Placement doesn't support 1.14 microversion,
+        # that means we don't have nested RPs.
+        # Since we ask for microversion 1.14, all RPs have a root RP UUID.
+        cn_uuid = provider.get("root_provider_uuid")
+        # Now get all the existing instances and active migrations for this
+        # compute node
+        result = self._get_instances_and_current_migrations(ctxt, cn_uuid)
+        if result is False:
+            # We don't want to hard stop here because the compute service could
+            # have disappear while we could still have orphaned allocations.
+            output(_('The compute node for UUID %s can not be '
+                     'found') % cn_uuid)
+        inst_uuids, mig_uuids = result or ([], [])
+        try:
+            pallocs = placement.get_allocations_for_resource_provider(
+                ctxt, provider['uuid'])
+        except exception.ResourceProviderAllocationRetrievalFailed:
+            print(_('Not able to find allocations for resource '
+                    'provider %s.') % provider['uuid'])
+            raise
+
+        # Verify every allocations for each consumer UUID
+        for consumer_uuid, consumer_resources in six.iteritems(
+                pallocs.allocations):
+            consumer_allocs = consumer_resources['resources']
+            if any(rc in NOVA_RCS
+                   for rc in consumer_allocs):
+                # We reset the consumer type for each allocation
+                consumer_type = None
+                # This is an allocation for Nova resources
+                # We need to guess whether the instance was deleted
+                # or if the instance is currently migrating
+                if not (consumer_uuid in inst_uuids or
+                        consumer_uuid in mig_uuids):
+                    # By default we suspect the orphaned allocation was for a
+                    # migration...
+                    consumer_type = 'migration'
+                    if not(consumer_uuid in inst_uuids):
+                        # ... but if we can't find it either for an instance,
+                        # that means it was for this.
+                        consumer_type = 'instance'
+                if consumer_type is not None:
+                    output(_('Allocations were set against consumer UUID '
+                             '%(consumer_uuid)s but no existing instances or '
+                             'active migrations are related. ')
+                           % {'consumer_uuid': consumer_uuid})
+                    if delete:
+                        deleted = self._delete_allocations_from_consumer(
+                            ctxt, placement, provider, consumer_uuid,
+                            consumer_type)
+                        if not deleted:
+                            print(_('Not able to delete allocations '
+                                    'for consumer UUID %s')
+                                  % consumer_uuid)
+                            faults += 1
+                            continue
+                        output(_('Deleted allocations for consumer UUID '
+                                 '%(consumer_uuid)s on Resource Provider '
+                                 '%(rp)s: %(allocations)s')
+                               % {'consumer_uuid': consumer_uuid,
+                                  'rp': provider['uuid'],
+                                  'allocations': consumer_allocs})
+                    else:
+                        output(_('Allocations for consumer UUID '
+                                 '%(consumer_uuid)s on Resource Provider '
+                                 '%(rp)s can be deleted: '
+                                 '%(allocations)s')
+                               % {'consumer_uuid': consumer_uuid,
+                                  'rp': provider['uuid'],
+                                  'allocations': consumer_allocs})
+                    num_processed += 1
+        return (num_processed, faults)
+
+    # TODO(sbauza): Move this to the scheduler report client ?
+    def _get_resource_provider(self, context, placement, uuid):
+        """Returns a single Resource Provider by its UUID.
+
+        :param context: The nova.context.RequestContext auth context
+        :param placement: nova.scheduler.client.report.SchedulerReportClient
+            to communicate with the Placement service API.
+        :param uuid: A specific Resource Provider UUID
+        :return: the existing resource provider.
+        :raises: keystoneauth1.exceptions.base.ClientException on failure to
+                 communicate with the placement API
+        """
+
+        resource_providers = self._get_resource_providers(context, placement,
+                                                          uuid=uuid)
+        if not resource_providers:
+            # The endpoint never returns a 404, it rather returns an empty list
+            raise exception.ResourceProviderNotFound(name_or_uuid=uuid)
+        return resource_providers[0]
+
+    def _get_resource_providers(self, context, placement, **kwargs):
+        """Returns all resource providers regardless of their relationships.
+
+        :param context: The nova.context.RequestContext auth context
+        :param placement: nova.scheduler.client.report.SchedulerReportClient
+            to communicate with the Placement service API.
+        :param kwargs: extra attributes for the query string
+        :return: list of resource providers.
+        :raises: keystoneauth1.exceptions.base.ClientException on failure to
+                 communicate with the placement API
+        """
+        url = '/resource_providers'
+        if 'uuid' in kwargs:
+            url += '&uuid=%s' % kwargs['uuid']
+
+        resp = placement.get(url, global_request_id=context.global_id,
+                             version='1.14')
+        if resp is None:
+            raise exception.PlacementAPIConnectFailure()
+
+        data = resp.json()
+        resource_providers = data.get('resource_providers')
+
+        return resource_providers
+
+    @action_description(
+        _("Audits orphaned allocations that are no longer corresponding to "
+          "existing instance resources. This command requires that "
+          "the [api_database]/connection and [placement] configuration "
+          "options are set."))
+    @args('--verbose', action='store_true', dest='verbose', default=False,
+          help='Provide verbose output during execution.')
+    @args('--resource_provider', metavar='<provider_uuid>',
+          dest='provider_uuid',
+          help='UUID of a specific resource provider to verify.')
+    @args('--delete', action='store_true', dest='delete', default=False,
+          help='Deletes orphaned allocations that were found.')
+    def audit(self, verbose=False, provider_uuid=None, delete=False):
+        """Provides information about orphaned allocations that can be removed
+
+        Return codes:
+
+        * 0: Command completed successfully and no orphaned allocations exist.
+        * 1: An unexpected error happened during run.
+        * 3: Orphaned allocations were detected.
+        * 4: Orphaned allocations were detected and deleted.
+        * 127: Invalid input.
+        """
+
+        ctxt = context.get_admin_context()
+        output = lambda msg: None
+        if verbose:
+            output = lambda msg: print(msg)
+
+        placement = report.SchedulerReportClient()
+        # Resets two in-memory dicts for knowing instances per compute node
+        self.cn_uuid_mapping = collections.defaultdict(tuple)
+        self.instances_mapping = collections.defaultdict(list)
+
+        num_processed = 0
+        faults = 0
+
+        if provider_uuid:
+            try:
+                resource_provider = self._get_resource_provider(
+                    ctxt, placement, provider_uuid)
+            except exception.ResourceProviderNotFound:
+                print(_('Resource provider with UUID %s does not exist.') %
+                      provider_uuid)
+                return 127
+            resource_providers = [resource_provider]
+        else:
+            resource_providers = self._get_resource_providers(ctxt, placement)
+
+        for provider in resource_providers:
+            (nb_p, faults) = self._check_orphaned_allocations_for_provider(
+                ctxt, placement, output, provider, delete)
+            num_processed += nb_p
+            if faults > 0:
+                print(_('The Resource Provider %s had problems when '
+                        'deleting allocations. Stopping now. Please fix the '
+                        'problem by hand and run again.') %
+                      provider['uuid'])
+                return 1
+        if num_processed > 0:
+            suffix = 's.' if num_processed > 1 else '.'
+            output(_('Processed %(num)s allocation%(suffix)s')
+                   % {'num': num_processed,
+                      'suffix': suffix})
+            return 4 if delete else 3
+        return 0
+

 CATEGORIES = {
    'api_db': ApiDbCommands,
--- a/nova/tests/functional/test_nova_manage.py
+++ b/nova/tests/functional/test_nova_manage.py
@ -1393,6 +1393,232 @@ class TestNovaManagePlacementSyncAggregates(
                             '%s should be in two provider aggregates' % host)


+class TestNovaManagePlacementAudit(
+        integrated_helpers.ProviderUsageBaseTestCase):
+    """Functional tests for nova-manage placement audit"""
+
+    # Let's just use a simple fake driver
+    compute_driver = 'fake.SmallFakeDriver'
+
+    def setUp(self):
+        super(TestNovaManagePlacementAudit, self).setUp()
+        self.cli = manage.PlacementCommands()
+        # Make sure we have two computes for migrations
+        self.compute1 = self._start_compute('host1')
+        self.compute2 = self._start_compute('host2')
+
+        # Make sure we have two hypervisors reported in the API.
+        hypervisors = self.admin_api.api_get(
+            '/os-hypervisors').body['hypervisors']
+        self.assertEqual(2, len(hypervisors))
+
+        self.output = StringIO()
+        self.useFixture(fixtures.MonkeyPatch('sys.stdout', self.output))
+
+        self.flavor = self.api.get_flavors()[0]
+
+    def _delete_instance_but_keep_its_allocations(self, server):
+        """Mocks out the call to Placement for deleting the allocations but
+           still performs the instance deletion.
+        """
+
+        with mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
+                        'delete_allocation_for_instance'):
+            self.api.delete_server(server['id'])
+            self._wait_until_deleted(server)
+
+    def test_audit_orphaned_allocation_from_instance_delete(self):
+        """Creates a server and deletes it by retaining its allocations so the
+           audit command can find it.
+        """
+        target_hostname = self.compute1.host
+        rp_uuid = self._get_provider_uuid_by_host(target_hostname)
+
+        server = self._boot_and_check_allocations(self.flavor, target_hostname)
+
+        # let's mock the allocation delete call to placement
+        with mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
+                        'delete_allocation_for_instance'):
+            self.api.delete_server(server['id'])
+            self._wait_until_deleted(server)
+
+        # make sure the allocation is still around
+        self.assertFlavorMatchesUsage(rp_uuid, self.flavor)
+
+        # Don't ask to delete the orphaned allocations, just audit them
+        ret = self.cli.audit(verbose=True)
+        # The allocation should still exist
+        self.assertFlavorMatchesUsage(rp_uuid, self.flavor)
+
+        output = self.output.getvalue()
+        self.assertIn(
+            'Allocations for consumer UUID %(consumer_uuid)s on '
+            'Resource Provider %(rp_uuid)s can be deleted' %
+            {'consumer_uuid': server['id'],
+             'rp_uuid': rp_uuid},
+            output)
+        self.assertIn('Processed 1 allocation.', output)
+        self.assertEqual(3, ret)
+
+        # Now ask the audit command to delete the rogue allocations.
+        ret = self.cli.audit(delete=True, verbose=True)
+
+        # The allocations are now deleted
+        self.assertRequestMatchesUsage({'VCPU': 0,
+                                        'MEMORY_MB': 0,
+                                        'DISK_GB': 0}, rp_uuid)
+
+        output = self.output.getvalue()
+        self.assertIn(
+            'Deleted allocations for consumer UUID %s' % server['id'], output)
+        self.assertIn('Processed 1 allocation.', output)
+        self.assertEqual(4, ret)
+
+    def test_audit_orphaned_allocations_from_confirmed_resize(self):
+        """Resize a server but when confirming it, leave the migration
+           allocation there so the audit command can find it.
+        """
+        source_hostname = self.compute1.host
+        dest_hostname = self.compute2.host
+
+        source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
+        dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
+
+        old_flavor = self.flavor
+        new_flavor = self.api.get_flavors()[1]
+        # we want to make sure we resize to compute2
+        self.flags(allow_resize_to_same_host=False)
+
+        server = self._boot_and_check_allocations(self.flavor, source_hostname)
+
+        # Do a resize
+        post = {
+            'resize': {
+                'flavorRef': new_flavor['id']
+            }
+        }
+        self._move_and_check_allocations(
+            server, request=post, old_flavor=old_flavor,
+            new_flavor=new_flavor, source_rp_uuid=source_rp_uuid,
+            dest_rp_uuid=dest_rp_uuid)
+
+        # Retain the migration UUID record for later usage
+        migration_uuid = self.get_migration_uuid_for_instance(server['id'])
+
+        # Confirm the resize so it should in theory delete the source
+        # allocations but mock out the allocation delete for the source
+        post = {'confirmResize': None}
+        with mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
+                        'delete_allocation_for_instance'):
+            self.api.post_server_action(
+                server['id'], post, check_response_status=[204])
+            self._wait_for_state_change(server, 'ACTIVE')
+
+        # The target host usage should be according to the new flavor...
+        self.assertFlavorMatchesUsage(dest_rp_uuid, new_flavor)
+        # ...but we should still see allocations for the source compute
+        self.assertFlavorMatchesUsage(source_rp_uuid, old_flavor)
+
+        # Now, run the audit command that will find this orphaned allocation
+        ret = self.cli.audit(verbose=True)
+        output = self.output.getvalue()
+        self.assertIn(
+            'Allocations for consumer UUID %(consumer_uuid)s on '
+            'Resource Provider %(rp_uuid)s can be deleted' %
+            {'consumer_uuid': migration_uuid,
+             'rp_uuid': source_rp_uuid},
+            output)
+        self.assertIn('Processed 1 allocation.', output)
+        self.assertEqual(3, ret)
+
+        # Now we want to delete the orphaned allocation that is duplicate
+        ret = self.cli.audit(delete=True, verbose=True)
+
+        # There should be no longer usage for the source host since the
+        # allocation disappeared
+        self.assertRequestMatchesUsage({'VCPU': 0,
+                                        'MEMORY_MB': 0,
+                                        'DISK_GB': 0}, source_rp_uuid)
+
+        output = self.output.getvalue()
+        self.assertIn(
+            'Deleted allocations for consumer UUID %(consumer_uuid)s on '
+            'Resource Provider %(rp_uuid)s' %
+            {'consumer_uuid': migration_uuid,
+             'rp_uuid': source_rp_uuid},
+            output)
+        self.assertIn('Processed 1 allocation.', output)
+        self.assertEqual(4, ret)
+
+    # TODO(sbauza): Mock this test once bug #1829479 is fixed
+    def test_audit_orphaned_allocations_from_deleted_compute_evacuate(self):
+        """Evacuate a server and the delete the source node so that it will
+           leave a source allocation that the audit command will find.
+        """
+
+        source_hostname = self.compute1.host
+        dest_hostname = self.compute2.host
+
+        source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
+        dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
+
+        server = self._boot_and_check_allocations(self.flavor, source_hostname)
+
+        # Stop the service and fake it down
+        self.compute1.stop()
+        source_service_id = self.admin_api.get_services(
+            host=source_hostname, binary='nova-compute')[0]['id']
+        self.admin_api.put_service(source_service_id, {'forced_down': 'true'})
+
+        # evacuate the instance to the target
+        post = {'evacuate': {"host": dest_hostname}}
+        self.admin_api.post_server_action(server['id'], post)
+        self._wait_for_server_parameter(server,
+                                        {'OS-EXT-SRV-ATTR:host': dest_hostname,
+                                         'status': 'ACTIVE'})
+
+        # Now the instance is gone, we can delete the compute service
+        self.admin_api.api_delete('/os-services/%s' % source_service_id)
+
+        # Since the compute is deleted, we should have in theory a single
+        # allocation against the destination resource provider, but evacuated
+        # instances are not having their allocations deleted. See bug #1829479.
+        # We have two allocations for the same consumer, source and destination
+        self._check_allocation_during_evacuate(
+            self.flavor, server['id'], source_rp_uuid, dest_rp_uuid)
+
+        # Now, run the audit command that will find this orphaned allocation
+        ret = self.cli.audit(verbose=True)
+        output = self.output.getvalue()
+        self.assertIn(
+            'Allocations for consumer UUID %(consumer_uuid)s on '
+            'Resource Provider %(rp_uuid)s can be deleted' %
+            {'consumer_uuid': server['id'],
+             'rp_uuid': source_rp_uuid},
+            output)
+        self.assertIn('Processed 1 allocation.', output)
+        self.assertEqual(3, ret)
+
+        # Now we want to delete the orphaned allocation that is duplicate
+        ret = self.cli.audit(delete=True, verbose=True)
+
+        # We finally should only have the target allocations
+        self.assertFlavorMatchesUsage(dest_rp_uuid, self.flavor)
+        self.assertRequestMatchesUsage({'VCPU': 0,
+                                        'MEMORY_MB': 0,
+                                        'DISK_GB': 0}, source_rp_uuid)
+
+        output = self.output.getvalue()
+        self.assertIn(
+            'Deleted allocations for consumer UUID %(consumer_uuid)s on '
+            'Resource Provider %(rp_uuid)s' %
+            {'consumer_uuid': server['id'],
+             'rp_uuid': source_rp_uuid},
+            output)
+        self.assertIn('Processed 1 allocation.', output)
+        self.assertEqual(4, ret)
+
+
 class TestDBArchiveDeletedRows(integrated_helpers._IntegratedTestBase):
    """Functional tests for the "nova-manage db archive_deleted_rows" CLI."""
    api_major_version = 'v2.1'
--- a/nova/tests/unit/cmd/test_manage.py
+++ b/nova/tests/unit/cmd/test_manage.py
@ -34,6 +34,7 @@ from nova.db import migration
 from nova.db.sqlalchemy import migration as sqla_migration
 from nova import exception
 from nova import objects
+from nova.scheduler.client import report
 from nova import test
 from nova.tests import fixtures as nova_fixtures
 from nova.tests.unit import fake_requests
@ -2851,6 +2852,142 @@ class TestNovaManagePlacement(test.NoDBTestCase):
        neutron.update_port.assert_called_once_with(
            uuidsentinel.port_id, body=expected_update_body)

+    def test_audit_with_wrong_provider_uuid(self):
+        with mock.patch.object(
+                self.cli, '_get_resource_provider',
+                side_effect=exception.ResourceProviderNotFound(
+                    name_or_uuid=uuidsentinel.fake_uuid)):
+            ret = self.cli.audit(
+                provider_uuid=uuidsentinel.fake_uuid)
+        self.assertEqual(127, ret)
+        output = self.output.getvalue()
+        self.assertIn(
+            'Resource provider with UUID %s' % uuidsentinel.fake_uuid,
+            output)
+
+    @mock.patch.object(manage.PlacementCommands,
+                       '_check_orphaned_allocations_for_provider')
+    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.get')
+    def _test_audit(self, get_resource_providers, check_orphaned_allocs,
+                     verbose=False, delete=False, errors=False, found=False):
+        rps = [
+              {"generation": 1,
+               "uuid": uuidsentinel.rp1,
+               "links": None,
+               "name": "rp1",
+               "parent_provider_uuid": None,
+               "root_provider_uuid": uuidsentinel.rp1},
+              {"generation": 1,
+               "uuid": uuidsentinel.rp2,
+               "links": None,
+               "name": "rp2",
+               "parent_provider_uuid": None,
+               "root_provider_uuid": uuidsentinel.rp2},
+              ]
+        get_resource_providers.return_value = fake_requests.FakeResponse(
+            200, content=jsonutils.dumps({"resource_providers": rps}))
+
+        if errors:
+            # We found one orphaned allocation per RP but RP1 got a fault
+            check_orphaned_allocs.side_effect = ((1, 1), (1, 0))
+        elif found:
+            # we found one orphaned allocation per RP and we had no faults
+            check_orphaned_allocs.side_effect = ((1, 0), (1, 0))
+        else:
+            # No orphaned allocations are found for all the RPs
+            check_orphaned_allocs.side_effect = ((0, 0), (0, 0))
+
+        ret = self.cli.audit(verbose=verbose, delete=delete)
+        if errors:
+            # Any fault stops the audit and provides a return code equals to 1
+            expected_ret = 1
+        elif found and delete:
+            # We found orphaned allocations and deleted them
+            expected_ret = 4
+        elif found and not delete:
+            # We found orphaned allocations but we left them
+            expected_ret = 3
+        else:
+            # Nothing was found
+            expected_ret = 0
+        self.assertEqual(expected_ret, ret)
+
+        call1 = mock.call(mock.ANY, mock.ANY, mock.ANY, rps[0], delete)
+        call2 = mock.call(mock.ANY, mock.ANY, mock.ANY, rps[1], delete)
+        if errors:
+            # We stop checking other RPs once we got a fault
+            check_orphaned_allocs.assert_has_calls([call1])
+        else:
+            # All the RPs are checked
+            check_orphaned_allocs.assert_has_calls([call1, call2])
+
+        if verbose and found:
+            output = self.output.getvalue()
+            self.assertIn('Processed 2 allocations', output)
+        if errors:
+            output = self.output.getvalue()
+            self.assertIn(
+                'The Resource Provider %s had problems' % rps[0]["uuid"],
+                output)
+
+    def test_audit_not_found_orphaned_allocs(self):
+        self._test_audit(found=False)
+
+    def test_audit_found_orphaned_allocs_not_verbose(self):
+        self._test_audit(found=True)
+
+    def test_audit_found_orphaned_allocs_verbose(self):
+        self._test_audit(found=True, verbose=True)
+
+    def test_audit_found_orphaned_allocs_and_deleted_them(self):
+        self._test_audit(found=True, delete=True)
+
+    def test_audit_found_orphaned_allocs_but_got_errors(self):
+        self._test_audit(errors=True)
+
+    @mock.patch.object(manage.PlacementCommands,
+                       '_delete_allocations_from_consumer')
+    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
+                'get_allocations_for_resource_provider')
+    @mock.patch.object(manage.PlacementCommands,
+                       '_get_instances_and_current_migrations')
+    def test_check_orphaned_allocations_for_provider(self,
+                                                     get_insts_and_migs,
+                                                     get_allocs_for_rp,
+                                                     delete_allocs):
+        provider = {"generation": 1,
+                    "uuid": uuidsentinel.rp1,
+                    "links": None,
+                    "name": "rp1",
+                    "parent_provider_uuid": None,
+                    "root_provider_uuid": uuidsentinel.rp1}
+        compute_resources = {'VCPU': 1, 'MEMORY_MB': 2048, 'DISK_GB': 20}
+        allocations = {
+            # Some orphaned compute allocation
+            uuidsentinel.orphaned_alloc1: {'resources': compute_resources},
+            # Some existing instance allocation
+            uuidsentinel.inst1: {'resources': compute_resources},
+            # Some existing migration allocation
+            uuidsentinel.mig1: {'resources': compute_resources},
+            # Some other allocation not related to Nova
+            uuidsentinel.other_alloc1: {'resources': {'CUSTOM_GOO'}},
+        }
+
+        get_insts_and_migs.return_value = (
+            [uuidsentinel.inst1],
+            [uuidsentinel.mig1])
+        get_allocs_for_rp.return_value = report.ProviderAllocInfo(allocations)
+
+        ctxt = context.RequestContext()
+        placement = report.SchedulerReportClient()
+        ret = self.cli._check_orphaned_allocations_for_provider(
+            ctxt, placement, lambda x: x, provider, True)
+        get_allocs_for_rp.assert_called_once_with(ctxt, uuidsentinel.rp1)
+        delete_allocs.assert_called_once_with(ctxt, placement, provider,
+                                              uuidsentinel.orphaned_alloc1,
+                                              'instance')
+        self.assertEqual((1, 0), ret)
+

 class TestNovaManageMain(test.NoDBTestCase):
    """Tests the nova-manage:main() setup code."""
--- a/releasenotes/notes/placement-audit-59a00dcfb188c6ac.yaml
+++ b/releasenotes/notes/placement-audit-59a00dcfb188c6ac.yaml
@ -0,0 +1,12 @@
+---
+other:
+  - |
+    A new ``nova-manage placement audit`` CLI has been added to help identify
+    orphaned compute allocations in the Placement API that are no longer
+    related to either instances or migrations.
+    Some race conditions in Nova could not remove allocations for some
+    instances or migrations when they're done and then it would create some
+    capacity issues. Thanks to the command, you could know the orphaned
+    allocations and ask to remove them.
+    For more details on CLI usage, see the man page entry:
+    https://docs.openstack.org/nova/latest/cli/nova-manage.html#placement