From 95106d2fa1ab86607231c338fa0abc0c3488f0f8 Mon Sep 17 00:00:00 2001
From: Matt Riedemann <mriedem.os@gmail.com>
Date: Wed, 2 May 2018 15:44:21 -0400
Subject: [PATCH] Add nova-manage placement heal_allocations CLI

This adds a new CLI which will iterate all non-cell0
cells looking for instances that (1) have a host,
(2) aren't undergoing a task state transition and
(3) don't have allocations in placement and try
to allocate resources, based on the instance embedded
flavor, against the compute node resource provider
on which the instance is currently running.

This is meant as a way to help migrate CachingScheduler
users off the CachingScheduler by first shoring up
instance allocations in placement for any instances
created after Pike, when the nova-compute resource
tracker code stopped creating allocations in placement
since the FilterScheduler does it at the time of
scheduling (but the CachingScheduler doesn't).

This will be useful beyond just getting deployments
off the CachingScheduler, however, since operators
will be able to use it to fix incorrect allocations
resulting from failed operations.

There are several TODOs and NOTEs inline about things
we could build on top of this or improve, but for now
this is the basic idea.

Change-Id: Iab67fd56ab4845f8ee19ca36e7353730638efb21
---
 doc/source/cli/nova-manage.rst                |  32 +++
 nova/cmd/manage.py                            | 243 ++++++++++++++++++
 nova/exception.py                             |   5 +
 nova/test.py                                  |   3 +-
 nova/tests/functional/test_nova_manage.py     | 223 ++++++++++++++++
 nova/tests/functional/test_servers.py         |   6 +-
 nova/tests/unit/test_nova_manage.py           |  91 +++++++
 ...ent-heal-allocations-13a9a0a3df910e0b.yaml |  18 ++
 8 files changed, 618 insertions(+), 3 deletions(-)
 create mode 100644 releasenotes/notes/nova-manage-placement-heal-allocations-13a9a0a3df910e0b.yaml

diff --git a/doc/source/cli/nova-manage.rst b/doc/source/cli/nova-manage.rst
index 5ea10d3b05bf..f0640296b138 100644
--- a/doc/source/cli/nova-manage.rst
+++ b/doc/source/cli/nova-manage.rst
@@ -276,6 +276,38 @@ Nova Cells v2
     found, 3 if a host with that name is not in a cell with that uuid, 4 if
     a host with that name has instances (host not empty).
 
+
+Placement
+~~~~~~~~~
+
+``nova-manage placement heal_allocations [--max-count <max_count>] [--verbose]``
+    Iterates over non-cell0 cells looking for instances which do not have
+    allocations in the Placement service and which are not undergoing a task
+    state transition. For each instance found, allocations are created against
+    the compute node resource provider for that instance based on the flavor
+    associated with the instance.
+
+    Specify ``--max-count`` to control the maximum number of instances to
+    process. If not specified, all instances in each cell will be mapped in
+    batches of 50. If you have a large number of instances, consider
+    specifying a custom value and run the command until it exits with 0 or 4.
+
+    Specify ``--verbose`` to get detailed progress output during execution.
+
+    This command requires that the ``[api_database]/connection`` and
+    ``[placement]`` configuration options are set.
+
+    Return codes:
+
+    * 0: Command completed successfully and allocations were created.
+    * 1: --max-count was reached and there are more instances to process.
+    * 2: Unable to find a compute node record for a given instance.
+    * 3: Unable to create allocations for an instance against its
+      compute node resource provider.
+    * 4: Command completed successfully but no allocations were created.
+    * 127: Invalid input.
+
+
 See Also
 ========
 
diff --git a/nova/cmd/manage.py b/nova/cmd/manage.py
index 29c1bf292202..812fddedd5d3 100644
--- a/nova/cmd/manage.py
+++ b/nova/cmd/manage.py
@@ -63,6 +63,8 @@ from nova.objects import quotas as quotas_obj
 from nova.objects import request_spec
 from nova import quota
 from nova import rpc
+from nova.scheduler.client import report
+from nova.scheduler import utils as scheduler_utils
 from nova import utils
 from nova import version
 from nova.virt import ironic
@@ -1707,6 +1709,246 @@ class CellV2Commands(object):
         return 0
 
 
+class PlacementCommands(object):
+    """Commands for managing placement resources."""
+
+    @staticmethod
+    def _get_compute_node_uuid(ctxt, instance, node_cache):
+        """Find the ComputeNode.uuid for the given Instance
+
+        :param ctxt: cell-targeted nova.context.RequestContext
+        :param instance: the instance to lookup a compute node
+        :param node_cache: dict of Instance.node keys to ComputeNode.uuid
+            values; this cache is updated if a new node is processed.
+        :returns: ComputeNode.uuid for the given instance
+        :raises: nova.exception.ComputeHostNotFound
+        """
+        if instance.node in node_cache:
+            return node_cache[instance.node]
+
+        compute_node = objects.ComputeNode.get_by_host_and_nodename(
+            ctxt, instance.host, instance.node)
+        node_uuid = compute_node.uuid
+        node_cache[instance.node] = node_uuid
+        return node_uuid
+
+    def _heal_instances_in_cell(self, ctxt, max_count, unlimited, output,
+                                placement):
+        """Checks for instances to heal in a given cell.
+
+        :param ctxt: cell-targeted nova.context.RequestContext
+        :param max_count: batch size (limit per instance query)
+        :param unlimited: True if all instances in the cell should be
+            processed, else False to just process $max_count instances
+        :param outout: function that takes a single message for verbose output
+        :param placement: nova.scheduler.client.report.SchedulerReportClient
+            to communicate with the Placement service API.
+        :return: Number of instances that had allocations created.
+        :raises: nova.exception.ComputeHostNotFound if a compute node for a
+            given instance cannot be found
+        :raises: AllocationCreateFailed if unable to create allocations for
+            a given instance against a given compute node resource provider
+        """
+        # Keep a cache of instance.node to compute node resource provider UUID.
+        # This will save some queries for non-ironic instances to the
+        # compute_nodes table.
+        node_cache = {}
+        # Track the total number of instances that have allocations created
+        # for them in this cell. We return when num_processed equals max_count
+        # and unlimited=True or we exhaust the number of instances to process
+        # in this cell.
+        num_processed = 0
+        # Get all instances from this cell which have a host and are not
+        # undergoing a task state transition. Go from oldest to newest.
+        # NOTE(mriedem): Unfortunately we don't have a marker to use
+        # between runs where the user is specifying --max-count.
+        # TODO(mriedem): Store a marker in system_metadata so we can
+        # automatically pick up where we left off without the user having
+        # to pass it in (if unlimited is False).
+        instances = objects.InstanceList.get_by_filters(
+            ctxt, filters={}, sort_key='created_at', sort_dir='asc',
+            limit=max_count, expected_attrs=['flavor'])
+        while instances:
+            output(_('Found %s candidate instances.') % len(instances))
+            # For each instance in this list, we need to see if it has
+            # allocations in placement and if so, assume it's correct and
+            # continue.
+            for instance in instances:
+                if instance.task_state is not None:
+                    output(_('Instance %(instance)s is undergoing a task '
+                             'state transition: %(task_state)s') %
+                           {'instance': instance.uuid,
+                            'task_state': instance.task_state})
+                    continue
+
+                if instance.node is None:
+                    output(_('Instance %s is not on a host.') % instance.uuid)
+                    continue
+
+                allocations = placement.get_allocations_for_consumer(
+                    ctxt, instance.uuid)
+                if allocations:
+                    output(_('Instance %s already has allocations.') %
+                           instance.uuid)
+                    # TODO(mriedem): Check to see if the allocation project_id
+                    # and user_id matches the instance project and user and
+                    # fix the allocation project/user if they don't match; see
+                    # blueprint add-consumer-generation for details.
+                    continue
+
+                # This instance doesn't have allocations so we need to find
+                # its compute node resource provider.
+                node_uuid = self._get_compute_node_uuid(
+                    ctxt, instance, node_cache)
+
+                # Now get the resource allocations for the instance based
+                # on its embedded flavor.
+                resources = scheduler_utils.resources_from_flavor(
+                    instance, instance.flavor)
+                if placement.put_allocations(
+                        ctxt, node_uuid, instance.uuid, resources,
+                        instance.project_id, instance.user_id):
+                    num_processed += 1
+                    output(_('Successfully created allocations for '
+                             'instance %(instance)s against resource '
+                             'provider %(provider)s.') %
+                           {'instance': instance.uuid, 'provider': node_uuid})
+                else:
+                    raise exception.AllocationCreateFailed(
+                        instance=instance.uuid, provider=node_uuid)
+
+            # Make sure we don't go over the max count. Note that we
+            # don't include instances that already have allocations in the
+            # max_count number, only the number of instances that have
+            # successfully created allocations.
+            if not unlimited and num_processed == max_count:
+                return num_processed
+
+            # Use a marker to get the next page of instances in this cell.
+            # Note that InstanceList doesn't support slice notation.
+            marker = instances[len(instances) - 1].uuid
+            instances = objects.InstanceList.get_by_filters(
+                ctxt, filters={}, sort_key='created_at', sort_dir='asc',
+                limit=max_count, marker=marker, expected_attrs=['flavor'])
+
+        return num_processed
+
+    @action_description(
+        _("Iterates over non-cell0 cells looking for instances which do "
+          "not have allocations in the Placement service and which are not "
+          "undergoing a task state transition. For each instance found, "
+          "allocations are created against the compute node resource provider "
+          "for that instance based on the flavor associated with the "
+          "instance. This command requires that the [api_database]/connection "
+          "and [placement] configuration options are set."))
+    @args('--max-count', metavar='<max_count>', dest='max_count',
+          help='Maximum number of instances to process. If not specified, all '
+               'instances in each cell will be mapped in batches of 50. '
+               'If you have a large number of instances, consider specifying '
+               'a custom value and run the command until it exits with '
+               '0 or 4.')
+    @args('--verbose', action='store_true', dest='verbose', default=False,
+          help='Provide verbose output during execution.')
+    def heal_allocations(self, max_count=None, verbose=False):
+        """Heals instance allocations in the Placement service
+
+        Return codes:
+
+        * 0: Command completed successfully and allocations were created.
+        * 1: --max-count was reached and there are more instances to process.
+        * 2: Unable to find a compute node record for a given instance.
+        * 3: Unable to create allocations for an instance against its
+             compute node resource provider.
+        * 4: Command completed successfully but no allocations were created.
+        * 127: Invalid input.
+        """
+        # NOTE(mriedem): Thoughts on ways to expand this:
+        # - add a --dry-run option to just print which instances would have
+        #   allocations created for them
+        # - allow passing a specific cell to heal
+        # - allow filtering on enabled/disabled cells
+        # - allow passing a specific instance to heal
+        # - add a force option to force allocations for instances which have
+        #   task_state is not None (would get complicated during a migration);
+        #   for example, this could cleanup ironic instances that have
+        #   allocations on VCPU/MEMORY_MB/DISK_GB but are now using a custom
+        #   resource class
+        # - add an option to overwrite allocations for instances which already
+        #   have allocations (but the operator thinks might be wrong?); this
+        #   would probably only be safe with a specific instance.
+        # - deal with nested resource providers?
+
+        output = lambda msg: None
+        if verbose:
+            output = lambda msg: print(msg)
+
+        # TODO(mriedem): Rather than --max-count being both a total and batch
+        # count, should we have separate options to be specific, i.e. --total
+        # and --batch-size? Then --batch-size defaults to 50 and --total
+        # defaults to None to mean unlimited.
+        if max_count is not None:
+            try:
+                max_count = int(max_count)
+            except ValueError:
+                max_count = -1
+            unlimited = False
+            if max_count < 1:
+                print(_('Must supply a positive integer for --max-count.'))
+                return 127
+        else:
+            max_count = 50
+            unlimited = True
+            output(_('Running batches of %i until complete') % max_count)
+
+        ctxt = context.get_admin_context()
+        cells = objects.CellMappingList.get_all(ctxt)
+        if not cells:
+            output(_('No cells to process.'))
+            return 4
+
+        placement = report.SchedulerReportClient()
+        num_processed = 0
+        # TODO(mriedem): Use context.scatter_gather_skip_cell0.
+        for cell in cells:
+            # Skip cell0 since that is where instances go that do not get
+            # scheduled and hence would not have allocations against a host.
+            if cell.uuid == objects.CellMapping.CELL0_UUID:
+                continue
+            output(_('Looking for instances in cell: %s') % cell.identity)
+
+            limit_per_cell = max_count
+            if not unlimited:
+                # Adjust the limit for the next cell. For example, if the user
+                # only wants to process a total of 100 instances and we did
+                # 75 in cell1, then we only need 25 more from cell2 and so on.
+                limit_per_cell = max_count - num_processed
+
+            with context.target_cell(ctxt, cell) as cctxt:
+                try:
+                    num_processed += self._heal_instances_in_cell(
+                        cctxt, limit_per_cell, unlimited, output, placement)
+                except exception.ComputeHostNotFound as e:
+                    print(e.format_message())
+                    return 2
+                except exception.AllocationCreateFailed as e:
+                    print(e.format_message())
+                    return 3
+
+                # Make sure we don't go over the max count. Note that we
+                # don't include instances that already have allocations in the
+                # max_count number, only the number of instances that have
+                # successfully created allocations.
+                if num_processed == max_count:
+                    output(_('Max count reached. Processed %s instances.')
+                           % num_processed)
+                    return 1
+
+        output(_('Processed %s instances.') % num_processed)
+        if not num_processed:
+            return 4
+        return 0
+
+
 CATEGORIES = {
     'api_db': ApiDbCommands,
     'cell': CellCommands,
@@ -1714,6 +1956,7 @@ CATEGORIES = {
     'db': DbCommands,
     'floating': FloatingIpCommands,
     'network': NetworkCommands,
+    'placement': PlacementCommands
 }
 
 
diff --git a/nova/exception.py b/nova/exception.py
index b6365b36bdbb..d9964c2331d7 100644
--- a/nova/exception.py
+++ b/nova/exception.py
@@ -2274,3 +2274,8 @@ class DeviceDeletionException(NovaException):
 class OptRequiredIfOtherOptValue(NovaException):
     msg_fmt = _("The %(then_opt)s option is required if %(if_opt)s is "
                 "specified as '%(if_value)s'.")
+
+
+class AllocationCreateFailed(NovaException):
+    msg_fmt = _('Failed to create allocations for instance %(instance)s '
+                'against resource provider %(provider)s.')
diff --git a/nova/test.py b/nova/test.py
index a22025f4889f..7eb9ea0b1989 100644
--- a/nova/test.py
+++ b/nova/test.py
@@ -413,7 +413,8 @@ class TestCase(testtools.TestCase):
             # otherwise we'll fail to update the scheduler while running
             # the compute node startup routines below.
             ctxt = context.get_context()
-            cell = self.cell_mappings[kwargs.pop('cell', CELL1_NAME)]
+            cell_name = kwargs.pop('cell', CELL1_NAME) or CELL1_NAME
+            cell = self.cell_mappings[cell_name]
             hm = objects.HostMapping(context=ctxt,
                                      host=host or name,
                                      cell_mapping=cell)
diff --git a/nova/tests/functional/test_nova_manage.py b/nova/tests/functional/test_nova_manage.py
index 8544d1db6aa2..304c78735e4e 100644
--- a/nova/tests/functional/test_nova_manage.py
+++ b/nova/tests/functional/test_nova_manage.py
@@ -10,10 +10,14 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.
 
+import fixtures
+from six.moves import StringIO
+
 from nova.cmd import manage
 from nova import context
 from nova import objects
 from nova import test
+from nova.tests.functional import test_servers
 
 
 class NovaManageDBIronicTest(test.TestCase):
@@ -348,3 +352,222 @@ class NovaManageCellV2Test(test.TestCase):
         cns = objects.ComputeNodeList.get_all(self.context)
         self.assertEqual(1, len(cns))
         self.assertEqual(0, cns[0].mapped)
+
+
+class TestNovaManagePlacementHealAllocations(
+        test_servers.ProviderUsageBaseTestCase):
+    """Functional tests for nova-manage placement heal_allocations"""
+
+    # This is required by the parent class.
+    compute_driver = 'fake.SmallFakeDriver'
+    # We want to test iterating across multiple cells.
+    NUMBER_OF_CELLS = 2
+
+    def setUp(self):
+        # Since the CachingScheduler does not use Placement, we want to use
+        # the CachingScheduler to create instances and then we can heal their
+        # allocations via the CLI.
+        self.flags(driver='caching_scheduler', group='scheduler')
+        super(TestNovaManagePlacementHealAllocations, self).setUp()
+        self.cli = manage.PlacementCommands()
+        # We need to start a compute in each non-cell0 cell.
+        for cell_name, cell_mapping in self.cell_mappings.items():
+            if cell_mapping.uuid == objects.CellMapping.CELL0_UUID:
+                continue
+            self._start_compute(cell_name, cell_name=cell_name)
+        # Make sure we have two hypervisors reported in the API.
+        hypervisors = self.admin_api.api_get(
+            '/os-hypervisors').body['hypervisors']
+        self.assertEqual(2, len(hypervisors))
+        self.flavor = self.api.get_flavors()[0]
+        self.output = StringIO()
+        self.useFixture(fixtures.MonkeyPatch('sys.stdout', self.output))
+
+    def _boot_and_assert_no_allocations(self, flavor, hostname):
+        """Creates a server on the given host and asserts neither have usage
+
+        :param flavor: the flavor used to create the server
+        :param hostname: the host on which to create the server
+        :returns: two-item tuple of the server and the compute node resource
+                  provider uuid
+        """
+        server_req = self._build_minimal_create_server_request(
+            self.api, 'some-server', flavor_id=flavor['id'],
+            image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
+            networks=[])
+        server_req['availability_zone'] = 'nova:%s' % hostname
+        created_server = self.api.post_server({'server': server_req})
+        server = self._wait_for_state_change(
+            self.admin_api, created_server, 'ACTIVE')
+
+        # Verify that our source host is what the server ended up on
+        self.assertEqual(hostname, server['OS-EXT-SRV-ATTR:host'])
+
+        # Check that the compute node resource provider has no allocations.
+        rp_uuid = self._get_provider_uuid_by_host(hostname)
+        provider_usages = self._get_provider_usages(rp_uuid)
+        for resource_class, usage in provider_usages.items():
+            self.assertEqual(
+                0, usage,
+                'Compute node resource provider %s should not have %s '
+                'usage when using the CachingScheduler.' %
+                (hostname, resource_class))
+
+        # Check that the server has no allocations.
+        allocations = self._get_allocations_by_server_uuid(server['id'])
+        self.assertEqual({}, allocations,
+                         'Server should not have allocations when using '
+                         'the CachingScheduler.')
+        return server, rp_uuid
+
+    def _assert_healed(self, server, rp_uuid):
+        allocations = self._get_allocations_by_server_uuid(server['id'])
+        self.assertIn(rp_uuid, allocations,
+                      'Allocations not found for server %s and compute node '
+                      'resource provider. %s\nOutput:%s' %
+                      (server['id'], rp_uuid, self.output.getvalue()))
+        self.assertFlavorMatchesAllocation(
+            self.flavor, allocations[rp_uuid]['resources'])
+
+    def test_heal_allocations_paging(self):
+        """This test runs the following scenario:
+
+        * Schedule server1 to cell1 and assert it doesn't have allocations.
+        * Schedule server2 to cell2 and assert it doesn't have allocations.
+        * Run "nova-manage placement heal_allocations --max-count 1" to make
+          sure we stop with just one instance and the return code is 1.
+        * Run "nova-manage placement heal_allocations" and assert both
+          both instances now have allocations against their respective compute
+          node resource providers.
+        """
+        server1, rp_uuid1 = self._boot_and_assert_no_allocations(
+            self.flavor, 'cell1')
+        server2, rp_uuid2 = self._boot_and_assert_no_allocations(
+            self.flavor, 'cell2')
+
+        # heal server1 and server2 in separate calls
+        for x in range(2):
+            result = self.cli.heal_allocations(max_count=1, verbose=True)
+            self.assertEqual(1, result, self.output.getvalue())
+            output = self.output.getvalue()
+            self.assertIn('Max count reached. Processed 1 instances.', output)
+            # If this is the 2nd call, we'll have skipped the first instance.
+            if x == 0:
+                self.assertNotIn('already has allocations', output)
+            else:
+                self.assertIn('already has allocations', output)
+
+        self._assert_healed(server1, rp_uuid1)
+        self._assert_healed(server2, rp_uuid2)
+
+        # run it again to make sure nothing was processed
+        result = self.cli.heal_allocations(verbose=True)
+        self.assertEqual(4, result, self.output.getvalue())
+        self.assertIn('already has allocations', self.output.getvalue())
+
+    def test_heal_allocations_paging_max_count_more_than_num_instances(self):
+        """Sets up 2 instances in cell1 and 1 instance in cell2. Then specify
+        --max-count=10, processes 3 instances, rc is 0
+        """
+        servers = []  # This is really a list of 2-item tuples.
+        for x in range(2):
+            servers.append(
+                self._boot_and_assert_no_allocations(self.flavor, 'cell1'))
+        servers.append(
+            self._boot_and_assert_no_allocations(self.flavor, 'cell2'))
+        result = self.cli.heal_allocations(max_count=10, verbose=True)
+        self.assertEqual(0, result, self.output.getvalue())
+        self.assertIn('Processed 3 instances.', self.output.getvalue())
+        for server, rp_uuid in servers:
+            self._assert_healed(server, rp_uuid)
+
+    def test_heal_allocations_paging_more_instances_remain(self):
+        """Tests that there is one instance in cell1 and two instances in
+        cell2, with a --max-count=2. This tests that we stop in cell2 once
+        max_count is reached.
+        """
+        servers = []  # This is really a list of 2-item tuples.
+        servers.append(
+            self._boot_and_assert_no_allocations(self.flavor, 'cell1'))
+        for x in range(2):
+            servers.append(
+                self._boot_and_assert_no_allocations(self.flavor, 'cell2'))
+        result = self.cli.heal_allocations(max_count=2, verbose=True)
+        self.assertEqual(1, result, self.output.getvalue())
+        self.assertIn('Max count reached. Processed 2 instances.',
+                      self.output.getvalue())
+        # Assert that allocations were healed on the instances we expect. Order
+        # works here because cell mappings are retrieved by id in ascending
+        # order so oldest to newest, and instances are also retrieved from each
+        # cell by created_at in ascending order, which matches the order we put
+        # created servers in our list.
+        for x in range(2):
+            self._assert_healed(*servers[x])
+        # And assert the remaining instance does not have allocations.
+        allocations = self._get_allocations_by_server_uuid(
+            servers[2][0]['id'])
+        self.assertEqual({}, allocations)
+
+    def test_heal_allocations_unlimited(self):
+        """Sets up 2 instances in cell1 and 1 instance in cell2. Then
+        don't specify --max-count, processes 3 instances, rc is 0.
+        """
+        servers = []  # This is really a list of 2-item tuples.
+        for x in range(2):
+            servers.append(
+                self._boot_and_assert_no_allocations(self.flavor, 'cell1'))
+        servers.append(
+            self._boot_and_assert_no_allocations(self.flavor, 'cell2'))
+        result = self.cli.heal_allocations(verbose=True)
+        self.assertEqual(0, result, self.output.getvalue())
+        self.assertIn('Processed 3 instances.', self.output.getvalue())
+        for server, rp_uuid in servers:
+            self._assert_healed(server, rp_uuid)
+
+    def test_heal_allocations_shelved(self):
+        """Tests the scenario that an instance with no allocations is shelved
+        so heal_allocations skips it (since the instance is not on a host).
+        """
+        server, rp_uuid = self._boot_and_assert_no_allocations(
+            self.flavor, 'cell1')
+        self.api.post_server_action(server['id'], {'shelve': None})
+        # The server status goes to SHELVED_OFFLOADED before the host/node
+        # is nulled out in the compute service, so we also have to wait for
+        # that so we don't race when we run heal_allocations.
+        server = self._wait_for_server_parameter(
+            self.admin_api, server,
+            {'OS-EXT-SRV-ATTR:host': None, 'status': 'SHELVED_OFFLOADED'})
+        result = self.cli.heal_allocations(verbose=True)
+        self.assertEqual(4, result, self.output.getvalue())
+        self.assertIn('Instance %s is not on a host.' % server['id'],
+                      self.output.getvalue())
+        # Check that the server has no allocations.
+        allocations = self._get_allocations_by_server_uuid(server['id'])
+        self.assertEqual({}, allocations,
+                         'Shelved-offloaded server should not have '
+                         'allocations.')
+
+    def test_heal_allocations_task_in_progress(self):
+        """Tests the case that heal_allocations skips over an instance which
+        is undergoing a task state transition (in this case pausing).
+        """
+        server, rp_uuid = self._boot_and_assert_no_allocations(
+            self.flavor, 'cell1')
+
+        def fake_pause_instance(_self, ctxt, instance, *a, **kw):
+            self.assertEqual('pausing', instance.task_state)
+        # We have to stub out pause_instance so that the instance is stuck with
+        # task_state != None.
+        self.stub_out('nova.compute.manager.ComputeManager.pause_instance',
+                      fake_pause_instance)
+        self.api.post_server_action(server['id'], {'pause': None})
+        result = self.cli.heal_allocations(verbose=True)
+        self.assertEqual(4, result, self.output.getvalue())
+        # Check that the server has no allocations.
+        allocations = self._get_allocations_by_server_uuid(server['id'])
+        self.assertEqual({}, allocations,
+                         'Server undergoing task state transition should '
+                         'not have allocations.')
+        # Assert something was logged for this instance when it was skipped.
+        self.assertIn('Instance %s is undergoing a task state transition: '
+                      'pausing' % server['id'], self.output.getvalue())
diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py
index a9a7887fce93..2aaf6e5bb279 100644
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@@ -1416,16 +1416,18 @@ class ProviderUsageBaseTestCase(test.TestCase,
 
         self.computes = {}
 
-    def _start_compute(self, host):
+    def _start_compute(self, host, cell_name=None):
         """Start a nova compute service on the given host
 
         :param host: the name of the host that will be associated to the
                      compute service.
+        :param cell_name: optional name of the cell in which to start the
+                          compute service (defaults to cell1)
         :return: the nova compute service object
         """
         fake.set_nodes([host])
         self.addCleanup(fake.restore_nodes)
-        compute = self.start_service('compute', host=host)
+        compute = self.start_service('compute', host=host, cell=cell_name)
         self.computes[host] = compute
         return compute
 
diff --git a/nova/tests/unit/test_nova_manage.py b/nova/tests/unit/test_nova_manage.py
index a4ce36983f10..91be05dc8133 100644
--- a/nova/tests/unit/test_nova_manage.py
+++ b/nova/tests/unit/test_nova_manage.py
@@ -2395,6 +2395,97 @@ class CellV2CommandsTestCase(test.NoDBTestCase):
             node.save.assert_called_once_with()
 
 
+@ddt.ddt
+class TestNovaManagePlacement(test.NoDBTestCase):
+    """Unit tests for the nova-manage placement commands.
+
+    Tests in this class should be simple and can rely on mock, so they
+    are usually restricted to negative or side-effect type tests.
+
+    For more involved functional scenarios, use
+    nova.tests.functional.test_nova_manage.
+    """
+    def setUp(self):
+        super(TestNovaManagePlacement, self).setUp()
+        self.output = StringIO()
+        self.useFixture(fixtures.MonkeyPatch('sys.stdout', self.output))
+        self.cli = manage.PlacementCommands()
+
+    @ddt.data(-1, 0, "one")
+    def test_heal_allocations_invalid_max_count(self, max_count):
+        self.assertEqual(127, self.cli.heal_allocations(max_count=max_count))
+
+    @mock.patch('nova.objects.CellMappingList.get_all',
+                return_value=objects.CellMappingList())
+    def test_heal_allocations_no_cells(self, mock_get_all_cells):
+        self.assertEqual(4, self.cli.heal_allocations(verbose=True))
+        self.assertIn('No cells to process', self.output.getvalue())
+
+    @mock.patch('nova.objects.CellMappingList.get_all',
+                return_value=objects.CellMappingList(objects=[
+                    objects.CellMapping(name='cell1',
+                                        uuid=uuidsentinel.cell1)]))
+    @mock.patch('nova.objects.InstanceList.get_by_filters',
+                return_value=objects.InstanceList())
+    def test_heal_allocations_no_instances(
+            self, mock_get_instances, mock_get_all_cells):
+        self.assertEqual(4, self.cli.heal_allocations(verbose=True))
+        self.assertIn('Processed 0 instances.', self.output.getvalue())
+
+    @mock.patch('nova.objects.CellMappingList.get_all',
+                return_value=objects.CellMappingList(objects=[
+                    objects.CellMapping(name='cell1',
+                                        uuid=uuidsentinel.cell1)]))
+    @mock.patch('nova.objects.InstanceList.get_by_filters',
+                return_value=objects.InstanceList(objects=[
+                    objects.Instance(
+                        uuid=uuidsentinel.instance, host='fake', node='fake',
+                        task_state=None)]))
+    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
+                'get_allocations_for_consumer', return_value={})
+    @mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename',
+                side_effect=exception.ComputeHostNotFound(host='fake'))
+    def test_heal_allocations_compute_host_not_found(
+            self, mock_get_compute_node, mock_get_allocs, mock_get_instances,
+            mock_get_all_cells):
+        self.assertEqual(2, self.cli.heal_allocations())
+        self.assertIn('Compute host fake could not be found.',
+                      self.output.getvalue())
+
+    @mock.patch('nova.objects.CellMappingList.get_all',
+                return_value=objects.CellMappingList(objects=[
+                    objects.CellMapping(name='cell1',
+                                        uuid=uuidsentinel.cell1)]))
+    @mock.patch('nova.objects.InstanceList.get_by_filters',
+                return_value=objects.InstanceList(objects=[
+                    objects.Instance(
+                        uuid=uuidsentinel.instance, host='fake', node='fake',
+                        task_state=None, flavor=objects.Flavor(),
+                        project_id='fake-project', user_id='fake-user')]))
+    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
+                'get_allocations_for_consumer', return_value={})
+    @mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename',
+                return_value=objects.ComputeNode(uuid=uuidsentinel.node))
+    @mock.patch('nova.scheduler.utils.resources_from_flavor',
+                return_value=mock.sentinel.resources)
+    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
+                'put_allocations', return_value=False)
+    def test_heal_allocations_put_allocations_fails(
+            self, mock_put_allocations, mock_res_from_flavor,
+            mock_get_compute_node, mock_get_allocs, mock_get_instances,
+            mock_get_all_cells):
+        self.assertEqual(3, self.cli.heal_allocations())
+        self.assertIn('Failed to create allocations for instance',
+                      self.output.getvalue())
+        instance = mock_get_instances.return_value[0]
+        mock_res_from_flavor.assert_called_once_with(
+            instance, instance.flavor)
+        mock_put_allocations.assert_called_once_with(
+            test.MatchType(context.RequestContext), uuidsentinel.node,
+            uuidsentinel.instance, mock.sentinel.resources, 'fake-project',
+            'fake-user')
+
+
 class TestNovaManageMain(test.NoDBTestCase):
     """Tests the nova-manage:main() setup code."""
 
diff --git a/releasenotes/notes/nova-manage-placement-heal-allocations-13a9a0a3df910e0b.yaml b/releasenotes/notes/nova-manage-placement-heal-allocations-13a9a0a3df910e0b.yaml
new file mode 100644
index 000000000000..88144592e658
--- /dev/null
+++ b/releasenotes/notes/nova-manage-placement-heal-allocations-13a9a0a3df910e0b.yaml
@@ -0,0 +1,18 @@
+---
+other:
+  - |
+    A new ``nova-manage placement heal_allocations`` CLI has been added to
+    help migrate users from the deprecated CachingScheduler. Starting in
+    16.0.0 (Pike), the nova-compute service no longer reports instance
+    allocations to the Placement service because the FilterScheduler does
+    that as part of scheduling. However, the CachingScheduler does not create
+    the allocations in the Placement service, so any instances created using
+    the CachingScheduler after Ocata will not have allocations in Placement.
+    The new CLI allows operators using the CachingScheduler to find all
+    instances in all cells which do not have allocations in Placement and
+    create those allocations. The CLI will skip any instances that are
+    undergoing a task state transition, so ideally this would be run when
+    the API is down but it can be run, if necessary, while the API is up.
+    For more details on CLI usage, see the man page entry:
+
+    https://docs.openstack.org/nova/latest/cli/nova-manage.html#placement