Merge "nova-status: check for compute resource providers"

2017-01-05 18:57:57 +00:00 · 2017-01-05 18:57:57 +00:00 · 966446553b
parent d8b84520ab c0a90ea3bc
commit 966446553b
2 changed files with 372 additions and 39 deletions
--- a/nova/cmd/status.py
+++ b/nova/cmd/status.py
@ -35,13 +35,17 @@ from sqlalchemy import MetaData, Table, select
 from nova.cmd import common as cmd_common
 import nova.conf
 from nova import config
+from nova import context as nova_context
 from nova.db.sqlalchemy import api as db_session
 from nova.i18n import _
 from nova.objects import cell_mapping as cell_mapping_obj
+from nova.objects import fields
 from nova import version

 CONF = nova.conf.CONF

+PLACEMENT_DOCS_LINK = 'http://docs.openstack.org/developer/nova/placement.html'
+

 class UpgradeCheckCode(enum.IntEnum):
    """These are the status codes for the nova-status upgrade check command
@ -92,9 +96,24 @@ class UpgradeCommands(object):
    schema migrations.
    """

-    def _count_compute_nodes(self):
+    def _count_compute_nodes(self, context=None):
        """Returns the number of compute nodes in the cell database."""
-        meta = MetaData(bind=db_session.get_engine())
+        # NOTE(mriedem): This does not filter based on the service status
+        # because a disabled nova-compute service could still be reporting
+        # inventory info to the placement service. There could be an outside
+        # chance that there are compute node records in the database for
+        # disabled nova-compute services that aren't yet upgraded to Ocata or
+        # the nova-compute service was deleted and the service isn't actually
+        # running on the compute host but the operator hasn't cleaned up the
+        # compute_nodes entry in the database yet. We consider those edge cases
+        # here and the worst case scenario is we give a warning that there are
+        # more compute nodes than resource providers. We can tighten this up
+        # later if needed, for example by not including compute nodes that
+        # don't have a corresponding nova-compute service in the services
+        # table, or by only counting compute nodes with a service version of at
+        # least 15 which was the highest service version when Newton was
+        # released.
+        meta = MetaData(bind=db_session.get_engine(context=context))
        compute_nodes = Table('compute_nodes', meta, autoload=True)
        return select([sqlfunc.count()]).select_from(compute_nodes).scalar()

@ -201,25 +220,129 @@ class UpgradeCommands(object):
            msg = _('Placement API does not seem to be running.')
            return UpgradeCheckResult(UpgradeCheckCode.FAILURE, msg)

-        # TODO(mriedem): The placement service is running, fun! Now let's query
-        # the API DB to count the number of resource providers and compare that
-        # to the number of compute nodes (probably across all cells?). If there
-        # are no resource providers it's a clear fail. If there are fewer RPs
-        # than computes then it's a warning because you might be underutilized.
-
        return UpgradeCheckResult(UpgradeCheckCode.SUCCESS)

+    @staticmethod
+    def _count_compute_resource_providers():
+        """Returns the number of compute resource providers in the API database
+
+        The resource provider count is filtered based on resource providers
+        which have inventories records for the VCPU resource class, which is
+        assumed to only come from the ResourceTracker in compute nodes.
+        """
+        # TODO(mriedem): If/when we support a separate placement database this
+        # will need to change to just use the REST API.
+
+        # Get the VCPU resource class ID for filtering.
+        vcpu_rc_id = fields.ResourceClass.STANDARD.index(
+            fields.ResourceClass.VCPU)
+
+        # The inventories table has a unique constraint per resource provider
+        # and resource class, so we can simply count the number of inventories
+        # records for the given resource class and those will uniquely identify
+        # the number of resource providers we care about.
+        meta = MetaData(bind=db_session.get_api_engine())
+        inventories = Table('inventories', meta, autoload=True)
+        return select([sqlfunc.count()]).select_from(
+            inventories).where(
+                   inventories.c.resource_class_id == vcpu_rc_id).scalar()
+
+    @staticmethod
+    def _get_non_cell0_mappings():
+        """Queries the API database for non-cell0 cell mappings."""
+        meta = MetaData(bind=db_session.get_api_engine())
+        cell_mappings = Table('cell_mappings', meta, autoload=True)
+        return cell_mappings.select().where(
+            cell_mappings.c.uuid !=
+                cell_mapping_obj.CellMapping.CELL0_UUID).execute().fetchall()
+
+    def _check_resource_providers(self):
+        """Checks the status of resource provider reporting.
+
+        This check relies on the cells v2 check passing because it queries the
+        cells for compute nodes using cell mappings.
+
+        This check relies on the placement service running because if it's not
+        then there won't be any resource providers for the filter scheduler to
+        use during instance build and move requests.
+        """
+
+        # Get the total count of resource providers from the API DB that can
+        # host compute resources. This might be 0 so we have to figure out if
+        # this is a fresh install and if so we don't consider this an error.
+        num_rps = self._count_compute_resource_providers()
+
+        cell_mappings = self._get_non_cell0_mappings()
+        ctxt = nova_context.get_admin_context()
+        num_computes = 0
+        for cell_mapping in cell_mappings:
+            with nova_context.target_cell(ctxt, cell_mapping):
+                num_computes += self._count_compute_nodes(ctxt)
+        else:
+            # There are no cell mappings, cells v2 was maybe not deployed in
+            # Newton, but placement might have been, so let's check the single
+            # database for compute nodes.
+            num_computes = self._count_compute_nodes()
+
+        if num_rps == 0:
+
+            if num_computes != 0:
+                # This is a failure because there are compute nodes in the
+                # database but nothing is reporting resource providers to the
+                # placement service.
+                msg = (_('There are no compute resource providers in the '
+                         'Placement service but there are %(num_computes)s '
+                         'compute nodes in the deployment. This means no '
+                         'compute nodes are reporting into the Placement '
+                         'service and need to be upgraded and/or fixed. See '
+                         '%(placement_docs_link)s for more details.') %
+                       {'num_computes': num_computes,
+                        'placement_docs_link': PLACEMENT_DOCS_LINK})
+                return UpgradeCheckResult(UpgradeCheckCode.FAILURE, msg)
+
+            # There are no resource providers and no compute nodes so we
+            # assume this is a fresh install and move on. We should return a
+            # success code with a message here though.
+            msg = (_('There are no compute resource providers in the '
+                     'Placement service nor are there compute nodes in the '
+                     'database. Remember to configure new compute nodes to '
+                     'report into the Placement service. See '
+                     '%(placement_docs_link)s for more details.') %
+                   {'placement_docs_link': PLACEMENT_DOCS_LINK})
+            return UpgradeCheckResult(UpgradeCheckCode.SUCCESS, msg)
+
+        elif num_rps < num_computes:
+            # There are fewer resource providers than compute nodes, so return
+            # a warning explaining that the deployment might be underutilized.
+            msg = (_('There are %(num_resource_providers)s compute resource '
+                     'providers and %(num_compute_nodes)s compute nodes in '
+                     'the deployment. Ideally the number of compute resource '
+                     'providers should equal the number of enabled compute '
+                     'nodes otherwise the cloud may be underutilized. '
+                     'See %(placement_docs_link)s for more details.') %
+                   {'num_resource_providers': num_rps,
+                    'num_compute_nodes': num_computes,
+                    'placement_docs_link': PLACEMENT_DOCS_LINK})
+            return UpgradeCheckResult(UpgradeCheckCode.WARNING, msg)
+        else:
+            # We have RPs >= CNs which is what we want to see.
+            return UpgradeCheckResult(UpgradeCheckCode.SUCCESS)
+
    # The format of the check functions is to return an UpgradeCheckResult
    # object with the appropriate UpgradeCheckCode and details set. If the
    # check hits warnings or failures then those should be stored in the
    # returned UpgradeCheckResult's "details" attribute. The summary will
-    # be rolled up at the end of the check() function.
-    _upgrade_checks = {
+    # be rolled up at the end of the check() function. These functions are
+    # intended to be run in order and build on top of each other so order
+    # matters.
+    _upgrade_checks = (
        # Added in Ocata
-        _('Cells v2'): _check_cellsv2,
+        (_('Cells v2'), _check_cellsv2),
        # Added in Ocata
-        _('Placement API'): _check_placement,
-    }
+        (_('Placement API'), _check_placement),
+        # Added in Ocata
+        (_('Resource Providers'), _check_resource_providers),
+    )

    def _get_details(self, upgrade_check_result):
        if upgrade_check_result.details is not None:
@ -240,9 +363,7 @@ class UpgradeCommands(object):
        return_code = UpgradeCheckCode.SUCCESS
        # This is a list if 2-item tuples for the check name and it's results.
        check_results = []
-        # Sort the checks by name so that we have predictable test results.
-        for name in sorted(self._upgrade_checks.keys()):
-            func = self._upgrade_checks[name]
+        for name, func in self._upgrade_checks:
            result = func(self)
            # store the result of the check for the summary table
            check_results.append((name, result))
--- a/nova/tests/unit/cmd/test_status.py
+++ b/nova/tests/unit/cmd/test_status.py
@ -22,6 +22,7 @@ from six.moves import StringIO

 from keystoneauth1 import exceptions as ks_exc
 from keystoneauth1 import loading as keystone
+from oslo_utils import uuidutils

 from nova.cmd import status
 import nova.conf
@ -29,6 +30,7 @@ from nova import context
 # NOTE(mriedem): We only use objects as a convenience to populate the database
 # in the tests, we don't use them in the actual CLI.
 from nova import objects
+from nova.objects import fields
 from nova import test
 from nova.tests import fixtures as nova_fixtures
 from nova.tests import uuidsentinel as uuids
@ -206,11 +208,11 @@ class TestUpgradeCheckBasic(test.NoDBTestCase):
        self.cmd = status.UpgradeCommands()

    def test_check_success(self):
-        fake_checks = {
-            'good': mock.Mock(return_value=status.UpgradeCheckResult(
+        fake_checks = (
+            ('good', mock.Mock(return_value=status.UpgradeCheckResult(
                status.UpgradeCheckCode.SUCCESS
-            )),
-        }
+            ))),
+        )
        with mock.patch.object(self.cmd, '_upgrade_checks', fake_checks):
            self.assertEqual(status.UpgradeCheckCode.SUCCESS, self.cmd.check())
        expected = """\
@ -225,14 +227,14 @@ class TestUpgradeCheckBasic(test.NoDBTestCase):
        self.assertEqual(expected, self.output.getvalue())

    def test_check_warning(self):
-        fake_checks = {
-            'good': mock.Mock(return_value=status.UpgradeCheckResult(
+        fake_checks = (
+            ('good', mock.Mock(return_value=status.UpgradeCheckResult(
                status.UpgradeCheckCode.SUCCESS
-            )),
-            'warn': mock.Mock(return_value=status.UpgradeCheckResult(
+            ))),
+            ('warn', mock.Mock(return_value=status.UpgradeCheckResult(
                status.UpgradeCheckCode.WARNING, 'there might be a problem'
-            )),
-        }
+            ))),
+        )
        with mock.patch.object(self.cmd, '_upgrade_checks', fake_checks):
            self.assertEqual(status.UpgradeCheckCode.WARNING, self.cmd.check())
        expected = """\
@ -253,28 +255,23 @@ class TestUpgradeCheckBasic(test.NoDBTestCase):
    def test_check_failure(self):
        # make the error details over 60 characters so we test the wrapping
        error_details = 'go back to bed' + '!' * 60
-        fake_checks = {
-            'good': mock.Mock(return_value=status.UpgradeCheckResult(
+        fake_checks = (
+            ('good', mock.Mock(return_value=status.UpgradeCheckResult(
                status.UpgradeCheckCode.SUCCESS
-            )),
-            'warn': mock.Mock(return_value=status.UpgradeCheckResult(
+            ))),
+            ('warn', mock.Mock(return_value=status.UpgradeCheckResult(
                status.UpgradeCheckCode.WARNING, 'there might be a problem'
-            )),
-            'fail': mock.Mock(return_value=status.UpgradeCheckResult(
+            ))),
+            ('fail', mock.Mock(return_value=status.UpgradeCheckResult(
                status.UpgradeCheckCode.FAILURE, error_details
-            )),
-        }
+            ))),
+        )
        with mock.patch.object(self.cmd, '_upgrade_checks', fake_checks):
            self.assertEqual(status.UpgradeCheckCode.FAILURE, self.cmd.check())
        expected = """\
 +-----------------------------------------------------------------------+
 | Upgrade Check Results                                                 |
 +-----------------------------------------------------------------------+
-| Check: fail                                                           |
-| Result: Failure                                                       |
-| Details: go back to bed!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! |
-|   !!!!!!!!!!!!!!                                                      |
-+-----------------------------------------------------------------------+
 | Check: good                                                           |
 | Result: Success                                                       |
 | Details: None                                                         |
@ -283,6 +280,11 @@ class TestUpgradeCheckBasic(test.NoDBTestCase):
 | Result: Warning                                                       |
 | Details: there might be a problem                                     |
 +-----------------------------------------------------------------------+
+| Check: fail                                                           |
+| Result: Failure                                                       |
+| Details: go back to bed!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! |
+|   !!!!!!!!!!!!!!                                                      |
+-----------------------------------------------------------------------+
 """
        self.assertEqual(expected, self.output.getvalue())

@ -375,3 +377,213 @@ class TestUpgradeCheckCellsV2(test.NoDBTestCase):
        result = self.cmd._check_cellsv2()
        self.assertEqual(status.UpgradeCheckCode.SUCCESS, result.code)
        self.assertIsNone(result.details)
+
+
+# This is what the ResourceTracker sets up in the nova-compute service.
+FAKE_VCPU_INVENTORY = {
+    'resource_class': fields.ResourceClass.VCPU,
+    'total': 32,
+    'reserved': 4,
+    'min_unit': 1,
+    'max_unit': 1,
+    'step_size': 1,
+    'allocation_ratio': 1.0,
+}
+
+# This is the kind of thing that Neutron will setup externally for routed
+# networks.
+FAKE_IP_POOL_INVENTORY = {
+    'resource_class': fields.ResourceClass.IPV4_ADDRESS,
+    'total': 256,
+    'reserved': 10,
+    'min_unit': 1,
+    'max_unit': 1,
+    'step_size': 1,
+    'allocation_ratio': 1.0,
+}
+
+
+class TestUpgradeCheckResourceProviders(test.NoDBTestCase):
+    """Tests for the nova-status upgrade check on resource providers."""
+
+    # We'll setup the database ourselves because we need to use cells fixtures
+    # for multiple cell mappings.
+    USES_DB_SELF = True
+
+    def setUp(self):
+        super(TestUpgradeCheckResourceProviders, self).setUp()
+        self.output = StringIO()
+        self.useFixture(fixtures.MonkeyPatch('sys.stdout', self.output))
+        # We always need the API DB to be setup.
+        self.useFixture(nova_fixtures.Database(database='api'))
+        self.cmd = status.UpgradeCommands()
+
+    def test_check_resource_providers_fresh_install_no_mappings(self):
+        """Tests the scenario where we don't have any cell mappings (no cells
+        v2 setup yet) and no compute nodes in the single main database.
+        """
+        # We don't have a cell mapping, just the regular old main database
+        # because let's assume they haven't run simple_cell_setup yet.
+        self.useFixture(nova_fixtures.Database())
+        result = self.cmd._check_resource_providers()
+        # this is assumed to be base install so it's OK but with details
+        self.assertEqual(status.UpgradeCheckCode.SUCCESS, result.code)
+        self.assertIn('There are no compute resource providers in the '
+                      'Placement service nor are there compute nodes in the '
+                      'database',
+                      result.details)
+
+    def test_check_resource_providers_no_rps_no_computes_in_cell1(self):
+        """Tests the scenario where we have a cell mapping with no computes in
+        it and no resource providers (because of no computes).
+        """
+        # this will setup two cell mappings, one for cell0 and a single cell1
+        self._setup_cells()
+        # there are no compute nodes in the cell1 database so we have 0
+        # resource providers and 0 compute nodes, so it's assumed to be a fresh
+        # install and not a failure.
+        result = self.cmd._check_resource_providers()
+        # this is assumed to be base install so it's OK but with details
+        self.assertEqual(status.UpgradeCheckCode.SUCCESS, result.code)
+        self.assertIn('There are no compute resource providers in the '
+                      'Placement service nor are there compute nodes in the '
+                      'database',
+                      result.details)
+
+    def test_check_resource_providers_no_rps_one_compute(self):
+        """Tests the scenario where we have compute nodes in the cell but no
+        resource providers yet - VCPU or otherwise. This is a failure because
+        the compute isn't reporting into placement.
+        """
+        self._setup_cells()
+        # create a compute node which will be in cell1 by default
+        cn = objects.ComputeNode(
+            context=context.get_admin_context(),
+            host='fake-host',
+            vcpus=4,
+            memory_mb=8 * 1024,
+            local_gb=40,
+            vcpus_used=2,
+            memory_mb_used=2 * 1024,
+            local_gb_used=10,
+            hypervisor_type='fake',
+            hypervisor_version=1,
+            cpu_info='{"arch": "x86_64"}')
+        cn.create()
+        result = self.cmd._check_resource_providers()
+        self.assertEqual(status.UpgradeCheckCode.FAILURE, result.code)
+        self.assertIn('There are no compute resource providers in the '
+                      'Placement service but there are 1 compute nodes in the '
+                      'deployment.', result.details)
+
+    def _create_resource_provider(self, inventory):
+        """Helper method to create a resource provider with inventory"""
+        ctxt = context.get_admin_context()
+        rp_uuid = uuidutils.generate_uuid()
+        rp = objects.ResourceProvider(
+            context=ctxt,
+            name=rp_uuid,
+            uuid=rp_uuid)
+        rp.create()
+        inventory = objects.Inventory(
+            context=ctxt,
+            resource_provider=rp,
+            **inventory)
+        inventory.create()
+        return rp
+
+    def test_check_resource_providers_no_compute_rps_one_compute(self):
+        """Tests the scenario where we have compute nodes in the cell but no
+        compute (VCPU) resource providers yet. This is a failure because the
+        compute isn't reporting into placement.
+        """
+        self._setup_cells()
+        # create a compute node which will be in cell1 by default
+        cn = objects.ComputeNode(
+            context=context.get_admin_context(),
+            host='fake-host',
+            vcpus=4,
+            memory_mb=8 * 1024,
+            local_gb=40,
+            vcpus_used=2,
+            memory_mb_used=2 * 1024,
+            local_gb_used=10,
+            hypervisor_type='fake',
+            hypervisor_version=1,
+            cpu_info='{"arch": "x86_64"}')
+        cn.create()
+
+        # create a single resource provider that represents an external shared
+        # IP allocation pool - this tests our filtering when counting resource
+        # providers
+        self._create_resource_provider(FAKE_IP_POOL_INVENTORY)
+
+        result = self.cmd._check_resource_providers()
+        self.assertEqual(status.UpgradeCheckCode.FAILURE, result.code)
+        self.assertIn('There are no compute resource providers in the '
+                      'Placement service but there are 1 compute nodes in the '
+                      'deployment.', result.details)
+
+    def test_check_resource_providers_fewer_rps_than_computes(self):
+        """Tests the scenario that we have fewer resource providers than
+        compute nodes which is a warning because we're underutilized.
+        """
+        # setup the cell0 and cell1 mappings
+        self._setup_cells()
+
+        # create two compute nodes (by default in cell1)
+        ctxt = context.get_admin_context()
+        for x in range(2):
+            cn = objects.ComputeNode(
+                context=ctxt,
+                host=getattr(uuids, str(x)),
+                vcpus=4,
+                memory_mb=8 * 1024,
+                local_gb=40,
+                vcpus_used=2,
+                memory_mb_used=2 * 1024,
+                local_gb_used=10,
+                hypervisor_type='fake',
+                hypervisor_version=1,
+                cpu_info='{"arch": "x86_64"}')
+            cn.create()
+
+        # create a single resource provider with some VCPU inventory
+        self._create_resource_provider(FAKE_VCPU_INVENTORY)
+
+        result = self.cmd._check_resource_providers()
+        self.assertEqual(status.UpgradeCheckCode.WARNING, result.code)
+        self.assertIn('There are 1 compute resource providers and 2 compute '
+                      'nodes in the deployment.', result.details)
+
+    def test_check_resource_providers_equal_rps_to_computes(self):
+        """This tests the happy path scenario where we have an equal number
+        of compute resource providers to compute nodes.
+        """
+        # setup the cell0 and cell1 mappings
+        self._setup_cells()
+
+        # create a single compute node
+        ctxt = context.get_admin_context()
+        cn = objects.ComputeNode(
+            context=ctxt,
+            host=uuids.host,
+            vcpus=4,
+            memory_mb=8 * 1024,
+            local_gb=40,
+            vcpus_used=2,
+            memory_mb_used=2 * 1024,
+            local_gb_used=10,
+            hypervisor_type='fake',
+            hypervisor_version=1,
+            cpu_info='{"arch": "x86_64"}')
+        cn.create()
+
+        # create a single resource provider with some VCPU inventory
+        self._create_resource_provider(FAKE_VCPU_INVENTORY)
+        # create an externally shared IP allocation pool resource provider
+        self._create_resource_provider(FAKE_IP_POOL_INVENTORY)
+
+        result = self.cmd._check_resource_providers()
+        self.assertEqual(status.UpgradeCheckCode.SUCCESS, result.code)
+        self.assertIsNone(result.details)