nova/nova/cmd/status.py

466 lines
21 KiB
Python

# Copyright 2016 IBM Corp.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""
CLI interface for nova status commands.
"""
from __future__ import print_function
# enum comes from the enum34 package if python < 3.4, else it's stdlib
import enum
import functools
import sys
import textwrap
import traceback
from keystoneauth1 import exceptions as ks_exc
from keystoneauth1 import loading as keystone
from oslo_config import cfg
import pkg_resources
import prettytable
from sqlalchemy import func as sqlfunc
from sqlalchemy import MetaData, Table, select
from nova.cmd import common as cmd_common
import nova.conf
from nova import config
from nova import context as nova_context
from nova.db.sqlalchemy import api as db_session
from nova.i18n import _
from nova.objects import cell_mapping as cell_mapping_obj
from nova.objects import fields
from nova import version
CONF = nova.conf.CONF
PLACEMENT_DOCS_LINK = 'https://docs.openstack.org/nova/latest' \
'/user/placement.html'
class UpgradeCheckCode(enum.IntEnum):
"""These are the status codes for the nova-status upgrade check command
and internal check commands.
"""
# All upgrade readiness checks passed successfully and there is
# nothing to do.
SUCCESS = 0
# At least one check encountered an issue and requires further
# investigation. This is considered a warning but the upgrade may be OK.
WARNING = 1
# There was an upgrade status check failure that needs to be
# investigated. This should be considered something that stops an upgrade.
FAILURE = 2
UPGRADE_CHECK_MSG_MAP = {
UpgradeCheckCode.SUCCESS: _('Success'),
UpgradeCheckCode.WARNING: _('Warning'),
UpgradeCheckCode.FAILURE: _('Failure'),
}
class UpgradeCheckResult(object):
"""Class used for 'nova-status upgrade check' results.
The 'code' attribute is an UpgradeCheckCode enum.
The 'details' attribute is a translated message generally only used for
checks that result in a warning or failure code. The details should provide
information on what issue was discovered along with any remediation.
"""
def __init__(self, code, details=None):
super(UpgradeCheckResult, self).__init__()
self.code = code
self.details = details
class UpgradeCommands(object):
"""Commands related to upgrades.
The subcommands here must not rely on the nova object model since they
should be able to run on n-1 data. Any queries to the database should be
done through the sqlalchemy query language directly like the database
schema migrations.
"""
def _count_compute_nodes(self, context=None):
"""Returns the number of compute nodes in the cell database."""
# NOTE(mriedem): This does not filter based on the service status
# because a disabled nova-compute service could still be reporting
# inventory info to the placement service. There could be an outside
# chance that there are compute node records in the database for
# disabled nova-compute services that aren't yet upgraded to Ocata or
# the nova-compute service was deleted and the service isn't actually
# running on the compute host but the operator hasn't cleaned up the
# compute_nodes entry in the database yet. We consider those edge cases
# here and the worst case scenario is we give a warning that there are
# more compute nodes than resource providers. We can tighten this up
# later if needed, for example by not including compute nodes that
# don't have a corresponding nova-compute service in the services
# table, or by only counting compute nodes with a service version of at
# least 15 which was the highest service version when Newton was
# released.
meta = MetaData(bind=db_session.get_engine(context=context))
compute_nodes = Table('compute_nodes', meta, autoload=True)
return select([sqlfunc.count()]).select_from(compute_nodes).scalar()
def _check_cellsv2(self):
"""Checks to see if cells v2 has been setup.
These are the same checks performed in the 030_require_cell_setup API
DB migration except we expect this to be run AFTER the
nova-manage cell_v2 simple_cell_setup command, which would create the
cell and host mappings and sync the cell0 database schema, so we don't
check for flavors at all because you could create those after doing
this on an initial install. This also has to be careful about checking
for compute nodes if there are no host mappings on a fresh install.
"""
meta = MetaData()
meta.bind = db_session.get_api_engine()
cell_mappings = Table('cell_mappings', meta, autoload=True)
count = select([sqlfunc.count()]).select_from(cell_mappings).scalar()
# Two mappings are required at a minimum, cell0 and your first cell
if count < 2:
msg = _('There needs to be at least two cell mappings, one for '
'cell0 and one for your first cell. Run command '
'\'nova-manage cell_v2 simple_cell_setup\' and then '
'retry.')
return UpgradeCheckResult(UpgradeCheckCode.FAILURE, msg)
count = select([sqlfunc.count()]).select_from(cell_mappings).where(
cell_mappings.c.uuid ==
cell_mapping_obj.CellMapping.CELL0_UUID).scalar()
if count != 1:
msg = _('No cell0 mapping found. Run command '
'\'nova-manage cell_v2 simple_cell_setup\' and then '
'retry.')
return UpgradeCheckResult(UpgradeCheckCode.FAILURE, msg)
host_mappings = Table('host_mappings', meta, autoload=True)
count = select([sqlfunc.count()]).select_from(host_mappings).scalar()
if count == 0:
# This may be a fresh install in which case there may not be any
# compute_nodes in the cell database if the nova-compute service
# hasn't started yet to create those records. So let's query the
# cell database for compute_nodes records and if we find at least
# one it's a failure.
num_computes = self._count_compute_nodes()
if num_computes > 0:
msg = _('No host mappings found but there are compute nodes. '
'Run command \'nova-manage cell_v2 '
'simple_cell_setup\' and then retry.')
return UpgradeCheckResult(UpgradeCheckCode.FAILURE, msg)
msg = _('No host mappings or compute nodes were found. Remember '
'to run command \'nova-manage cell_v2 discover_hosts\' '
'when new compute hosts are deployed.')
return UpgradeCheckResult(UpgradeCheckCode.SUCCESS, msg)
return UpgradeCheckResult(UpgradeCheckCode.SUCCESS)
def _placement_get(self, path):
"""Do an HTTP get call against placement engine.
This is in a dedicated method to make it easier for unit
testing purposes.
"""
ks_filter = {'service_type': 'placement',
'region_name': CONF.placement.os_region_name,
'interface': CONF.placement.os_interface}
auth = keystone.load_auth_from_conf_options(
CONF, 'placement')
client = keystone.load_session_from_conf_options(
CONF, 'placement', auth=auth)
return client.get(path, endpoint_filter=ks_filter).json()
def _check_placement(self):
"""Checks to see if the placement API is ready for scheduling.
Checks to see that the placement API service is registered in the
service catalog and that we can make requests against it. Also checks
that there are compute nodes registered with the placement service
as resource providers so that when the Ocata nova-scheduler code starts
handling requests it has somewhere to send those builds.
"""
try:
versions = self._placement_get("/")
max_version = pkg_resources.parse_version(
versions["versions"][0]["max_version"])
# NOTE(rpodolyaka): 1.10 is needed in Pike and further as
# FilterScheduler requires GET /allocation_candidates in the
# Placement API.
needs_version = pkg_resources.parse_version("1.10")
if max_version < needs_version:
msg = (_('Placement API version %(needed)s needed, '
'you have %(current)s.') %
{'needed': needs_version, 'current': max_version})
return UpgradeCheckResult(UpgradeCheckCode.FAILURE, msg)
except ks_exc.MissingAuthPlugin:
msg = _('No credentials specified for placement API in nova.conf.')
return UpgradeCheckResult(UpgradeCheckCode.FAILURE, msg)
except ks_exc.Unauthorized:
msg = _('Placement service credentials do not work.')
return UpgradeCheckResult(UpgradeCheckCode.FAILURE, msg)
except ks_exc.EndpointNotFound:
msg = _('Placement API endpoint not found.')
return UpgradeCheckResult(UpgradeCheckCode.FAILURE, msg)
except ks_exc.DiscoveryFailure:
msg = _('Discovery for placement API URI failed.')
return UpgradeCheckResult(UpgradeCheckCode.FAILURE, msg)
except ks_exc.NotFound:
msg = _('Placement API does not seem to be running.')
return UpgradeCheckResult(UpgradeCheckCode.FAILURE, msg)
return UpgradeCheckResult(UpgradeCheckCode.SUCCESS)
@staticmethod
def _count_compute_resource_providers():
"""Returns the number of compute resource providers in the API database
The resource provider count is filtered based on resource providers
which have inventories records for the VCPU resource class, which is
assumed to only come from the ResourceTracker in compute nodes.
"""
# TODO(mriedem): If/when we support a separate placement database this
# will need to change to just use the REST API.
# Get the VCPU resource class ID for filtering.
vcpu_rc_id = fields.ResourceClass.STANDARD.index(
fields.ResourceClass.VCPU)
# The inventories table has a unique constraint per resource provider
# and resource class, so we can simply count the number of inventories
# records for the given resource class and those will uniquely identify
# the number of resource providers we care about.
meta = MetaData(bind=db_session.get_api_engine())
inventories = Table('inventories', meta, autoload=True)
return select([sqlfunc.count()]).select_from(
inventories).where(
inventories.c.resource_class_id == vcpu_rc_id).scalar()
@staticmethod
def _get_non_cell0_mappings():
"""Queries the API database for non-cell0 cell mappings."""
meta = MetaData(bind=db_session.get_api_engine())
cell_mappings = Table('cell_mappings', meta, autoload=True)
return cell_mappings.select().where(
cell_mappings.c.uuid !=
cell_mapping_obj.CellMapping.CELL0_UUID).execute().fetchall()
def _check_resource_providers(self):
"""Checks the status of resource provider reporting.
This check relies on the cells v2 check passing because it queries the
cells for compute nodes using cell mappings.
This check relies on the placement service running because if it's not
then there won't be any resource providers for the filter scheduler to
use during instance build and move requests.
Note that in Ocata, the filter scheduler will only use placement if
the minimum nova-compute service version in the deployment is >= 16
which signals when nova-compute will fail to start if placement is not
configured on the compute. Otherwise the scheduler will fallback
to pulling compute nodes from the database directly as it has always
done. That fallback will be removed in Pike.
"""
# Get the total count of resource providers from the API DB that can
# host compute resources. This might be 0 so we have to figure out if
# this is a fresh install and if so we don't consider this an error.
num_rps = self._count_compute_resource_providers()
cell_mappings = self._get_non_cell0_mappings()
ctxt = nova_context.get_admin_context()
num_computes = 0
for cell_mapping in cell_mappings:
with nova_context.target_cell(ctxt, cell_mapping) as cctxt:
num_computes += self._count_compute_nodes(cctxt)
else:
# There are no cell mappings, cells v2 was maybe not deployed in
# Newton, but placement might have been, so let's check the single
# database for compute nodes.
num_computes = self._count_compute_nodes()
if num_rps == 0:
if num_computes != 0:
# This is a warning because there are compute nodes in the
# database but nothing is reporting resource providers to the
# placement service. This will not result in scheduling
# failures in Ocata because of the fallback that is in place
# but we signal it as a warning since there is work to do.
msg = (_('There are no compute resource providers in the '
'Placement service but there are %(num_computes)s '
'compute nodes in the deployment. This means no '
'compute nodes are reporting into the Placement '
'service and need to be upgraded and/or fixed. See '
'%(placement_docs_link)s for more details.') %
{'num_computes': num_computes,
'placement_docs_link': PLACEMENT_DOCS_LINK})
return UpgradeCheckResult(UpgradeCheckCode.WARNING, msg)
# There are no resource providers and no compute nodes so we
# assume this is a fresh install and move on. We should return a
# success code with a message here though.
msg = (_('There are no compute resource providers in the '
'Placement service nor are there compute nodes in the '
'database. Remember to configure new compute nodes to '
'report into the Placement service. See '
'%(placement_docs_link)s for more details.') %
{'placement_docs_link': PLACEMENT_DOCS_LINK})
return UpgradeCheckResult(UpgradeCheckCode.SUCCESS, msg)
elif num_rps < num_computes:
# There are fewer resource providers than compute nodes, so return
# a warning explaining that the deployment might be underutilized.
# Technically this is not going to result in scheduling failures in
# Ocata because of the fallback that is in place if there are older
# compute nodes still, but it is probably OK to leave the wording
# on this as-is to prepare for when the fallback is removed in
# Pike.
msg = (_('There are %(num_resource_providers)s compute resource '
'providers and %(num_compute_nodes)s compute nodes in '
'the deployment. Ideally the number of compute resource '
'providers should equal the number of enabled compute '
'nodes otherwise the cloud may be underutilized. '
'See %(placement_docs_link)s for more details.') %
{'num_resource_providers': num_rps,
'num_compute_nodes': num_computes,
'placement_docs_link': PLACEMENT_DOCS_LINK})
return UpgradeCheckResult(UpgradeCheckCode.WARNING, msg)
else:
# We have RPs >= CNs which is what we want to see.
return UpgradeCheckResult(UpgradeCheckCode.SUCCESS)
# The format of the check functions is to return an UpgradeCheckResult
# object with the appropriate UpgradeCheckCode and details set. If the
# check hits warnings or failures then those should be stored in the
# returned UpgradeCheckResult's "details" attribute. The summary will
# be rolled up at the end of the check() function. These functions are
# intended to be run in order and build on top of each other so order
# matters.
_upgrade_checks = (
# Added in Ocata
(_('Cells v2'), _check_cellsv2),
# Added in Ocata
(_('Placement API'), _check_placement),
# Added in Ocata
(_('Resource Providers'), _check_resource_providers),
)
def _get_details(self, upgrade_check_result):
if upgrade_check_result.details is not None:
# wrap the text on the details to 60 characters
return '\n'.join(textwrap.wrap(upgrade_check_result.details, 60,
subsequent_indent=' '))
def check(self):
"""Performs checks to see if the deployment is ready for upgrade.
These checks are expected to be run BEFORE services are restarted with
new code. These checks also require access to potentially all of the
Nova databases (nova, nova_api, nova_api_cell0) and external services
such as the placement API service.
:returns: UpgradeCheckCode
"""
return_code = UpgradeCheckCode.SUCCESS
# This is a list if 2-item tuples for the check name and it's results.
check_results = []
for name, func in self._upgrade_checks:
result = func(self)
# store the result of the check for the summary table
check_results.append((name, result))
# we want to end up with the highest level code of all checks
if result.code > return_code:
return_code = result.code
# We're going to build a summary table that looks like:
# +----------------------------------------------------+
# | Upgrade Check Results |
# +----------------------------------------------------+
# | Check: Cells v2 |
# | Result: Success |
# | Details: None |
# +----------------------------------------------------+
# | Check: Placement API |
# | Result: Failure |
# | Details: There is no placement-api endpoint in the |
# | service catalog. |
# +----------------------------------------------------+
t = prettytable.PrettyTable([_('Upgrade Check Results')],
hrules=prettytable.ALL)
t.align = 'l'
for name, result in check_results:
cell = (
_('Check: %(name)s\n'
'Result: %(result)s\n'
'Details: %(details)s') %
{
'name': name,
'result': UPGRADE_CHECK_MSG_MAP[result.code],
'details': self._get_details(result),
}
)
t.add_row([cell])
print(t)
return return_code
CATEGORIES = {
'upgrade': UpgradeCommands,
}
add_command_parsers = functools.partial(cmd_common.add_command_parsers,
categories=CATEGORIES)
category_opt = cfg.SubCommandOpt('category',
title='Command categories',
help='Available categories',
handler=add_command_parsers)
def main():
"""Parse options and call the appropriate class/method."""
CONF.register_cli_opt(category_opt)
config.parse_args(sys.argv)
if CONF.category.name == "version":
print(version.version_string_with_package())
return 0
if CONF.category.name == "bash-completion":
cmd_common.print_bash_completion(CATEGORIES)
return 0
try:
fn, fn_args, fn_kwargs = cmd_common.get_action_fn()
ret = fn(*fn_args, **fn_kwargs)
return ret
except Exception:
print(_('Error:\n%s') % traceback.format_exc())
# This is 255 so it's not confused with the upgrade check exit codes.
return 255