Support passing an ignore alarm list to kube upgrade start API

Health utils support an ignore alarm list.
The kube_upgrade API makes use of those commands.

Story: 2008137
Task: 41559
Signed-off-by: albailey <Al.Bailey@windriver.com>
Change-Id: I19db852f2e87273551d8a30f4bab470afa420de2
This commit is contained in:
albailey 2021-01-06 19:18:09 -06:00
parent 84936cf189
commit 2ab82b7262
6 changed files with 134 additions and 41 deletions

View File

@ -187,6 +187,7 @@ class KubeUpgradeController(rest.RestController):
"""Create a new Kubernetes Upgrade and start upgrade."""
force = body.get('force', False) is True
alarm_ignore_list = body.get('alarm_ignore_list')
# There must not be a platform upgrade in progress
try:
@ -247,7 +248,10 @@ class KubeUpgradeController(rest.RestController):
# The system must be healthy
success, output = pecan.request.rpcapi.get_system_health(
pecan.request.context, force=force, kube_upgrade=True)
pecan.request.context,
force=force,
kube_upgrade=True,
alarm_ignore_list=alarm_ignore_list)
if not success:
LOG.info("Health query failure during kubernetes upgrade start: %s"
% output)

View File

@ -94,8 +94,11 @@ class Health(object):
success = not not_patch_current_hosts and not hostnames
return success, not_patch_current_hosts, hostnames
def _check_alarms(self, context, force=False):
def _check_alarms(self, context, force=False, alarm_ignore_list=None):
"""Checks that no alarms are active"""
if alarm_ignore_list is None:
alarm_ignore_list = []
alarms = fmclient(context).alarm.list(include_suppress=True)
success = True
@ -103,14 +106,15 @@ class Health(object):
affecting = 0
# Separate alarms that are mgmt affecting
for alarm in alarms:
mgmt_affecting = alarm.mgmt_affecting == "True"
if not mgmt_affecting:
allowed += 1
if not force:
if alarm.alarm_id not in alarm_ignore_list:
mgmt_affecting = alarm.mgmt_affecting == "True"
if not mgmt_affecting:
allowed += 1
if not force:
success = False
else:
affecting += 1
success = False
else:
affecting += 1
success = False
return success, allowed, affecting
@ -245,7 +249,7 @@ class Health(object):
return True
def get_system_health(self, context, force=False):
def get_system_health(self, context, force=False, alarm_ignore_list=None):
"""Returns the general health of the system
Checks the following:
@ -257,8 +261,12 @@ class Health(object):
- For ceph systems: The storage cluster is healthy
- All kubernetes nodes are ready
- All kubernetes control plane pods are ready
"""
:param context: request context.
:param force: set to true to ignore minor and warning alarms
:param alarm_ignore_list: list of alarm ids to ignore when performing
a health check
"""
hosts = self._dbapi.ihost_get_list()
output = _('System Health:\n')
health_ok = True
@ -316,7 +324,10 @@ class Health(object):
health_ok = health_ok and success
success, allowed, affecting = self._check_alarms(context, force)
success, allowed, affecting = self._check_alarms(
context,
force=force,
alarm_ignore_list=alarm_ignore_list)
output += _('No alarms: [%s]\n') \
% (Health.SUCCESS_MSG if success else Health.FAIL_MSG)
if not success:
@ -345,17 +356,29 @@ class Health(object):
return health_ok, output
def get_system_health_upgrade(self, context, force=False):
"""Ensures the system is in a valid state for an upgrade"""
def get_system_health_upgrade(self,
context,
force=False,
alarm_ignore_list=None):
"""
Ensures the system is in a valid state for an upgrade
:param context: request context.
:param force: set to true to ignore minor and warning alarms
:param alarm_ignore_list: list of alarm ids to ignore when performing
a health check
"""
# Does a general health check then does the following:
# A load is imported
# The load patch requirements are met
# The license is valid for the N+1 load
system_mode = self._dbapi.isystem_get_one().system_mode
simplex = (system_mode == constants.SYSTEM_MODE_SIMPLEX)
health_ok, output = self.get_system_health(context, force)
health_ok, output = self.get_system_health(
context,
force=force,
alarm_ignore_list=alarm_ignore_list)
loads = self._dbapi.load_get_list()
try:
imported_load = utils.get_imported_load(loads)
@ -412,14 +435,25 @@ class Health(object):
return health_ok, output
def get_system_health_kube_upgrade(self, context, force=False):
"""Ensures the system is in a valid state for a kubernetes upgrade
def get_system_health_kube_upgrade(self,
context,
force=False,
alarm_ignore_list=None):
"""
Ensures the system is in a valid state for a kubernetes upgrade
Does a general health check then does the following:
- All kubernetes applications are in a stable state
"""
health_ok, output = self.get_system_health(context, force)
:param context: request context.
:param force: set to true to ignore minor and warning alarms
:param alarm_ignore_list: list of alarm ids to ignore when performing
a health check
"""
health_ok, output = self.get_system_health(
context,
force=force,
alarm_ignore_list=alarm_ignore_list)
success, apps_not_valid = self._check_kube_applications()
output += _(

View File

@ -9792,7 +9792,8 @@ class ConductorManager(service.PeriodicService):
return
def get_system_health(self, context, force=False, upgrade=False,
kube_upgrade=False):
kube_upgrade=False,
alarm_ignore_list=None):
"""
Performs a system health check.
@ -9801,18 +9802,26 @@ class ConductorManager(service.PeriodicService):
:param upgrade: set to true to perform an upgrade health check
:param kube_upgrade: set to true to perform a kubernetes upgrade health
check
:param alarm_ignore_list: list of alarm ids to ignore when performing
a health check
"""
health_util = health.Health(self.dbapi)
if upgrade is True:
return health_util.get_system_health_upgrade(context=context,
force=force)
return health_util.get_system_health_upgrade(
context=context,
force=force,
alarm_ignore_list=alarm_ignore_list)
elif kube_upgrade is True:
return health_util.get_system_health_kube_upgrade(context=context,
force=force)
return health_util.get_system_health_kube_upgrade(
context=context,
force=force,
alarm_ignore_list=alarm_ignore_list)
else:
return health_util.get_system_health(context=context,
force=force)
return health_util.get_system_health(
context=context,
force=force,
alarm_ignore_list=alarm_ignore_list)
def _get_cinder_address_name(self, network_type):
ADDRESS_FORMAT_ARGS = (constants.CONTROLLER_HOSTNAME,

View File

@ -1336,7 +1336,7 @@ class ConductorAPI(sysinv.openstack.common.rpc.proxy.RpcProxy):
success=success))
def get_system_health(self, context, force=False, upgrade=False,
kube_upgrade=False):
kube_upgrade=False, alarm_ignore_list=None):
"""
Performs a system health check.
@ -1345,11 +1345,14 @@ class ConductorAPI(sysinv.openstack.common.rpc.proxy.RpcProxy):
:param upgrade: set to true to perform an upgrade health check
:param kube_upgrade: set to true to perform a kubernetes upgrade health
check
:param alarm_ignore_list: list of alarm ids to ignore when performing
a health check
"""
return self.call(context,
self.make_msg('get_system_health',
force=force, upgrade=upgrade,
kube_upgrade=kube_upgrade))
kube_upgrade=kube_upgrade,
alarm_ignore_list=alarm_ignore_list))
def reserve_ip_for_first_storage_node(self, context):
"""

View File

@ -54,6 +54,21 @@ FAKE_KUBE_VERSIONS = [
]
class FakeAlarm(object):
def __init__(self, alarm_id, mgmt_affecting):
self.alarm_id = alarm_id
self.mgmt_affecting = mgmt_affecting
FAKE_MGMT_AFFECTING_ALARM = FakeAlarm('900.401', "True")
FAKE_NON_MGMT_AFFECTING_ALARM = FakeAlarm('900.400', "False")
class FakeFmClient(object):
def __init__(self):
self.alarm = mock.MagicMock()
class FakeConductorAPI(object):
def __init__(self):
@ -62,9 +77,13 @@ class FakeConductorAPI(object):
self.service = ConductorManager('test-host', 'test-topic')
def get_system_health(self, context, force=False, upgrade=False,
kube_upgrade=False):
return self.service.get_system_health(context, force, upgrade,
kube_upgrade)
kube_upgrade=False, alarm_ignore_list=None):
return self.service.get_system_health(
context,
force=force,
upgrade=upgrade,
kube_upgrade=kube_upgrade,
alarm_ignore_list=alarm_ignore_list)
class TestKubeUpgrade(base.FunctionalTest):
@ -167,11 +186,11 @@ class TestKubeUpgrade(base.FunctionalTest):
self.mock_patch_query_hosts.return_value = self._patch_current()
self.addCleanup(p.stop)
# _check_alarms
# _check_alarms returns (Success Boolean, Allow Int, Affecting Int)
p = mock.patch.object(health.Health, '_check_alarms')
self.mock_check_alarms = p.start()
self.mock_check_alarms.return_value = (True, 0, 0)
# _check_alarms calls fmclient alarms.list
self.fake_fm_client = FakeFmClient()
p = mock.patch('sysinv.common.health.fmclient')
self.mock_fm_client = p.start()
self.mock_fm_client.return_value = self.fake_fm_client
self.addCleanup(p.stop)
# _check_kube_nodes_ready
@ -357,7 +376,8 @@ class TestPostKubeUpgrade(TestKubeUpgrade,
"""Test creation of a kube upgrade while there are alarms"""
# Test creation of upgrade when system health check fails
# 1 alarm, when force is not specified will return False
self.mock_check_alarms.return_value = (False, 1, 0)
self.fake_fm_client.alarm.list.return_value = \
[FAKE_NON_MGMT_AFFECTING_ALARM, ]
create_dict = dbutils.post_get_test_kube_upgrade(to_version='v1.43.2')
result = self.post_json('/kube_upgrade', create_dict,
@ -375,7 +395,8 @@ class TestPostKubeUpgrade(TestKubeUpgrade,
# overridden with force
# mock a 'non' mgmt_affecting alarm, upgrade can be forced
self.mock_check_alarms.return_value = (True, 1, 0)
self.fake_fm_client.alarm.list.return_value = \
[FAKE_NON_MGMT_AFFECTING_ALARM, ]
create_dict = dbutils.post_get_test_kube_upgrade(
to_version='v1.43.2')
create_dict['force'] = True
@ -392,7 +413,8 @@ class TestPostKubeUpgrade(TestKubeUpgrade,
""" Test kube upgrade create fails when mgmt affecting alarms found"""
# mock a mgmt_affecting alarm, upgrade cannot be forced
self.mock_check_alarms.return_value = (False, 0, 1)
self.fake_fm_client.alarm.list.return_value = \
[FAKE_MGMT_AFFECTING_ALARM, ]
create_dict = dbutils.post_get_test_kube_upgrade(
to_version='v1.43.2')
create_dict['force'] = True
@ -406,6 +428,26 @@ class TestPostKubeUpgrade(TestKubeUpgrade,
self.assertIn("System is not in a valid state",
result.json['error_message'])
def test_create_system_can_ignore_alarms(self):
# Test creation of upgrade when system health check fails but
# overridden with force
# mock a 'non' mgmt_affecting alarm, upgrade can be forced
self.fake_fm_client.alarm.list.return_value = \
[FAKE_MGMT_AFFECTING_ALARM, ]
create_dict = dbutils.post_get_test_kube_upgrade(
to_version='v1.43.2')
# ignore the alarm_id for the mgmt affecting alarm
create_dict['alarm_ignore_list'] = "['900.401',]"
result = self.post_json('/kube_upgrade', create_dict,
headers={'User-Agent': 'sysinv-test'})
# Verify that the upgrade has the expected attributes
self.assertEqual(result.json['from_version'], 'v1.43.1')
self.assertEqual(result.json['to_version'], 'v1.43.2')
self.assertEqual(result.json['state'],
kubernetes.KUBE_UPGRADE_STARTED)
def test_create_system_unhealthy_from_bad_apps(self):
""" Test kube upgrade create fails when invalid kube app found"""

View File

@ -25,7 +25,8 @@ class FakeConductorAPI(object):
self.start_upgrade = mock.MagicMock()
self.get_system_health_return = (True, "System is super healthy")
def get_system_health(self, context, force=False, upgrade=False):
def get_system_health(self, context, force=False, upgrade=False,
kube_upgrade=False, alarm_ignore_list=None):
if force:
return True, "System is healthy because I was forced to say that"
else: