Use patch-alarm to raise software alarms

The patch-alarm package was previously used to raise useful
sw-patch alarms. This change modifies the package to raise
software alarms when the following states are identified:

Raise 900.023 alarm when a software release is in progress
Raise 900.021 alarm when a deploy fails on a host
Raise 900.024 alarm when an unavailable release is present

Keeps mechanism for raising 500.101 alarm (dev patch enabled)
in case it is needed for USM in the future.

Depends-On: https://review.opendev.org/c/starlingx/fault/+/931963
Depends-On: https://review.opendev.org/c/starlingx/nfv/+/931964

Test Plan:
PASS: Deploy software release with no unwanted alarms
PASS: Force a failed deploy host and verify alarm
PASS: Deploy a major release and verify 900.024 alarm
PASS: Use sw-manager to deploy major & minor releases

Story: 2010676
Task: 51120

Change-Id: I36701e66bc99aeadd22befd1b0bf82030047256a
Signed-off-by: mmachado <mmachado@windriver.com>
This commit is contained in:
mmachado 2024-10-02 01:14:21 -03:00
parent 94cc0843ab
commit 5985fd9ab6
3 changed files with 61 additions and 51 deletions

View File

@ -1,5 +1,5 @@
"""
Copyright (c) 2014-2023 Wind River Systems, Inc.
Copyright (c) 2014-2024 Wind River Systems, Inc.
SPDX-License-Identifier: Apache-2.0
@ -18,14 +18,14 @@ from daemon import runner
from fm_api import constants as fm_constants
from fm_api import fm_api
import cgcs_patch.config as cfg
from cgcs_patch.constants import ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER
from cgcs_patch.patch_functions import configure_logging
import software.config as cfg
from software.constants import ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER
from software.software_functions import configure_logging
from software.software_functions import LOG
###################
# CONSTANTS
###################
LOG_FILE = '/var/log/patch-alarms.log'
PID_FILE = '/var/run/patch-alarm-manager.pid'
@ -91,7 +91,7 @@ class PatchAlarmDaemon(object):
self._get_handle_failed_hosts()
def _handle_patch_alarms(self):
url = "http://%s/patch/query" % self.api_addr
url = "http://%s/v1/release" % self.api_addr
try:
req = requests.get(url)
@ -100,67 +100,61 @@ class PatchAlarmDaemon(object):
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, "controller")
raise_pip_alarm = False
raise_dip_alarm = False
raise_obs_alarm = False
raise_cert_alarm = False
if req.status_code == 200:
data = json.loads(req.text)
if 'pd' in data:
for patch_id, metadata in data['pd'].items():
if 'patchstate' in metadata and \
(metadata['patchstate'] == 'Partial-Apply' or metadata['patchstate'] == 'Partial-Remove'):
raise_pip_alarm = True
if 'status' in metadata and \
(metadata['status'] == 'OBS' or metadata['status'] == 'Obsolete'):
for rel_metadata in data:
if 'state' in rel_metadata:
if rel_metadata['state'] in ['deploying', 'removing']:
raise_dip_alarm = True
elif rel_metadata['state'] == 'unavailable':
raise_obs_alarm = True
# If there is a patch in the system (in any state) that is
# named some variation of "enable-dev-certificate", raise
# the 'developer certificate could allow for untrusted
# patches' alarm
if ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER in patch_id:
if 'release_id' in rel_metadata and ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER in rel_metadata['release_id']:
raise_cert_alarm = True
pip_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_PATCH_IN_PROGRESS,
dip_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_USM_RELEASE_DEPLOY_IN_PROGRESS,
entity_instance_id)
if raise_pip_alarm and pip_alarm is None:
logging.info("Raising patch-in-progress alarm")
fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_PATCH_IN_PROGRESS,
if raise_dip_alarm and dip_alarm is None:
LOG.info("Raising deploy-in-progress alarm")
fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_USM_RELEASE_DEPLOY_IN_PROGRESS,
alarm_type=fm_constants.FM_ALARM_TYPE_5,
alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
entity_instance_id=entity_instance_id,
severity=fm_constants.FM_ALARM_SEVERITY_MINOR,
reason_text='Patching operation in progress',
reason_text='Software release deploy in progress',
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65,
proposed_repair_action='Complete reboots of affected hosts',
proposed_repair_action='Complete release',
service_affecting=False)
self.fm_api.set_fault(fault)
elif not raise_pip_alarm and pip_alarm is not None:
logging.info("Clearing patch-in-progress alarm")
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_PATCH_IN_PROGRESS,
elif not raise_dip_alarm and dip_alarm is not None:
LOG.info("Clearing deploy-in-progress alarm")
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_USM_RELEASE_DEPLOY_IN_PROGRESS,
entity_instance_id)
obs_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_PATCH_OBS_IN_SYSTEM,
obs_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_USM_RELEASE_OBS_IN_SYSTEM,
entity_instance_id)
if raise_obs_alarm and obs_alarm is None:
logging.info("Raising obsolete-patch-in-system alarm")
fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_PATCH_OBS_IN_SYSTEM,
LOG.info("Raising obsolete-patch-in-system alarm")
fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_USM_RELEASE_OBS_IN_SYSTEM,
alarm_type=fm_constants.FM_ALARM_TYPE_5,
alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
entity_instance_id=entity_instance_id,
severity=fm_constants.FM_ALARM_SEVERITY_WARNING,
reason_text='Obsolete patch in system',
reason_text='Obsolete release in system',
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65,
proposed_repair_action='Remove and delete obsolete patches',
proposed_repair_action='Delete unavailable releases',
service_affecting=False)
self.fm_api.set_fault(fault)
elif not raise_obs_alarm and obs_alarm is not None:
logging.info("Clearing obsolete-patch-in-system alarm")
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_PATCH_OBS_IN_SYSTEM,
LOG.info("Clearing obsolete-patch-in-system alarm")
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_USM_RELEASE_OBS_IN_SYSTEM,
entity_instance_id)
cert_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_NONSTANDARD_CERT_PATCH,
@ -182,7 +176,7 @@ class PatchAlarmDaemon(object):
self.fm_api.set_fault(fault)
def _get_handle_failed_hosts(self):
url = "http://%s/patch/query_hosts" % self.api_addr
url = "http://%s/v1/deploy_host" % self.api_addr
try:
req = requests.get(url)
@ -195,27 +189,39 @@ class PatchAlarmDaemon(object):
if req.status_code == 200:
data = json.loads(req.text)
if 'data' in data:
for host in data['data']:
if 'hostname' in host and 'patch_failed' in host and host['patch_failed']:
failed_hosts.append(host['hostname'])
for host_metadata in data:
if 'host_state' in host_metadata:
if host_metadata['host_state'] in ['failed', 'rollback-failed']:
failed_hosts.append(host_metadata['hostname'])
# Query existing alarms
patch_failed_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_PATCH_HOST_INSTALL_FAILED,
deploy_host_failed_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
entity_instance_id)
if len(failed_hosts) > 0:
reason_text = "Patch installation failed on the following hosts: %s" % ", ".join(sorted(failed_hosts))
reason_text = "Release installation failed on the following hosts: %s" % ", ".join(sorted(failed_hosts))
if patch_failed_alarm is None or reason_text != patch_failed_alarm.reason_text:
if patch_failed_alarm is None:
logging.info("Raising patch-host-install-failure alarm")
if deploy_host_failed_alarm is None or reason_text != deploy_host_failed_alarm.reason_text:
if deploy_host_failed_alarm is None:
LOG.info("Raising deploy-host-failure alarm")
else:
logging.info("Updating patch-host-install-failure alarm")
LOG.info("Updating deploy-host-failure alarm")
elif patch_failed_alarm is not None:
logging.info("Clearing patch-host-install-failure alarm")
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_PATCH_HOST_INSTALL_FAILED,
fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
alarm_type=fm_constants.FM_ALARM_TYPE_5,
alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
entity_instance_id=entity_instance_id,
severity=fm_constants.FM_ALARM_SEVERITY_MAJOR,
reason_text=reason_text,
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65,
proposed_repair_action='Undo software operation',
service_affecting=False)
self.fm_api.set_fault(fault)
elif deploy_host_failed_alarm is not None:
LOG.info("Clearing patch-host-install-failure alarm")
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
entity_instance_id)
return False

View File

@ -36,10 +36,12 @@ install_command = pip install \
deps = -r{toxinidir}/requirements.txt
-r{toxinidir}/test-requirements.txt
-r{[tox]stxdir}/update/software/requirements.txt
-e{[tox]stxdir}/config/sysinv/sysinv/sysinv
-e{[tox]stxdir}/fault/fm-api/source
-e{[tox]stxdir}/config/tsconfig/tsconfig
-e{[tox]stxdir}/update/sw-patch/cgcs-patch
-e{[tox]stxdir}/update/software
allowlist_externals = find
sh

View File

@ -203,3 +203,5 @@ DC_VAULT_DIR = "/opt/dc-vault"
DC_VAULT_PLAYBOOK_DIR = "%s/playbooks" % DC_VAULT_DIR
DC_VAULT_LOADS_DIR = "%s/loads" % DC_VAULT_DIR
PLAYBOOKS_PATH = "/usr/share/ansible/stx-ansible/playbooks"
ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER = 'ENABLE_DEV_CERTIFICATE'