Use patch-alarm to raise software alarms

The patch-alarm package was previously used to raise useful
sw-patch alarms. This change modifies the package to raise
software alarms when the following states are identified:

Raise 900.023 alarm when a software release is in progress
Raise 900.021 alarm when a deploy fails on a host
Raise 900.024 alarm when an unavailable release is present

Keeps mechanism for raising 500.101 alarm (dev patch enabled)
in case it is needed for USM in the future.

Depends-On: https://review.opendev.org/c/starlingx/fault/+/931963
Depends-On: https://review.opendev.org/c/starlingx/nfv/+/931964

Test Plan:
PASS: Deploy software release with no unwanted alarms
PASS: Force a failed deploy host and verify alarm
PASS: Deploy a major release and verify 900.024 alarm
PASS: Use sw-manager to deploy major & minor releases

Story: 2010676
Task: 51120

Change-Id: I36701e66bc99aeadd22befd1b0bf82030047256a
Signed-off-by: mmachado <mmachado@windriver.com>
This commit is contained in:
mmachado 2024-10-02 01:14:21 -03:00
parent 94cc0843ab
commit 5985fd9ab6
3 changed files with 61 additions and 51 deletions

View File

@ -1,5 +1,5 @@
""" """
Copyright (c) 2014-2023 Wind River Systems, Inc. Copyright (c) 2014-2024 Wind River Systems, Inc.
SPDX-License-Identifier: Apache-2.0 SPDX-License-Identifier: Apache-2.0
@ -18,14 +18,14 @@ from daemon import runner
from fm_api import constants as fm_constants from fm_api import constants as fm_constants
from fm_api import fm_api from fm_api import fm_api
import cgcs_patch.config as cfg import software.config as cfg
from cgcs_patch.constants import ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER from software.constants import ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER
from cgcs_patch.patch_functions import configure_logging from software.software_functions import configure_logging
from software.software_functions import LOG
################### ###################
# CONSTANTS # CONSTANTS
################### ###################
LOG_FILE = '/var/log/patch-alarms.log'
PID_FILE = '/var/run/patch-alarm-manager.pid' PID_FILE = '/var/run/patch-alarm-manager.pid'
@ -91,7 +91,7 @@ class PatchAlarmDaemon(object):
self._get_handle_failed_hosts() self._get_handle_failed_hosts()
def _handle_patch_alarms(self): def _handle_patch_alarms(self):
url = "http://%s/patch/query" % self.api_addr url = "http://%s/v1/release" % self.api_addr
try: try:
req = requests.get(url) req = requests.get(url)
@ -100,67 +100,61 @@ class PatchAlarmDaemon(object):
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, "controller") entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, "controller")
raise_pip_alarm = False raise_dip_alarm = False
raise_obs_alarm = False raise_obs_alarm = False
raise_cert_alarm = False raise_cert_alarm = False
if req.status_code == 200: if req.status_code == 200:
data = json.loads(req.text) data = json.loads(req.text)
if 'pd' in data: for rel_metadata in data:
for patch_id, metadata in data['pd'].items(): if 'state' in rel_metadata:
if 'patchstate' in metadata and \ if rel_metadata['state'] in ['deploying', 'removing']:
(metadata['patchstate'] == 'Partial-Apply' or metadata['patchstate'] == 'Partial-Remove'): raise_dip_alarm = True
raise_pip_alarm = True elif rel_metadata['state'] == 'unavailable':
if 'status' in metadata and \
(metadata['status'] == 'OBS' or metadata['status'] == 'Obsolete'):
raise_obs_alarm = True raise_obs_alarm = True
# If there is a patch in the system (in any state) that is if 'release_id' in rel_metadata and ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER in rel_metadata['release_id']:
# named some variation of "enable-dev-certificate", raise
# the 'developer certificate could allow for untrusted
# patches' alarm
if ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER in patch_id:
raise_cert_alarm = True raise_cert_alarm = True
pip_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_PATCH_IN_PROGRESS, dip_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_USM_RELEASE_DEPLOY_IN_PROGRESS,
entity_instance_id) entity_instance_id)
if raise_pip_alarm and pip_alarm is None: if raise_dip_alarm and dip_alarm is None:
logging.info("Raising patch-in-progress alarm") LOG.info("Raising deploy-in-progress alarm")
fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_PATCH_IN_PROGRESS, fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_USM_RELEASE_DEPLOY_IN_PROGRESS,
alarm_type=fm_constants.FM_ALARM_TYPE_5, alarm_type=fm_constants.FM_ALARM_TYPE_5,
alarm_state=fm_constants.FM_ALARM_STATE_SET, alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
entity_instance_id=entity_instance_id, entity_instance_id=entity_instance_id,
severity=fm_constants.FM_ALARM_SEVERITY_MINOR, severity=fm_constants.FM_ALARM_SEVERITY_MINOR,
reason_text='Patching operation in progress', reason_text='Software release deploy in progress',
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65, probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65,
proposed_repair_action='Complete reboots of affected hosts', proposed_repair_action='Complete release',
service_affecting=False) service_affecting=False)
self.fm_api.set_fault(fault) self.fm_api.set_fault(fault)
elif not raise_pip_alarm and pip_alarm is not None: elif not raise_dip_alarm and dip_alarm is not None:
logging.info("Clearing patch-in-progress alarm") LOG.info("Clearing deploy-in-progress alarm")
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_PATCH_IN_PROGRESS, self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_USM_RELEASE_DEPLOY_IN_PROGRESS,
entity_instance_id) entity_instance_id)
obs_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_PATCH_OBS_IN_SYSTEM, obs_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_USM_RELEASE_OBS_IN_SYSTEM,
entity_instance_id) entity_instance_id)
if raise_obs_alarm and obs_alarm is None: if raise_obs_alarm and obs_alarm is None:
logging.info("Raising obsolete-patch-in-system alarm") LOG.info("Raising obsolete-patch-in-system alarm")
fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_PATCH_OBS_IN_SYSTEM, fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_USM_RELEASE_OBS_IN_SYSTEM,
alarm_type=fm_constants.FM_ALARM_TYPE_5, alarm_type=fm_constants.FM_ALARM_TYPE_5,
alarm_state=fm_constants.FM_ALARM_STATE_SET, alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
entity_instance_id=entity_instance_id, entity_instance_id=entity_instance_id,
severity=fm_constants.FM_ALARM_SEVERITY_WARNING, severity=fm_constants.FM_ALARM_SEVERITY_WARNING,
reason_text='Obsolete patch in system', reason_text='Obsolete release in system',
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65, probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65,
proposed_repair_action='Remove and delete obsolete patches', proposed_repair_action='Delete unavailable releases',
service_affecting=False) service_affecting=False)
self.fm_api.set_fault(fault) self.fm_api.set_fault(fault)
elif not raise_obs_alarm and obs_alarm is not None: elif not raise_obs_alarm and obs_alarm is not None:
logging.info("Clearing obsolete-patch-in-system alarm") LOG.info("Clearing obsolete-patch-in-system alarm")
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_PATCH_OBS_IN_SYSTEM, self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_USM_RELEASE_OBS_IN_SYSTEM,
entity_instance_id) entity_instance_id)
cert_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_NONSTANDARD_CERT_PATCH, cert_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_NONSTANDARD_CERT_PATCH,
@ -182,7 +176,7 @@ class PatchAlarmDaemon(object):
self.fm_api.set_fault(fault) self.fm_api.set_fault(fault)
def _get_handle_failed_hosts(self): def _get_handle_failed_hosts(self):
url = "http://%s/patch/query_hosts" % self.api_addr url = "http://%s/v1/deploy_host" % self.api_addr
try: try:
req = requests.get(url) req = requests.get(url)
@ -195,27 +189,39 @@ class PatchAlarmDaemon(object):
if req.status_code == 200: if req.status_code == 200:
data = json.loads(req.text) data = json.loads(req.text)
if 'data' in data: for host_metadata in data:
for host in data['data']: if 'host_state' in host_metadata:
if 'hostname' in host and 'patch_failed' in host and host['patch_failed']: if host_metadata['host_state'] in ['failed', 'rollback-failed']:
failed_hosts.append(host['hostname']) failed_hosts.append(host_metadata['hostname'])
# Query existing alarms # Query existing alarms
patch_failed_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_PATCH_HOST_INSTALL_FAILED, deploy_host_failed_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
entity_instance_id) entity_instance_id)
if len(failed_hosts) > 0: if len(failed_hosts) > 0:
reason_text = "Patch installation failed on the following hosts: %s" % ", ".join(sorted(failed_hosts)) reason_text = "Release installation failed on the following hosts: %s" % ", ".join(sorted(failed_hosts))
if patch_failed_alarm is None or reason_text != patch_failed_alarm.reason_text: if deploy_host_failed_alarm is None or reason_text != deploy_host_failed_alarm.reason_text:
if patch_failed_alarm is None: if deploy_host_failed_alarm is None:
logging.info("Raising patch-host-install-failure alarm") LOG.info("Raising deploy-host-failure alarm")
else: else:
logging.info("Updating patch-host-install-failure alarm") LOG.info("Updating deploy-host-failure alarm")
elif patch_failed_alarm is not None: fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
logging.info("Clearing patch-host-install-failure alarm") alarm_type=fm_constants.FM_ALARM_TYPE_5,
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_PATCH_HOST_INSTALL_FAILED, alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
entity_instance_id=entity_instance_id,
severity=fm_constants.FM_ALARM_SEVERITY_MAJOR,
reason_text=reason_text,
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65,
proposed_repair_action='Undo software operation',
service_affecting=False)
self.fm_api.set_fault(fault)
elif deploy_host_failed_alarm is not None:
LOG.info("Clearing patch-host-install-failure alarm")
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
entity_instance_id) entity_instance_id)
return False return False

View File

@ -36,10 +36,12 @@ install_command = pip install \
deps = -r{toxinidir}/requirements.txt deps = -r{toxinidir}/requirements.txt
-r{toxinidir}/test-requirements.txt -r{toxinidir}/test-requirements.txt
-r{[tox]stxdir}/update/software/requirements.txt
-e{[tox]stxdir}/config/sysinv/sysinv/sysinv -e{[tox]stxdir}/config/sysinv/sysinv/sysinv
-e{[tox]stxdir}/fault/fm-api/source -e{[tox]stxdir}/fault/fm-api/source
-e{[tox]stxdir}/config/tsconfig/tsconfig -e{[tox]stxdir}/config/tsconfig/tsconfig
-e{[tox]stxdir}/update/sw-patch/cgcs-patch -e{[tox]stxdir}/update/sw-patch/cgcs-patch
-e{[tox]stxdir}/update/software
allowlist_externals = find allowlist_externals = find
sh sh

View File

@ -203,3 +203,5 @@ DC_VAULT_DIR = "/opt/dc-vault"
DC_VAULT_PLAYBOOK_DIR = "%s/playbooks" % DC_VAULT_DIR DC_VAULT_PLAYBOOK_DIR = "%s/playbooks" % DC_VAULT_DIR
DC_VAULT_LOADS_DIR = "%s/loads" % DC_VAULT_DIR DC_VAULT_LOADS_DIR = "%s/loads" % DC_VAULT_DIR
PLAYBOOKS_PATH = "/usr/share/ansible/stx-ansible/playbooks" PLAYBOOKS_PATH = "/usr/share/ansible/stx-ansible/playbooks"
ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER = 'ENABLE_DEV_CERTIFICATE'