From 5985fd9ab6be1039bf0cb57721af29dc427d1945 Mon Sep 17 00:00:00 2001 From: mmachado Date: Wed, 2 Oct 2024 01:14:21 -0300 Subject: [PATCH] Use patch-alarm to raise software alarms The patch-alarm package was previously used to raise useful sw-patch alarms. This change modifies the package to raise software alarms when the following states are identified: Raise 900.023 alarm when a software release is in progress Raise 900.021 alarm when a deploy fails on a host Raise 900.024 alarm when an unavailable release is present Keeps mechanism for raising 500.101 alarm (dev patch enabled) in case it is needed for USM in the future. Depends-On: https://review.opendev.org/c/starlingx/fault/+/931963 Depends-On: https://review.opendev.org/c/starlingx/nfv/+/931964 Test Plan: PASS: Deploy software release with no unwanted alarms PASS: Force a failed deploy host and verify alarm PASS: Deploy a major release and verify 900.024 alarm PASS: Use sw-manager to deploy major & minor releases Story: 2010676 Task: 51120 Change-Id: I36701e66bc99aeadd22befd1b0bf82030047256a Signed-off-by: mmachado --- .../patch_alarm/patch_alarm_manager.py | 108 +++++++++--------- patch-alarm/patch-alarm/tox.ini | 2 + software/software/constants.py | 2 + 3 files changed, 61 insertions(+), 51 deletions(-) diff --git a/patch-alarm/patch-alarm/patch_alarm/patch_alarm_manager.py b/patch-alarm/patch-alarm/patch_alarm/patch_alarm_manager.py index 17c755fc..fce55a4f 100644 --- a/patch-alarm/patch-alarm/patch_alarm/patch_alarm_manager.py +++ b/patch-alarm/patch-alarm/patch_alarm/patch_alarm_manager.py @@ -1,5 +1,5 @@ """ -Copyright (c) 2014-2023 Wind River Systems, Inc. +Copyright (c) 2014-2024 Wind River Systems, Inc. SPDX-License-Identifier: Apache-2.0 @@ -18,14 +18,14 @@ from daemon import runner from fm_api import constants as fm_constants from fm_api import fm_api -import cgcs_patch.config as cfg -from cgcs_patch.constants import ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER -from cgcs_patch.patch_functions import configure_logging +import software.config as cfg +from software.constants import ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER +from software.software_functions import configure_logging +from software.software_functions import LOG ################### # CONSTANTS ################### -LOG_FILE = '/var/log/patch-alarms.log' PID_FILE = '/var/run/patch-alarm-manager.pid' @@ -91,7 +91,7 @@ class PatchAlarmDaemon(object): self._get_handle_failed_hosts() def _handle_patch_alarms(self): - url = "http://%s/patch/query" % self.api_addr + url = "http://%s/v1/release" % self.api_addr try: req = requests.get(url) @@ -100,67 +100,61 @@ class PatchAlarmDaemon(object): entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, "controller") - raise_pip_alarm = False + raise_dip_alarm = False raise_obs_alarm = False raise_cert_alarm = False if req.status_code == 200: data = json.loads(req.text) - if 'pd' in data: - for patch_id, metadata in data['pd'].items(): - if 'patchstate' in metadata and \ - (metadata['patchstate'] == 'Partial-Apply' or metadata['patchstate'] == 'Partial-Remove'): - raise_pip_alarm = True - if 'status' in metadata and \ - (metadata['status'] == 'OBS' or metadata['status'] == 'Obsolete'): + for rel_metadata in data: + if 'state' in rel_metadata: + if rel_metadata['state'] in ['deploying', 'removing']: + raise_dip_alarm = True + elif rel_metadata['state'] == 'unavailable': raise_obs_alarm = True - # If there is a patch in the system (in any state) that is - # named some variation of "enable-dev-certificate", raise - # the 'developer certificate could allow for untrusted - # patches' alarm - if ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER in patch_id: - raise_cert_alarm = True + if 'release_id' in rel_metadata and ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER in rel_metadata['release_id']: + raise_cert_alarm = True - pip_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_PATCH_IN_PROGRESS, + dip_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_USM_RELEASE_DEPLOY_IN_PROGRESS, entity_instance_id) - if raise_pip_alarm and pip_alarm is None: - logging.info("Raising patch-in-progress alarm") - fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_PATCH_IN_PROGRESS, + if raise_dip_alarm and dip_alarm is None: + LOG.info("Raising deploy-in-progress alarm") + fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_USM_RELEASE_DEPLOY_IN_PROGRESS, alarm_type=fm_constants.FM_ALARM_TYPE_5, alarm_state=fm_constants.FM_ALARM_STATE_SET, entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, entity_instance_id=entity_instance_id, severity=fm_constants.FM_ALARM_SEVERITY_MINOR, - reason_text='Patching operation in progress', + reason_text='Software release deploy in progress', probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65, - proposed_repair_action='Complete reboots of affected hosts', + proposed_repair_action='Complete release', service_affecting=False) self.fm_api.set_fault(fault) - elif not raise_pip_alarm and pip_alarm is not None: - logging.info("Clearing patch-in-progress alarm") - self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_PATCH_IN_PROGRESS, + elif not raise_dip_alarm and dip_alarm is not None: + LOG.info("Clearing deploy-in-progress alarm") + self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_USM_RELEASE_DEPLOY_IN_PROGRESS, entity_instance_id) - obs_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_PATCH_OBS_IN_SYSTEM, + obs_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_USM_RELEASE_OBS_IN_SYSTEM, entity_instance_id) if raise_obs_alarm and obs_alarm is None: - logging.info("Raising obsolete-patch-in-system alarm") - fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_PATCH_OBS_IN_SYSTEM, + LOG.info("Raising obsolete-patch-in-system alarm") + fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_USM_RELEASE_OBS_IN_SYSTEM, alarm_type=fm_constants.FM_ALARM_TYPE_5, alarm_state=fm_constants.FM_ALARM_STATE_SET, entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, entity_instance_id=entity_instance_id, severity=fm_constants.FM_ALARM_SEVERITY_WARNING, - reason_text='Obsolete patch in system', + reason_text='Obsolete release in system', probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65, - proposed_repair_action='Remove and delete obsolete patches', + proposed_repair_action='Delete unavailable releases', service_affecting=False) self.fm_api.set_fault(fault) elif not raise_obs_alarm and obs_alarm is not None: - logging.info("Clearing obsolete-patch-in-system alarm") - self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_PATCH_OBS_IN_SYSTEM, + LOG.info("Clearing obsolete-patch-in-system alarm") + self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_USM_RELEASE_OBS_IN_SYSTEM, entity_instance_id) cert_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_NONSTANDARD_CERT_PATCH, @@ -182,7 +176,7 @@ class PatchAlarmDaemon(object): self.fm_api.set_fault(fault) def _get_handle_failed_hosts(self): - url = "http://%s/patch/query_hosts" % self.api_addr + url = "http://%s/v1/deploy_host" % self.api_addr try: req = requests.get(url) @@ -195,27 +189,39 @@ class PatchAlarmDaemon(object): if req.status_code == 200: data = json.loads(req.text) - if 'data' in data: - for host in data['data']: - if 'hostname' in host and 'patch_failed' in host and host['patch_failed']: - failed_hosts.append(host['hostname']) + for host_metadata in data: + if 'host_state' in host_metadata: + if host_metadata['host_state'] in ['failed', 'rollback-failed']: + failed_hosts.append(host_metadata['hostname']) # Query existing alarms - patch_failed_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_PATCH_HOST_INSTALL_FAILED, - entity_instance_id) + deploy_host_failed_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE, + entity_instance_id) if len(failed_hosts) > 0: - reason_text = "Patch installation failed on the following hosts: %s" % ", ".join(sorted(failed_hosts)) + reason_text = "Release installation failed on the following hosts: %s" % ", ".join(sorted(failed_hosts)) - if patch_failed_alarm is None or reason_text != patch_failed_alarm.reason_text: - if patch_failed_alarm is None: - logging.info("Raising patch-host-install-failure alarm") + if deploy_host_failed_alarm is None or reason_text != deploy_host_failed_alarm.reason_text: + if deploy_host_failed_alarm is None: + LOG.info("Raising deploy-host-failure alarm") else: - logging.info("Updating patch-host-install-failure alarm") + LOG.info("Updating deploy-host-failure alarm") - elif patch_failed_alarm is not None: - logging.info("Clearing patch-host-install-failure alarm") - self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_PATCH_HOST_INSTALL_FAILED, + fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE, + alarm_type=fm_constants.FM_ALARM_TYPE_5, + alarm_state=fm_constants.FM_ALARM_STATE_SET, + entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, + entity_instance_id=entity_instance_id, + severity=fm_constants.FM_ALARM_SEVERITY_MAJOR, + reason_text=reason_text, + probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65, + proposed_repair_action='Undo software operation', + service_affecting=False) + self.fm_api.set_fault(fault) + + elif deploy_host_failed_alarm is not None: + LOG.info("Clearing patch-host-install-failure alarm") + self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE, entity_instance_id) return False diff --git a/patch-alarm/patch-alarm/tox.ini b/patch-alarm/patch-alarm/tox.ini index ca048a6b..aeae68fa 100644 --- a/patch-alarm/patch-alarm/tox.ini +++ b/patch-alarm/patch-alarm/tox.ini @@ -36,10 +36,12 @@ install_command = pip install \ deps = -r{toxinidir}/requirements.txt -r{toxinidir}/test-requirements.txt + -r{[tox]stxdir}/update/software/requirements.txt -e{[tox]stxdir}/config/sysinv/sysinv/sysinv -e{[tox]stxdir}/fault/fm-api/source -e{[tox]stxdir}/config/tsconfig/tsconfig -e{[tox]stxdir}/update/sw-patch/cgcs-patch + -e{[tox]stxdir}/update/software allowlist_externals = find sh diff --git a/software/software/constants.py b/software/software/constants.py index 954156ae..a6ad27e9 100644 --- a/software/software/constants.py +++ b/software/software/constants.py @@ -203,3 +203,5 @@ DC_VAULT_DIR = "/opt/dc-vault" DC_VAULT_PLAYBOOK_DIR = "%s/playbooks" % DC_VAULT_DIR DC_VAULT_LOADS_DIR = "%s/loads" % DC_VAULT_DIR PLAYBOOKS_PATH = "/usr/share/ansible/stx-ansible/playbooks" + +ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER = 'ENABLE_DEV_CERTIFICATE'