Use patch-alarm to raise software alarms
The patch-alarm package was previously used to raise useful sw-patch alarms. This change modifies the package to raise software alarms when the following states are identified: Raise 900.023 alarm when a software release is in progress Raise 900.021 alarm when a deploy fails on a host Raise 900.024 alarm when an unavailable release is present Keeps mechanism for raising 500.101 alarm (dev patch enabled) in case it is needed for USM in the future. Depends-On: https://review.opendev.org/c/starlingx/fault/+/931963 Depends-On: https://review.opendev.org/c/starlingx/nfv/+/931964 Test Plan: PASS: Deploy software release with no unwanted alarms PASS: Force a failed deploy host and verify alarm PASS: Deploy a major release and verify 900.024 alarm PASS: Use sw-manager to deploy major & minor releases Story: 2010676 Task: 51120 Change-Id: I36701e66bc99aeadd22befd1b0bf82030047256a Signed-off-by: mmachado <mmachado@windriver.com>
This commit is contained in:
parent
94cc0843ab
commit
5985fd9ab6
@ -1,5 +1,5 @@
|
||||
"""
|
||||
Copyright (c) 2014-2023 Wind River Systems, Inc.
|
||||
Copyright (c) 2014-2024 Wind River Systems, Inc.
|
||||
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
@ -18,14 +18,14 @@ from daemon import runner
|
||||
from fm_api import constants as fm_constants
|
||||
from fm_api import fm_api
|
||||
|
||||
import cgcs_patch.config as cfg
|
||||
from cgcs_patch.constants import ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER
|
||||
from cgcs_patch.patch_functions import configure_logging
|
||||
import software.config as cfg
|
||||
from software.constants import ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER
|
||||
from software.software_functions import configure_logging
|
||||
from software.software_functions import LOG
|
||||
|
||||
###################
|
||||
# CONSTANTS
|
||||
###################
|
||||
LOG_FILE = '/var/log/patch-alarms.log'
|
||||
PID_FILE = '/var/run/patch-alarm-manager.pid'
|
||||
|
||||
|
||||
@ -91,7 +91,7 @@ class PatchAlarmDaemon(object):
|
||||
self._get_handle_failed_hosts()
|
||||
|
||||
def _handle_patch_alarms(self):
|
||||
url = "http://%s/patch/query" % self.api_addr
|
||||
url = "http://%s/v1/release" % self.api_addr
|
||||
|
||||
try:
|
||||
req = requests.get(url)
|
||||
@ -100,67 +100,61 @@ class PatchAlarmDaemon(object):
|
||||
|
||||
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, "controller")
|
||||
|
||||
raise_pip_alarm = False
|
||||
raise_dip_alarm = False
|
||||
raise_obs_alarm = False
|
||||
raise_cert_alarm = False
|
||||
if req.status_code == 200:
|
||||
data = json.loads(req.text)
|
||||
|
||||
if 'pd' in data:
|
||||
for patch_id, metadata in data['pd'].items():
|
||||
if 'patchstate' in metadata and \
|
||||
(metadata['patchstate'] == 'Partial-Apply' or metadata['patchstate'] == 'Partial-Remove'):
|
||||
raise_pip_alarm = True
|
||||
if 'status' in metadata and \
|
||||
(metadata['status'] == 'OBS' or metadata['status'] == 'Obsolete'):
|
||||
for rel_metadata in data:
|
||||
if 'state' in rel_metadata:
|
||||
if rel_metadata['state'] in ['deploying', 'removing']:
|
||||
raise_dip_alarm = True
|
||||
elif rel_metadata['state'] == 'unavailable':
|
||||
raise_obs_alarm = True
|
||||
# If there is a patch in the system (in any state) that is
|
||||
# named some variation of "enable-dev-certificate", raise
|
||||
# the 'developer certificate could allow for untrusted
|
||||
# patches' alarm
|
||||
if ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER in patch_id:
|
||||
raise_cert_alarm = True
|
||||
if 'release_id' in rel_metadata and ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER in rel_metadata['release_id']:
|
||||
raise_cert_alarm = True
|
||||
|
||||
pip_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_PATCH_IN_PROGRESS,
|
||||
dip_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_USM_RELEASE_DEPLOY_IN_PROGRESS,
|
||||
entity_instance_id)
|
||||
if raise_pip_alarm and pip_alarm is None:
|
||||
logging.info("Raising patch-in-progress alarm")
|
||||
fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_PATCH_IN_PROGRESS,
|
||||
if raise_dip_alarm and dip_alarm is None:
|
||||
LOG.info("Raising deploy-in-progress alarm")
|
||||
fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_USM_RELEASE_DEPLOY_IN_PROGRESS,
|
||||
alarm_type=fm_constants.FM_ALARM_TYPE_5,
|
||||
alarm_state=fm_constants.FM_ALARM_STATE_SET,
|
||||
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
|
||||
entity_instance_id=entity_instance_id,
|
||||
severity=fm_constants.FM_ALARM_SEVERITY_MINOR,
|
||||
reason_text='Patching operation in progress',
|
||||
reason_text='Software release deploy in progress',
|
||||
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65,
|
||||
proposed_repair_action='Complete reboots of affected hosts',
|
||||
proposed_repair_action='Complete release',
|
||||
service_affecting=False)
|
||||
|
||||
self.fm_api.set_fault(fault)
|
||||
elif not raise_pip_alarm and pip_alarm is not None:
|
||||
logging.info("Clearing patch-in-progress alarm")
|
||||
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_PATCH_IN_PROGRESS,
|
||||
elif not raise_dip_alarm and dip_alarm is not None:
|
||||
LOG.info("Clearing deploy-in-progress alarm")
|
||||
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_USM_RELEASE_DEPLOY_IN_PROGRESS,
|
||||
entity_instance_id)
|
||||
|
||||
obs_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_PATCH_OBS_IN_SYSTEM,
|
||||
obs_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_USM_RELEASE_OBS_IN_SYSTEM,
|
||||
entity_instance_id)
|
||||
if raise_obs_alarm and obs_alarm is None:
|
||||
logging.info("Raising obsolete-patch-in-system alarm")
|
||||
fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_PATCH_OBS_IN_SYSTEM,
|
||||
LOG.info("Raising obsolete-patch-in-system alarm")
|
||||
fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_USM_RELEASE_OBS_IN_SYSTEM,
|
||||
alarm_type=fm_constants.FM_ALARM_TYPE_5,
|
||||
alarm_state=fm_constants.FM_ALARM_STATE_SET,
|
||||
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
|
||||
entity_instance_id=entity_instance_id,
|
||||
severity=fm_constants.FM_ALARM_SEVERITY_WARNING,
|
||||
reason_text='Obsolete patch in system',
|
||||
reason_text='Obsolete release in system',
|
||||
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65,
|
||||
proposed_repair_action='Remove and delete obsolete patches',
|
||||
proposed_repair_action='Delete unavailable releases',
|
||||
service_affecting=False)
|
||||
|
||||
self.fm_api.set_fault(fault)
|
||||
elif not raise_obs_alarm and obs_alarm is not None:
|
||||
logging.info("Clearing obsolete-patch-in-system alarm")
|
||||
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_PATCH_OBS_IN_SYSTEM,
|
||||
LOG.info("Clearing obsolete-patch-in-system alarm")
|
||||
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_USM_RELEASE_OBS_IN_SYSTEM,
|
||||
entity_instance_id)
|
||||
|
||||
cert_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_NONSTANDARD_CERT_PATCH,
|
||||
@ -182,7 +176,7 @@ class PatchAlarmDaemon(object):
|
||||
self.fm_api.set_fault(fault)
|
||||
|
||||
def _get_handle_failed_hosts(self):
|
||||
url = "http://%s/patch/query_hosts" % self.api_addr
|
||||
url = "http://%s/v1/deploy_host" % self.api_addr
|
||||
|
||||
try:
|
||||
req = requests.get(url)
|
||||
@ -195,27 +189,39 @@ class PatchAlarmDaemon(object):
|
||||
if req.status_code == 200:
|
||||
data = json.loads(req.text)
|
||||
|
||||
if 'data' in data:
|
||||
for host in data['data']:
|
||||
if 'hostname' in host and 'patch_failed' in host and host['patch_failed']:
|
||||
failed_hosts.append(host['hostname'])
|
||||
for host_metadata in data:
|
||||
if 'host_state' in host_metadata:
|
||||
if host_metadata['host_state'] in ['failed', 'rollback-failed']:
|
||||
failed_hosts.append(host_metadata['hostname'])
|
||||
|
||||
# Query existing alarms
|
||||
patch_failed_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_PATCH_HOST_INSTALL_FAILED,
|
||||
entity_instance_id)
|
||||
deploy_host_failed_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
|
||||
entity_instance_id)
|
||||
|
||||
if len(failed_hosts) > 0:
|
||||
reason_text = "Patch installation failed on the following hosts: %s" % ", ".join(sorted(failed_hosts))
|
||||
reason_text = "Release installation failed on the following hosts: %s" % ", ".join(sorted(failed_hosts))
|
||||
|
||||
if patch_failed_alarm is None or reason_text != patch_failed_alarm.reason_text:
|
||||
if patch_failed_alarm is None:
|
||||
logging.info("Raising patch-host-install-failure alarm")
|
||||
if deploy_host_failed_alarm is None or reason_text != deploy_host_failed_alarm.reason_text:
|
||||
if deploy_host_failed_alarm is None:
|
||||
LOG.info("Raising deploy-host-failure alarm")
|
||||
else:
|
||||
logging.info("Updating patch-host-install-failure alarm")
|
||||
LOG.info("Updating deploy-host-failure alarm")
|
||||
|
||||
elif patch_failed_alarm is not None:
|
||||
logging.info("Clearing patch-host-install-failure alarm")
|
||||
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_PATCH_HOST_INSTALL_FAILED,
|
||||
fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
|
||||
alarm_type=fm_constants.FM_ALARM_TYPE_5,
|
||||
alarm_state=fm_constants.FM_ALARM_STATE_SET,
|
||||
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
|
||||
entity_instance_id=entity_instance_id,
|
||||
severity=fm_constants.FM_ALARM_SEVERITY_MAJOR,
|
||||
reason_text=reason_text,
|
||||
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65,
|
||||
proposed_repair_action='Undo software operation',
|
||||
service_affecting=False)
|
||||
self.fm_api.set_fault(fault)
|
||||
|
||||
elif deploy_host_failed_alarm is not None:
|
||||
LOG.info("Clearing patch-host-install-failure alarm")
|
||||
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
|
||||
entity_instance_id)
|
||||
|
||||
return False
|
||||
|
@ -36,10 +36,12 @@ install_command = pip install \
|
||||
|
||||
deps = -r{toxinidir}/requirements.txt
|
||||
-r{toxinidir}/test-requirements.txt
|
||||
-r{[tox]stxdir}/update/software/requirements.txt
|
||||
-e{[tox]stxdir}/config/sysinv/sysinv/sysinv
|
||||
-e{[tox]stxdir}/fault/fm-api/source
|
||||
-e{[tox]stxdir}/config/tsconfig/tsconfig
|
||||
-e{[tox]stxdir}/update/sw-patch/cgcs-patch
|
||||
-e{[tox]stxdir}/update/software
|
||||
|
||||
allowlist_externals = find
|
||||
sh
|
||||
|
@ -203,3 +203,5 @@ DC_VAULT_DIR = "/opt/dc-vault"
|
||||
DC_VAULT_PLAYBOOK_DIR = "%s/playbooks" % DC_VAULT_DIR
|
||||
DC_VAULT_LOADS_DIR = "%s/loads" % DC_VAULT_DIR
|
||||
PLAYBOOKS_PATH = "/usr/share/ansible/stx-ansible/playbooks"
|
||||
|
||||
ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER = 'ENABLE_DEV_CERTIFICATE'
|
||||
|
Loading…
Reference in New Issue
Block a user