Use patch-alarm to raise software alarms
The patch-alarm package was previously used to raise useful sw-patch alarms. This change modifies the package to raise software alarms when the following states are identified: Raise 900.023 alarm when a software release is in progress Raise 900.021 alarm when a deploy fails on a host Raise 900.024 alarm when an unavailable release is present Keeps mechanism for raising 500.101 alarm (dev patch enabled) in case it is needed for USM in the future. Depends-On: https://review.opendev.org/c/starlingx/fault/+/931963 Depends-On: https://review.opendev.org/c/starlingx/nfv/+/931964 Test Plan: PASS: Deploy software release with no unwanted alarms PASS: Force a failed deploy host and verify alarm PASS: Deploy a major release and verify 900.024 alarm PASS: Use sw-manager to deploy major & minor releases Story: 2010676 Task: 51120 Change-Id: I36701e66bc99aeadd22befd1b0bf82030047256a Signed-off-by: mmachado <mmachado@windriver.com>
This commit is contained in:
parent
94cc0843ab
commit
5985fd9ab6
@ -1,5 +1,5 @@
|
|||||||
"""
|
"""
|
||||||
Copyright (c) 2014-2023 Wind River Systems, Inc.
|
Copyright (c) 2014-2024 Wind River Systems, Inc.
|
||||||
|
|
||||||
SPDX-License-Identifier: Apache-2.0
|
SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
@ -18,14 +18,14 @@ from daemon import runner
|
|||||||
from fm_api import constants as fm_constants
|
from fm_api import constants as fm_constants
|
||||||
from fm_api import fm_api
|
from fm_api import fm_api
|
||||||
|
|
||||||
import cgcs_patch.config as cfg
|
import software.config as cfg
|
||||||
from cgcs_patch.constants import ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER
|
from software.constants import ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER
|
||||||
from cgcs_patch.patch_functions import configure_logging
|
from software.software_functions import configure_logging
|
||||||
|
from software.software_functions import LOG
|
||||||
|
|
||||||
###################
|
###################
|
||||||
# CONSTANTS
|
# CONSTANTS
|
||||||
###################
|
###################
|
||||||
LOG_FILE = '/var/log/patch-alarms.log'
|
|
||||||
PID_FILE = '/var/run/patch-alarm-manager.pid'
|
PID_FILE = '/var/run/patch-alarm-manager.pid'
|
||||||
|
|
||||||
|
|
||||||
@ -91,7 +91,7 @@ class PatchAlarmDaemon(object):
|
|||||||
self._get_handle_failed_hosts()
|
self._get_handle_failed_hosts()
|
||||||
|
|
||||||
def _handle_patch_alarms(self):
|
def _handle_patch_alarms(self):
|
||||||
url = "http://%s/patch/query" % self.api_addr
|
url = "http://%s/v1/release" % self.api_addr
|
||||||
|
|
||||||
try:
|
try:
|
||||||
req = requests.get(url)
|
req = requests.get(url)
|
||||||
@ -100,67 +100,61 @@ class PatchAlarmDaemon(object):
|
|||||||
|
|
||||||
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, "controller")
|
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, "controller")
|
||||||
|
|
||||||
raise_pip_alarm = False
|
raise_dip_alarm = False
|
||||||
raise_obs_alarm = False
|
raise_obs_alarm = False
|
||||||
raise_cert_alarm = False
|
raise_cert_alarm = False
|
||||||
if req.status_code == 200:
|
if req.status_code == 200:
|
||||||
data = json.loads(req.text)
|
data = json.loads(req.text)
|
||||||
|
|
||||||
if 'pd' in data:
|
for rel_metadata in data:
|
||||||
for patch_id, metadata in data['pd'].items():
|
if 'state' in rel_metadata:
|
||||||
if 'patchstate' in metadata and \
|
if rel_metadata['state'] in ['deploying', 'removing']:
|
||||||
(metadata['patchstate'] == 'Partial-Apply' or metadata['patchstate'] == 'Partial-Remove'):
|
raise_dip_alarm = True
|
||||||
raise_pip_alarm = True
|
elif rel_metadata['state'] == 'unavailable':
|
||||||
if 'status' in metadata and \
|
|
||||||
(metadata['status'] == 'OBS' or metadata['status'] == 'Obsolete'):
|
|
||||||
raise_obs_alarm = True
|
raise_obs_alarm = True
|
||||||
# If there is a patch in the system (in any state) that is
|
if 'release_id' in rel_metadata and ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER in rel_metadata['release_id']:
|
||||||
# named some variation of "enable-dev-certificate", raise
|
raise_cert_alarm = True
|
||||||
# the 'developer certificate could allow for untrusted
|
|
||||||
# patches' alarm
|
|
||||||
if ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER in patch_id:
|
|
||||||
raise_cert_alarm = True
|
|
||||||
|
|
||||||
pip_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_PATCH_IN_PROGRESS,
|
dip_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_USM_RELEASE_DEPLOY_IN_PROGRESS,
|
||||||
entity_instance_id)
|
entity_instance_id)
|
||||||
if raise_pip_alarm and pip_alarm is None:
|
if raise_dip_alarm and dip_alarm is None:
|
||||||
logging.info("Raising patch-in-progress alarm")
|
LOG.info("Raising deploy-in-progress alarm")
|
||||||
fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_PATCH_IN_PROGRESS,
|
fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_USM_RELEASE_DEPLOY_IN_PROGRESS,
|
||||||
alarm_type=fm_constants.FM_ALARM_TYPE_5,
|
alarm_type=fm_constants.FM_ALARM_TYPE_5,
|
||||||
alarm_state=fm_constants.FM_ALARM_STATE_SET,
|
alarm_state=fm_constants.FM_ALARM_STATE_SET,
|
||||||
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
|
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
|
||||||
entity_instance_id=entity_instance_id,
|
entity_instance_id=entity_instance_id,
|
||||||
severity=fm_constants.FM_ALARM_SEVERITY_MINOR,
|
severity=fm_constants.FM_ALARM_SEVERITY_MINOR,
|
||||||
reason_text='Patching operation in progress',
|
reason_text='Software release deploy in progress',
|
||||||
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65,
|
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65,
|
||||||
proposed_repair_action='Complete reboots of affected hosts',
|
proposed_repair_action='Complete release',
|
||||||
service_affecting=False)
|
service_affecting=False)
|
||||||
|
|
||||||
self.fm_api.set_fault(fault)
|
self.fm_api.set_fault(fault)
|
||||||
elif not raise_pip_alarm and pip_alarm is not None:
|
elif not raise_dip_alarm and dip_alarm is not None:
|
||||||
logging.info("Clearing patch-in-progress alarm")
|
LOG.info("Clearing deploy-in-progress alarm")
|
||||||
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_PATCH_IN_PROGRESS,
|
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_USM_RELEASE_DEPLOY_IN_PROGRESS,
|
||||||
entity_instance_id)
|
entity_instance_id)
|
||||||
|
|
||||||
obs_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_PATCH_OBS_IN_SYSTEM,
|
obs_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_USM_RELEASE_OBS_IN_SYSTEM,
|
||||||
entity_instance_id)
|
entity_instance_id)
|
||||||
if raise_obs_alarm and obs_alarm is None:
|
if raise_obs_alarm and obs_alarm is None:
|
||||||
logging.info("Raising obsolete-patch-in-system alarm")
|
LOG.info("Raising obsolete-patch-in-system alarm")
|
||||||
fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_PATCH_OBS_IN_SYSTEM,
|
fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_USM_RELEASE_OBS_IN_SYSTEM,
|
||||||
alarm_type=fm_constants.FM_ALARM_TYPE_5,
|
alarm_type=fm_constants.FM_ALARM_TYPE_5,
|
||||||
alarm_state=fm_constants.FM_ALARM_STATE_SET,
|
alarm_state=fm_constants.FM_ALARM_STATE_SET,
|
||||||
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
|
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
|
||||||
entity_instance_id=entity_instance_id,
|
entity_instance_id=entity_instance_id,
|
||||||
severity=fm_constants.FM_ALARM_SEVERITY_WARNING,
|
severity=fm_constants.FM_ALARM_SEVERITY_WARNING,
|
||||||
reason_text='Obsolete patch in system',
|
reason_text='Obsolete release in system',
|
||||||
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65,
|
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65,
|
||||||
proposed_repair_action='Remove and delete obsolete patches',
|
proposed_repair_action='Delete unavailable releases',
|
||||||
service_affecting=False)
|
service_affecting=False)
|
||||||
|
|
||||||
self.fm_api.set_fault(fault)
|
self.fm_api.set_fault(fault)
|
||||||
elif not raise_obs_alarm and obs_alarm is not None:
|
elif not raise_obs_alarm and obs_alarm is not None:
|
||||||
logging.info("Clearing obsolete-patch-in-system alarm")
|
LOG.info("Clearing obsolete-patch-in-system alarm")
|
||||||
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_PATCH_OBS_IN_SYSTEM,
|
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_USM_RELEASE_OBS_IN_SYSTEM,
|
||||||
entity_instance_id)
|
entity_instance_id)
|
||||||
|
|
||||||
cert_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_NONSTANDARD_CERT_PATCH,
|
cert_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_NONSTANDARD_CERT_PATCH,
|
||||||
@ -182,7 +176,7 @@ class PatchAlarmDaemon(object):
|
|||||||
self.fm_api.set_fault(fault)
|
self.fm_api.set_fault(fault)
|
||||||
|
|
||||||
def _get_handle_failed_hosts(self):
|
def _get_handle_failed_hosts(self):
|
||||||
url = "http://%s/patch/query_hosts" % self.api_addr
|
url = "http://%s/v1/deploy_host" % self.api_addr
|
||||||
|
|
||||||
try:
|
try:
|
||||||
req = requests.get(url)
|
req = requests.get(url)
|
||||||
@ -195,27 +189,39 @@ class PatchAlarmDaemon(object):
|
|||||||
if req.status_code == 200:
|
if req.status_code == 200:
|
||||||
data = json.loads(req.text)
|
data = json.loads(req.text)
|
||||||
|
|
||||||
if 'data' in data:
|
for host_metadata in data:
|
||||||
for host in data['data']:
|
if 'host_state' in host_metadata:
|
||||||
if 'hostname' in host and 'patch_failed' in host and host['patch_failed']:
|
if host_metadata['host_state'] in ['failed', 'rollback-failed']:
|
||||||
failed_hosts.append(host['hostname'])
|
failed_hosts.append(host_metadata['hostname'])
|
||||||
|
|
||||||
# Query existing alarms
|
# Query existing alarms
|
||||||
patch_failed_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_PATCH_HOST_INSTALL_FAILED,
|
deploy_host_failed_alarm = self.fm_api.get_fault(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
|
||||||
entity_instance_id)
|
entity_instance_id)
|
||||||
|
|
||||||
if len(failed_hosts) > 0:
|
if len(failed_hosts) > 0:
|
||||||
reason_text = "Patch installation failed on the following hosts: %s" % ", ".join(sorted(failed_hosts))
|
reason_text = "Release installation failed on the following hosts: %s" % ", ".join(sorted(failed_hosts))
|
||||||
|
|
||||||
if patch_failed_alarm is None or reason_text != patch_failed_alarm.reason_text:
|
if deploy_host_failed_alarm is None or reason_text != deploy_host_failed_alarm.reason_text:
|
||||||
if patch_failed_alarm is None:
|
if deploy_host_failed_alarm is None:
|
||||||
logging.info("Raising patch-host-install-failure alarm")
|
LOG.info("Raising deploy-host-failure alarm")
|
||||||
else:
|
else:
|
||||||
logging.info("Updating patch-host-install-failure alarm")
|
LOG.info("Updating deploy-host-failure alarm")
|
||||||
|
|
||||||
elif patch_failed_alarm is not None:
|
fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
|
||||||
logging.info("Clearing patch-host-install-failure alarm")
|
alarm_type=fm_constants.FM_ALARM_TYPE_5,
|
||||||
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_PATCH_HOST_INSTALL_FAILED,
|
alarm_state=fm_constants.FM_ALARM_STATE_SET,
|
||||||
|
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
|
||||||
|
entity_instance_id=entity_instance_id,
|
||||||
|
severity=fm_constants.FM_ALARM_SEVERITY_MAJOR,
|
||||||
|
reason_text=reason_text,
|
||||||
|
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65,
|
||||||
|
proposed_repair_action='Undo software operation',
|
||||||
|
service_affecting=False)
|
||||||
|
self.fm_api.set_fault(fault)
|
||||||
|
|
||||||
|
elif deploy_host_failed_alarm is not None:
|
||||||
|
LOG.info("Clearing patch-host-install-failure alarm")
|
||||||
|
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_USM_DEPLOY_HOST_FAILURE,
|
||||||
entity_instance_id)
|
entity_instance_id)
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
@ -36,10 +36,12 @@ install_command = pip install \
|
|||||||
|
|
||||||
deps = -r{toxinidir}/requirements.txt
|
deps = -r{toxinidir}/requirements.txt
|
||||||
-r{toxinidir}/test-requirements.txt
|
-r{toxinidir}/test-requirements.txt
|
||||||
|
-r{[tox]stxdir}/update/software/requirements.txt
|
||||||
-e{[tox]stxdir}/config/sysinv/sysinv/sysinv
|
-e{[tox]stxdir}/config/sysinv/sysinv/sysinv
|
||||||
-e{[tox]stxdir}/fault/fm-api/source
|
-e{[tox]stxdir}/fault/fm-api/source
|
||||||
-e{[tox]stxdir}/config/tsconfig/tsconfig
|
-e{[tox]stxdir}/config/tsconfig/tsconfig
|
||||||
-e{[tox]stxdir}/update/sw-patch/cgcs-patch
|
-e{[tox]stxdir}/update/sw-patch/cgcs-patch
|
||||||
|
-e{[tox]stxdir}/update/software
|
||||||
|
|
||||||
allowlist_externals = find
|
allowlist_externals = find
|
||||||
sh
|
sh
|
||||||
|
@ -203,3 +203,5 @@ DC_VAULT_DIR = "/opt/dc-vault"
|
|||||||
DC_VAULT_PLAYBOOK_DIR = "%s/playbooks" % DC_VAULT_DIR
|
DC_VAULT_PLAYBOOK_DIR = "%s/playbooks" % DC_VAULT_DIR
|
||||||
DC_VAULT_LOADS_DIR = "%s/loads" % DC_VAULT_DIR
|
DC_VAULT_LOADS_DIR = "%s/loads" % DC_VAULT_DIR
|
||||||
PLAYBOOKS_PATH = "/usr/share/ansible/stx-ansible/playbooks"
|
PLAYBOOKS_PATH = "/usr/share/ansible/stx-ansible/playbooks"
|
||||||
|
|
||||||
|
ENABLE_DEV_CERTIFICATE_PATCH_IDENTIFIER = 'ENABLE_DEV_CERTIFICATE'
|
||||||
|
Loading…
Reference in New Issue
Block a user