Audit for deleted certificates

Cert-alarm feature was missing an audit for deleted
certificate resources. If an installed certificate was
deleted, the alarm would stay forever. This code change
audits (during the full audit mode, i.e., once a day)
for such conditions and deletes the related alarm.

To achieve this feature, various internal function
definitions had to be changed, resulting in some
code refactoring.

Test Plan:
PASS: Existing feature works as intended/designed
PASS: Deleting a certificate resource will cleanup
      the alarm during the full audit cycle

Story: 2008946
Task: 42852

Signed-off-by: Sabeel Ansari <Sabeel.Ansari@windriver.com>
Change-Id: Ie9f9c056c1af6c0e1887d294ed8d1765532ae8f9
This commit is contained in:
Sabeel Ansari 2021-11-15 14:37:13 -05:00
parent 8e6ce5bf11
commit f0843c40d9
3 changed files with 191 additions and 105 deletions

View File

@ -6,7 +6,6 @@
from datetime import datetime
from datetime import timedelta
from fm_api import constants as fm_constants
from itertools import chain
from oslo_log import log
import re
from sysinv.cert_alarm import fm as fm_mgr
@ -32,15 +31,29 @@ class CertAlarmAudit(object):
# Reset both CERT_SNAPSHOT & ALARM_SNAPSHOT
utils.reset_cert_snapshot()
self.fm_obj.reset_alarms_snapshot()
self.fm_obj.reset_entityid_to_certname_map()
# Collect CERT_SNAPSHOT
# Collect snapshots
self.collect_cert_snapshot()
self.fm_obj.collect_all_cert_alarms()
# Update snapshots
"""
In order to correlate alarms with CERT_SNAPSHOT,
we need references to entity_instance_id and
alarm_uuids (if any alarms present). This is needed
to audit for deleted certificates
"""
# Needs entity_id present before auditing deleted certificates
# Do not change order
self.update_entity_ids_in_cert_snapshot()
self.audit_for_deleted_certificates()
utils.print_cert_snapshot()
self.fm_obj.print_alarms_snapshot()
self.compute_action_full_audit()
self.apply_action_full_audit()
LOG.info('cert-alarm full completed')
LOG.info('cert-alarm full audit completed')
def collect_cert_snapshot(self):
"""
@ -94,9 +107,18 @@ class CertAlarmAudit(object):
if entry[1] is not None:
utils.add_cert_snapshot(entry[0], entry[1], entry[2], entry[3])
def compute_action_full_audit(self):
def update_entity_ids_in_cert_snapshot(self):
for cert_name in utils.CERT_SNAPSHOT:
self.compute_action(cert_name)
entity_id = self.fm_obj.get_entity_instance_id(cert_name)
utils.update_cert_snapshot_field(cert_name,
utils.ENTITY_ID,
entity_id)
def apply_action_full_audit(self):
for cert_name in utils.CERT_SNAPSHOT:
entity_id = utils.CERT_SNAPSHOT[cert_name].get(utils.ENTITY_ID,
self.fm_obj.get_entity_instance_id(cert_name))
self.apply_action(cert_name, entity_id)
# ============== Active Alarm audit ===================
def run_active_alarm_audit(self):
@ -106,22 +128,26 @@ class CertAlarmAudit(object):
LOG.info('Running cert-alarm active_alarm_audit')
# Collect ALARM_SNAPSHOT
self.fm_obj.reset_alarms_snapshot()
self.fm_obj.collect_all_cert_alarms()
self.compute_action_active_alarms()
self.apply_action_active_alarms()
utils.print_cert_snapshot()
self.fm_obj.print_alarms_snapshot()
LOG.info('cert-alarm active_alarm_audit completed')
def compute_action_active_alarms(self):
# Create single list of expiring_soon & expired certs
all_active_alarms = list(chain(*self.fm_obj.ALARMS_SNAPSHOT.values()))
def apply_action_active_alarms(self):
for alarm_instance in self.fm_obj.ALARMS_SNAPSHOT:
entity_id = self.fm_obj.ALARMS_SNAPSHOT[alarm_instance]['ENTITY_ID']
cert_name = utils.get_cert_name_with_entity_id(entity_id)
if cert_name is not None:
# 1. First refresh expiry date snapshot data
self.refresh_expiry_data(cert_name)
for cert_name in all_active_alarms:
# 1. First refresh expiry date snapshot data
self.refresh_expiry_data(cert_name)
# 2. Now check dates and compute_action
self.compute_action(cert_name)
# 2. Now check dates and apply_action
self.apply_action(cert_name, entity_id)
def refresh_expiry_data(self, cert_name):
if cert_name not in utils.CERT_SNAPSHOT:
@ -167,9 +193,9 @@ class CertAlarmAudit(object):
time_params[name] = int(param)
return timedelta(**time_params)
def compute_action(self, cert_name):
def apply_action(self, cert_name, entity_id):
"""
Computes any action required based on parameters passed and calls FM API
Applies any action required based on parameters passed and calls FM API
Input: cert_name: Certificate name
"""
if cert_name not in utils.CERT_SNAPSHOT:
@ -183,8 +209,8 @@ class CertAlarmAudit(object):
renew_before = None
if utils.SNAPSHOT_KEY_RENEW_BEFORE in snapshot:
renew_before = self.parse_time(snapshot[utils.SNAPSHOT_KEY_RENEW_BEFORE])
LOG.debug('cert_name=%s, expiry=%s, alarm_before=%s, renew_before=%s'
% (cert_name, expiry.days, alarm_before.days, renew_before.days))
LOG.debug('cert_name=%s, entity_id=%s, expiry=%s, alarm_before=%s, renew_before=%s'
% (cert_name, entity_id, expiry.days, alarm_before.days, renew_before.days))
days_to_expiry = expiry.days
alarm_before_days = alarm_before.days
@ -198,32 +224,64 @@ class CertAlarmAudit(object):
threshold = alarm_before_days
if days_to_expiry > threshold:
self.clear_expiring_soon(cert_name)
self.clear_expired(cert_name)
self.clear_expiring_soon(cert_name, entity_id)
self.clear_expired(cert_name, entity_id)
else:
if days_to_expiry < 0:
# Expired. Clear expiring-soon & raise expired
self.clear_expiring_soon(cert_name)
self.raise_expired(cert_name)
self.clear_expiring_soon(cert_name, entity_id)
self.raise_expired(cert_name, entity_id)
else:
self.raise_expiring_soon(cert_name)
self.raise_expiring_soon(cert_name, entity_id)
def raise_expiring_soon(self, cert_name):
self.fm_obj.set_fault(cert_name,
False,
fm_constants.FM_ALARM_STATE_SET)
def raise_expiring_soon(self, cert_name, entity_id):
if self.alarm_override_check_passed(cert_name):
self.fm_obj.set_fault(entity_id,
fm_constants.FM_ALARM_ID_CERT_EXPIRING_SOON,
fm_constants.FM_ALARM_STATE_SET)
def clear_expiring_soon(self, cert_name):
self.fm_obj.set_fault(cert_name,
False,
fm_constants.FM_ALARM_STATE_CLEAR)
def clear_expiring_soon(self, cert_name, entity_id):
if self.alarm_override_check_passed(cert_name):
self.fm_obj.set_fault(entity_id,
fm_constants.FM_ALARM_ID_CERT_EXPIRING_SOON,
fm_constants.FM_ALARM_STATE_CLEAR)
def raise_expired(self, cert_name):
self.fm_obj.set_fault(cert_name,
True,
fm_constants.FM_ALARM_STATE_SET)
def raise_expired(self, cert_name, entity_id):
if self.alarm_override_check_passed(cert_name):
self.fm_obj.set_fault(entity_id,
fm_constants.FM_ALARM_ID_CERT_EXPIRED,
fm_constants.FM_ALARM_STATE_SET)
def clear_expired(self, cert_name):
self.fm_obj.set_fault(cert_name,
True,
fm_constants.FM_ALARM_STATE_CLEAR)
def clear_expired(self, cert_name, entity_id):
if self.alarm_override_check_passed(cert_name):
self.fm_obj.set_fault(entity_id,
fm_constants.FM_ALARM_ID_CERT_EXPIRED,
fm_constants.FM_ALARM_STATE_CLEAR)
def alarm_override_check_passed(self, cert_name):
'''
Check for alarm overrides in annotation.
Return: True for enabled, False for disabled alarms
'''
if cert_name in utils.CERT_SNAPSHOT:
snapshot = utils.CERT_SNAPSHOT[cert_name]
if snapshot.get(constants.CERT_ALARM_ANNOTATION_ALARM,
constants.CERT_ALARM_DEFAULT_ANNOTATION_ALARM) == 'disabled':
LOG.info('Found annotation override, disabling alarm. Suppressing %s' %
cert_name)
return False
return True # defaults to True (i.e., raise alarm)
def audit_for_deleted_certificates(self):
LOG.info('Auditing for deleted certificates')
for alarm_instance in self.fm_obj.ALARMS_SNAPSHOT:
entity_id = self.fm_obj.ALARMS_SNAPSHOT[alarm_instance]['ENTITY_ID']
cert_name = utils.get_cert_name_with_entity_id(entity_id)
if cert_name is None:
LOG.info('Found alarm for entity %s, but no related \
certificate resource' % entity_id)
alarm_id = self.fm_obj.ALARMS_SNAPSHOT[alarm_instance]['ALARM_ID']
self.fm_obj.set_fault(entity_id,
alarm_id,
fm_constants.FM_ALARM_STATE_CLEAR)

View File

@ -14,6 +14,8 @@ from sysinv.common import constants
LOG = log.getLogger(__name__)
CONF = cfg.CONF
ALARM_ID = 'ALARM_ID'
ENTITY_ID = 'ENTITY_ID'
EXPIRING_SOON = 'EXPIRING_SOON'
EXPIRED = 'EXPIRED'
@ -24,19 +26,18 @@ class FaultApiMgr(object):
self.fm_api = fm_api.FaultAPIs()
"""
After an audit is completed, ALARMS_SNAPSHOT stores all active alarms
ALARMS_SNAPSHOT is a dict of list.
ALARMS_SNAPSHOT is a dict of dict. Each entry is per certificate.
{
EXPIRING_SOON: [certname1, certname2,...]
EXPIRED: [certname7, certname8,...]
alarm_uuid_1: {
ALARM_ID: FM_ALARM_ID_CERT_EXPIRED or FM_ALARM_ID_CERT_EXPIRING_SOON
ENTITY_ID: entity_instance_id returned from FM API
}
alarm_uuid_2: {
...
}
}
"""
self.ALARMS_SNAPSHOT = {}
"""
Entity ID to cert_name mapping
Due to the nature of entity_id strings generated, we need a map
to lookup cert_name's in utils.CERT_SNAPSHOT during audits
"""
self.ENTITYID_TO_CERTNAME_MAP = {}
def get_entity_instance_id(self, cert_name):
"""
@ -65,21 +66,20 @@ class FaultApiMgr(object):
tmp_id.append("system.certificate.%s" % cert_name)
entity_id = ''.join(tmp_id)
self.ENTITYID_TO_CERTNAME_MAP[entity_id] = cert_name
return entity_id
def get_cert_name_from_entity_instance_id(self, instance_id):
if instance_id in self.ENTITYID_TO_CERTNAME_MAP:
return self.ENTITYID_TO_CERTNAME_MAP[instance_id]
else:
return 'Unknown'
@staticmethod
def get_mode(cert_name):
return 'ssl_ca' if 'ssl_ca' in cert_name else cert_name
def get_reason_text(self, cert_name, expired_flag):
def get_reason_text(self, entity_id, alrm_id):
txt = []
cert_name = utils.get_cert_name_with_entity_id(entity_id)
if cert_name is None:
LOG.eror('Error retrieving certificate from snapshot. Returning entity_id')
txt.append(entity_id)
return ''.join(txt)
if cert_name in utils.CERT_SNAPSHOT:
# Add entity related text
snapshot = utils.CERT_SNAPSHOT[cert_name]
@ -106,7 +106,7 @@ class FaultApiMgr(object):
txt.append(' ')
# Add Expired or Expiring
if expired_flag:
if alrm_id == fm_constants.FM_ALARM_ID_CERT_EXPIRED:
txt.append("expired.")
else:
expiry_date = snapshot[utils.SNAPSHOT_KEY_EXPDATE]
@ -122,9 +122,15 @@ class FaultApiMgr(object):
LOG.debug('Alarm text: %s' % txt_str)
return txt_str
def get_severity(self, cert_name, expired_flag):
alarm_severity = fm_constants.FM_ALARM_SEVERITY_CRITICAL if expired_flag \
else fm_constants.FM_ALARM_SEVERITY_MAJOR
def get_severity(self, entity_id, alrm_id):
alarm_severity = fm_constants.FM_ALARM_SEVERITY_CRITICAL if \
alrm_id == fm_constants.FM_ALARM_ID_CERT_EXPIRED \
else fm_constants.FM_ALARM_SEVERITY_MAJOR
cert_name = utils.get_cert_name_with_entity_id(entity_id)
if cert_name is None:
LOG.error('Error retrieving certificate from snapshot. Using default severity')
return alarm_severity
# Check for annotation overrides
if cert_name in utils.CERT_SNAPSHOT:
@ -136,62 +142,57 @@ class FaultApiMgr(object):
return alarm_severity
def set_fault(self, cert_name, expired_flag, state):
"""
def set_fault(self, entity_inst_id, alrm_id, state):
'''
Set Fault calls the FM API to raise or clear alarm
Params: cert-name: certificate name
expired_flag: True/False
Determines whether 'Expired' (True) or 'Expiring Soon' (False)
Also determines the severity Critical (True) or Major (False)
Params: entity_inst_id: entity id for alarm
alrm_id: fm_constants.FM_ALARM_ID_CERT_EXPIRED or
fm_constant.FM_ALARM_ID_CERT_EXPIRING_SOON
state: will determine SET or CLEAR
"""
alrm_id = fm_constants.FM_ALARM_ID_CERT_EXPIRED if expired_flag \
else fm_constants.FM_ALARM_ID_CERT_EXPIRING_SOON
entity_inst_id = self.get_entity_instance_id(cert_name)
'''
# If case of api errors during data collection, we do not want to raise alarms with
# "unknown" UUID (because we will need to clear such alarms manually). In such a case,
# we log the error and skip the alarm raise. Subsequent audit runs will raise the alarms.
if "uuid=unknown" in entity_inst_id:
if entity_inst_id is None or "uuid=unknown" in entity_inst_id:
LOG.error('set_fault called for certificate %s with unknown UUID. Suppressing alarm' %
cert_name)
entity_inst_id)
return
try:
if state == fm_constants.FM_ALARM_STATE_SET:
# Raise alarm only if alarm does not already exist
if not self.fm_api.get_fault(alrm_id, entity_inst_id):
# Check for annotation override
if cert_name in utils.CERT_SNAPSHOT:
snapshot = utils.CERT_SNAPSHOT[cert_name]
if snapshot.get(constants.CERT_ALARM_ANNOTATION_ALARM,
constants.CERT_ALARM_DEFAULT_ANNOTATION_ALARM) == 'disabled':
LOG.info('Found annotation override, disabling alarm. Suppressing %s' %
cert_name)
return
fault = fm_api.Fault(
alarm_id=alrm_id,
alarm_state=state,
entity_type_id=fm_constants.FM_ENTITY_TYPE_CERTIFICATE,
entity_instance_id=entity_inst_id,
severity=self.get_severity(cert_name, expired_flag),
reason_text=self.get_reason_text(cert_name, expired_flag),
severity=self.get_severity(entity_inst_id, alrm_id),
reason_text=self.get_reason_text(entity_inst_id, alrm_id),
alarm_type=fm_constants.FM_ALARM_TYPE_9,
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_77,
proposed_repair_action="Renew certificate for entity identified",
suppression=False,
service_affecting=False)
LOG.info('Setting fault for cert_name=%s, expired_flag=%s, state=%s' %
(cert_name, expired_flag, state))
self.fm_api.set_fault(fault)
LOG.info('Setting fault for entity_id=%s, alarm_type=%s, state=%s' %
(entity_inst_id, alrm_id, state))
alarm_uuid = self.fm_api.set_fault(fault)
# Update CERT_SNAPSHOT
utils.update_cert_snapshot_field_with_entity_id(entity_inst_id,
utils.ALARM_UUID,
alarm_uuid)
else:
if self.fm_api.get_fault(alrm_id, entity_inst_id):
LOG.info('Setting fault for cert_name=%s, expired_flag=%s, state=%s' %
(cert_name, expired_flag, state))
LOG.info('Setting fault for entity_id=%s, alarm_type=%s, state=%s' %
(entity_inst_id, alrm_id, state))
self.fm_api.clear_fault(alrm_id, entity_inst_id)
# Update CERT_SNAPSHOT
utils.update_cert_snapshot_field_with_entity_id(entity_inst_id,
utils.ALARM_UUID,
"")
except Exception as e:
LOG.exception(e)
@ -210,11 +211,13 @@ class FaultApiMgr(object):
# Expiring Soon alarms
exp_soon_alarms = self.get_faults(False)
self.add_alarms_snapshot(EXPIRING_SOON, exp_soon_alarms)
self.add_alarms_snapshot(fm_constants.FM_ALARM_ID_CERT_EXPIRING_SOON,
exp_soon_alarms)
# Expired alarms
exprd_alarms = self.get_faults(True)
self.add_alarms_snapshot(EXPIRED, exprd_alarms)
self.add_alarms_snapshot(fm_constants.FM_ALARM_ID_CERT_EXPIRED,
exprd_alarms)
def reset_alarms_snapshot(self):
self.ALARMS_SNAPSHOT = {}
@ -222,16 +225,10 @@ class FaultApiMgr(object):
def print_alarms_snapshot(self):
LOG.info('Alarms snapshot = %s' % self.ALARMS_SNAPSHOT)
def add_alarms_snapshot(self, key, alarms):
cert_names = []
def add_alarms_snapshot(self, alarm_type, alarms):
if alarms:
for item in alarms:
cert_names.append(self.get_cert_name_from_entity_instance_id(item.entity_instance_id))
self.ALARMS_SNAPSHOT[key] = cert_names
def reset_entityid_to_certname_map(self):
self.ENTITYID_TO_CERTNAME_MAP = {}
def print_entityid_to_certname_map(self):
LOG.info('Entityid_to_certname map = %s' % self.ENTITYID_TO_CERTNAME_MAP)
self.ALARMS_SNAPSHOT[item.uuid] = {
ALARM_ID: alarm_type,
ENTITY_ID: item.entity_instance_id
}

View File

@ -40,6 +40,9 @@ MODE_SECRET = 'secret'
MODE_CERT_MGR = 'certmgr'
MODE_OTHER = 'other'
ALARM_UUID = 'alarm_uuid'
ENTITY_ID = 'entity_id'
CERT_SNAPSHOT = {}
"""
CERT_SNAPSHOT is a dict of dict. Each entry is per certificate.
@ -58,6 +61,8 @@ CERT_SNAPSHOT is a dict of dict. Each entry is per certificate.
mode_other: <other>
file_location: <filepath>
renewBefore: <renewBefore>
alarm_uuid: <alarm-uuid>
entity_id: <entity-instance-id>
}
certname2: {
...
@ -296,6 +301,32 @@ def add_cert_snapshot(certname, expirydate, annotation_data, mode_metadata):
CERT_SNAPSHOT[certname] = internaldict
def update_cert_snapshot_field(cert_name, key, value):
global CERT_SNAPSHOT
if cert_name not in CERT_SNAPSHOT:
LOG.error('Cannot find certificate %s in CERT_SNAPSHOT' % cert_name)
else:
LOG.debug('Updating CERT_SNAPSHOT cert_name=%s, key=%s, val=%s' % (cert_name, key, value))
CERT_SNAPSHOT[cert_name][key] = value
def update_cert_snapshot_field_with_entity_id(entity_id, key, value):
cert_name = get_cert_name_with_entity_id(entity_id)
if cert_name is None:
LOG.error('Cannot find certificate with entity_id %s' % entity_id)
else:
update_cert_snapshot_field(cert_name, key, value)
def get_cert_name_with_entity_id(entity_id):
global CERT_SNAPSHOT
for cert_name in CERT_SNAPSHOT:
if CERT_SNAPSHOT[cert_name].get(ENTITY_ID) == entity_id:
return cert_name
return None
def get_default_annotation_values():
return {
constants.CERT_ALARM_ANNOTATION_ALARM: