From f0843c40d95aa82b4a9c75cedc970aade12e032b Mon Sep 17 00:00:00 2001 From: Sabeel Ansari Date: Mon, 15 Nov 2021 14:37:13 -0500 Subject: [PATCH] Audit for deleted certificates Cert-alarm feature was missing an audit for deleted certificate resources. If an installed certificate was deleted, the alarm would stay forever. This code change audits (during the full audit mode, i.e., once a day) for such conditions and deletes the related alarm. To achieve this feature, various internal function definitions had to be changed, resulting in some code refactoring. Test Plan: PASS: Existing feature works as intended/designed PASS: Deleting a certificate resource will cleanup the alarm during the full audit cycle Story: 2008946 Task: 42852 Signed-off-by: Sabeel Ansari Change-Id: Ie9f9c056c1af6c0e1887d294ed8d1765532ae8f9 --- .../sysinv/sysinv/sysinv/cert_alarm/audit.py | 142 ++++++++++++------ sysinv/sysinv/sysinv/sysinv/cert_alarm/fm.py | 123 ++++++++------- .../sysinv/sysinv/sysinv/cert_alarm/utils.py | 31 ++++ 3 files changed, 191 insertions(+), 105 deletions(-) diff --git a/sysinv/sysinv/sysinv/sysinv/cert_alarm/audit.py b/sysinv/sysinv/sysinv/sysinv/cert_alarm/audit.py index b6ea2b52fc..bbfaee7cd2 100644 --- a/sysinv/sysinv/sysinv/sysinv/cert_alarm/audit.py +++ b/sysinv/sysinv/sysinv/sysinv/cert_alarm/audit.py @@ -6,7 +6,6 @@ from datetime import datetime from datetime import timedelta from fm_api import constants as fm_constants -from itertools import chain from oslo_log import log import re from sysinv.cert_alarm import fm as fm_mgr @@ -32,15 +31,29 @@ class CertAlarmAudit(object): # Reset both CERT_SNAPSHOT & ALARM_SNAPSHOT utils.reset_cert_snapshot() self.fm_obj.reset_alarms_snapshot() - self.fm_obj.reset_entityid_to_certname_map() - # Collect CERT_SNAPSHOT + # Collect snapshots self.collect_cert_snapshot() + self.fm_obj.collect_all_cert_alarms() + + # Update snapshots + """ + In order to correlate alarms with CERT_SNAPSHOT, + we need references to entity_instance_id and + alarm_uuids (if any alarms present). This is needed + to audit for deleted certificates + """ + # Needs entity_id present before auditing deleted certificates + # Do not change order + self.update_entity_ids_in_cert_snapshot() + self.audit_for_deleted_certificates() + utils.print_cert_snapshot() + self.fm_obj.print_alarms_snapshot() - self.compute_action_full_audit() + self.apply_action_full_audit() - LOG.info('cert-alarm full completed') + LOG.info('cert-alarm full audit completed') def collect_cert_snapshot(self): """ @@ -94,9 +107,18 @@ class CertAlarmAudit(object): if entry[1] is not None: utils.add_cert_snapshot(entry[0], entry[1], entry[2], entry[3]) - def compute_action_full_audit(self): + def update_entity_ids_in_cert_snapshot(self): for cert_name in utils.CERT_SNAPSHOT: - self.compute_action(cert_name) + entity_id = self.fm_obj.get_entity_instance_id(cert_name) + utils.update_cert_snapshot_field(cert_name, + utils.ENTITY_ID, + entity_id) + + def apply_action_full_audit(self): + for cert_name in utils.CERT_SNAPSHOT: + entity_id = utils.CERT_SNAPSHOT[cert_name].get(utils.ENTITY_ID, + self.fm_obj.get_entity_instance_id(cert_name)) + self.apply_action(cert_name, entity_id) # ============== Active Alarm audit =================== def run_active_alarm_audit(self): @@ -106,22 +128,26 @@ class CertAlarmAudit(object): LOG.info('Running cert-alarm active_alarm_audit') # Collect ALARM_SNAPSHOT + self.fm_obj.reset_alarms_snapshot() self.fm_obj.collect_all_cert_alarms() - self.compute_action_active_alarms() + self.apply_action_active_alarms() + + utils.print_cert_snapshot() + self.fm_obj.print_alarms_snapshot() LOG.info('cert-alarm active_alarm_audit completed') - def compute_action_active_alarms(self): - # Create single list of expiring_soon & expired certs - all_active_alarms = list(chain(*self.fm_obj.ALARMS_SNAPSHOT.values())) + def apply_action_active_alarms(self): + for alarm_instance in self.fm_obj.ALARMS_SNAPSHOT: + entity_id = self.fm_obj.ALARMS_SNAPSHOT[alarm_instance]['ENTITY_ID'] + cert_name = utils.get_cert_name_with_entity_id(entity_id) + if cert_name is not None: + # 1. First refresh expiry date snapshot data + self.refresh_expiry_data(cert_name) - for cert_name in all_active_alarms: - # 1. First refresh expiry date snapshot data - self.refresh_expiry_data(cert_name) - - # 2. Now check dates and compute_action - self.compute_action(cert_name) + # 2. Now check dates and apply_action + self.apply_action(cert_name, entity_id) def refresh_expiry_data(self, cert_name): if cert_name not in utils.CERT_SNAPSHOT: @@ -167,9 +193,9 @@ class CertAlarmAudit(object): time_params[name] = int(param) return timedelta(**time_params) - def compute_action(self, cert_name): + def apply_action(self, cert_name, entity_id): """ - Computes any action required based on parameters passed and calls FM API + Applies any action required based on parameters passed and calls FM API Input: cert_name: Certificate name """ if cert_name not in utils.CERT_SNAPSHOT: @@ -183,8 +209,8 @@ class CertAlarmAudit(object): renew_before = None if utils.SNAPSHOT_KEY_RENEW_BEFORE in snapshot: renew_before = self.parse_time(snapshot[utils.SNAPSHOT_KEY_RENEW_BEFORE]) - LOG.debug('cert_name=%s, expiry=%s, alarm_before=%s, renew_before=%s' - % (cert_name, expiry.days, alarm_before.days, renew_before.days)) + LOG.debug('cert_name=%s, entity_id=%s, expiry=%s, alarm_before=%s, renew_before=%s' + % (cert_name, entity_id, expiry.days, alarm_before.days, renew_before.days)) days_to_expiry = expiry.days alarm_before_days = alarm_before.days @@ -198,32 +224,64 @@ class CertAlarmAudit(object): threshold = alarm_before_days if days_to_expiry > threshold: - self.clear_expiring_soon(cert_name) - self.clear_expired(cert_name) + self.clear_expiring_soon(cert_name, entity_id) + self.clear_expired(cert_name, entity_id) else: if days_to_expiry < 0: # Expired. Clear expiring-soon & raise expired - self.clear_expiring_soon(cert_name) - self.raise_expired(cert_name) + self.clear_expiring_soon(cert_name, entity_id) + self.raise_expired(cert_name, entity_id) else: - self.raise_expiring_soon(cert_name) + self.raise_expiring_soon(cert_name, entity_id) - def raise_expiring_soon(self, cert_name): - self.fm_obj.set_fault(cert_name, - False, - fm_constants.FM_ALARM_STATE_SET) + def raise_expiring_soon(self, cert_name, entity_id): + if self.alarm_override_check_passed(cert_name): + self.fm_obj.set_fault(entity_id, + fm_constants.FM_ALARM_ID_CERT_EXPIRING_SOON, + fm_constants.FM_ALARM_STATE_SET) - def clear_expiring_soon(self, cert_name): - self.fm_obj.set_fault(cert_name, - False, - fm_constants.FM_ALARM_STATE_CLEAR) + def clear_expiring_soon(self, cert_name, entity_id): + if self.alarm_override_check_passed(cert_name): + self.fm_obj.set_fault(entity_id, + fm_constants.FM_ALARM_ID_CERT_EXPIRING_SOON, + fm_constants.FM_ALARM_STATE_CLEAR) - def raise_expired(self, cert_name): - self.fm_obj.set_fault(cert_name, - True, - fm_constants.FM_ALARM_STATE_SET) + def raise_expired(self, cert_name, entity_id): + if self.alarm_override_check_passed(cert_name): + self.fm_obj.set_fault(entity_id, + fm_constants.FM_ALARM_ID_CERT_EXPIRED, + fm_constants.FM_ALARM_STATE_SET) - def clear_expired(self, cert_name): - self.fm_obj.set_fault(cert_name, - True, - fm_constants.FM_ALARM_STATE_CLEAR) + def clear_expired(self, cert_name, entity_id): + if self.alarm_override_check_passed(cert_name): + self.fm_obj.set_fault(entity_id, + fm_constants.FM_ALARM_ID_CERT_EXPIRED, + fm_constants.FM_ALARM_STATE_CLEAR) + + def alarm_override_check_passed(self, cert_name): + ''' + Check for alarm overrides in annotation. + Return: True for enabled, False for disabled alarms + ''' + if cert_name in utils.CERT_SNAPSHOT: + snapshot = utils.CERT_SNAPSHOT[cert_name] + if snapshot.get(constants.CERT_ALARM_ANNOTATION_ALARM, + constants.CERT_ALARM_DEFAULT_ANNOTATION_ALARM) == 'disabled': + LOG.info('Found annotation override, disabling alarm. Suppressing %s' % + cert_name) + return False + + return True # defaults to True (i.e., raise alarm) + + def audit_for_deleted_certificates(self): + LOG.info('Auditing for deleted certificates') + for alarm_instance in self.fm_obj.ALARMS_SNAPSHOT: + entity_id = self.fm_obj.ALARMS_SNAPSHOT[alarm_instance]['ENTITY_ID'] + cert_name = utils.get_cert_name_with_entity_id(entity_id) + if cert_name is None: + LOG.info('Found alarm for entity %s, but no related \ + certificate resource' % entity_id) + alarm_id = self.fm_obj.ALARMS_SNAPSHOT[alarm_instance]['ALARM_ID'] + self.fm_obj.set_fault(entity_id, + alarm_id, + fm_constants.FM_ALARM_STATE_CLEAR) diff --git a/sysinv/sysinv/sysinv/sysinv/cert_alarm/fm.py b/sysinv/sysinv/sysinv/sysinv/cert_alarm/fm.py index 1c987a0142..99d58a29e1 100644 --- a/sysinv/sysinv/sysinv/sysinv/cert_alarm/fm.py +++ b/sysinv/sysinv/sysinv/sysinv/cert_alarm/fm.py @@ -14,6 +14,8 @@ from sysinv.common import constants LOG = log.getLogger(__name__) CONF = cfg.CONF +ALARM_ID = 'ALARM_ID' +ENTITY_ID = 'ENTITY_ID' EXPIRING_SOON = 'EXPIRING_SOON' EXPIRED = 'EXPIRED' @@ -24,19 +26,18 @@ class FaultApiMgr(object): self.fm_api = fm_api.FaultAPIs() """ After an audit is completed, ALARMS_SNAPSHOT stores all active alarms - ALARMS_SNAPSHOT is a dict of list. + ALARMS_SNAPSHOT is a dict of dict. Each entry is per certificate. { - EXPIRING_SOON: [certname1, certname2,...] - EXPIRED: [certname7, certname8,...] + alarm_uuid_1: { + ALARM_ID: FM_ALARM_ID_CERT_EXPIRED or FM_ALARM_ID_CERT_EXPIRING_SOON + ENTITY_ID: entity_instance_id returned from FM API + } + alarm_uuid_2: { + ... + } } """ self.ALARMS_SNAPSHOT = {} - """ - Entity ID to cert_name mapping - Due to the nature of entity_id strings generated, we need a map - to lookup cert_name's in utils.CERT_SNAPSHOT during audits - """ - self.ENTITYID_TO_CERTNAME_MAP = {} def get_entity_instance_id(self, cert_name): """ @@ -65,21 +66,20 @@ class FaultApiMgr(object): tmp_id.append("system.certificate.%s" % cert_name) entity_id = ''.join(tmp_id) - self.ENTITYID_TO_CERTNAME_MAP[entity_id] = cert_name return entity_id - def get_cert_name_from_entity_instance_id(self, instance_id): - if instance_id in self.ENTITYID_TO_CERTNAME_MAP: - return self.ENTITYID_TO_CERTNAME_MAP[instance_id] - else: - return 'Unknown' - @staticmethod def get_mode(cert_name): return 'ssl_ca' if 'ssl_ca' in cert_name else cert_name - def get_reason_text(self, cert_name, expired_flag): + def get_reason_text(self, entity_id, alrm_id): txt = [] + cert_name = utils.get_cert_name_with_entity_id(entity_id) + if cert_name is None: + LOG.eror('Error retrieving certificate from snapshot. Returning entity_id') + txt.append(entity_id) + return ''.join(txt) + if cert_name in utils.CERT_SNAPSHOT: # Add entity related text snapshot = utils.CERT_SNAPSHOT[cert_name] @@ -106,7 +106,7 @@ class FaultApiMgr(object): txt.append(' ') # Add Expired or Expiring - if expired_flag: + if alrm_id == fm_constants.FM_ALARM_ID_CERT_EXPIRED: txt.append("expired.") else: expiry_date = snapshot[utils.SNAPSHOT_KEY_EXPDATE] @@ -122,9 +122,15 @@ class FaultApiMgr(object): LOG.debug('Alarm text: %s' % txt_str) return txt_str - def get_severity(self, cert_name, expired_flag): - alarm_severity = fm_constants.FM_ALARM_SEVERITY_CRITICAL if expired_flag \ - else fm_constants.FM_ALARM_SEVERITY_MAJOR + def get_severity(self, entity_id, alrm_id): + alarm_severity = fm_constants.FM_ALARM_SEVERITY_CRITICAL if \ + alrm_id == fm_constants.FM_ALARM_ID_CERT_EXPIRED \ + else fm_constants.FM_ALARM_SEVERITY_MAJOR + + cert_name = utils.get_cert_name_with_entity_id(entity_id) + if cert_name is None: + LOG.error('Error retrieving certificate from snapshot. Using default severity') + return alarm_severity # Check for annotation overrides if cert_name in utils.CERT_SNAPSHOT: @@ -136,62 +142,57 @@ class FaultApiMgr(object): return alarm_severity - def set_fault(self, cert_name, expired_flag, state): - """ + def set_fault(self, entity_inst_id, alrm_id, state): + ''' Set Fault calls the FM API to raise or clear alarm - Params: cert-name: certificate name - expired_flag: True/False - Determines whether 'Expired' (True) or 'Expiring Soon' (False) - Also determines the severity Critical (True) or Major (False) + Params: entity_inst_id: entity id for alarm + alrm_id: fm_constants.FM_ALARM_ID_CERT_EXPIRED or + fm_constant.FM_ALARM_ID_CERT_EXPIRING_SOON state: will determine SET or CLEAR - """ - - alrm_id = fm_constants.FM_ALARM_ID_CERT_EXPIRED if expired_flag \ - else fm_constants.FM_ALARM_ID_CERT_EXPIRING_SOON - entity_inst_id = self.get_entity_instance_id(cert_name) + ''' # If case of api errors during data collection, we do not want to raise alarms with # "unknown" UUID (because we will need to clear such alarms manually). In such a case, # we log the error and skip the alarm raise. Subsequent audit runs will raise the alarms. - if "uuid=unknown" in entity_inst_id: + if entity_inst_id is None or "uuid=unknown" in entity_inst_id: LOG.error('set_fault called for certificate %s with unknown UUID. Suppressing alarm' % - cert_name) + entity_inst_id) return try: if state == fm_constants.FM_ALARM_STATE_SET: # Raise alarm only if alarm does not already exist if not self.fm_api.get_fault(alrm_id, entity_inst_id): - # Check for annotation override - if cert_name in utils.CERT_SNAPSHOT: - snapshot = utils.CERT_SNAPSHOT[cert_name] - if snapshot.get(constants.CERT_ALARM_ANNOTATION_ALARM, - constants.CERT_ALARM_DEFAULT_ANNOTATION_ALARM) == 'disabled': - LOG.info('Found annotation override, disabling alarm. Suppressing %s' % - cert_name) - return - fault = fm_api.Fault( alarm_id=alrm_id, alarm_state=state, entity_type_id=fm_constants.FM_ENTITY_TYPE_CERTIFICATE, entity_instance_id=entity_inst_id, - severity=self.get_severity(cert_name, expired_flag), - reason_text=self.get_reason_text(cert_name, expired_flag), + severity=self.get_severity(entity_inst_id, alrm_id), + reason_text=self.get_reason_text(entity_inst_id, alrm_id), alarm_type=fm_constants.FM_ALARM_TYPE_9, probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_77, proposed_repair_action="Renew certificate for entity identified", suppression=False, service_affecting=False) - LOG.info('Setting fault for cert_name=%s, expired_flag=%s, state=%s' % - (cert_name, expired_flag, state)) - self.fm_api.set_fault(fault) + LOG.info('Setting fault for entity_id=%s, alarm_type=%s, state=%s' % + (entity_inst_id, alrm_id, state)) + alarm_uuid = self.fm_api.set_fault(fault) + # Update CERT_SNAPSHOT + utils.update_cert_snapshot_field_with_entity_id(entity_inst_id, + utils.ALARM_UUID, + alarm_uuid) else: if self.fm_api.get_fault(alrm_id, entity_inst_id): - LOG.info('Setting fault for cert_name=%s, expired_flag=%s, state=%s' % - (cert_name, expired_flag, state)) + LOG.info('Setting fault for entity_id=%s, alarm_type=%s, state=%s' % + (entity_inst_id, alrm_id, state)) self.fm_api.clear_fault(alrm_id, entity_inst_id) + # Update CERT_SNAPSHOT + utils.update_cert_snapshot_field_with_entity_id(entity_inst_id, + utils.ALARM_UUID, + "") + except Exception as e: LOG.exception(e) @@ -210,11 +211,13 @@ class FaultApiMgr(object): # Expiring Soon alarms exp_soon_alarms = self.get_faults(False) - self.add_alarms_snapshot(EXPIRING_SOON, exp_soon_alarms) + self.add_alarms_snapshot(fm_constants.FM_ALARM_ID_CERT_EXPIRING_SOON, + exp_soon_alarms) # Expired alarms exprd_alarms = self.get_faults(True) - self.add_alarms_snapshot(EXPIRED, exprd_alarms) + self.add_alarms_snapshot(fm_constants.FM_ALARM_ID_CERT_EXPIRED, + exprd_alarms) def reset_alarms_snapshot(self): self.ALARMS_SNAPSHOT = {} @@ -222,16 +225,10 @@ class FaultApiMgr(object): def print_alarms_snapshot(self): LOG.info('Alarms snapshot = %s' % self.ALARMS_SNAPSHOT) - def add_alarms_snapshot(self, key, alarms): - cert_names = [] + def add_alarms_snapshot(self, alarm_type, alarms): if alarms: for item in alarms: - cert_names.append(self.get_cert_name_from_entity_instance_id(item.entity_instance_id)) - - self.ALARMS_SNAPSHOT[key] = cert_names - - def reset_entityid_to_certname_map(self): - self.ENTITYID_TO_CERTNAME_MAP = {} - - def print_entityid_to_certname_map(self): - LOG.info('Entityid_to_certname map = %s' % self.ENTITYID_TO_CERTNAME_MAP) + self.ALARMS_SNAPSHOT[item.uuid] = { + ALARM_ID: alarm_type, + ENTITY_ID: item.entity_instance_id + } diff --git a/sysinv/sysinv/sysinv/sysinv/cert_alarm/utils.py b/sysinv/sysinv/sysinv/sysinv/cert_alarm/utils.py index c888bdd112..470c40d7b2 100644 --- a/sysinv/sysinv/sysinv/sysinv/cert_alarm/utils.py +++ b/sysinv/sysinv/sysinv/sysinv/cert_alarm/utils.py @@ -40,6 +40,9 @@ MODE_SECRET = 'secret' MODE_CERT_MGR = 'certmgr' MODE_OTHER = 'other' +ALARM_UUID = 'alarm_uuid' +ENTITY_ID = 'entity_id' + CERT_SNAPSHOT = {} """ CERT_SNAPSHOT is a dict of dict. Each entry is per certificate. @@ -58,6 +61,8 @@ CERT_SNAPSHOT is a dict of dict. Each entry is per certificate. mode_other: file_location: renewBefore: + alarm_uuid: + entity_id: } certname2: { ... @@ -296,6 +301,32 @@ def add_cert_snapshot(certname, expirydate, annotation_data, mode_metadata): CERT_SNAPSHOT[certname] = internaldict +def update_cert_snapshot_field(cert_name, key, value): + global CERT_SNAPSHOT + if cert_name not in CERT_SNAPSHOT: + LOG.error('Cannot find certificate %s in CERT_SNAPSHOT' % cert_name) + else: + LOG.debug('Updating CERT_SNAPSHOT cert_name=%s, key=%s, val=%s' % (cert_name, key, value)) + CERT_SNAPSHOT[cert_name][key] = value + + +def update_cert_snapshot_field_with_entity_id(entity_id, key, value): + cert_name = get_cert_name_with_entity_id(entity_id) + if cert_name is None: + LOG.error('Cannot find certificate with entity_id %s' % entity_id) + else: + update_cert_snapshot_field(cert_name, key, value) + + +def get_cert_name_with_entity_id(entity_id): + global CERT_SNAPSHOT + for cert_name in CERT_SNAPSHOT: + if CERT_SNAPSHOT[cert_name].get(ENTITY_ID) == entity_id: + return cert_name + + return None + + def get_default_annotation_values(): return { constants.CERT_ALARM_ANNOTATION_ALARM: