Fix CEPH_DOWN alarm clearing

After the changes in [1], it was possible to create an alarm without the "cluster" information in the 'entity_instance_id'. When the fsid cannot be obtained, a CEPH_DOWN alarm is created without setting the 'entity_instance_id', and when ceph becomes esponsive and the fsid is obtained, this initially created alarm is cleared. However, if the FM service is unavailable, the alarm is maintained and the 'entity_instance_id' is set to the cluster's fsid. With this, ceph-manager will always look for alarms with 'entity_instance_id: cluster=<fsid>' and ignore alarms created without 'entity_instance_id'. Therefore, to solve the issue, when ceph is HEALTH_OK, it will check if there is an alarm without the 'entity_instance_id', and if there is, it will be cleared. [1]: https://review.opendev.org/c/starlingx/utilities/+/953994 Test Plan: - PASS: Fresh install on STD. - PASS: Reboot the active controller many times. - PASS: Force CEPH_DOWN alarms. - PASS: Check if there is any CEPH_DOWN alarm when ceph is HEALTH_OK. Closes-Bug: 2129927 Change-Id: Ib4a1765caa7f38a7eb72b8d0b366048e98d82f1f Signed-off-by: Erickson Silva de Oliveira <Erickson.SilvadeOliveira@windriver.com>
2025-10-24 16:34:46 -03:00
parent ecf7274598
commit ee48f23033
1 changed files with 25 additions and 19 deletions
--- a/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py
+++ b/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py
@@ -139,9 +139,10 @@ class Monitor(HandleUpgradesMixin):
                LOG.exception(
                    "Error getting fsid, will retry in %ss"
                    % constants.CEPH_HEALTH_CHECK_INTERVAL)
-            if self.service.entity_instance_id:
+                time.sleep(constants.CEPH_HEALTH_CHECK_INTERVAL)
+                continue
+            else:
                break
-            time.sleep(constants.CEPH_HEALTH_CHECK_INTERVAL)

        # Start monitoring ceph status
        while True:
@@ -158,15 +159,11 @@ class Monitor(HandleUpgradesMixin):
    def ceph_get_fsid(self):
        # Check whether an alarm has already been raised
        self._refresh_current_alarms()
-        if self.current_health_alarm:
-            LOG.info(_LI("Current alarm: %s") %
-                     str(self.current_health_alarm.__dict__))

        fsid = self._get_fsid()
-
        if fsid:
-            # Clear alarm with no entity_instance_id
-            self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH)
+            # Clear alarm without entity_instance_id
+            self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH, "")
            self.service.entity_instance_id = 'cluster=%s' % fsid
        else:
            health_info = {
@@ -176,15 +173,14 @@ class Monitor(HandleUpgradesMixin):
            }
            # Raise alarm - it will not have an entity_instance_id
            self._report_fault(health_info, fm_constants.FM_ALARM_ID_STORAGE_CEPH)
+            # Throws exception to get fsid again
+            raise Exception("Could not get ceph fsid.")

    def ceph_poll_status(self):
        # get previous data every time in case:
        # * daemon restarted
        # * alarm was cleared manually but stored as raised in daemon
        self._refresh_current_alarms()
-        if self.current_health_alarm:
-            LOG.info(_LI("Current alarm: %s") %
-                     str(self.current_health_alarm.__dict__))

        health = self._get_health_detail()

@@ -194,6 +190,8 @@ class Monitor(HandleUpgradesMixin):
                self._report_fault(health_info, fm_constants.FM_ALARM_ID_STORAGE_CEPH)
            else:
                self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH)
+                # Clear alarm without entity_instance_id
+                self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH, "")

        # Report OSD down/out even if ceph health is OK
        self._report_alarm_osds_health()
@@ -572,13 +570,19 @@ class Monitor(HandleUpgradesMixin):
                self.detailed_health_reason = health['checks']

    def _clear_fault(self, alarm_id, entity_instance_id=None):
-        # Only clear alarm if there is one already raised
-        if (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH and
-                self.current_health_alarm):
+        if entity_instance_id is None:
+            entity_instance_id = self.service.entity_instance_id
+
+        # Only clear the alarm if it exists
+        if self._get_fault(alarm_id, entity_instance_id):
            LOG.info(_LI("Clearing health alarm"))
+
            self.service.fm_api.clear_fault(
-                fm_constants.FM_ALARM_ID_STORAGE_CEPH,
-                self.service.entity_instance_id)
+                alarm_id,
+                entity_instance_id)
+
+    def _get_fault(self, alarm_id, entity_instance_id):
+        return self.service.fm_api.get_fault(alarm_id, entity_instance_id)

    def clear_critical_alarm(self, group_name):
        alarm_list = self.service.fm_api.get_faults_by_id(
@@ -596,6 +600,8 @@ class Monitor(HandleUpgradesMixin):

    def _refresh_current_alarms(self):
        """Retrieve currently raised alarm"""
-        self.current_health_alarm = self.service.fm_api.get_fault(
-            fm_constants.FM_ALARM_ID_STORAGE_CEPH,
-            self.service.entity_instance_id)
+        self.current_health_alarm = self._get_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH,
+                                                    self.service.entity_instance_id)
+        if self.current_health_alarm:
+            LOG.info(_LI("Current alarm: %s") %
+                     str(self.current_health_alarm.__dict__))