Fix CEPH_DOWN alarm clearing

After the changes in [1], it was possible to create an alarm
without the "cluster" information in the 'entity_instance_id'.

When the fsid cannot be obtained, a CEPH_DOWN alarm is created
without setting the 'entity_instance_id', and when ceph becomes
esponsive and the fsid is obtained, this initially created
alarm is cleared.

However, if the FM service is unavailable, the alarm is maintained
and the 'entity_instance_id' is set to the cluster's fsid.
With this, ceph-manager will always look for alarms with
'entity_instance_id: cluster=<fsid>' and ignore alarms
created without 'entity_instance_id'.

Therefore, to solve the issue, when ceph is HEALTH_OK, it will
check if there is an alarm without the 'entity_instance_id',
and if there is, it will be cleared.

[1]: https://review.opendev.org/c/starlingx/utilities/+/953994

Test Plan:
 - PASS: Fresh install on STD.
 - PASS: Reboot the active controller many times.
 - PASS: Force CEPH_DOWN alarms.
 - PASS: Check if there is any CEPH_DOWN alarm when
	 ceph is HEALTH_OK.

Closes-Bug: 2129927

Change-Id: Ib4a1765caa7f38a7eb72b8d0b366048e98d82f1f
Signed-off-by: Erickson Silva de Oliveira <Erickson.SilvadeOliveira@windriver.com>
This commit is contained in:
Erickson Silva de Oliveira
2025-10-24 16:34:46 -03:00
parent ecf7274598
commit ee48f23033

View File

@@ -139,9 +139,10 @@ class Monitor(HandleUpgradesMixin):
LOG.exception(
"Error getting fsid, will retry in %ss"
% constants.CEPH_HEALTH_CHECK_INTERVAL)
if self.service.entity_instance_id:
time.sleep(constants.CEPH_HEALTH_CHECK_INTERVAL)
continue
else:
break
time.sleep(constants.CEPH_HEALTH_CHECK_INTERVAL)
# Start monitoring ceph status
while True:
@@ -158,15 +159,11 @@ class Monitor(HandleUpgradesMixin):
def ceph_get_fsid(self):
# Check whether an alarm has already been raised
self._refresh_current_alarms()
if self.current_health_alarm:
LOG.info(_LI("Current alarm: %s") %
str(self.current_health_alarm.__dict__))
fsid = self._get_fsid()
if fsid:
# Clear alarm with no entity_instance_id
self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH)
# Clear alarm without entity_instance_id
self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH, "")
self.service.entity_instance_id = 'cluster=%s' % fsid
else:
health_info = {
@@ -176,15 +173,14 @@ class Monitor(HandleUpgradesMixin):
}
# Raise alarm - it will not have an entity_instance_id
self._report_fault(health_info, fm_constants.FM_ALARM_ID_STORAGE_CEPH)
# Throws exception to get fsid again
raise Exception("Could not get ceph fsid.")
def ceph_poll_status(self):
# get previous data every time in case:
# * daemon restarted
# * alarm was cleared manually but stored as raised in daemon
self._refresh_current_alarms()
if self.current_health_alarm:
LOG.info(_LI("Current alarm: %s") %
str(self.current_health_alarm.__dict__))
health = self._get_health_detail()
@@ -194,6 +190,8 @@ class Monitor(HandleUpgradesMixin):
self._report_fault(health_info, fm_constants.FM_ALARM_ID_STORAGE_CEPH)
else:
self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH)
# Clear alarm without entity_instance_id
self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH, "")
# Report OSD down/out even if ceph health is OK
self._report_alarm_osds_health()
@@ -572,13 +570,19 @@ class Monitor(HandleUpgradesMixin):
self.detailed_health_reason = health['checks']
def _clear_fault(self, alarm_id, entity_instance_id=None):
# Only clear alarm if there is one already raised
if (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH and
self.current_health_alarm):
if entity_instance_id is None:
entity_instance_id = self.service.entity_instance_id
# Only clear the alarm if it exists
if self._get_fault(alarm_id, entity_instance_id):
LOG.info(_LI("Clearing health alarm"))
self.service.fm_api.clear_fault(
fm_constants.FM_ALARM_ID_STORAGE_CEPH,
self.service.entity_instance_id)
alarm_id,
entity_instance_id)
def _get_fault(self, alarm_id, entity_instance_id):
return self.service.fm_api.get_fault(alarm_id, entity_instance_id)
def clear_critical_alarm(self, group_name):
alarm_list = self.service.fm_api.get_faults_by_id(
@@ -596,6 +600,8 @@ class Monitor(HandleUpgradesMixin):
def _refresh_current_alarms(self):
"""Retrieve currently raised alarm"""
self.current_health_alarm = self.service.fm_api.get_fault(
fm_constants.FM_ALARM_ID_STORAGE_CEPH,
self.service.entity_instance_id)
self.current_health_alarm = self._get_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH,
self.service.entity_instance_id)
if self.current_health_alarm:
LOG.info(_LI("Current alarm: %s") %
str(self.current_health_alarm.__dict__))