ceph-manager: raise alarms when OSD is down even if health OK

If ceph status reports HEALTH_OK then OSD down alarm is not raised.
Same for OSD out.

Example scenario:
The disk might fail which means the osd will be in down state.
Ceph status shows a health warning, for example HEALTH_WARN.
An alarm is raised by ceph-manager.
After some time the disk will be marked out. Ceph health becomes
HEALTH_OK and alarms are cleared.
The user might never replace the disk thus the OSD state is still down
yet no alarm is raised.

Raise alarms even when status is HEALTH_OK to let the user know that
the OSDs are still down or out.

Closes-Bug: 1841903
Change-Id: I4380183ce0cd2e41fbf12d0f9f20a4328293882c
Signed-off-by: Dan Voiculeasa <dan.voiculeasa@windriver.com>
This commit is contained in:
Dan Voiculeasa 2019-09-10 08:50:39 -04:00
parent ed1ba16650
commit e32b790684
1 changed files with 3 additions and 42 deletions

View File

@ -224,10 +224,11 @@ class Monitor(HandleUpgradesMixin):
health = self.filter_health_status(health)
if health['health'] != constants.CEPH_HEALTH_OK:
self._report_fault(health, fm_constants.FM_ALARM_ID_STORAGE_CEPH)
self._report_alarm_osds_health()
else:
self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH)
self.clear_all_major_critical()
# Report OSD down/out even if ceph health is OK
self._report_alarm_osds_health()
def filter_health_status(self, health):
return super(Monitor, self).filter_health_status(health)
@ -829,46 +830,6 @@ class Monitor(HandleUpgradesMixin):
fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL,
alarm_list[alarm].entity_instance_id)
def clear_all_major_critical(self, group_name=None):
# clear major alarms
alarm_list = self.service.fm_api.get_faults_by_id(
fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR)
if alarm_list:
for alarm in range(len(alarm_list)):
if group_name is not None:
group_id = (
alarm_list[alarm].entity_instance_id.find("group-"))
group_instance_name = (
"group-" +
alarm_list[alarm].entity_instance_id[group_id + 6])
if group_name == group_instance_name:
self.service.fm_api.clear_fault(
fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR,
alarm_list[alarm].entity_instance_id)
else:
self.service.fm_api.clear_fault(
fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR,
alarm_list[alarm].entity_instance_id)
# clear critical alarms
alarm_list = self.service.fm_api.get_faults_by_id(
fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL)
if alarm_list:
for alarm in range(len(alarm_list)):
if group_name is not None:
group_id = (
alarm_list[alarm].entity_instance_id.find("group-"))
group_instance_name = (
"group-" +
alarm_list[alarm].entity_instance_id[group_id + 6])
if group_name == group_instance_name:
self.service.fm_api.clear_fault(
fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL,
alarm_list[alarm].entity_instance_id)
else:
self.service.fm_api.clear_fault(
fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL,
alarm_list[alarm].entity_instance_id)
def _get_current_alarms(self):
"""Retrieve currently raised alarm"""
self.current_health_alarm = self.service.fm_api.get_fault(