utilities/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py

#
# Copyright (c) 2013-2018 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#

import time

# noinspection PyUnresolvedReferences
from fm_api import fm_api
# noinspection PyUnresolvedReferences
from fm_api import constants as fm_constants
# noinspection PyUnresolvedReferences
from oslo_log import log as logging

# noinspection PyProtectedMember
from ceph_manager.i18n import _
from ceph_manager.i18n import _LI
from ceph_manager.i18n import _LW
from ceph_manager.i18n import _LE

from ceph_manager import constants
from ceph_manager import exception

LOG = logging.getLogger(__name__)


# In 18.03 R5, ceph cache tiering was disabled and prevented from being
# re-enabled. When upgrading from 18.03 (R5) to R6 we need to remove the
# cache-tier from the crushmap ceph-cache-tiering
#
# This class is needed only when upgrading from R5 to R6
# TODO: remove it after 1st R6 release
#
class HandleUpgradesMixin(object):

    def __init__(self, service):
        self.service = service
        self.wait_for_upgrade_complete = False

    def setup(self, config):
        self._set_upgrade(self.service.retry_get_software_upgrade_status())

    def _set_upgrade(self, upgrade):
        state = upgrade.get('state')
        from_version = upgrade.get('from_version')
        if (state
                and state != constants.UPGRADE_COMPLETED
                and from_version == constants.TITANIUM_SERVER_VERSION_18_03):

            LOG.info(_LI("Wait for ceph upgrade to complete "
                         "before monitoring cluster."))
            self.wait_for_upgrade_complete = True

    def set_flag_require_jewel_osds(self):
        try:
            response, body = self.service.ceph_api.osd_set_key(
                constants.CEPH_FLAG_REQUIRE_JEWEL_OSDS,
                body='json')
            LOG.info(_LI("Set require_jewel_osds flag"))
        except IOError as e:
            raise exception.CephApiFailure(
                call="osd_set_key",
                reason=str(e))
        else:
            if not response.ok:
                raise exception.CephSetKeyFailure(
                    flag=constants.CEPH_FLAG_REQUIRE_JEWEL_OSDS,
                    extra=_("needed to complete upgrade to Jewel"),
                    response_status_code=response.status_code,
                    response_reason=response.reason,
                    status=body.get('status'),
                    output=body.get('output'))

    def filter_health_status(self, health):
        health = self.auto_heal(health)
        # filter out require_jewel_osds warning
        #
        if not self.wait_for_upgrade_complete:
            return health
        if health['health'] != constants.CEPH_HEALTH_WARN:
            return health
        if (constants.CEPH_HEALTH_WARN_REQUIRE_JEWEL_OSDS_NOT_SET
                not in health['detail']):
            return health
        return self._remove_require_jewel_osds_warning(health)

    def _remove_require_jewel_osds_warning(self, health):
        reasons_list = []
        for reason in health['detail'].split(';'):
            reason = reason.strip()
            if len(reason) == 0:
                continue
            if constants.CEPH_HEALTH_WARN_REQUIRE_JEWEL_OSDS_NOT_SET \
                    in reason:
                continue
            reasons_list.append(reason)
        if len(reasons_list) == 0:
            health = {
                'health': constants.CEPH_HEALTH_OK,
                'detail': ''}
        else:
            health['detail'] = '; '.join(reasons_list)
        return health

    def auto_heal(self, health):
        if (health['health'] == constants.CEPH_HEALTH_WARN
                and (constants.CEPH_HEALTH_WARN_REQUIRE_JEWEL_OSDS_NOT_SET
                     in health['detail'])):
            try:
                upgrade = self.service.get_software_upgrade_status()
            except Exception as ex:
                LOG.warn(_LW(
                    "Getting software upgrade status failed "
                    "with: %s. Skip auto-heal attempt "
                    "(will retry on next ceph status poll).") % str(ex))
                return health
            state = upgrade.get('state')
            # surpress require_jewel_osds in case upgrade is
            # in progress but not completed or aborting
            if (not self.wait_for_upgrade_complete
                    and (upgrade.get('from_version')
                         == constants.TITANIUM_SERVER_VERSION_18_03)
                    and state not in [
                        None,
                        constants.UPGRADE_COMPLETED,
                        constants.UPGRADE_ABORTING,
                        constants.UPGRADE_ABORT_COMPLETING,
                        constants.UPGRADE_ABORTING_ROLLBACK]):
                self.wait_for_upgrade_complete = True
            # set require_jewel_osds in case upgrade is
            # not in progress or completed
            if (state in [None, constants.UPGRADE_COMPLETED]):
                LOG.warn(_LW(
                    "No upgrade in progress or update completed "
                    "and require_jewel_osds health warning raised. "
                    "Set require_jewel_osds flag."))
                self.set_flag_require_jewel_osds()
                health = self._remove_require_jewel_osds_warning(health)
                LOG.info(_LI("Unsurpress require_jewel_osds health warning"))
                self.wait_for_upgrade_complete = False
            # unsurpress require_jewel_osds in case upgrade
            # is aborting
            if (state in [
                    constants.UPGRADE_ABORTING,
                    constants.UPGRADE_ABORT_COMPLETING,
                    constants.UPGRADE_ABORTING_ROLLBACK]):
                self.wait_for_upgrade_complete = False
        return health


class Monitor(HandleUpgradesMixin):

    def __init__(self, service):
        self.service = service
        self.current_ceph_health = ""
        self.tiers_size = {}
        self.known_object_pool_name = None
        self.primary_tier_name = constants.SB_TIER_DEFAULT_NAMES[
            constants.SB_TIER_TYPE_CEPH] + constants.CEPH_CRUSH_TIER_SUFFIX
        self.cluster_is_up = False
        super(Monitor, self).__init__(service)

    def setup(self, config):
        super(Monitor, self).setup(config)

    def run(self):
        # Wait until Ceph cluster is up and we can get the fsid
        while True:
            try:
                self.ceph_get_fsid()
            except Exception:
                LOG.exception(
                    "Error getting fsid, will retry in %ss"
                    % constants.CEPH_HEALTH_CHECK_INTERVAL)
            if self.service.entity_instance_id:
                break
            time.sleep(constants.CEPH_HEALTH_CHECK_INTERVAL)

        # Start monitoring ceph status
        while True:
            try:
                self.ceph_poll_status()
                self.ceph_poll_quotas()
            except Exception:
                LOG.exception(
                    "Error running periodic monitoring of ceph status, "
                    "will retry in %ss"
                    % constants.CEPH_HEALTH_CHECK_INTERVAL)
            time.sleep(constants.CEPH_HEALTH_CHECK_INTERVAL)

    def ceph_get_fsid(self):
        # Check whether an alarm has already been raised
        self._get_current_alarms()
        if self.current_health_alarm:
            LOG.info(_LI("Current alarm: %s") %
                     str(self.current_health_alarm.__dict__))

        fsid = self._get_fsid()
        if not fsid:
            # Raise alarm - it will not have an entity_instance_id
            self._report_fault({'health': constants.CEPH_HEALTH_DOWN,
                                'detail': 'Ceph cluster is down.'},
                               fm_constants.FM_ALARM_ID_STORAGE_CEPH)
        else:
            # Clear alarm with no entity_instance_id
            self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH)
            self.service.entity_instance_id = 'cluster=%s' % fsid

    def ceph_poll_status(self):
        # get previous data every time in case:
        # * daemon restarted
        # * alarm was cleared manually but stored as raised in daemon
        self._get_current_alarms()
        if self.current_health_alarm:
            LOG.info(_LI("Current alarm: %s") %
                     str(self.current_health_alarm.__dict__))

        # get ceph health
        health = self._get_health()
        LOG.info(_LI("Current Ceph health: "
                     "%(health)s detail: %(detail)s") % health)

        health = self.filter_health_status(health)
        if health['health'] != constants.CEPH_HEALTH_OK:
            self._report_fault(health, fm_constants.FM_ALARM_ID_STORAGE_CEPH)
            self._report_alarm_osds_health()
        else:
            self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH)
            self.clear_all_major_critical()

    def filter_health_status(self, health):
        return super(Monitor, self).filter_health_status(health)

    def ceph_poll_quotas(self):
        self._get_current_alarms()
        if self.current_quota_alarms:
            LOG.info(_LI("Current quota alarms %s") %
                     self.current_quota_alarms)

        # Get current current size of each tier
        previous_tiers_size = self.tiers_size
        self.tiers_size = self._get_tiers_size()

        # Make sure any removed tiers have the alarms cleared
        for t in (set(previous_tiers_size) - set(self.tiers_size)):
            self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE,
                              "{0}.tier={1}".format(
                                  self.service.entity_instance_id,
                                  t[:-len(constants.CEPH_CRUSH_TIER_SUFFIX)]))

        # Check the quotas on each tier
        for tier in self.tiers_size:
            # Extract the tier name from the crush equivalent
            tier_name = tier[:-len(constants.CEPH_CRUSH_TIER_SUFFIX)]

            if self.tiers_size[tier] == 0:
                LOG.info(_LI("'%s' tier cluster size not yet available")
                         % tier_name)
                continue

            pools_quota_sum = 0
            if tier == self.primary_tier_name:
                for pool in constants.CEPH_POOLS:
                    if (pool['pool_name'] ==
                       constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL or
                       pool['pool_name'] ==
                       constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER):
                        object_pool_name = self._get_object_pool_name()
                        if object_pool_name is None:
                            LOG.error("Rados gateway object data pool does "
                                      "not exist.")
                        else:
                            pools_quota_sum += \
                                self._get_osd_pool_quota(object_pool_name)
                    else:
                        pools_quota_sum += self._get_osd_pool_quota(
                            pool['pool_name'])
            else:
                for pool in constants.SB_TIER_CEPH_POOLS:
                    pool_name = "{0}-{1}".format(pool['pool_name'], tier_name)
                    pools_quota_sum += self._get_osd_pool_quota(pool_name)

            # Currently, there is only one pool on the addtional tier(s),
            # therefore allow a quota of 0
            if (pools_quota_sum != self.tiers_size[tier] and
                    pools_quota_sum != 0):
                self._report_fault(
                    {'tier_name': tier_name,
                     'tier_eid': "{0}.tier={1}".format(
                         self.service.entity_instance_id,
                         tier_name)},
                    fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE)
            else:
                self._clear_fault(
                    fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE,
                    "{0}.tier={1}".format(self.service.entity_instance_id,
                                          tier_name))

    # CEPH HELPERS

    def _get_fsid(self):
        try:
            response, fsid = self.service.ceph_api.fsid(
                body='text', timeout=30)
        except IOError as e:
            LOG.warning(_LW("ceph_api.fsid failed: %s") % str(e))
            self.cluster_is_up = False
            return None

        if not response.ok:
            LOG.warning(_LW("Get fsid failed: %s") % response.reason)
            self.cluster_is_up = False
            return None

        self.cluster_is_up = True
        return fsid.strip()

    def _get_health(self):
        try:
            # we use text since it has all info
            response, body = self.service.ceph_api.health(
                body='text', timeout=30)
        except IOError as e:
            LOG.warning(_LW("ceph_api.health failed: %s") % str(e))
            self.cluster_is_up = False
            return {'health': constants.CEPH_HEALTH_DOWN,
                    'detail': 'Ceph cluster is down.'}

        if not response.ok:
            LOG.warning(_LW("CEPH health check failed: %s") % response.reason)
            health_info = [constants.CEPH_HEALTH_DOWN, response.reason]
            self.cluster_is_up = False
        else:
            health_info = body.split(' ', 1)
            self.cluster_is_up = True

        health = health_info[0]

        if len(health_info) > 1:
            detail = health_info[1]
        else:
            detail = health_info[0]

        return {'health': health.strip(),
                'detail': detail.strip()}

    def _get_object_pool_name(self):
        if self.known_object_pool_name is None:
            response, body = self.service.ceph_api.osd_pool_get(
                constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL,
                "pg_num",
                body='json')

            if response.ok:
                self.known_object_pool_name = \
                    constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL
                return self.known_object_pool_name

            response, body = self.service.ceph_api.osd_pool_get(
                constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER,
                "pg_num",
                body='json')

            if response.ok:
                self.known_object_pool_name = \
                    constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER
                return self.known_object_pool_name

        return self.known_object_pool_name

    def _get_osd_pool_quota(self, pool_name):
        try:
            resp, quota = self.service.ceph_api.osd_get_pool_quota(
                pool_name, body='json')
        except IOError:
            return 0

        if not resp.ok:
            LOG.error(_LE("Getting the quota for "
                          "%(name)s pool failed:%(reason)s)") %
                      {"name": pool_name, "reason": resp.reason})
            return 0
        else:
            try:
                quota_gib = int(quota["output"]["quota_max_bytes"]) / (1024**3)
                return quota_gib
            except IOError:
                return 0

    # we have two root nodes 'cache-tier' and 'storage-tier'
    # to calculate the space that is used by the pools, we must only
    # use 'storage-tier'
    # this function determines if a certain node is under a certain
    # tree
    def host_is_in_root(self, search_tree, node, root_name):
        if node['type'] == 'root':
            if node['name'] == root_name:
                return True
            else:
                return False
        return self.host_is_in_root(search_tree,
                                    search_tree[node['parent']],
                                    root_name)

    # The information received from ceph is not properly
    # structured for efficient parsing and searching, so
    # it must be processed and transformed into a more
    # structured form.
    #
    # Input received from ceph is an array of nodes with the
    # following structure:
    #    [{'id':<node_id>, 'children':<array_of_children_ids>, ....},
    #     ...]
    #
    # We process this array and transform it into a dictionary
    # (for efficient access) The transformed "search tree" is a
    # dictionary with the following structure:
    #   {<node_id> : {'children':<array_of_children_ids>}
    def _get_tiers_size(self):
        try:
            resp, body = self.service.ceph_api.osd_df(
                body='json',
                output_method='tree')
        except IOError:
            return 0
        if not resp.ok:
            LOG.error(_LE("Getting the cluster usage "
                          "information failed: %(reason)s - "
                          "%(body)s") % {"reason": resp.reason,
                                         "body": body})
            return {}

        # A node is a crushmap element: root, chassis, host, osd. Create a
        # dictionary for the nodes with the key as the id used for efficient
        # searching through nodes.
        #
        # For example: storage-0's node has one child node => OSD 0
        # {
        #     "id": -4,
        #     "name": "storage-0",
        #     "type": "host",
        #     "type_id": 1,
        #     "reweight": -1.000000,
        #     "kb": 51354096,
        #     "kb_used": 1510348,
        #     "kb_avail": 49843748,
        #     "utilization": 2.941047,
        #     "var": 1.480470,
        #     "pgs": 0,
        #     "children": [
        #         0
        #     ]
        # },
        search_tree = {}
        for node in body['output']['nodes']:
            search_tree[node['id']] = node

        # Extract the tiers as we will return a dict for the size of each tier
        tiers = {k: v for k, v in search_tree.items() if v['type'] == 'root'}

        # For each tier, traverse the heirarchy from the root->chassis->host.
        # Sum the host sizes to determine the overall size of the tier
        tier_sizes = {}
        for tier in tiers.values():
            tier_size = 0
            for chassis_id in tier['children']:
                chassis_size = 0
                chassis = search_tree[chassis_id]
                for host_id in chassis['children']:
                    host = search_tree[host_id]
                    if (chassis_size == 0 or
                            chassis_size > host['kb']):
                        chassis_size = host['kb']
                tier_size += chassis_size / (1024**2)
            tier_sizes[tier['name']] = tier_size

        return tier_sizes

    # ALARM HELPERS

    @staticmethod
    def _check_storage_group(osd_tree, group_id,
                             hosts, osds, fn_report_alarm):
        reasons = set()
        degraded_hosts = set()
        severity = fm_constants.FM_ALARM_SEVERITY_CRITICAL
        for host_id in hosts:
            if len(osds[host_id]) == 0:
                reasons.add(constants.ALARM_REASON_NO_OSD)
                degraded_hosts.add(host_id)
            else:
                for osd_id in osds[host_id]:
                    if osd_tree[osd_id]['status'] == 'up':
                        if osd_tree[osd_id]['reweight'] == 0.0:
                            reasons.add(constants.ALARM_REASON_OSDS_OUT)
                            degraded_hosts.add(host_id)
                        else:
                            severity = fm_constants.FM_ALARM_SEVERITY_MAJOR
                    elif osd_tree[osd_id]['status'] == 'down':
                        reasons.add(constants.ALARM_REASON_OSDS_DOWN)
                        degraded_hosts.add(host_id)
        if constants.ALARM_REASON_OSDS_OUT in reasons \
           and constants.ALARM_REASON_OSDS_DOWN in reasons:
            reasons.add(constants.ALARM_REASON_OSDS_DOWN_OUT)
            reasons.remove(constants.ALARM_REASON_OSDS_OUT)
        if constants.ALARM_REASON_OSDS_DOWN in reasons \
           and constants.ALARM_REASON_OSDS_DOWN_OUT in reasons:
            reasons.remove(constants.ALARM_REASON_OSDS_DOWN)
        reason = "/".join(list(reasons))
        if severity == fm_constants.FM_ALARM_SEVERITY_CRITICAL:
            reason = "{} {}: {}".format(
                fm_constants.ALARM_CRITICAL_REPLICATION,
                osd_tree[group_id]['name'],
                reason)
        elif severity == fm_constants.FM_ALARM_SEVERITY_MAJOR:
            reason = "{} {}: {}".format(
                fm_constants.ALARM_MAJOR_REPLICATION,
                osd_tree[group_id]['name'],
                reason)
        if len(degraded_hosts) == 0:
            if len(hosts) < 2:
                fn_report_alarm(
                    osd_tree[group_id]['name'],
                    "{} {}: {}".format(
                        fm_constants.ALARM_MAJOR_REPLICATION,
                        osd_tree[group_id]['name'],
                        constants.ALARM_REASON_PEER_HOST_DOWN),
                    fm_constants.FM_ALARM_SEVERITY_MAJOR)
        elif len(degraded_hosts) == 1:
            fn_report_alarm(
                "{}.host={}".format(
                    osd_tree[group_id]['name'],
                    osd_tree[list(degraded_hosts)[0]]['name']),
                reason, severity)
        else:
            fn_report_alarm(
                osd_tree[group_id]['name'],
                reason, severity)

    def _check_storage_tier(self, osd_tree, tier_name, fn_report_alarm):
        for tier_id in osd_tree:
            if osd_tree[tier_id]['type'] != 'root':
                continue
            if osd_tree[tier_id]['name'] != tier_name:
                continue
            for group_id in osd_tree[tier_id]['children']:
                if osd_tree[group_id]['type'] != 'chassis':
                    continue
                if not osd_tree[group_id]['name'].startswith('group-'):
                    continue
                hosts = []
                osds = {}
                for host_id in osd_tree[group_id]['children']:
                    if osd_tree[host_id]['type'] != 'host':
                        continue
                    hosts.append(host_id)
                    osds[host_id] = []
                    for osd_id in osd_tree[host_id]['children']:
                        if osd_tree[osd_id]['type'] == 'osd':
                            osds[host_id].append(osd_id)
                self._check_storage_group(osd_tree, group_id, hosts,
                                          osds, fn_report_alarm)
            break

    def _current_health_alarm_equals(self, reason, severity):
        if not self.current_health_alarm:
            return False
        if getattr(self.current_health_alarm, 'severity', None) != severity:
            return False
        if getattr(self.current_health_alarm, 'reason_text', None) != reason:
            return False
        return True

    def _report_alarm_osds_health(self):
        response, osd_tree = self.service.ceph_api.osd_tree(body='json')
        if not response.ok:
            LOG.error(_LE("Failed to retrieve Ceph OSD tree: "
                          "status_code: %(status_code)s, reason: %(reason)s") %
                      {"status_code": response.status_code,
                       "reason": response.reason})
            return
        osd_tree = dict([(n['id'], n) for n in osd_tree['output']['nodes']])
        alarms = []

        self._check_storage_tier(osd_tree, "storage-tier",
                                 lambda *args: alarms.append(args))

        old_alarms = {}
        for alarm_id in [
                fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR,
                fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL]:
            alarm_list = self.service.fm_api.get_faults_by_id(alarm_id)
            if not alarm_list:
                continue
            for alarm in alarm_list:
                if alarm.entity_instance_id not in old_alarms:
                    old_alarms[alarm.entity_instance_id] = []
                old_alarms[alarm.entity_instance_id].append(
                    (alarm.alarm_id, alarm.reason_text))

        for peer_group, reason, severity in alarms:
            if self._current_health_alarm_equals(reason, severity):
                continue
            alarm_critical_major = fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR
            if severity == fm_constants.FM_ALARM_SEVERITY_CRITICAL:
                alarm_critical_major = (
                    fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL)
            entity_instance_id = (
                self.service.entity_instance_id + '.peergroup=' + peer_group)
            alarm_already_exists = False
            if entity_instance_id in old_alarms:
                for alarm_id, old_reason in old_alarms[entity_instance_id]:
                    if (reason == old_reason and
                            alarm_id == alarm_critical_major):
                        # if the alarm is exactly the same, we don't need
                        # to recreate it
                        old_alarms[entity_instance_id].remove(
                            (alarm_id, old_reason))
                        alarm_already_exists = True
                    elif (alarm_id == alarm_critical_major):
                        # if we change just the reason, then we just remove the
                        # alarm from the list so we don't remove it at the
                        # end of the function
                        old_alarms[entity_instance_id].remove(
                            (alarm_id, old_reason))

                if (len(old_alarms[entity_instance_id]) == 0):
                    del old_alarms[entity_instance_id]

                # in case the alarm is exactly the same, we skip the alarm set
                if alarm_already_exists is True:
                    continue
            major_repair_action = constants.REPAIR_ACTION_MAJOR_CRITICAL_ALARM
            fault = fm_api.Fault(
                alarm_id=alarm_critical_major,
                alarm_type=fm_constants.FM_ALARM_TYPE_4,
                alarm_state=fm_constants.FM_ALARM_STATE_SET,
                entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER,
                entity_instance_id=entity_instance_id,
                severity=severity,
                reason_text=reason,
                probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_15,
                proposed_repair_action=major_repair_action,
                service_affecting=constants.SERVICE_AFFECTING['HEALTH_WARN'])
            alarm_uuid = self.service.fm_api.set_fault(fault)
            if alarm_uuid:
                LOG.info(_LI(
                    "Created storage alarm %(alarm_uuid)s - "
                    "severity: %(severity)s, reason: %(reason)s, "
                    "service_affecting: %(service_affecting)s") % {
                    "alarm_uuid": str(alarm_uuid),
                    "severity": str(severity),
                    "reason": reason,
                    "service_affecting": str(
                        constants.SERVICE_AFFECTING['HEALTH_WARN'])})
            else:
                LOG.error(_LE(
                    "Failed to create storage alarm - "
                    "severity: %(severity)s, reason: %(reason)s, "
                    "service_affecting: %(service_affecting)s") % {
                    "severity": str(severity),
                    "reason": reason,
                    "service_affecting": str(
                        constants.SERVICE_AFFECTING['HEALTH_WARN'])})

        for entity_instance_id in old_alarms:
            for alarm_id, old_reason in old_alarms[entity_instance_id]:
                self.service.fm_api.clear_fault(alarm_id, entity_instance_id)

    @staticmethod
    def _parse_reason(health):
        """Parse reason strings received from Ceph"""
        if health['health'] in constants.CEPH_STATUS_CUSTOM:
            # Don't parse reason messages that we added
            return "Storage Alarm Condition: %(health)s. %(detail)s" % health

        reasons_lst = health['detail'].split(';')

        parsed_reasons_text = ""

        # Check if PGs have issues - we can't safely store the entire message
        # as it tends to be long
        for reason in reasons_lst:
            if "pgs" in reason:
                parsed_reasons_text += "PGs are degraded/stuck or undersized"
                break

        # Extract recovery status
        parsed_reasons = [r.strip() for r in reasons_lst if 'recovery' in r]
        if parsed_reasons:
            parsed_reasons_text += ";" + ";".join(parsed_reasons)

        # We need to keep the most important parts of the messages when storing
        # them to fm alarms, therefore text between [] brackets is truncated if
        # max size is reached.

        # Add brackets, if needed
        if len(parsed_reasons_text):
            lbracket = " ["
            rbracket = "]"
        else:
            lbracket = ""
            rbracket = ""

        msg = {"head": "Storage Alarm Condition: ",
               "tail": ". Please check 'ceph -s' for more details."}
        max_size = constants.FM_ALARM_REASON_MAX_SIZE - \
            len(msg["head"]) - len(msg["tail"])

        return (
            msg['head'] +
            (health['health'] + lbracket
             + parsed_reasons_text)[:max_size - 1] +
            rbracket + msg['tail'])

    def _report_fault(self, health, alarm_id):
        if alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH:
            new_severity = constants.SEVERITY[health['health']]
            new_reason_text = self._parse_reason(health)
            new_service_affecting = \
                constants.SERVICE_AFFECTING[health['health']]

            # Raise or update alarm if necessary
            if ((not self.current_health_alarm) or
                (self.current_health_alarm.__dict__['severity'] !=
                 new_severity) or
                (self.current_health_alarm.__dict__['reason_text'] !=
                 new_reason_text) or
                (self.current_health_alarm.__dict__['service_affecting'] !=
                 str(new_service_affecting))):

                fault = fm_api.Fault(
                    alarm_id=fm_constants.FM_ALARM_ID_STORAGE_CEPH,
                    alarm_type=fm_constants.FM_ALARM_TYPE_4,
                    alarm_state=fm_constants.FM_ALARM_STATE_SET,
                    entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER,
                    entity_instance_id=self.service.entity_instance_id,
                    severity=new_severity,
                    reason_text=new_reason_text,
                    probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_15,
                    proposed_repair_action=constants.REPAIR_ACTION,
                    service_affecting=new_service_affecting)

                alarm_uuid = self.service.fm_api.set_fault(fault)
                if alarm_uuid:
                    LOG.info(_LI(
                        "Created storage alarm %(alarm_uuid)s - "
                        "severity: %(severity)s, reason: %(reason)s, "
                        "service_affecting: %(service_affecting)s") % {
                        "alarm_uuid": alarm_uuid,
                        "severity": new_severity,
                        "reason": new_reason_text,
                        "service_affecting": new_service_affecting})
                else:
                    LOG.error(_LE(
                        "Failed to create storage alarm - "
                        "severity: %(severity)s, reason: %(reason)s "
                        "service_affecting: %(service_affecting)s") % {
                        "severity": new_severity,
                        "reason": new_reason_text,
                        "service_affecting": new_service_affecting})

            # Log detailed reason for later analysis
            if (self.current_ceph_health != health['health'] or
                    self.detailed_health_reason != health['detail']):
                LOG.info(_LI("Ceph status changed: %(health)s "
                             "detailed reason: %(detail)s") % health)
                self.current_ceph_health = health['health']
                self.detailed_health_reason = health['detail']

        elif (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE and
              not health['tier_eid'] in self.current_quota_alarms):

            quota_reason_text = ("Quota/Space mismatch for the %s tier. The "
                                 "sum of Ceph pool quotas does not match the "
                                 "tier size." % health['tier_name'])
            fault = fm_api.Fault(
                alarm_id=fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE,
                alarm_state=fm_constants.FM_ALARM_STATE_SET,
                entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER,
                entity_instance_id=health['tier_eid'],
                severity=fm_constants.FM_ALARM_SEVERITY_MINOR,
                reason_text=quota_reason_text,
                alarm_type=fm_constants.FM_ALARM_TYPE_7,
                probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_75,
                proposed_repair_action=(
                    "Update ceph storage pool quotas to use all available "
                    "cluster space for the %s tier." % health['tier_name']),
                service_affecting=False)

            alarm_uuid = self.service.fm_api.set_fault(fault)
            if alarm_uuid:
                LOG.info(_LI(
                    "Created storage quota storage alarm %(alarm_uuid)s. "
                    "Reason: %(reason)s") % {
                    "alarm_uuid": alarm_uuid, "reason": quota_reason_text})
            else:
                LOG.error(_LE("Failed to create quota "
                              "storage alarm. Reason: %s") % quota_reason_text)

    def _clear_fault(self, alarm_id, entity_instance_id=None):
        # Only clear alarm if there is one already raised
        if (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH and
                self.current_health_alarm):
            LOG.info(_LI("Clearing health alarm"))
            self.service.fm_api.clear_fault(
                fm_constants.FM_ALARM_ID_STORAGE_CEPH,
                self.service.entity_instance_id)
        elif (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE and
              entity_instance_id in self.current_quota_alarms):
            LOG.info(_LI("Clearing quota alarm with entity_instance_id %s")
                     % entity_instance_id)
            self.service.fm_api.clear_fault(
                fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE,
                entity_instance_id)

    def clear_critical_alarm(self, group_name):
        alarm_list = self.service.fm_api.get_faults_by_id(
            fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL)
        if alarm_list:
            for alarm in range(len(alarm_list)):
                group_id = alarm_list[alarm].entity_instance_id.find("group-")
                group_instance_name = (
                    "group-" +
                    alarm_list[alarm].entity_instance_id[group_id + 6])
                if group_name == group_instance_name:
                    self.service.fm_api.clear_fault(
                        fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL,
                        alarm_list[alarm].entity_instance_id)

    def clear_all_major_critical(self, group_name=None):
        # clear major alarms
        alarm_list = self.service.fm_api.get_faults_by_id(
            fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR)
        if alarm_list:
            for alarm in range(len(alarm_list)):
                if group_name is not None:
                    group_id = (
                        alarm_list[alarm].entity_instance_id.find("group-"))
                    group_instance_name = (
                        "group-" +
                        alarm_list[alarm].entity_instance_id[group_id + 6])
                    if group_name == group_instance_name:
                        self.service.fm_api.clear_fault(
                            fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR,
                            alarm_list[alarm].entity_instance_id)
                else:
                    self.service.fm_api.clear_fault(
                        fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR,
                        alarm_list[alarm].entity_instance_id)
        # clear critical alarms
        alarm_list = self.service.fm_api.get_faults_by_id(
            fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL)
        if alarm_list:
            for alarm in range(len(alarm_list)):
                if group_name is not None:
                    group_id = (
                        alarm_list[alarm].entity_instance_id.find("group-"))
                    group_instance_name = (
                        "group-" +
                        alarm_list[alarm].entity_instance_id[group_id + 6])
                    if group_name == group_instance_name:
                        self.service.fm_api.clear_fault(
                            fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL,
                            alarm_list[alarm].entity_instance_id)
                else:
                    self.service.fm_api.clear_fault(
                        fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL,
                        alarm_list[alarm].entity_instance_id)

    def _get_current_alarms(self):
        """Retrieve currently raised alarm"""
        self.current_health_alarm = self.service.fm_api.get_fault(
            fm_constants.FM_ALARM_ID_STORAGE_CEPH,
            self.service.entity_instance_id)
        quota_faults = self.service.fm_api.get_faults_by_id(
            fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE)
        if quota_faults:
            self.current_quota_alarms = [f.entity_instance_id
                                         for f in quota_faults]
        else:
            self.current_quota_alarms = []