utilities/ceph/ceph-manager/ceph-manager/ceph_manager/monitor.py

884 lines
37 KiB
Python

#
# Copyright (c) 2013-2018 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
import time
# noinspection PyUnresolvedReferences
from fm_api import fm_api
# noinspection PyUnresolvedReferences
from fm_api import constants as fm_constants
# noinspection PyUnresolvedReferences
from oslo_log import log as logging
# noinspection PyProtectedMember
from ceph_manager.i18n import _
from ceph_manager.i18n import _LI
from ceph_manager.i18n import _LW
from ceph_manager.i18n import _LE
from ceph_manager import constants
from ceph_manager import exception
LOG = logging.getLogger(__name__)
# In 18.03 R5, ceph cache tiering was disabled and prevented from being
# re-enabled. When upgrading from 18.03 (R5) to R6 we need to remove the
# cache-tier from the crushmap ceph-cache-tiering
#
# This class is needed only when upgrading from R5 to R6
# TODO: remove it after 1st R6 release
#
class HandleUpgradesMixin(object):
def __init__(self, service):
self.service = service
self.wait_for_upgrade_complete = False
def setup(self, config):
self._set_upgrade(self.service.retry_get_software_upgrade_status())
def _set_upgrade(self, upgrade):
state = upgrade.get('state')
from_version = upgrade.get('from_version')
if (state
and state != constants.UPGRADE_COMPLETED
and from_version == constants.TITANIUM_SERVER_VERSION_18_03):
LOG.info(_LI("Wait for ceph upgrade to complete "
"before monitoring cluster."))
self.wait_for_upgrade_complete = True
def set_flag_require_jewel_osds(self):
try:
response, body = self.service.ceph_api.osd_set_key(
constants.CEPH_FLAG_REQUIRE_JEWEL_OSDS,
body='json')
LOG.info(_LI("Set require_jewel_osds flag"))
except IOError as e:
raise exception.CephApiFailure(
call="osd_set_key",
reason=str(e))
else:
if not response.ok:
raise exception.CephSetKeyFailure(
flag=constants.CEPH_FLAG_REQUIRE_JEWEL_OSDS,
extra=_("needed to complete upgrade to Jewel"),
response_status_code=response.status_code,
response_reason=response.reason,
status=body.get('status'),
output=body.get('output'))
def filter_health_status(self, health):
health = self.auto_heal(health)
# filter out require_jewel_osds warning
#
if not self.wait_for_upgrade_complete:
return health
if health['health'] != constants.CEPH_HEALTH_WARN:
return health
if (constants.CEPH_HEALTH_WARN_REQUIRE_JEWEL_OSDS_NOT_SET
not in health['detail']):
return health
return self._remove_require_jewel_osds_warning(health)
def _remove_require_jewel_osds_warning(self, health):
reasons_list = []
for reason in health['detail'].split(';'):
reason = reason.strip()
if len(reason) == 0:
continue
if constants.CEPH_HEALTH_WARN_REQUIRE_JEWEL_OSDS_NOT_SET \
in reason:
continue
reasons_list.append(reason)
if len(reasons_list) == 0:
health = {
'health': constants.CEPH_HEALTH_OK,
'detail': ''}
else:
health['detail'] = '; '.join(reasons_list)
return health
def auto_heal(self, health):
if (health['health'] == constants.CEPH_HEALTH_WARN
and (constants.CEPH_HEALTH_WARN_REQUIRE_JEWEL_OSDS_NOT_SET
in health['detail'])):
try:
upgrade = self.service.get_software_upgrade_status()
except Exception as ex:
LOG.warn(_LW(
"Getting software upgrade status failed "
"with: %s. Skip auto-heal attempt "
"(will retry on next ceph status poll).") % str(ex))
return health
state = upgrade.get('state')
# surpress require_jewel_osds in case upgrade is
# in progress but not completed or aborting
if (not self.wait_for_upgrade_complete
and (upgrade.get('from_version')
== constants.TITANIUM_SERVER_VERSION_18_03)
and state not in [
None,
constants.UPGRADE_COMPLETED,
constants.UPGRADE_ABORTING,
constants.UPGRADE_ABORT_COMPLETING,
constants.UPGRADE_ABORTING_ROLLBACK]):
self.wait_for_upgrade_complete = True
# set require_jewel_osds in case upgrade is
# not in progress or completed
if (state in [None, constants.UPGRADE_COMPLETED]):
LOG.warn(_LW(
"No upgrade in progress or update completed "
"and require_jewel_osds health warning raised. "
"Set require_jewel_osds flag."))
self.set_flag_require_jewel_osds()
health = self._remove_require_jewel_osds_warning(health)
LOG.info(_LI("Unsurpress require_jewel_osds health warning"))
self.wait_for_upgrade_complete = False
# unsurpress require_jewel_osds in case upgrade
# is aborting
if (state in [
constants.UPGRADE_ABORTING,
constants.UPGRADE_ABORT_COMPLETING,
constants.UPGRADE_ABORTING_ROLLBACK]):
self.wait_for_upgrade_complete = False
return health
class Monitor(HandleUpgradesMixin):
def __init__(self, service):
self.service = service
self.current_ceph_health = ""
self.tiers_size = {}
self.known_object_pool_name = None
self.primary_tier_name = constants.SB_TIER_DEFAULT_NAMES[
constants.SB_TIER_TYPE_CEPH] + constants.CEPH_CRUSH_TIER_SUFFIX
self.cluster_is_up = False
super(Monitor, self).__init__(service)
def setup(self, config):
super(Monitor, self).setup(config)
def run(self):
# Wait until Ceph cluster is up and we can get the fsid
while True:
try:
self.ceph_get_fsid()
except Exception:
LOG.exception(
"Error getting fsid, will retry in %ss"
% constants.CEPH_HEALTH_CHECK_INTERVAL)
if self.service.entity_instance_id:
break
time.sleep(constants.CEPH_HEALTH_CHECK_INTERVAL)
# Start monitoring ceph status
while True:
try:
self.ceph_poll_status()
self.ceph_poll_quotas()
except Exception:
LOG.exception(
"Error running periodic monitoring of ceph status, "
"will retry in %ss"
% constants.CEPH_HEALTH_CHECK_INTERVAL)
time.sleep(constants.CEPH_HEALTH_CHECK_INTERVAL)
def ceph_get_fsid(self):
# Check whether an alarm has already been raised
self._get_current_alarms()
if self.current_health_alarm:
LOG.info(_LI("Current alarm: %s") %
str(self.current_health_alarm.__dict__))
fsid = self._get_fsid()
if not fsid:
# Raise alarm - it will not have an entity_instance_id
self._report_fault({'health': constants.CEPH_HEALTH_DOWN,
'detail': 'Ceph cluster is down.'},
fm_constants.FM_ALARM_ID_STORAGE_CEPH)
else:
# Clear alarm with no entity_instance_id
self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH)
self.service.entity_instance_id = 'cluster=%s' % fsid
def ceph_poll_status(self):
# get previous data every time in case:
# * daemon restarted
# * alarm was cleared manually but stored as raised in daemon
self._get_current_alarms()
if self.current_health_alarm:
LOG.info(_LI("Current alarm: %s") %
str(self.current_health_alarm.__dict__))
# get ceph health
health = self._get_health()
LOG.info(_LI("Current Ceph health: "
"%(health)s detail: %(detail)s") % health)
health = self.filter_health_status(health)
if health['health'] != constants.CEPH_HEALTH_OK:
self._report_fault(health, fm_constants.FM_ALARM_ID_STORAGE_CEPH)
self._report_alarm_osds_health()
else:
self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH)
self.clear_all_major_critical()
def filter_health_status(self, health):
return super(Monitor, self).filter_health_status(health)
def ceph_poll_quotas(self):
self._get_current_alarms()
if self.current_quota_alarms:
LOG.info(_LI("Current quota alarms %s") %
self.current_quota_alarms)
# Get current current size of each tier
previous_tiers_size = self.tiers_size
self.tiers_size = self._get_tiers_size()
# Make sure any removed tiers have the alarms cleared
for t in (set(previous_tiers_size) - set(self.tiers_size)):
self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE,
"{0}.tier={1}".format(
self.service.entity_instance_id,
t[:-len(constants.CEPH_CRUSH_TIER_SUFFIX)]))
# Check the quotas on each tier
for tier in self.tiers_size:
# Extract the tier name from the crush equivalent
tier_name = tier[:-len(constants.CEPH_CRUSH_TIER_SUFFIX)]
if self.tiers_size[tier] == 0:
LOG.info(_LI("'%s' tier cluster size not yet available")
% tier_name)
continue
pools_quota_sum = 0
if tier == self.primary_tier_name:
for pool in constants.CEPH_POOLS:
if (pool['pool_name'] ==
constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL or
pool['pool_name'] ==
constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER):
object_pool_name = self._get_object_pool_name()
if object_pool_name is None:
LOG.error("Rados gateway object data pool does "
"not exist.")
else:
pools_quota_sum += \
self._get_osd_pool_quota(object_pool_name)
else:
pools_quota_sum += self._get_osd_pool_quota(
pool['pool_name'])
else:
for pool in constants.SB_TIER_CEPH_POOLS:
pool_name = "{0}-{1}".format(pool['pool_name'], tier_name)
pools_quota_sum += self._get_osd_pool_quota(pool_name)
# Currently, there is only one pool on the addtional tier(s),
# therefore allow a quota of 0
if (pools_quota_sum != self.tiers_size[tier] and
pools_quota_sum != 0):
self._report_fault(
{'tier_name': tier_name,
'tier_eid': "{0}.tier={1}".format(
self.service.entity_instance_id,
tier_name)},
fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE)
else:
self._clear_fault(
fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE,
"{0}.tier={1}".format(self.service.entity_instance_id,
tier_name))
# CEPH HELPERS
def _get_fsid(self):
try:
response, fsid = self.service.ceph_api.fsid(
body='text', timeout=30)
except IOError as e:
LOG.warning(_LW("ceph_api.fsid failed: %s") % str(e))
self.cluster_is_up = False
return None
if not response.ok:
LOG.warning(_LW("Get fsid failed: %s") % response.reason)
self.cluster_is_up = False
return None
self.cluster_is_up = True
return fsid.strip()
def _get_health(self):
try:
# we use text since it has all info
response, body = self.service.ceph_api.health(
body='text', timeout=30)
except IOError as e:
LOG.warning(_LW("ceph_api.health failed: %s") % str(e))
self.cluster_is_up = False
return {'health': constants.CEPH_HEALTH_DOWN,
'detail': 'Ceph cluster is down.'}
if not response.ok:
LOG.warning(_LW("CEPH health check failed: %s") % response.reason)
health_info = [constants.CEPH_HEALTH_DOWN, response.reason]
self.cluster_is_up = False
else:
health_info = body.split(' ', 1)
self.cluster_is_up = True
health = health_info[0]
if len(health_info) > 1:
detail = health_info[1]
else:
detail = health_info[0]
return {'health': health.strip(),
'detail': detail.strip()}
def _get_object_pool_name(self):
if self.known_object_pool_name is None:
response, body = self.service.ceph_api.osd_pool_get(
constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL,
"pg_num",
body='json')
if response.ok:
self.known_object_pool_name = \
constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL
return self.known_object_pool_name
response, body = self.service.ceph_api.osd_pool_get(
constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER,
"pg_num",
body='json')
if response.ok:
self.known_object_pool_name = \
constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER
return self.known_object_pool_name
return self.known_object_pool_name
def _get_osd_pool_quota(self, pool_name):
try:
resp, quota = self.service.ceph_api.osd_get_pool_quota(
pool_name, body='json')
except IOError:
return 0
if not resp.ok:
LOG.error(_LE("Getting the quota for "
"%(name)s pool failed:%(reason)s)") %
{"name": pool_name, "reason": resp.reason})
return 0
else:
try:
quota_gib = int(quota["output"]["quota_max_bytes"]) / (1024**3)
return quota_gib
except IOError:
return 0
# we have two root nodes 'cache-tier' and 'storage-tier'
# to calculate the space that is used by the pools, we must only
# use 'storage-tier'
# this function determines if a certain node is under a certain
# tree
def host_is_in_root(self, search_tree, node, root_name):
if node['type'] == 'root':
if node['name'] == root_name:
return True
else:
return False
return self.host_is_in_root(search_tree,
search_tree[node['parent']],
root_name)
# The information received from ceph is not properly
# structured for efficient parsing and searching, so
# it must be processed and transformed into a more
# structured form.
#
# Input received from ceph is an array of nodes with the
# following structure:
# [{'id':<node_id>, 'children':<array_of_children_ids>, ....},
# ...]
#
# We process this array and transform it into a dictionary
# (for efficient access) The transformed "search tree" is a
# dictionary with the following structure:
# {<node_id> : {'children':<array_of_children_ids>}
def _get_tiers_size(self):
try:
resp, body = self.service.ceph_api.osd_df(
body='json',
output_method='tree')
except IOError:
return 0
if not resp.ok:
LOG.error(_LE("Getting the cluster usage "
"information failed: %(reason)s - "
"%(body)s") % {"reason": resp.reason,
"body": body})
return {}
# A node is a crushmap element: root, chassis, host, osd. Create a
# dictionary for the nodes with the key as the id used for efficient
# searching through nodes.
#
# For example: storage-0's node has one child node => OSD 0
# {
# "id": -4,
# "name": "storage-0",
# "type": "host",
# "type_id": 1,
# "reweight": -1.000000,
# "kb": 51354096,
# "kb_used": 1510348,
# "kb_avail": 49843748,
# "utilization": 2.941047,
# "var": 1.480470,
# "pgs": 0,
# "children": [
# 0
# ]
# },
search_tree = {}
for node in body['output']['nodes']:
search_tree[node['id']] = node
# Extract the tiers as we will return a dict for the size of each tier
tiers = {k: v for k, v in search_tree.items() if v['type'] == 'root'}
# For each tier, traverse the heirarchy from the root->chassis->host.
# Sum the host sizes to determine the overall size of the tier
tier_sizes = {}
for tier in tiers.values():
tier_size = 0
for chassis_id in tier['children']:
chassis_size = 0
chassis = search_tree[chassis_id]
for host_id in chassis['children']:
host = search_tree[host_id]
if (chassis_size == 0 or
chassis_size > host['kb']):
chassis_size = host['kb']
tier_size += chassis_size / (1024**2)
tier_sizes[tier['name']] = tier_size
return tier_sizes
# ALARM HELPERS
@staticmethod
def _check_storage_group(osd_tree, group_id,
hosts, osds, fn_report_alarm):
reasons = set()
degraded_hosts = set()
severity = fm_constants.FM_ALARM_SEVERITY_CRITICAL
for host_id in hosts:
if len(osds[host_id]) == 0:
reasons.add(constants.ALARM_REASON_NO_OSD)
degraded_hosts.add(host_id)
else:
for osd_id in osds[host_id]:
if osd_tree[osd_id]['status'] == 'up':
if osd_tree[osd_id]['reweight'] == 0.0:
reasons.add(constants.ALARM_REASON_OSDS_OUT)
degraded_hosts.add(host_id)
else:
severity = fm_constants.FM_ALARM_SEVERITY_MAJOR
elif osd_tree[osd_id]['status'] == 'down':
reasons.add(constants.ALARM_REASON_OSDS_DOWN)
degraded_hosts.add(host_id)
if constants.ALARM_REASON_OSDS_OUT in reasons \
and constants.ALARM_REASON_OSDS_DOWN in reasons:
reasons.add(constants.ALARM_REASON_OSDS_DOWN_OUT)
reasons.remove(constants.ALARM_REASON_OSDS_OUT)
if constants.ALARM_REASON_OSDS_DOWN in reasons \
and constants.ALARM_REASON_OSDS_DOWN_OUT in reasons:
reasons.remove(constants.ALARM_REASON_OSDS_DOWN)
reason = "/".join(list(reasons))
if severity == fm_constants.FM_ALARM_SEVERITY_CRITICAL:
reason = "{} {}: {}".format(
fm_constants.ALARM_CRITICAL_REPLICATION,
osd_tree[group_id]['name'],
reason)
elif severity == fm_constants.FM_ALARM_SEVERITY_MAJOR:
reason = "{} {}: {}".format(
fm_constants.ALARM_MAJOR_REPLICATION,
osd_tree[group_id]['name'],
reason)
if len(degraded_hosts) == 0:
if len(hosts) < 2:
fn_report_alarm(
osd_tree[group_id]['name'],
"{} {}: {}".format(
fm_constants.ALARM_MAJOR_REPLICATION,
osd_tree[group_id]['name'],
constants.ALARM_REASON_PEER_HOST_DOWN),
fm_constants.FM_ALARM_SEVERITY_MAJOR)
elif len(degraded_hosts) == 1:
fn_report_alarm(
"{}.host={}".format(
osd_tree[group_id]['name'],
osd_tree[list(degraded_hosts)[0]]['name']),
reason, severity)
else:
fn_report_alarm(
osd_tree[group_id]['name'],
reason, severity)
def _check_storage_tier(self, osd_tree, tier_name, fn_report_alarm):
for tier_id in osd_tree:
if osd_tree[tier_id]['type'] != 'root':
continue
if osd_tree[tier_id]['name'] != tier_name:
continue
for group_id in osd_tree[tier_id]['children']:
if osd_tree[group_id]['type'] != 'chassis':
continue
if not osd_tree[group_id]['name'].startswith('group-'):
continue
hosts = []
osds = {}
for host_id in osd_tree[group_id]['children']:
if osd_tree[host_id]['type'] != 'host':
continue
hosts.append(host_id)
osds[host_id] = []
for osd_id in osd_tree[host_id]['children']:
if osd_tree[osd_id]['type'] == 'osd':
osds[host_id].append(osd_id)
self._check_storage_group(osd_tree, group_id, hosts,
osds, fn_report_alarm)
break
def _current_health_alarm_equals(self, reason, severity):
if not self.current_health_alarm:
return False
if getattr(self.current_health_alarm, 'severity', None) != severity:
return False
if getattr(self.current_health_alarm, 'reason_text', None) != reason:
return False
return True
def _report_alarm_osds_health(self):
response, osd_tree = self.service.ceph_api.osd_tree(body='json')
if not response.ok:
LOG.error(_LE("Failed to retrieve Ceph OSD tree: "
"status_code: %(status_code)s, reason: %(reason)s") %
{"status_code": response.status_code,
"reason": response.reason})
return
osd_tree = dict([(n['id'], n) for n in osd_tree['output']['nodes']])
alarms = []
self._check_storage_tier(osd_tree, "storage-tier",
lambda *args: alarms.append(args))
old_alarms = {}
for alarm_id in [
fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR,
fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL]:
alarm_list = self.service.fm_api.get_faults_by_id(alarm_id)
if not alarm_list:
continue
for alarm in alarm_list:
if alarm.entity_instance_id not in old_alarms:
old_alarms[alarm.entity_instance_id] = []
old_alarms[alarm.entity_instance_id].append(
(alarm.alarm_id, alarm.reason_text))
for peer_group, reason, severity in alarms:
if self._current_health_alarm_equals(reason, severity):
continue
alarm_critical_major = fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR
if severity == fm_constants.FM_ALARM_SEVERITY_CRITICAL:
alarm_critical_major = (
fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL)
entity_instance_id = (
self.service.entity_instance_id + '.peergroup=' + peer_group)
alarm_already_exists = False
if entity_instance_id in old_alarms:
for alarm_id, old_reason in old_alarms[entity_instance_id]:
if (reason == old_reason and
alarm_id == alarm_critical_major):
# if the alarm is exactly the same, we don't need
# to recreate it
old_alarms[entity_instance_id].remove(
(alarm_id, old_reason))
alarm_already_exists = True
elif (alarm_id == alarm_critical_major):
# if we change just the reason, then we just remove the
# alarm from the list so we don't remove it at the
# end of the function
old_alarms[entity_instance_id].remove(
(alarm_id, old_reason))
if (len(old_alarms[entity_instance_id]) == 0):
del old_alarms[entity_instance_id]
# in case the alarm is exactly the same, we skip the alarm set
if alarm_already_exists is True:
continue
major_repair_action = constants.REPAIR_ACTION_MAJOR_CRITICAL_ALARM
fault = fm_api.Fault(
alarm_id=alarm_critical_major,
alarm_type=fm_constants.FM_ALARM_TYPE_4,
alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER,
entity_instance_id=entity_instance_id,
severity=severity,
reason_text=reason,
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_15,
proposed_repair_action=major_repair_action,
service_affecting=constants.SERVICE_AFFECTING['HEALTH_WARN'])
alarm_uuid = self.service.fm_api.set_fault(fault)
if alarm_uuid:
LOG.info(_LI(
"Created storage alarm %(alarm_uuid)s - "
"severity: %(severity)s, reason: %(reason)s, "
"service_affecting: %(service_affecting)s") % {
"alarm_uuid": str(alarm_uuid),
"severity": str(severity),
"reason": reason,
"service_affecting": str(
constants.SERVICE_AFFECTING['HEALTH_WARN'])})
else:
LOG.error(_LE(
"Failed to create storage alarm - "
"severity: %(severity)s, reason: %(reason)s, "
"service_affecting: %(service_affecting)s") % {
"severity": str(severity),
"reason": reason,
"service_affecting": str(
constants.SERVICE_AFFECTING['HEALTH_WARN'])})
for entity_instance_id in old_alarms:
for alarm_id, old_reason in old_alarms[entity_instance_id]:
self.service.fm_api.clear_fault(alarm_id, entity_instance_id)
@staticmethod
def _parse_reason(health):
"""Parse reason strings received from Ceph"""
if health['health'] in constants.CEPH_STATUS_CUSTOM:
# Don't parse reason messages that we added
return "Storage Alarm Condition: %(health)s. %(detail)s" % health
reasons_lst = health['detail'].split(';')
parsed_reasons_text = ""
# Check if PGs have issues - we can't safely store the entire message
# as it tends to be long
for reason in reasons_lst:
if "pgs" in reason:
parsed_reasons_text += "PGs are degraded/stuck or undersized"
break
# Extract recovery status
parsed_reasons = [r.strip() for r in reasons_lst if 'recovery' in r]
if parsed_reasons:
parsed_reasons_text += ";" + ";".join(parsed_reasons)
# We need to keep the most important parts of the messages when storing
# them to fm alarms, therefore text between [] brackets is truncated if
# max size is reached.
# Add brackets, if needed
if len(parsed_reasons_text):
lbracket = " ["
rbracket = "]"
else:
lbracket = ""
rbracket = ""
msg = {"head": "Storage Alarm Condition: ",
"tail": ". Please check 'ceph -s' for more details."}
max_size = constants.FM_ALARM_REASON_MAX_SIZE - \
len(msg["head"]) - len(msg["tail"])
return (
msg['head'] +
(health['health'] + lbracket
+ parsed_reasons_text)[:max_size - 1] +
rbracket + msg['tail'])
def _report_fault(self, health, alarm_id):
if alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH:
new_severity = constants.SEVERITY[health['health']]
new_reason_text = self._parse_reason(health)
new_service_affecting = \
constants.SERVICE_AFFECTING[health['health']]
# Raise or update alarm if necessary
if ((not self.current_health_alarm) or
(self.current_health_alarm.__dict__['severity'] !=
new_severity) or
(self.current_health_alarm.__dict__['reason_text'] !=
new_reason_text) or
(self.current_health_alarm.__dict__['service_affecting'] !=
str(new_service_affecting))):
fault = fm_api.Fault(
alarm_id=fm_constants.FM_ALARM_ID_STORAGE_CEPH,
alarm_type=fm_constants.FM_ALARM_TYPE_4,
alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER,
entity_instance_id=self.service.entity_instance_id,
severity=new_severity,
reason_text=new_reason_text,
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_15,
proposed_repair_action=constants.REPAIR_ACTION,
service_affecting=new_service_affecting)
alarm_uuid = self.service.fm_api.set_fault(fault)
if alarm_uuid:
LOG.info(_LI(
"Created storage alarm %(alarm_uuid)s - "
"severity: %(severity)s, reason: %(reason)s, "
"service_affecting: %(service_affecting)s") % {
"alarm_uuid": alarm_uuid,
"severity": new_severity,
"reason": new_reason_text,
"service_affecting": new_service_affecting})
else:
LOG.error(_LE(
"Failed to create storage alarm - "
"severity: %(severity)s, reason: %(reason)s "
"service_affecting: %(service_affecting)s") % {
"severity": new_severity,
"reason": new_reason_text,
"service_affecting": new_service_affecting})
# Log detailed reason for later analysis
if (self.current_ceph_health != health['health'] or
self.detailed_health_reason != health['detail']):
LOG.info(_LI("Ceph status changed: %(health)s "
"detailed reason: %(detail)s") % health)
self.current_ceph_health = health['health']
self.detailed_health_reason = health['detail']
elif (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE and
not health['tier_eid'] in self.current_quota_alarms):
quota_reason_text = ("Quota/Space mismatch for the %s tier. The "
"sum of Ceph pool quotas does not match the "
"tier size." % health['tier_name'])
fault = fm_api.Fault(
alarm_id=fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE,
alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER,
entity_instance_id=health['tier_eid'],
severity=fm_constants.FM_ALARM_SEVERITY_MINOR,
reason_text=quota_reason_text,
alarm_type=fm_constants.FM_ALARM_TYPE_7,
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_75,
proposed_repair_action=(
"Update ceph storage pool quotas to use all available "
"cluster space for the %s tier." % health['tier_name']),
service_affecting=False)
alarm_uuid = self.service.fm_api.set_fault(fault)
if alarm_uuid:
LOG.info(_LI(
"Created storage quota storage alarm %(alarm_uuid)s. "
"Reason: %(reason)s") % {
"alarm_uuid": alarm_uuid, "reason": quota_reason_text})
else:
LOG.error(_LE("Failed to create quota "
"storage alarm. Reason: %s") % quota_reason_text)
def _clear_fault(self, alarm_id, entity_instance_id=None):
# Only clear alarm if there is one already raised
if (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH and
self.current_health_alarm):
LOG.info(_LI("Clearing health alarm"))
self.service.fm_api.clear_fault(
fm_constants.FM_ALARM_ID_STORAGE_CEPH,
self.service.entity_instance_id)
elif (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE and
entity_instance_id in self.current_quota_alarms):
LOG.info(_LI("Clearing quota alarm with entity_instance_id %s")
% entity_instance_id)
self.service.fm_api.clear_fault(
fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE,
entity_instance_id)
def clear_critical_alarm(self, group_name):
alarm_list = self.service.fm_api.get_faults_by_id(
fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL)
if alarm_list:
for alarm in range(len(alarm_list)):
group_id = alarm_list[alarm].entity_instance_id.find("group-")
group_instance_name = (
"group-" +
alarm_list[alarm].entity_instance_id[group_id + 6])
if group_name == group_instance_name:
self.service.fm_api.clear_fault(
fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL,
alarm_list[alarm].entity_instance_id)
def clear_all_major_critical(self, group_name=None):
# clear major alarms
alarm_list = self.service.fm_api.get_faults_by_id(
fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR)
if alarm_list:
for alarm in range(len(alarm_list)):
if group_name is not None:
group_id = (
alarm_list[alarm].entity_instance_id.find("group-"))
group_instance_name = (
"group-" +
alarm_list[alarm].entity_instance_id[group_id + 6])
if group_name == group_instance_name:
self.service.fm_api.clear_fault(
fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR,
alarm_list[alarm].entity_instance_id)
else:
self.service.fm_api.clear_fault(
fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR,
alarm_list[alarm].entity_instance_id)
# clear critical alarms
alarm_list = self.service.fm_api.get_faults_by_id(
fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL)
if alarm_list:
for alarm in range(len(alarm_list)):
if group_name is not None:
group_id = (
alarm_list[alarm].entity_instance_id.find("group-"))
group_instance_name = (
"group-" +
alarm_list[alarm].entity_instance_id[group_id + 6])
if group_name == group_instance_name:
self.service.fm_api.clear_fault(
fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL,
alarm_list[alarm].entity_instance_id)
else:
self.service.fm_api.clear_fault(
fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL,
alarm_list[alarm].entity_instance_id)
def _get_current_alarms(self):
"""Retrieve currently raised alarm"""
self.current_health_alarm = self.service.fm_api.get_fault(
fm_constants.FM_ALARM_ID_STORAGE_CEPH,
self.service.entity_instance_id)
quota_faults = self.service.fm_api.get_faults_by_id(
fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE)
if quota_faults:
self.current_quota_alarms = [f.entity_instance_id
for f in quota_faults]
else:
self.current_quota_alarms = []