distcloud/dcorch/engine/alarm_aggregate_manager.py
Tao Liu 38f08a0d98 Decouple Fault Management from stx-config
The fault management (FM) APIs has been removed from the syinv API
service and a new FM API service has been introduced. This update adds
a new fm openstack driver for retrieving each region's alarm summary,
and it also modifies the alarm aggregate manager to use the fm driver.
In addition, it removes get alarm summary routine from sysinv and
adds the  fm user to the subcloud user list

Story: 2002828
Task: 22747

Signed-off-by: Tao Liu <tao.liu@windriver.com>
2018-08-23 14:59:38 -04:00

162 lines
6.2 KiB
Python

# Copyright 2016 Ericsson AB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datetime
from dcmanager.common import consts as dcm_consts
from dcorch.common import consts
from dcorch.common import context
from dcorch.common import exceptions
from dcorch.common.i18n import _
from dcorch.common import manager
from dcorch.db import api as db_api
from dcorch.drivers.openstack import sdk_platform
from dcorch.drivers.openstack import sdk
from oslo_config import cfg
from oslo_log import log as logging
import threading
import time
CONF = cfg.CONF
LOG = logging.getLogger(__name__)
class AlarmAggregateManager(manager.Manager):
"""Manages tasks related to alarm aggregation"""
def __init__(self, *args, **kwargs):
LOG.debug(_('AlarmAggregateManager initialization...'))
super(AlarmAggregateManager, self).\
__init__(service_name="alarm_aggregate_manager", *args, **kwargs)
self.context = context.get_admin_context()
self.alarm_update_thread = PeriodicAlarmUpdate(self)
self.alarm_update_thread.start()
def shutdown(self):
self.alarm_update_thread.stop()
self.alarm_update_thread.join()
def enable_snmp(self, ctxt, subcloud_name):
LOG.info("Enabling fm-aggregation trap for region_name=%s" %
subcloud_name)
os_client = sdk_platform.OpenStackDriver(subcloud_name)
payload = {"ip_address": CONF.snmp.snmp_ip,
"community": CONF.snmp.snmp_comm_str}
try:
os_client.sysinv_client.snmp_trapdest_create(payload)
self.update_alarm_summary(self.context, subcloud_name)
except (exceptions.ConnectionRefused, exceptions.NotAuthorized,
exceptions.TimeOut):
LOG.info("snmp_trapdest_create exception Timeout region_name=%s" %
subcloud_name)
pass
except AttributeError:
LOG.info("snmp_trapdest_create AttributeError region_name=%s" %
subcloud_name)
pass
except exceptions.TrapDestAlreadyExists:
LOG.info("snmp_trapdest_create TrapDestAlreadyExists "
"region_name=%s payload %s" % (subcloud_name, payload))
pass
except Exception:
LOG.info("snmp_trapdest_create exception region_name=%s" %
subcloud_name)
pass
def update_alarm_summary(self, cntx, region_name):
LOG.info("Updating alarm summary for %s" % region_name)
try:
os_client = sdk.OpenStackDriver(region_name)
alarms = os_client.fm_client.get_alarm_summary()
alarm_updates = {'critical_alarms': alarms[0].critical,
'major_alarms': alarms[0].major,
'minor_alarms': alarms[0].minor,
'warnings': alarms[0].warnings}
alarm_updates = self._set_cloud_status(alarm_updates)
db_api.subcloud_alarms_update(self.context, region_name,
alarm_updates)
except Exception:
LOG.error('Failed to update alarms for %s' % region_name)
def _set_cloud_status(self, alarm_dict):
status = consts.ALARM_OK_STATUS
if (alarm_dict.get('major_alarms') > 0) or\
(alarm_dict.get('minor_alarms') > 0):
status = consts.ALARM_DEGRADED_STATUS
if (alarm_dict.get('critical_alarms') > 0):
status = consts.ALARM_CRITICAL_STATUS
alarm_dict['cloud_status'] = status
return alarm_dict
def get_alarm_summary(self, ctxt):
alarms = db_api.subcloud_alarms_get_all(self.context)
summary = []
for alarm in alarms:
alarm_dict = {'region_name': alarm['region_name'],
'uuid': alarm['uuid'],
'critical_alarms': alarm['critical_alarms'],
'major_alarms': alarm['major_alarms'],
'minor_alarms': alarm['minor_alarms'],
'warnings': alarm['warnings'],
'cloud_status': alarm['cloud_status']}
summary.append(alarm_dict)
return summary
class PeriodicAlarmUpdate(threading.Thread):
def __init__(self, parent):
super(PeriodicAlarmUpdate, self).__init__()
self.parent = parent
self.context = context.get_admin_context()
self._stop = threading.Event()
self.interval = CONF.snmp.alarm_audit_interval_time
self.system_last_update = datetime.datetime.now()
def run_updates(self):
while not self.stopped():
delta = (datetime.datetime.now() -
self.system_last_update).total_seconds()
if delta < self.interval:
time.sleep(1.0)
continue
try:
LOG.info('Running alarm summary update sync')
self.system_last_update = datetime.datetime.now()
subclouds = db_api.subcloud_get_all(self.context)
for subcloud in subclouds:
if self.stopped():
break
if subcloud['availability_status'] ==\
dcm_consts.AVAILABILITY_ONLINE:
self.parent.\
update_alarm_summary(self.context,
subcloud['region_name'])
except Exception:
pass
time.sleep(1.0)
LOG.info("Periodic Alarm Update Thread Stopped")
def stopped(self):
return self._stop.isSet()
def stop(self):
LOG.info("Periodic Alarm Update Thread Stopping")
self._stop.set()
def run(self):
self.run_updates()