Merge "Add new collectd plugin to monitor a service status"

This commit is contained in:
Zuul 2021-12-01 20:00:24 +00:00 committed by Gerrit Code Review
commit 1fe30c7fe8
5 changed files with 222 additions and 1 deletions

View File

@ -23,5 +23,7 @@ COPY_LIST="$PKG_BASE/src/LICENSE \
$PKG_BASE/src/ovs_interface.py \
$PKG_BASE/src/ovs_interface.conf \
$PKG_BASE/src/example.py \
$PKG_BASE/src/example.conf"
$PKG_BASE/src/example.conf \
$PKG_BASE/src/service_res.py \
$PKG_BASE/src/service_res.conf"
TIS_PATCH_VER=PKG_GITREVCOUNT

View File

@ -25,6 +25,7 @@ Source16: interface.py
Source17: remotels.py
Source18: ptp.py
Source19: ovs_interface.py
Source20: service_res.py
# collectd plugin conf files into /etc/collectd.d
Source100: python_plugins.conf
@ -36,6 +37,7 @@ Source106: interface.conf
Source107: remotels.conf
Source108: ptp.conf
Source109: ovs_interface.conf
Source110: service_res.conf
BuildRequires: systemd-devel
@ -91,6 +93,7 @@ install -m 700 %{SOURCE16} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE17} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE18} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE19} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE20} %{buildroot}%{local_python_extensions_dir}
# collectd plugin conf files into /etc/collectd.d/starlingx
@ -103,6 +106,7 @@ install -m 600 %{SOURCE106} %{buildroot}%{local_starlingx_plugin_dir}
install -m 600 %{SOURCE107} %{buildroot}%{local_starlingx_plugin_dir}
install -m 600 %{SOURCE108} %{buildroot}%{local_starlingx_plugin_dir}
install -m 600 %{SOURCE109} %{buildroot}%{local_starlingx_plugin_dir}
install -m 600 %{SOURCE110} %{buildroot}%{local_starlingx_plugin_dir}
%clean
rm -rf $RPM_BUILD_ROOT

View File

@ -19,6 +19,7 @@ LoadPlugin python
</Module>
Import "ovs_interface"
Import "remotels"
Import "service_res"
LogTraces = true
Encoding "utf-8"
</Plugin>

View File

@ -0,0 +1,3 @@
# service_res plugin is self contained. It runs under the collectd
# extension framework but manages the alarm by itself.
# The empty config here allows the collectd to schedule the plugin.

View File

@ -0,0 +1,211 @@
#
# Copyright (c) 2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
import collectd
import os
import subprocess
import plugin_common as pc
from datetime import datetime
from fm_api import constants as fm_constants
from fm_api import fm_api
LSB_RETURN_WARNING = 160
LSB_RETURN_DEGRADE = 161
# Fault manager API Object
api = fm_api.FaultAPIsV2()
# name of the plugin - all logs produced by this plugin are prefixed with this
PLUGIN = 'service resource plugin'
# define a struct for monitoring individual service with service plugin
# and react (raise or clear alarm) with specified return code.
services = [
{
"service_name": "open-ldap",
"service_type": "lsb",
"service_plugin": "/etc/init.d/openldap",
"service_plugin_cmdline": "/etc/init.d/openldap status > 2; echo -n $?",
"service_plugin_env": {"SYSTEMCTL_SKIP_REDIRECT": "1"},
"alarm": None,
"current_status": 0, # init status, actual status will be string
"service_status": [
{
"status": "0",
"alarm": None,
},
{
"status": "160",
"alarm": {
"severity": "major",
"id": "100.150",
"entity_id": "resource_type=file-descriptor.service_name=open-ldap",
"reason": "Number of open file descriptor is approaching to its limit",
"repair": "Consider to swact to the other controller if available",
}
},
{
"status": "161",
"alarm": {
"severity": "critical",
"id": "100.150",
"entity_id": "resource_type=file-descriptor.service_name=open-ldap",
"reason": "Number of open file descriptor has reached its limit",
"repair": "Consider to swact to the other controller if available",
}
}
],
"alarms_to_clear": [
{
"id": "100.150",
"entity_id": "resource_type=file-descriptor.service_name=open-ldap"
}
],
"alarm_to_raise": None,
"alarm_raised": None
}
]
def clear_alarm(alarm):
alarm_id = alarm["id"]
eid = 'host=' + obj.hostname + "." + alarm["entity_id"]
try:
if api.clear_fault(alarm_id, eid) is False:
collectd.info("%s %s:%s alarm already cleared" %
(PLUGIN, alarm_id, eid))
else:
collectd.info("%s %s:%s alarm cleared" %
(PLUGIN, alarm_id, eid))
return True
except Exception as ex:
collectd.error("%s 'clear_fault' failed ; %s:%s ; %s" %
(PLUGIN, alarm_id, eid, ex))
return False
def raise_alarm(service_name, alarm):
"""raise alarms"""
alarm_id = alarm["id"]
alarm_state = fm_constants.FM_ALARM_STATE_SET
eid = 'host=' + obj.hostname + "." + alarm["entity_id"]
severity = alarm["severity"]
reason = alarm["reason"]
repair = alarm["repair"]
ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
fault = fm_api.Fault(
uuid="",
alarm_id=alarm_id,
alarm_state=alarm_state,
entity_type_id=fm_constants.FM_ENTITY_TYPE_SYSTEM,
entity_instance_id=eid,
severity=severity,
reason_text=reason,
alarm_type=fm_constants.FM_ALARM_TYPE_0,
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_42,
proposed_repair_action=repair,
service_affecting=True,
timestamp=ts,
suppression=True)
try:
alarm_uuid = api.set_fault(fault)
except Exception as ex:
collectd.error("%s 'set_fault' exception ; %s:%s ; %s" %
(PLUGIN, alarm_id, eid, ex))
return False
if pc.is_uuid_like(alarm_uuid) is False:
collectd.error("%s 'set_fault' failed ; %s:%s ; %s" %
(PLUGIN, alarm_id, eid, alarm_uuid))
return False
else:
return True
# Plugin Control Object
obj = pc.PluginObject(PLUGIN, "")
def init_func():
"""Init the plugin"""
# do nothing till config is complete.
if obj.config_complete() is False:
return 0
for service in services:
if not os.path.exists(service["service_plugin"]):
return 1
obj.hostname = obj.gethostname()
obj.init_completed()
return 0
def check_service_status(service):
cmd = service["service_plugin_cmdline"]
env = service["service_plugin_env"]
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, shell=True)
proc.wait()
new_status = (proc.communicate()[0] or "").strip()
if service["current_status"] == new_status:
return
current_alarm = service["alarm_raised"]
if current_alarm is not None:
if current_alarm not in service["alarms_to_clear"]:
service["alarms_to_clear"].append(current_alarm)
for status in service["service_status"]:
if status["status"] == new_status:
alarm = status["alarm"]
if alarm is not None and alarm != service["alarm_to_raise"]:
service["alarm_to_raise"] = alarm
break
else:
collectd.error("undefined service status %s[%s]" %
(service["service_name"], new_status))
def process_service_alarm(service):
alarms_to_clear = service["alarms_to_clear"][:]
for alarm in alarms_to_clear:
if clear_alarm(alarm):
service["alarms_to_clear"].remove(alarm)
alarm = service["alarm_to_raise"]
if alarm is not None:
if raise_alarm(service["service_name"], alarm):
alarm_raised = {"id": alarm["id"], "entity_id": alarm["entity_id"]}
service["alarm_raised"] = alarm_raised
service["alarm_to_raise"] = None
# The read function - called on every audit interval
def read_func():
"""collectd service resource monitor plugin read function"""
if obj.init_complete is False:
init_func()
return 0
if obj._node_ready is False:
obj.node_ready()
return 0
for service in services:
check_service_status(service)
process_service_alarm(service)
collectd.register_init(init_func)
collectd.register_read(read_func)