Add new collectd plugin to monitor a service status

When openldap service status return 160, raise a major alarm
for the service is approaching its FD limit. When 161 is returned
raise critical alarm for the limit is reached.

SM will degrade the node when the FD reaches the limit.
Ref SM changes:
https://review.opendev.org/c/starlingx/ha/+/819130

TC passed:
Alarm is raised when FD limit is reached, or above 95% (approaching).
Alarm is cleared when FD usage is below 95% threshold.
Upgrade test. New alarm raised on controller-1 (N+1).
Alarm is cleared when collectd restarts or node reboot (alarm will
be re-raised if alarming situation is dected again)
SM detects 161 status code and degraded the node with service
degraded alarm.
Alarm raised after fm comes back up after being not available.
Alarm is cleared after fm comes backup after being not available.

Closes-bug: 1952126
Depends-on: https://review.opendev.org/c/starlingx/fault/+/819132

Change-Id: I78bb6ed6f24570d68f62818e1242286d638fd835
Signed-off-by: Bin Qian <bin.qian@windriver.com>
This commit is contained in:
Bin Qian 2021-11-18 15:05:51 -05:00
parent 3ff8c48cc3
commit a9f84e13b1
5 changed files with 222 additions and 1 deletions

View File

@ -23,5 +23,7 @@ COPY_LIST="$PKG_BASE/src/LICENSE \
$PKG_BASE/src/ovs_interface.py \
$PKG_BASE/src/ovs_interface.conf \
$PKG_BASE/src/example.py \
$PKG_BASE/src/example.conf"
$PKG_BASE/src/example.conf \
$PKG_BASE/src/service_res.py \
$PKG_BASE/src/service_res.conf"
TIS_PATCH_VER=PKG_GITREVCOUNT

View File

@ -25,6 +25,7 @@ Source16: interface.py
Source17: remotels.py
Source18: ptp.py
Source19: ovs_interface.py
Source20: service_res.py
# collectd plugin conf files into /etc/collectd.d
Source100: python_plugins.conf
@ -36,6 +37,7 @@ Source106: interface.conf
Source107: remotels.conf
Source108: ptp.conf
Source109: ovs_interface.conf
Source110: service_res.conf
BuildRequires: systemd-devel
@ -91,6 +93,7 @@ install -m 700 %{SOURCE16} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE17} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE18} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE19} %{buildroot}%{local_python_extensions_dir}
install -m 700 %{SOURCE20} %{buildroot}%{local_python_extensions_dir}
# collectd plugin conf files into /etc/collectd.d/starlingx
@ -103,6 +106,7 @@ install -m 600 %{SOURCE106} %{buildroot}%{local_starlingx_plugin_dir}
install -m 600 %{SOURCE107} %{buildroot}%{local_starlingx_plugin_dir}
install -m 600 %{SOURCE108} %{buildroot}%{local_starlingx_plugin_dir}
install -m 600 %{SOURCE109} %{buildroot}%{local_starlingx_plugin_dir}
install -m 600 %{SOURCE110} %{buildroot}%{local_starlingx_plugin_dir}
%clean
rm -rf $RPM_BUILD_ROOT

View File

@ -19,6 +19,7 @@ LoadPlugin python
</Module>
Import "ovs_interface"
Import "remotels"
Import "service_res"
LogTraces = true
Encoding "utf-8"
</Plugin>

View File

@ -0,0 +1,3 @@
# service_res plugin is self contained. It runs under the collectd
# extension framework but manages the alarm by itself.
# The empty config here allows the collectd to schedule the plugin.

View File

@ -0,0 +1,211 @@
#
# Copyright (c) 2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
import collectd
import os
import subprocess
import plugin_common as pc
from datetime import datetime
from fm_api import constants as fm_constants
from fm_api import fm_api
LSB_RETURN_WARNING = 160
LSB_RETURN_DEGRADE = 161
# Fault manager API Object
api = fm_api.FaultAPIsV2()
# name of the plugin - all logs produced by this plugin are prefixed with this
PLUGIN = 'service resource plugin'
# define a struct for monitoring individual service with service plugin
# and react (raise or clear alarm) with specified return code.
services = [
{
"service_name": "open-ldap",
"service_type": "lsb",
"service_plugin": "/etc/init.d/openldap",
"service_plugin_cmdline": "/etc/init.d/openldap status > 2; echo -n $?",
"service_plugin_env": {"SYSTEMCTL_SKIP_REDIRECT": "1"},
"alarm": None,
"current_status": 0, # init status, actual status will be string
"service_status": [
{
"status": "0",
"alarm": None,
},
{
"status": "160",
"alarm": {
"severity": "major",
"id": "100.150",
"entity_id": "resource_type=file-descriptor.service_name=open-ldap",
"reason": "Number of open file descriptor is approaching to its limit",
"repair": "Consider to swact to the other controller if available",
}
},
{
"status": "161",
"alarm": {
"severity": "critical",
"id": "100.150",
"entity_id": "resource_type=file-descriptor.service_name=open-ldap",
"reason": "Number of open file descriptor has reached its limit",
"repair": "Consider to swact to the other controller if available",
}
}
],
"alarms_to_clear": [
{
"id": "100.150",
"entity_id": "resource_type=file-descriptor.service_name=open-ldap"
}
],
"alarm_to_raise": None,
"alarm_raised": None
}
]
def clear_alarm(alarm):
alarm_id = alarm["id"]
eid = 'host=' + obj.hostname + "." + alarm["entity_id"]
try:
if api.clear_fault(alarm_id, eid) is False:
collectd.info("%s %s:%s alarm already cleared" %
(PLUGIN, alarm_id, eid))
else:
collectd.info("%s %s:%s alarm cleared" %
(PLUGIN, alarm_id, eid))
return True
except Exception as ex:
collectd.error("%s 'clear_fault' failed ; %s:%s ; %s" %
(PLUGIN, alarm_id, eid, ex))
return False
def raise_alarm(service_name, alarm):
"""raise alarms"""
alarm_id = alarm["id"]
alarm_state = fm_constants.FM_ALARM_STATE_SET
eid = 'host=' + obj.hostname + "." + alarm["entity_id"]
severity = alarm["severity"]
reason = alarm["reason"]
repair = alarm["repair"]
ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
fault = fm_api.Fault(
uuid="",
alarm_id=alarm_id,
alarm_state=alarm_state,
entity_type_id=fm_constants.FM_ENTITY_TYPE_SYSTEM,
entity_instance_id=eid,
severity=severity,
reason_text=reason,
alarm_type=fm_constants.FM_ALARM_TYPE_0,
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_42,
proposed_repair_action=repair,
service_affecting=True,
timestamp=ts,
suppression=True)
try:
alarm_uuid = api.set_fault(fault)
except Exception as ex:
collectd.error("%s 'set_fault' exception ; %s:%s ; %s" %
(PLUGIN, alarm_id, eid, ex))
return False
if pc.is_uuid_like(alarm_uuid) is False:
collectd.error("%s 'set_fault' failed ; %s:%s ; %s" %
(PLUGIN, alarm_id, eid, alarm_uuid))
return False
else:
return True
# Plugin Control Object
obj = pc.PluginObject(PLUGIN, "")
def init_func():
"""Init the plugin"""
# do nothing till config is complete.
if obj.config_complete() is False:
return 0
for service in services:
if not os.path.exists(service["service_plugin"]):
return 1
obj.hostname = obj.gethostname()
obj.init_completed()
return 0
def check_service_status(service):
cmd = service["service_plugin_cmdline"]
env = service["service_plugin_env"]
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, shell=True)
proc.wait()
new_status = (proc.communicate()[0] or "").strip()
if service["current_status"] == new_status:
return
current_alarm = service["alarm_raised"]
if current_alarm is not None:
if current_alarm not in service["alarms_to_clear"]:
service["alarms_to_clear"].append(current_alarm)
for status in service["service_status"]:
if status["status"] == new_status:
alarm = status["alarm"]
if alarm is not None and alarm != service["alarm_to_raise"]:
service["alarm_to_raise"] = alarm
break
else:
collectd.error("undefined service status %s[%s]" %
(service["service_name"], new_status))
def process_service_alarm(service):
alarms_to_clear = service["alarms_to_clear"][:]
for alarm in alarms_to_clear:
if clear_alarm(alarm):
service["alarms_to_clear"].remove(alarm)
alarm = service["alarm_to_raise"]
if alarm is not None:
if raise_alarm(service["service_name"], alarm):
alarm_raised = {"id": alarm["id"], "entity_id": alarm["entity_id"]}
service["alarm_raised"] = alarm_raised
service["alarm_to_raise"] = None
# The read function - called on every audit interval
def read_func():
"""collectd service resource monitor plugin read function"""
if obj.init_complete is False:
init_func()
return 0
if obj._node_ready is False:
obj.node_ready()
return 0
for service in services:
check_service_status(service)
process_service_alarm(service)
collectd.register_init(init_func)
collectd.register_read(read_func)