Add new collectd plugin to monitor a service status
When openldap service status return 160, raise a major alarm for the service is approaching its FD limit. When 161 is returned raise critical alarm for the limit is reached. SM will degrade the node when the FD reaches the limit. Ref SM changes: https://review.opendev.org/c/starlingx/ha/+/819130 TC passed: Alarm is raised when FD limit is reached, or above 95% (approaching). Alarm is cleared when FD usage is below 95% threshold. Upgrade test. New alarm raised on controller-1 (N+1). Alarm is cleared when collectd restarts or node reboot (alarm will be re-raised if alarming situation is dected again) SM detects 161 status code and degraded the node with service degraded alarm. Alarm raised after fm comes back up after being not available. Alarm is cleared after fm comes backup after being not available. Closes-bug: 1952126 Depends-on: https://review.opendev.org/c/starlingx/fault/+/819132 Change-Id: I78bb6ed6f24570d68f62818e1242286d638fd835 Signed-off-by: Bin Qian <bin.qian@windriver.com>
This commit is contained in:
parent
3ff8c48cc3
commit
a9f84e13b1
@ -23,5 +23,7 @@ COPY_LIST="$PKG_BASE/src/LICENSE \
|
||||
$PKG_BASE/src/ovs_interface.py \
|
||||
$PKG_BASE/src/ovs_interface.conf \
|
||||
$PKG_BASE/src/example.py \
|
||||
$PKG_BASE/src/example.conf"
|
||||
$PKG_BASE/src/example.conf \
|
||||
$PKG_BASE/src/service_res.py \
|
||||
$PKG_BASE/src/service_res.conf"
|
||||
TIS_PATCH_VER=PKG_GITREVCOUNT
|
||||
|
@ -25,6 +25,7 @@ Source16: interface.py
|
||||
Source17: remotels.py
|
||||
Source18: ptp.py
|
||||
Source19: ovs_interface.py
|
||||
Source20: service_res.py
|
||||
|
||||
# collectd plugin conf files into /etc/collectd.d
|
||||
Source100: python_plugins.conf
|
||||
@ -36,6 +37,7 @@ Source106: interface.conf
|
||||
Source107: remotels.conf
|
||||
Source108: ptp.conf
|
||||
Source109: ovs_interface.conf
|
||||
Source110: service_res.conf
|
||||
|
||||
BuildRequires: systemd-devel
|
||||
|
||||
@ -91,6 +93,7 @@ install -m 700 %{SOURCE16} %{buildroot}%{local_python_extensions_dir}
|
||||
install -m 700 %{SOURCE17} %{buildroot}%{local_python_extensions_dir}
|
||||
install -m 700 %{SOURCE18} %{buildroot}%{local_python_extensions_dir}
|
||||
install -m 700 %{SOURCE19} %{buildroot}%{local_python_extensions_dir}
|
||||
install -m 700 %{SOURCE20} %{buildroot}%{local_python_extensions_dir}
|
||||
|
||||
|
||||
# collectd plugin conf files into /etc/collectd.d/starlingx
|
||||
@ -103,6 +106,7 @@ install -m 600 %{SOURCE106} %{buildroot}%{local_starlingx_plugin_dir}
|
||||
install -m 600 %{SOURCE107} %{buildroot}%{local_starlingx_plugin_dir}
|
||||
install -m 600 %{SOURCE108} %{buildroot}%{local_starlingx_plugin_dir}
|
||||
install -m 600 %{SOURCE109} %{buildroot}%{local_starlingx_plugin_dir}
|
||||
install -m 600 %{SOURCE110} %{buildroot}%{local_starlingx_plugin_dir}
|
||||
|
||||
%clean
|
||||
rm -rf $RPM_BUILD_ROOT
|
||||
|
@ -19,6 +19,7 @@ LoadPlugin python
|
||||
</Module>
|
||||
Import "ovs_interface"
|
||||
Import "remotels"
|
||||
Import "service_res"
|
||||
LogTraces = true
|
||||
Encoding "utf-8"
|
||||
</Plugin>
|
||||
|
3
collectd-extensions/src/service_res.conf
Normal file
3
collectd-extensions/src/service_res.conf
Normal file
@ -0,0 +1,3 @@
|
||||
# service_res plugin is self contained. It runs under the collectd
|
||||
# extension framework but manages the alarm by itself.
|
||||
# The empty config here allows the collectd to schedule the plugin.
|
211
collectd-extensions/src/service_res.py
Normal file
211
collectd-extensions/src/service_res.py
Normal file
@ -0,0 +1,211 @@
|
||||
#
|
||||
# Copyright (c) 2021 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import collectd
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import plugin_common as pc
|
||||
from datetime import datetime
|
||||
|
||||
from fm_api import constants as fm_constants
|
||||
from fm_api import fm_api
|
||||
|
||||
|
||||
LSB_RETURN_WARNING = 160
|
||||
LSB_RETURN_DEGRADE = 161
|
||||
|
||||
# Fault manager API Object
|
||||
api = fm_api.FaultAPIsV2()
|
||||
|
||||
# name of the plugin - all logs produced by this plugin are prefixed with this
|
||||
PLUGIN = 'service resource plugin'
|
||||
|
||||
# define a struct for monitoring individual service with service plugin
|
||||
# and react (raise or clear alarm) with specified return code.
|
||||
services = [
|
||||
{
|
||||
"service_name": "open-ldap",
|
||||
"service_type": "lsb",
|
||||
"service_plugin": "/etc/init.d/openldap",
|
||||
"service_plugin_cmdline": "/etc/init.d/openldap status > 2; echo -n $?",
|
||||
"service_plugin_env": {"SYSTEMCTL_SKIP_REDIRECT": "1"},
|
||||
"alarm": None,
|
||||
"current_status": 0, # init status, actual status will be string
|
||||
"service_status": [
|
||||
{
|
||||
"status": "0",
|
||||
"alarm": None,
|
||||
},
|
||||
{
|
||||
"status": "160",
|
||||
"alarm": {
|
||||
"severity": "major",
|
||||
"id": "100.150",
|
||||
"entity_id": "resource_type=file-descriptor.service_name=open-ldap",
|
||||
"reason": "Number of open file descriptor is approaching to its limit",
|
||||
"repair": "Consider to swact to the other controller if available",
|
||||
}
|
||||
},
|
||||
{
|
||||
"status": "161",
|
||||
"alarm": {
|
||||
"severity": "critical",
|
||||
"id": "100.150",
|
||||
"entity_id": "resource_type=file-descriptor.service_name=open-ldap",
|
||||
"reason": "Number of open file descriptor has reached its limit",
|
||||
"repair": "Consider to swact to the other controller if available",
|
||||
}
|
||||
|
||||
}
|
||||
],
|
||||
"alarms_to_clear": [
|
||||
{
|
||||
"id": "100.150",
|
||||
"entity_id": "resource_type=file-descriptor.service_name=open-ldap"
|
||||
}
|
||||
],
|
||||
"alarm_to_raise": None,
|
||||
"alarm_raised": None
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def clear_alarm(alarm):
|
||||
alarm_id = alarm["id"]
|
||||
eid = 'host=' + obj.hostname + "." + alarm["entity_id"]
|
||||
try:
|
||||
if api.clear_fault(alarm_id, eid) is False:
|
||||
collectd.info("%s %s:%s alarm already cleared" %
|
||||
(PLUGIN, alarm_id, eid))
|
||||
else:
|
||||
collectd.info("%s %s:%s alarm cleared" %
|
||||
(PLUGIN, alarm_id, eid))
|
||||
return True
|
||||
|
||||
except Exception as ex:
|
||||
collectd.error("%s 'clear_fault' failed ; %s:%s ; %s" %
|
||||
(PLUGIN, alarm_id, eid, ex))
|
||||
return False
|
||||
|
||||
|
||||
def raise_alarm(service_name, alarm):
|
||||
"""raise alarms"""
|
||||
|
||||
alarm_id = alarm["id"]
|
||||
alarm_state = fm_constants.FM_ALARM_STATE_SET
|
||||
eid = 'host=' + obj.hostname + "." + alarm["entity_id"]
|
||||
severity = alarm["severity"]
|
||||
reason = alarm["reason"]
|
||||
repair = alarm["repair"]
|
||||
ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
fault = fm_api.Fault(
|
||||
uuid="",
|
||||
alarm_id=alarm_id,
|
||||
alarm_state=alarm_state,
|
||||
entity_type_id=fm_constants.FM_ENTITY_TYPE_SYSTEM,
|
||||
entity_instance_id=eid,
|
||||
severity=severity,
|
||||
reason_text=reason,
|
||||
alarm_type=fm_constants.FM_ALARM_TYPE_0,
|
||||
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_42,
|
||||
proposed_repair_action=repair,
|
||||
service_affecting=True,
|
||||
timestamp=ts,
|
||||
suppression=True)
|
||||
|
||||
try:
|
||||
alarm_uuid = api.set_fault(fault)
|
||||
except Exception as ex:
|
||||
collectd.error("%s 'set_fault' exception ; %s:%s ; %s" %
|
||||
(PLUGIN, alarm_id, eid, ex))
|
||||
return False
|
||||
|
||||
if pc.is_uuid_like(alarm_uuid) is False:
|
||||
collectd.error("%s 'set_fault' failed ; %s:%s ; %s" %
|
||||
(PLUGIN, alarm_id, eid, alarm_uuid))
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
# Plugin Control Object
|
||||
obj = pc.PluginObject(PLUGIN, "")
|
||||
|
||||
|
||||
def init_func():
|
||||
"""Init the plugin"""
|
||||
|
||||
# do nothing till config is complete.
|
||||
if obj.config_complete() is False:
|
||||
return 0
|
||||
|
||||
for service in services:
|
||||
if not os.path.exists(service["service_plugin"]):
|
||||
return 1
|
||||
|
||||
obj.hostname = obj.gethostname()
|
||||
obj.init_completed()
|
||||
return 0
|
||||
|
||||
|
||||
def check_service_status(service):
|
||||
cmd = service["service_plugin_cmdline"]
|
||||
env = service["service_plugin_env"]
|
||||
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, shell=True)
|
||||
proc.wait()
|
||||
new_status = (proc.communicate()[0] or "").strip()
|
||||
|
||||
if service["current_status"] == new_status:
|
||||
return
|
||||
|
||||
current_alarm = service["alarm_raised"]
|
||||
if current_alarm is not None:
|
||||
if current_alarm not in service["alarms_to_clear"]:
|
||||
service["alarms_to_clear"].append(current_alarm)
|
||||
|
||||
for status in service["service_status"]:
|
||||
if status["status"] == new_status:
|
||||
alarm = status["alarm"]
|
||||
if alarm is not None and alarm != service["alarm_to_raise"]:
|
||||
service["alarm_to_raise"] = alarm
|
||||
break
|
||||
else:
|
||||
collectd.error("undefined service status %s[%s]" %
|
||||
(service["service_name"], new_status))
|
||||
|
||||
|
||||
def process_service_alarm(service):
|
||||
alarms_to_clear = service["alarms_to_clear"][:]
|
||||
for alarm in alarms_to_clear:
|
||||
if clear_alarm(alarm):
|
||||
service["alarms_to_clear"].remove(alarm)
|
||||
|
||||
alarm = service["alarm_to_raise"]
|
||||
if alarm is not None:
|
||||
if raise_alarm(service["service_name"], alarm):
|
||||
alarm_raised = {"id": alarm["id"], "entity_id": alarm["entity_id"]}
|
||||
service["alarm_raised"] = alarm_raised
|
||||
service["alarm_to_raise"] = None
|
||||
|
||||
|
||||
# The read function - called on every audit interval
|
||||
def read_func():
|
||||
"""collectd service resource monitor plugin read function"""
|
||||
|
||||
if obj.init_complete is False:
|
||||
init_func()
|
||||
return 0
|
||||
|
||||
if obj._node_ready is False:
|
||||
obj.node_ready()
|
||||
return 0
|
||||
|
||||
for service in services:
|
||||
check_service_status(service)
|
||||
process_service_alarm(service)
|
||||
|
||||
collectd.register_init(init_func)
|
||||
collectd.register_read(read_func)
|
Loading…
Reference in New Issue
Block a user