From a9f84e13b13b0f20f1511d0503bbcf2df9f0fced Mon Sep 17 00:00:00 2001 From: Bin Qian Date: Thu, 18 Nov 2021 15:05:51 -0500 Subject: [PATCH] Add new collectd plugin to monitor a service status When openldap service status return 160, raise a major alarm for the service is approaching its FD limit. When 161 is returned raise critical alarm for the limit is reached. SM will degrade the node when the FD reaches the limit. Ref SM changes: https://review.opendev.org/c/starlingx/ha/+/819130 TC passed: Alarm is raised when FD limit is reached, or above 95% (approaching). Alarm is cleared when FD usage is below 95% threshold. Upgrade test. New alarm raised on controller-1 (N+1). Alarm is cleared when collectd restarts or node reboot (alarm will be re-raised if alarming situation is dected again) SM detects 161 status code and degraded the node with service degraded alarm. Alarm raised after fm comes back up after being not available. Alarm is cleared after fm comes backup after being not available. Closes-bug: 1952126 Depends-on: https://review.opendev.org/c/starlingx/fault/+/819132 Change-Id: I78bb6ed6f24570d68f62818e1242286d638fd835 Signed-off-by: Bin Qian --- collectd-extensions/centos/build_srpm.data | 4 +- .../centos/collectd-extensions.spec | 4 + collectd-extensions/src/python_plugins.conf | 1 + collectd-extensions/src/service_res.conf | 3 + collectd-extensions/src/service_res.py | 211 ++++++++++++++++++ 5 files changed, 222 insertions(+), 1 deletion(-) create mode 100644 collectd-extensions/src/service_res.conf create mode 100644 collectd-extensions/src/service_res.py diff --git a/collectd-extensions/centos/build_srpm.data b/collectd-extensions/centos/build_srpm.data index 11618ac..45dd4d3 100644 --- a/collectd-extensions/centos/build_srpm.data +++ b/collectd-extensions/centos/build_srpm.data @@ -23,5 +23,7 @@ COPY_LIST="$PKG_BASE/src/LICENSE \ $PKG_BASE/src/ovs_interface.py \ $PKG_BASE/src/ovs_interface.conf \ $PKG_BASE/src/example.py \ - $PKG_BASE/src/example.conf" + $PKG_BASE/src/example.conf \ + $PKG_BASE/src/service_res.py \ + $PKG_BASE/src/service_res.conf" TIS_PATCH_VER=PKG_GITREVCOUNT diff --git a/collectd-extensions/centos/collectd-extensions.spec b/collectd-extensions/centos/collectd-extensions.spec index 71409cd..b564279 100644 --- a/collectd-extensions/centos/collectd-extensions.spec +++ b/collectd-extensions/centos/collectd-extensions.spec @@ -25,6 +25,7 @@ Source16: interface.py Source17: remotels.py Source18: ptp.py Source19: ovs_interface.py +Source20: service_res.py # collectd plugin conf files into /etc/collectd.d Source100: python_plugins.conf @@ -36,6 +37,7 @@ Source106: interface.conf Source107: remotels.conf Source108: ptp.conf Source109: ovs_interface.conf +Source110: service_res.conf BuildRequires: systemd-devel @@ -91,6 +93,7 @@ install -m 700 %{SOURCE16} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE17} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE18} %{buildroot}%{local_python_extensions_dir} install -m 700 %{SOURCE19} %{buildroot}%{local_python_extensions_dir} +install -m 700 %{SOURCE20} %{buildroot}%{local_python_extensions_dir} # collectd plugin conf files into /etc/collectd.d/starlingx @@ -103,6 +106,7 @@ install -m 600 %{SOURCE106} %{buildroot}%{local_starlingx_plugin_dir} install -m 600 %{SOURCE107} %{buildroot}%{local_starlingx_plugin_dir} install -m 600 %{SOURCE108} %{buildroot}%{local_starlingx_plugin_dir} install -m 600 %{SOURCE109} %{buildroot}%{local_starlingx_plugin_dir} +install -m 600 %{SOURCE110} %{buildroot}%{local_starlingx_plugin_dir} %clean rm -rf $RPM_BUILD_ROOT diff --git a/collectd-extensions/src/python_plugins.conf b/collectd-extensions/src/python_plugins.conf index 49d626c..240c042 100644 --- a/collectd-extensions/src/python_plugins.conf +++ b/collectd-extensions/src/python_plugins.conf @@ -19,6 +19,7 @@ LoadPlugin python Import "ovs_interface" Import "remotels" + Import "service_res" LogTraces = true Encoding "utf-8" diff --git a/collectd-extensions/src/service_res.conf b/collectd-extensions/src/service_res.conf new file mode 100644 index 0000000..d2ffbc0 --- /dev/null +++ b/collectd-extensions/src/service_res.conf @@ -0,0 +1,3 @@ +# service_res plugin is self contained. It runs under the collectd +# extension framework but manages the alarm by itself. +# The empty config here allows the collectd to schedule the plugin. diff --git a/collectd-extensions/src/service_res.py b/collectd-extensions/src/service_res.py new file mode 100644 index 0000000..978d517 --- /dev/null +++ b/collectd-extensions/src/service_res.py @@ -0,0 +1,211 @@ +# +# Copyright (c) 2021 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +import collectd + +import os +import subprocess +import plugin_common as pc +from datetime import datetime + +from fm_api import constants as fm_constants +from fm_api import fm_api + + +LSB_RETURN_WARNING = 160 +LSB_RETURN_DEGRADE = 161 + +# Fault manager API Object +api = fm_api.FaultAPIsV2() + +# name of the plugin - all logs produced by this plugin are prefixed with this +PLUGIN = 'service resource plugin' + +# define a struct for monitoring individual service with service plugin +# and react (raise or clear alarm) with specified return code. +services = [ + { + "service_name": "open-ldap", + "service_type": "lsb", + "service_plugin": "/etc/init.d/openldap", + "service_plugin_cmdline": "/etc/init.d/openldap status > 2; echo -n $?", + "service_plugin_env": {"SYSTEMCTL_SKIP_REDIRECT": "1"}, + "alarm": None, + "current_status": 0, # init status, actual status will be string + "service_status": [ + { + "status": "0", + "alarm": None, + }, + { + "status": "160", + "alarm": { + "severity": "major", + "id": "100.150", + "entity_id": "resource_type=file-descriptor.service_name=open-ldap", + "reason": "Number of open file descriptor is approaching to its limit", + "repair": "Consider to swact to the other controller if available", + } + }, + { + "status": "161", + "alarm": { + "severity": "critical", + "id": "100.150", + "entity_id": "resource_type=file-descriptor.service_name=open-ldap", + "reason": "Number of open file descriptor has reached its limit", + "repair": "Consider to swact to the other controller if available", + } + + } + ], + "alarms_to_clear": [ + { + "id": "100.150", + "entity_id": "resource_type=file-descriptor.service_name=open-ldap" + } + ], + "alarm_to_raise": None, + "alarm_raised": None + } +] + + +def clear_alarm(alarm): + alarm_id = alarm["id"] + eid = 'host=' + obj.hostname + "." + alarm["entity_id"] + try: + if api.clear_fault(alarm_id, eid) is False: + collectd.info("%s %s:%s alarm already cleared" % + (PLUGIN, alarm_id, eid)) + else: + collectd.info("%s %s:%s alarm cleared" % + (PLUGIN, alarm_id, eid)) + return True + + except Exception as ex: + collectd.error("%s 'clear_fault' failed ; %s:%s ; %s" % + (PLUGIN, alarm_id, eid, ex)) + return False + + +def raise_alarm(service_name, alarm): + """raise alarms""" + + alarm_id = alarm["id"] + alarm_state = fm_constants.FM_ALARM_STATE_SET + eid = 'host=' + obj.hostname + "." + alarm["entity_id"] + severity = alarm["severity"] + reason = alarm["reason"] + repair = alarm["repair"] + ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + fault = fm_api.Fault( + uuid="", + alarm_id=alarm_id, + alarm_state=alarm_state, + entity_type_id=fm_constants.FM_ENTITY_TYPE_SYSTEM, + entity_instance_id=eid, + severity=severity, + reason_text=reason, + alarm_type=fm_constants.FM_ALARM_TYPE_0, + probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_42, + proposed_repair_action=repair, + service_affecting=True, + timestamp=ts, + suppression=True) + + try: + alarm_uuid = api.set_fault(fault) + except Exception as ex: + collectd.error("%s 'set_fault' exception ; %s:%s ; %s" % + (PLUGIN, alarm_id, eid, ex)) + return False + + if pc.is_uuid_like(alarm_uuid) is False: + collectd.error("%s 'set_fault' failed ; %s:%s ; %s" % + (PLUGIN, alarm_id, eid, alarm_uuid)) + return False + else: + return True + + +# Plugin Control Object +obj = pc.PluginObject(PLUGIN, "") + + +def init_func(): + """Init the plugin""" + + # do nothing till config is complete. + if obj.config_complete() is False: + return 0 + + for service in services: + if not os.path.exists(service["service_plugin"]): + return 1 + + obj.hostname = obj.gethostname() + obj.init_completed() + return 0 + + +def check_service_status(service): + cmd = service["service_plugin_cmdline"] + env = service["service_plugin_env"] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, shell=True) + proc.wait() + new_status = (proc.communicate()[0] or "").strip() + + if service["current_status"] == new_status: + return + + current_alarm = service["alarm_raised"] + if current_alarm is not None: + if current_alarm not in service["alarms_to_clear"]: + service["alarms_to_clear"].append(current_alarm) + + for status in service["service_status"]: + if status["status"] == new_status: + alarm = status["alarm"] + if alarm is not None and alarm != service["alarm_to_raise"]: + service["alarm_to_raise"] = alarm + break + else: + collectd.error("undefined service status %s[%s]" % + (service["service_name"], new_status)) + + +def process_service_alarm(service): + alarms_to_clear = service["alarms_to_clear"][:] + for alarm in alarms_to_clear: + if clear_alarm(alarm): + service["alarms_to_clear"].remove(alarm) + + alarm = service["alarm_to_raise"] + if alarm is not None: + if raise_alarm(service["service_name"], alarm): + alarm_raised = {"id": alarm["id"], "entity_id": alarm["entity_id"]} + service["alarm_raised"] = alarm_raised + service["alarm_to_raise"] = None + + +# The read function - called on every audit interval +def read_func(): + """collectd service resource monitor plugin read function""" + + if obj.init_complete is False: + init_func() + return 0 + + if obj._node_ready is False: + obj.node_ready() + return 0 + + for service in services: + check_service_status(service) + process_service_alarm(service) + +collectd.register_init(init_func) +collectd.register_read(read_func)