Porting Supervisord check from DataDog
DD's service_check function does not exist in Monasca, since these metrics are still valuable they have been replaced with gauge checks. Also adding corresponding monasca_setup detection plugin for supervisord Change-Id: I0276a51aaea21b5684ec114b4e31e05ad0dddfdf
This commit is contained in:
parent
641b37b5bb
commit
c483d95b0b
66
conf.d/supervisord.yaml.example
Normal file
66
conf.d/supervisord.yaml.example
Normal file
@ -0,0 +1,66 @@
|
||||
# (C) Copyright 2016 Hewlett Packard Enterprise Development Company LP
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
#
|
||||
# There are two ways to get started with the supervisord check.
|
||||
#
|
||||
# You can configure inet_http_server in /etc/supervisord.conf. Below is an
|
||||
# example inet_http_server configuration:
|
||||
#
|
||||
# [inet_http_server]
|
||||
# port:localhost:9001
|
||||
# username:user # optional
|
||||
# password:pass # optional
|
||||
#
|
||||
# OR, you can use supervisorctl socket to communicate with supervisor.
|
||||
# If supervisor is running as root, make sure chmod property is set
|
||||
# to a permission accessible to non-root users. See the example below:
|
||||
#
|
||||
# [supervisorctl]
|
||||
# serverurl=unix:///var/run//supervisor.sock
|
||||
#
|
||||
# [unix_http_server]
|
||||
# file=/var/run/supervisor.sock
|
||||
# chmod=775
|
||||
#
|
||||
# Reload supervisor, specify the inet or unix socket server information
|
||||
# in this yaml file along with an optional list of the processes you want
|
||||
# to monitor per instance, and you're good to go!
|
||||
#
|
||||
# See http://supervisord.org/configuration.html for more information on
|
||||
# configuring supervisord sockets and inet http servers.
|
||||
#
|
||||
|
||||
init_config:
|
||||
|
||||
instances:
|
||||
# - name: server0 # Required. An arbitrary name to identify the supervisord server
|
||||
# host: localhost # Optional. Defaults to localhost. The host where supervisord server is running
|
||||
# port: 9001 # Optional. Defaults to 9001. The port number.
|
||||
# user: user # Optional. Required only if a username is configured.
|
||||
# pass: pass # Optional. Required only if a password is configured.
|
||||
# proc_regex: # Optional. Regex pattern[s] matching the names of processes to monitor
|
||||
# - 'myprocess-\d\d$'
|
||||
# proc_names: # Optional. The process to monitor within this supervisord instance.
|
||||
# - apache2 # If not specified, the check will monitor all processes.
|
||||
# - webapp
|
||||
# - java
|
||||
# proc_uptime_check: False # Optional. Defaults to True.
|
||||
# proc_details_check: False # Optional. Defaults to True.
|
||||
# - name: server1
|
||||
# host: localhost
|
||||
# port: 9002
|
||||
# - name: server2
|
||||
# socket: unix:///var/run//supervisor.sock
|
||||
# host: http://127.0.0.1 # Optional. Defaults to http://127.0.0.1
|
232
monasca_agent/collector/checks_d/supervisord.py
Normal file
232
monasca_agent/collector/checks_d/supervisord.py
Normal file
@ -0,0 +1,232 @@
|
||||
# (C) Copyright 2016 Hewlett Packard Enterprise Development Company LP
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# stdlib
|
||||
from collections import defaultdict
|
||||
import itertools
|
||||
import re
|
||||
import socket
|
||||
import time
|
||||
import xmlrpclib
|
||||
|
||||
# 3p
|
||||
import supervisor.xmlrpc
|
||||
|
||||
# project
|
||||
import monasca_agent.collector.checks as checks
|
||||
|
||||
DEFAULT_HOST = 'localhost'
|
||||
DEFAULT_PORT = '9001'
|
||||
DEFAULT_SOCKET_IP = 'http://127.0.0.1'
|
||||
|
||||
STATUS = {
|
||||
'STOPPED': 'CRITICAL',
|
||||
'STARTING': 'UNKNOWN',
|
||||
'RUNNING': 'OK',
|
||||
'BACKOFF': 'CRITICAL',
|
||||
'STOPPING': 'CRITICAL',
|
||||
'EXITED': 'CRITICAL',
|
||||
'FATAL': 'CRITICAL',
|
||||
'UNKNOWN': 'UNKNOWN'
|
||||
}
|
||||
|
||||
PROCESS_STATUS = {
|
||||
'CRITICAL': 'down',
|
||||
'OK': 'up',
|
||||
'UNKNOWN': 'unknown'
|
||||
}
|
||||
|
||||
PROCESS_STATE = {
|
||||
'CRITICAL': 0,
|
||||
'OK': 1,
|
||||
'UNKNOWN': -1
|
||||
}
|
||||
|
||||
SERVER_STATE = {
|
||||
'DOWN': 1,
|
||||
'UP': 0
|
||||
}
|
||||
|
||||
SERVER_TAG = 'supervisord_server'
|
||||
|
||||
PROCESS_TAG = 'supervisord_process'
|
||||
|
||||
FORMAT_TIME = lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x))
|
||||
|
||||
SERVER_SERVICE_CHECK = 'supervisord.can_connect'
|
||||
PROCESS_SERVICE_CHECK = 'supervisord.process.status'
|
||||
PROCESS_UP_TIME_CHECK = 'supervisord.process.uptime'
|
||||
PROCESS_COUNT_UP_CHECK = 'supervisord.process.count.status_up'
|
||||
PROCESS_COUNT_DOWN_CHECK = 'supervisord.process.count.status_down'
|
||||
PROCESS_COUNT_UNKNOWN_CHECK = 'supervisord.process.count.status_unknown'
|
||||
|
||||
|
||||
class Supervisord(checks.AgentCheck):
|
||||
|
||||
def check(self, instance):
|
||||
server_name = instance.get('name')
|
||||
proc_details_check = instance.get('proc_details_check', True)
|
||||
if proc_details_check in ['False', 'false']:
|
||||
proc_details_check = False
|
||||
proc_uptime_check = instance.get('proc_uptime_check', True)
|
||||
if proc_uptime_check in ['False', 'false']:
|
||||
proc_uptime_check = False
|
||||
|
||||
if not server_name or not server_name.strip():
|
||||
raise Exception("Supervisor server name not specified in yaml configuration.")
|
||||
|
||||
dimensions = self._set_dimensions({'server_name': server_name}, instance)
|
||||
supe = self._connect(instance)
|
||||
count_by_status = defaultdict(int)
|
||||
|
||||
# Gather all process information
|
||||
try:
|
||||
processes = supe.getAllProcessInfo()
|
||||
except xmlrpclib.Fault as error:
|
||||
raise Exception(
|
||||
'An error occurred while reading process information: %s %s'
|
||||
% (error.faultCode, error.faultString)
|
||||
)
|
||||
except socket.error as error:
|
||||
host = instance.get('host', DEFAULT_HOST)
|
||||
port = instance.get('port', DEFAULT_PORT)
|
||||
sock = instance.get('socket')
|
||||
if sock is None:
|
||||
msg = 'Cannot connect to http://%s:%s. ' \
|
||||
'Make sure supervisor is running and XML-RPC ' \
|
||||
'inet interface is enabled.' % (host, port)
|
||||
else:
|
||||
msg = 'Cannot connect to %s. Make sure sure supervisor ' \
|
||||
'is running and socket is enabled and socket file' \
|
||||
' has the right permissions.' % sock
|
||||
|
||||
server_details = {'server_details': msg}
|
||||
self.gauge(SERVER_SERVICE_CHECK, SERVER_STATE['DOWN'],
|
||||
dimensions=dimensions, value_meta=server_details)
|
||||
raise Exception(msg)
|
||||
|
||||
except xmlrpclib.ProtocolError as error:
|
||||
if error.errcode == 401: # authorization error
|
||||
msg = 'Username or password to %s are incorrect.' % server_name
|
||||
else:
|
||||
msg = "An error occurred while connecting to %s: "\
|
||||
"%s %s " % (server_name, error.errcode, error.errmsg)
|
||||
|
||||
server_details = {'server_details': msg}
|
||||
self.gauge(SERVER_SERVICE_CHECK, SERVER_STATE['DOWN'],
|
||||
dimensions=dimensions, value_meta=server_details)
|
||||
raise Exception(msg)
|
||||
|
||||
# If we're here, we were able to connect to the server
|
||||
self.gauge(SERVER_SERVICE_CHECK, SERVER_STATE['UP'], dimensions=dimensions)
|
||||
|
||||
# Filter monitored processes on configuration directives
|
||||
proc_regex = instance.get('proc_regex', [])
|
||||
if not isinstance(proc_regex, list):
|
||||
raise Exception("Invalid proc_regex.")
|
||||
|
||||
proc_names = instance.get('proc_names', [])
|
||||
if not isinstance(proc_names, list):
|
||||
raise Exception("Invalid proc_names.")
|
||||
|
||||
# Collect information on each monitored process
|
||||
monitored_processes = []
|
||||
|
||||
# monitor all processes if no filters were specified
|
||||
if len(proc_regex) == 0 and len(proc_names) == 0:
|
||||
monitored_processes = processes
|
||||
|
||||
for pattern, process in itertools.product(proc_regex, processes):
|
||||
try:
|
||||
if re.match(pattern, process['name']) and process not in monitored_processes:
|
||||
monitored_processes.append(process)
|
||||
except re.error:
|
||||
raise
|
||||
|
||||
for process in processes:
|
||||
if process['name'] in proc_names and process not in monitored_processes:
|
||||
monitored_processes.append(process)
|
||||
|
||||
# Report service checks and uptime for each process
|
||||
for proc in monitored_processes:
|
||||
proc_name = proc['name']
|
||||
dimensions[PROCESS_TAG] = proc_name
|
||||
|
||||
# Retrieve status and update status count
|
||||
status = STATUS[proc['statename']]
|
||||
count_by_status[status] += 1
|
||||
|
||||
# Report process details
|
||||
if proc_details_check:
|
||||
msg = self._build_message(proc)
|
||||
self.log.info('process details: %s' % msg)
|
||||
self.gauge(PROCESS_SERVICE_CHECK, PROCESS_STATE[status],
|
||||
dimensions=dimensions)
|
||||
|
||||
# Report Uptime
|
||||
if proc_uptime_check:
|
||||
uptime = self._extract_uptime(proc)
|
||||
self.gauge(PROCESS_UP_TIME_CHECK, uptime, dimensions=dimensions)
|
||||
|
||||
dimensions.pop(PROCESS_TAG, None)
|
||||
|
||||
# Report counts by status
|
||||
self.gauge(PROCESS_COUNT_UP_CHECK, count_by_status['OK'],
|
||||
dimensions=dimensions)
|
||||
self.gauge(PROCESS_COUNT_DOWN_CHECK, count_by_status['CRITICAL'],
|
||||
dimensions=dimensions)
|
||||
self.gauge(PROCESS_COUNT_UNKNOWN_CHECK, count_by_status['UNKNOWN'],
|
||||
dimensions=dimensions)
|
||||
|
||||
@staticmethod
|
||||
def _connect(instance):
|
||||
sock = instance.get('socket')
|
||||
if sock is not None:
|
||||
host = instance.get('host', DEFAULT_SOCKET_IP)
|
||||
transport = supervisor.xmlrpc.SupervisorTransport(None, None, sock)
|
||||
server = xmlrpclib.ServerProxy(host, transport=transport)
|
||||
else:
|
||||
host = instance.get('host', DEFAULT_HOST)
|
||||
port = instance.get('port', DEFAULT_PORT)
|
||||
user = instance.get('user')
|
||||
password = instance.get('pass')
|
||||
auth = '%s:%s@' % (user, password) if user and password else ''
|
||||
server = xmlrpclib.Server('http://%s%s:%s/RPC2' % (auth, host, port))
|
||||
return server.supervisor
|
||||
|
||||
@staticmethod
|
||||
def _extract_uptime(proc):
|
||||
start, now = int(proc['start']), int(proc['now'])
|
||||
status = proc['statename']
|
||||
active_state = status in ['BACKOFF', 'RUNNING', 'STOPPING']
|
||||
return now - start if active_state else 0
|
||||
|
||||
@staticmethod
|
||||
def _build_message(proc):
|
||||
start, stop, now = int(proc['start']), int(proc['stop']), int(proc['now'])
|
||||
proc['now_str'] = FORMAT_TIME(now)
|
||||
proc['start_str'] = FORMAT_TIME(start)
|
||||
proc['stop_str'] = '' if stop == 0 else FORMAT_TIME(stop)
|
||||
|
||||
return """Current time: %(now_str)s
|
||||
Process name: %(name)s
|
||||
Process group: %(group)s
|
||||
Description: %(description)s
|
||||
Error log file: %(stderr_logfile)s
|
||||
Stdout log file: %(stdout_logfile)s
|
||||
Log file: %(logfile)s
|
||||
State: %(statename)s
|
||||
Start time: %(start_str)s
|
||||
Stop time: %(stop_str)s
|
||||
Exit Status: %(exitstatus)s""" % proc
|
167
monasca_setup/detection/plugins/supervisord.py
Normal file
167
monasca_setup/detection/plugins/supervisord.py
Normal file
@ -0,0 +1,167 @@
|
||||
# (C) Copyright 2016 Hewlett Packard Enterprise Development Company LP
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import logging
|
||||
|
||||
import monasca_setup.agent_config
|
||||
import monasca_setup.detection
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# Defaults
|
||||
supervisord_conf = '/root/.supervisord.cnf'
|
||||
supervisord_server_name = 'server0'
|
||||
|
||||
|
||||
class Supervisord(monasca_setup.detection.Plugin):
|
||||
"""Detect supervisord process and setup configuration for monitoring.
|
||||
|
||||
This plugin needs connection info for supervisord setup. There are two
|
||||
ways to provide it, either by a file placed in /root/.supervisord.cnf
|
||||
or by specifying the following arguments:
|
||||
- server (req, arbitrary name to identify the supervisord server)
|
||||
- socket (opt, required for socket connection type)
|
||||
- host (opt, defaults to localhost)
|
||||
- port (opt, defaults to 9001)
|
||||
- user (opt, only if username is configured)
|
||||
- password (opt, only if password is configured)
|
||||
- process_regex (opt, regex patterns for processes to monitor)
|
||||
- process_names (opt, process to monitor by name)
|
||||
process_regex and process_names are comma separated lists
|
||||
|
||||
The file at /root/.supervisord.cnf should have this format:
|
||||
[client]
|
||||
server=server0
|
||||
socket=unix:///var/run//supervisor.sock
|
||||
process_names=apache2,webapp,java
|
||||
"""
|
||||
|
||||
def _detect(self):
|
||||
"""Run detection, set self.available True if the service is detected.
|
||||
"""
|
||||
if monasca_setup.detection.find_process_cmdline('supervisord') is not None:
|
||||
self.available = True
|
||||
|
||||
def _get_config(self):
|
||||
"""Set the configuration to be used for connecting to supervisord
|
||||
:return:
|
||||
"""
|
||||
# Set defaults and read config or use arguments
|
||||
if self.args is None:
|
||||
self.server = supervisord_server_name
|
||||
self.socket = None
|
||||
self.host = None
|
||||
self.port = None
|
||||
self.user = None
|
||||
self.password = None
|
||||
self.process_regex = None
|
||||
self.process_names = None
|
||||
self.process_details_check = None
|
||||
self.process_uptime_check = None
|
||||
|
||||
self._read_config(supervisord_conf)
|
||||
else:
|
||||
self.server = self.args.get('server', supervisord_server_name)
|
||||
self.socket = self.args.get('socket')
|
||||
self.host = self.args.get('host')
|
||||
self.port = self.args.get('port')
|
||||
self.user = self.args.get('user')
|
||||
self.password = self.args.get('pass')
|
||||
self.process_regex = self.args.get('proc_regex')
|
||||
self.process_names = self.args.get('proc_names')
|
||||
self.process_details_check = self.args.get('proc_details_check')
|
||||
self.process_uptime_check = self.args.get('proc_uptime_check')
|
||||
|
||||
def _read_config(self, config_file):
|
||||
"""Read the configuration setting member variables as appropriate.
|
||||
:param config_file: The filename of the configuration to read and parse
|
||||
"""
|
||||
# Read the supervisord config file to extract the needed variables.
|
||||
client_section = False
|
||||
try:
|
||||
with open(config_file, "r") as conf:
|
||||
for row in conf:
|
||||
if "[client]" in row:
|
||||
client_section = True
|
||||
log.info("\tUsing client credentials from {:s}".format(config_file))
|
||||
pass
|
||||
if client_section:
|
||||
if "server=" in row:
|
||||
self.server = row.split("=")[1].strip()
|
||||
if "socket=" in row:
|
||||
self.socket = row.split("=")[1].strip()
|
||||
if "host=" in row:
|
||||
self.host = row.split("=")[1].strip()
|
||||
if "port=" in row:
|
||||
self.port = row.split("=")[1].strip()
|
||||
if "user=" in row:
|
||||
self.user = row.split("=")[1].strip()
|
||||
if "pass=" in row:
|
||||
self.password = row.split("=")[1].strip()
|
||||
if "proc_regex=" in row:
|
||||
self.process_regex = row.split("=")[1].strip()
|
||||
if "proc_names=" in row:
|
||||
self.process_names = row.split("=")[1].strip()
|
||||
if "proc_details_check=" in row:
|
||||
self.process_details_check = row.split("=")[1].strip()
|
||||
if "proc_uptime_check=" in row:
|
||||
self.process_uptime_check = row.split("=")[1].strip()
|
||||
except IOError:
|
||||
log.error("\tI/O error reading {:s}".format(config_file))
|
||||
|
||||
@staticmethod
|
||||
def _split_list(to_split):
|
||||
return [x.strip() for x in to_split.split(',')]
|
||||
|
||||
def build_config(self):
|
||||
"""Build the config as a Plugins object and return.
|
||||
"""
|
||||
config = monasca_setup.agent_config.Plugins()
|
||||
# First watch the process
|
||||
config.merge(monasca_setup.detection.watch_process(['supervisord'],
|
||||
'supervisord',
|
||||
exact_match=False))
|
||||
log.info("\tWatching the supervisord process.")
|
||||
|
||||
try:
|
||||
self._get_config()
|
||||
instance_config = {'name': self.server}
|
||||
if self.socket is not None:
|
||||
instance_config['socket'] = self.socket
|
||||
if self.host is not None:
|
||||
instance_config['host'] = self.host
|
||||
if self.port is not None:
|
||||
instance_config['port'] = self.port
|
||||
if self.user is not None:
|
||||
instance_config['user'] = self.user
|
||||
if self.password is not None:
|
||||
instance_config['pass'] = self.password
|
||||
if self.process_regex is not None:
|
||||
instance_config['proc_regex'] = self._split_list(self.process_regex)
|
||||
if self.process_names is not None:
|
||||
instance_config['proc_names'] = self._split_list(self.process_names)
|
||||
if self.process_details_check is not None:
|
||||
instance_config['proc_details_check'] = self.process_details_check
|
||||
if self.process_uptime_check is not None:
|
||||
instance_config['proc_uptime_check'] = self.process_uptime_check
|
||||
|
||||
config['supervisord'] = {'init_config': None, 'instances': [instance_config]}
|
||||
except Exception:
|
||||
log.exception('Error configuring the supervisord check plugin')
|
||||
|
||||
return config
|
||||
|
||||
def dependencies_installed(self):
|
||||
return True
|
Loading…
Reference in New Issue
Block a user