Porting Supervisord check from DataDog

DD's service_check function does not exist in Monasca, since these
metrics are still valuable they have been replaced with gauge checks.

Also adding corresponding monasca_setup detection plugin for supervisord

Change-Id: I0276a51aaea21b5684ec114b4e31e05ad0dddfdf
This commit is contained in:
dagnello 2016-01-12 12:09:40 -08:00 committed by Steve Leon
parent 641b37b5bb
commit c483d95b0b
3 changed files with 465 additions and 0 deletions

View File

@ -0,0 +1,66 @@
# (C) Copyright 2016 Hewlett Packard Enterprise Development Company LP
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# There are two ways to get started with the supervisord check.
#
# You can configure inet_http_server in /etc/supervisord.conf. Below is an
# example inet_http_server configuration:
#
# [inet_http_server]
# port:localhost:9001
# username:user # optional
# password:pass # optional
#
# OR, you can use supervisorctl socket to communicate with supervisor.
# If supervisor is running as root, make sure chmod property is set
# to a permission accessible to non-root users. See the example below:
#
# [supervisorctl]
# serverurl=unix:///var/run//supervisor.sock
#
# [unix_http_server]
# file=/var/run/supervisor.sock
# chmod=775
#
# Reload supervisor, specify the inet or unix socket server information
# in this yaml file along with an optional list of the processes you want
# to monitor per instance, and you're good to go!
#
# See http://supervisord.org/configuration.html for more information on
# configuring supervisord sockets and inet http servers.
#
init_config:
instances:
# - name: server0 # Required. An arbitrary name to identify the supervisord server
# host: localhost # Optional. Defaults to localhost. The host where supervisord server is running
# port: 9001 # Optional. Defaults to 9001. The port number.
# user: user # Optional. Required only if a username is configured.
# pass: pass # Optional. Required only if a password is configured.
# proc_regex: # Optional. Regex pattern[s] matching the names of processes to monitor
# - 'myprocess-\d\d$'
# proc_names: # Optional. The process to monitor within this supervisord instance.
# - apache2 # If not specified, the check will monitor all processes.
# - webapp
# - java
# proc_uptime_check: False # Optional. Defaults to True.
# proc_details_check: False # Optional. Defaults to True.
# - name: server1
# host: localhost
# port: 9002
# - name: server2
# socket: unix:///var/run//supervisor.sock
# host: http://127.0.0.1 # Optional. Defaults to http://127.0.0.1

View File

@ -0,0 +1,232 @@
# (C) Copyright 2016 Hewlett Packard Enterprise Development Company LP
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
# stdlib
from collections import defaultdict
import itertools
import re
import socket
import time
import xmlrpclib
# 3p
import supervisor.xmlrpc
# project
import monasca_agent.collector.checks as checks
DEFAULT_HOST = 'localhost'
DEFAULT_PORT = '9001'
DEFAULT_SOCKET_IP = 'http://127.0.0.1'
STATUS = {
'STOPPED': 'CRITICAL',
'STARTING': 'UNKNOWN',
'RUNNING': 'OK',
'BACKOFF': 'CRITICAL',
'STOPPING': 'CRITICAL',
'EXITED': 'CRITICAL',
'FATAL': 'CRITICAL',
'UNKNOWN': 'UNKNOWN'
}
PROCESS_STATUS = {
'CRITICAL': 'down',
'OK': 'up',
'UNKNOWN': 'unknown'
}
PROCESS_STATE = {
'CRITICAL': 0,
'OK': 1,
'UNKNOWN': -1
}
SERVER_STATE = {
'DOWN': 1,
'UP': 0
}
SERVER_TAG = 'supervisord_server'
PROCESS_TAG = 'supervisord_process'
FORMAT_TIME = lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x))
SERVER_SERVICE_CHECK = 'supervisord.can_connect'
PROCESS_SERVICE_CHECK = 'supervisord.process.status'
PROCESS_UP_TIME_CHECK = 'supervisord.process.uptime'
PROCESS_COUNT_UP_CHECK = 'supervisord.process.count.status_up'
PROCESS_COUNT_DOWN_CHECK = 'supervisord.process.count.status_down'
PROCESS_COUNT_UNKNOWN_CHECK = 'supervisord.process.count.status_unknown'
class Supervisord(checks.AgentCheck):
def check(self, instance):
server_name = instance.get('name')
proc_details_check = instance.get('proc_details_check', True)
if proc_details_check in ['False', 'false']:
proc_details_check = False
proc_uptime_check = instance.get('proc_uptime_check', True)
if proc_uptime_check in ['False', 'false']:
proc_uptime_check = False
if not server_name or not server_name.strip():
raise Exception("Supervisor server name not specified in yaml configuration.")
dimensions = self._set_dimensions({'server_name': server_name}, instance)
supe = self._connect(instance)
count_by_status = defaultdict(int)
# Gather all process information
try:
processes = supe.getAllProcessInfo()
except xmlrpclib.Fault as error:
raise Exception(
'An error occurred while reading process information: %s %s'
% (error.faultCode, error.faultString)
)
except socket.error as error:
host = instance.get('host', DEFAULT_HOST)
port = instance.get('port', DEFAULT_PORT)
sock = instance.get('socket')
if sock is None:
msg = 'Cannot connect to http://%s:%s. ' \
'Make sure supervisor is running and XML-RPC ' \
'inet interface is enabled.' % (host, port)
else:
msg = 'Cannot connect to %s. Make sure sure supervisor ' \
'is running and socket is enabled and socket file' \
' has the right permissions.' % sock
server_details = {'server_details': msg}
self.gauge(SERVER_SERVICE_CHECK, SERVER_STATE['DOWN'],
dimensions=dimensions, value_meta=server_details)
raise Exception(msg)
except xmlrpclib.ProtocolError as error:
if error.errcode == 401: # authorization error
msg = 'Username or password to %s are incorrect.' % server_name
else:
msg = "An error occurred while connecting to %s: "\
"%s %s " % (server_name, error.errcode, error.errmsg)
server_details = {'server_details': msg}
self.gauge(SERVER_SERVICE_CHECK, SERVER_STATE['DOWN'],
dimensions=dimensions, value_meta=server_details)
raise Exception(msg)
# If we're here, we were able to connect to the server
self.gauge(SERVER_SERVICE_CHECK, SERVER_STATE['UP'], dimensions=dimensions)
# Filter monitored processes on configuration directives
proc_regex = instance.get('proc_regex', [])
if not isinstance(proc_regex, list):
raise Exception("Invalid proc_regex.")
proc_names = instance.get('proc_names', [])
if not isinstance(proc_names, list):
raise Exception("Invalid proc_names.")
# Collect information on each monitored process
monitored_processes = []
# monitor all processes if no filters were specified
if len(proc_regex) == 0 and len(proc_names) == 0:
monitored_processes = processes
for pattern, process in itertools.product(proc_regex, processes):
try:
if re.match(pattern, process['name']) and process not in monitored_processes:
monitored_processes.append(process)
except re.error:
raise
for process in processes:
if process['name'] in proc_names and process not in monitored_processes:
monitored_processes.append(process)
# Report service checks and uptime for each process
for proc in monitored_processes:
proc_name = proc['name']
dimensions[PROCESS_TAG] = proc_name
# Retrieve status and update status count
status = STATUS[proc['statename']]
count_by_status[status] += 1
# Report process details
if proc_details_check:
msg = self._build_message(proc)
self.log.info('process details: %s' % msg)
self.gauge(PROCESS_SERVICE_CHECK, PROCESS_STATE[status],
dimensions=dimensions)
# Report Uptime
if proc_uptime_check:
uptime = self._extract_uptime(proc)
self.gauge(PROCESS_UP_TIME_CHECK, uptime, dimensions=dimensions)
dimensions.pop(PROCESS_TAG, None)
# Report counts by status
self.gauge(PROCESS_COUNT_UP_CHECK, count_by_status['OK'],
dimensions=dimensions)
self.gauge(PROCESS_COUNT_DOWN_CHECK, count_by_status['CRITICAL'],
dimensions=dimensions)
self.gauge(PROCESS_COUNT_UNKNOWN_CHECK, count_by_status['UNKNOWN'],
dimensions=dimensions)
@staticmethod
def _connect(instance):
sock = instance.get('socket')
if sock is not None:
host = instance.get('host', DEFAULT_SOCKET_IP)
transport = supervisor.xmlrpc.SupervisorTransport(None, None, sock)
server = xmlrpclib.ServerProxy(host, transport=transport)
else:
host = instance.get('host', DEFAULT_HOST)
port = instance.get('port', DEFAULT_PORT)
user = instance.get('user')
password = instance.get('pass')
auth = '%s:%s@' % (user, password) if user and password else ''
server = xmlrpclib.Server('http://%s%s:%s/RPC2' % (auth, host, port))
return server.supervisor
@staticmethod
def _extract_uptime(proc):
start, now = int(proc['start']), int(proc['now'])
status = proc['statename']
active_state = status in ['BACKOFF', 'RUNNING', 'STOPPING']
return now - start if active_state else 0
@staticmethod
def _build_message(proc):
start, stop, now = int(proc['start']), int(proc['stop']), int(proc['now'])
proc['now_str'] = FORMAT_TIME(now)
proc['start_str'] = FORMAT_TIME(start)
proc['stop_str'] = '' if stop == 0 else FORMAT_TIME(stop)
return """Current time: %(now_str)s
Process name: %(name)s
Process group: %(group)s
Description: %(description)s
Error log file: %(stderr_logfile)s
Stdout log file: %(stdout_logfile)s
Log file: %(logfile)s
State: %(statename)s
Start time: %(start_str)s
Stop time: %(stop_str)s
Exit Status: %(exitstatus)s""" % proc

View File

@ -0,0 +1,167 @@
# (C) Copyright 2016 Hewlett Packard Enterprise Development Company LP
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import logging
import monasca_setup.agent_config
import monasca_setup.detection
log = logging.getLogger(__name__)
# Defaults
supervisord_conf = '/root/.supervisord.cnf'
supervisord_server_name = 'server0'
class Supervisord(monasca_setup.detection.Plugin):
"""Detect supervisord process and setup configuration for monitoring.
This plugin needs connection info for supervisord setup. There are two
ways to provide it, either by a file placed in /root/.supervisord.cnf
or by specifying the following arguments:
- server (req, arbitrary name to identify the supervisord server)
- socket (opt, required for socket connection type)
- host (opt, defaults to localhost)
- port (opt, defaults to 9001)
- user (opt, only if username is configured)
- password (opt, only if password is configured)
- process_regex (opt, regex patterns for processes to monitor)
- process_names (opt, process to monitor by name)
process_regex and process_names are comma separated lists
The file at /root/.supervisord.cnf should have this format:
[client]
server=server0
socket=unix:///var/run//supervisor.sock
process_names=apache2,webapp,java
"""
def _detect(self):
"""Run detection, set self.available True if the service is detected.
"""
if monasca_setup.detection.find_process_cmdline('supervisord') is not None:
self.available = True
def _get_config(self):
"""Set the configuration to be used for connecting to supervisord
:return:
"""
# Set defaults and read config or use arguments
if self.args is None:
self.server = supervisord_server_name
self.socket = None
self.host = None
self.port = None
self.user = None
self.password = None
self.process_regex = None
self.process_names = None
self.process_details_check = None
self.process_uptime_check = None
self._read_config(supervisord_conf)
else:
self.server = self.args.get('server', supervisord_server_name)
self.socket = self.args.get('socket')
self.host = self.args.get('host')
self.port = self.args.get('port')
self.user = self.args.get('user')
self.password = self.args.get('pass')
self.process_regex = self.args.get('proc_regex')
self.process_names = self.args.get('proc_names')
self.process_details_check = self.args.get('proc_details_check')
self.process_uptime_check = self.args.get('proc_uptime_check')
def _read_config(self, config_file):
"""Read the configuration setting member variables as appropriate.
:param config_file: The filename of the configuration to read and parse
"""
# Read the supervisord config file to extract the needed variables.
client_section = False
try:
with open(config_file, "r") as conf:
for row in conf:
if "[client]" in row:
client_section = True
log.info("\tUsing client credentials from {:s}".format(config_file))
pass
if client_section:
if "server=" in row:
self.server = row.split("=")[1].strip()
if "socket=" in row:
self.socket = row.split("=")[1].strip()
if "host=" in row:
self.host = row.split("=")[1].strip()
if "port=" in row:
self.port = row.split("=")[1].strip()
if "user=" in row:
self.user = row.split("=")[1].strip()
if "pass=" in row:
self.password = row.split("=")[1].strip()
if "proc_regex=" in row:
self.process_regex = row.split("=")[1].strip()
if "proc_names=" in row:
self.process_names = row.split("=")[1].strip()
if "proc_details_check=" in row:
self.process_details_check = row.split("=")[1].strip()
if "proc_uptime_check=" in row:
self.process_uptime_check = row.split("=")[1].strip()
except IOError:
log.error("\tI/O error reading {:s}".format(config_file))
@staticmethod
def _split_list(to_split):
return [x.strip() for x in to_split.split(',')]
def build_config(self):
"""Build the config as a Plugins object and return.
"""
config = monasca_setup.agent_config.Plugins()
# First watch the process
config.merge(monasca_setup.detection.watch_process(['supervisord'],
'supervisord',
exact_match=False))
log.info("\tWatching the supervisord process.")
try:
self._get_config()
instance_config = {'name': self.server}
if self.socket is not None:
instance_config['socket'] = self.socket
if self.host is not None:
instance_config['host'] = self.host
if self.port is not None:
instance_config['port'] = self.port
if self.user is not None:
instance_config['user'] = self.user
if self.password is not None:
instance_config['pass'] = self.password
if self.process_regex is not None:
instance_config['proc_regex'] = self._split_list(self.process_regex)
if self.process_names is not None:
instance_config['proc_names'] = self._split_list(self.process_names)
if self.process_details_check is not None:
instance_config['proc_details_check'] = self.process_details_check
if self.process_uptime_check is not None:
instance_config['proc_uptime_check'] = self.process_uptime_check
config['supervisord'] = {'init_config': None, 'instances': [instance_config]}
except Exception:
log.exception('Error configuring the supervisord check plugin')
return config
def dependencies_installed(self):
return True