diff --git a/conf.d/supervisord.yaml.example b/conf.d/supervisord.yaml.example new file mode 100644 index 00000000..d9799400 --- /dev/null +++ b/conf.d/supervisord.yaml.example @@ -0,0 +1,66 @@ +# (C) Copyright 2016 Hewlett Packard Enterprise Development Company LP +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# +# There are two ways to get started with the supervisord check. +# +# You can configure inet_http_server in /etc/supervisord.conf. Below is an +# example inet_http_server configuration: +# +# [inet_http_server] +# port:localhost:9001 +# username:user # optional +# password:pass # optional +# +# OR, you can use supervisorctl socket to communicate with supervisor. +# If supervisor is running as root, make sure chmod property is set +# to a permission accessible to non-root users. See the example below: +# +# [supervisorctl] +# serverurl=unix:///var/run//supervisor.sock +# +# [unix_http_server] +# file=/var/run/supervisor.sock +# chmod=775 +# +# Reload supervisor, specify the inet or unix socket server information +# in this yaml file along with an optional list of the processes you want +# to monitor per instance, and you're good to go! +# +# See http://supervisord.org/configuration.html for more information on +# configuring supervisord sockets and inet http servers. +# + +init_config: + +instances: +# - name: server0 # Required. An arbitrary name to identify the supervisord server +# host: localhost # Optional. Defaults to localhost. The host where supervisord server is running +# port: 9001 # Optional. Defaults to 9001. The port number. +# user: user # Optional. Required only if a username is configured. +# pass: pass # Optional. Required only if a password is configured. +# proc_regex: # Optional. Regex pattern[s] matching the names of processes to monitor +# - 'myprocess-\d\d$' +# proc_names: # Optional. The process to monitor within this supervisord instance. +# - apache2 # If not specified, the check will monitor all processes. +# - webapp +# - java +# proc_uptime_check: False # Optional. Defaults to True. +# proc_details_check: False # Optional. Defaults to True. +# - name: server1 +# host: localhost +# port: 9002 +# - name: server2 +# socket: unix:///var/run//supervisor.sock +# host: http://127.0.0.1 # Optional. Defaults to http://127.0.0.1 \ No newline at end of file diff --git a/monasca_agent/collector/checks_d/supervisord.py b/monasca_agent/collector/checks_d/supervisord.py new file mode 100644 index 00000000..cfa90a1c --- /dev/null +++ b/monasca_agent/collector/checks_d/supervisord.py @@ -0,0 +1,232 @@ +# (C) Copyright 2016 Hewlett Packard Enterprise Development Company LP +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# stdlib +from collections import defaultdict +import itertools +import re +import socket +import time +import xmlrpclib + +# 3p +import supervisor.xmlrpc + +# project +import monasca_agent.collector.checks as checks + +DEFAULT_HOST = 'localhost' +DEFAULT_PORT = '9001' +DEFAULT_SOCKET_IP = 'http://127.0.0.1' + +STATUS = { + 'STOPPED': 'CRITICAL', + 'STARTING': 'UNKNOWN', + 'RUNNING': 'OK', + 'BACKOFF': 'CRITICAL', + 'STOPPING': 'CRITICAL', + 'EXITED': 'CRITICAL', + 'FATAL': 'CRITICAL', + 'UNKNOWN': 'UNKNOWN' +} + +PROCESS_STATUS = { + 'CRITICAL': 'down', + 'OK': 'up', + 'UNKNOWN': 'unknown' +} + +PROCESS_STATE = { + 'CRITICAL': 0, + 'OK': 1, + 'UNKNOWN': -1 +} + +SERVER_STATE = { + 'DOWN': 1, + 'UP': 0 +} + +SERVER_TAG = 'supervisord_server' + +PROCESS_TAG = 'supervisord_process' + +FORMAT_TIME = lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x)) + +SERVER_SERVICE_CHECK = 'supervisord.can_connect' +PROCESS_SERVICE_CHECK = 'supervisord.process.status' +PROCESS_UP_TIME_CHECK = 'supervisord.process.uptime' +PROCESS_COUNT_UP_CHECK = 'supervisord.process.count.status_up' +PROCESS_COUNT_DOWN_CHECK = 'supervisord.process.count.status_down' +PROCESS_COUNT_UNKNOWN_CHECK = 'supervisord.process.count.status_unknown' + + +class Supervisord(checks.AgentCheck): + + def check(self, instance): + server_name = instance.get('name') + proc_details_check = instance.get('proc_details_check', True) + if proc_details_check in ['False', 'false']: + proc_details_check = False + proc_uptime_check = instance.get('proc_uptime_check', True) + if proc_uptime_check in ['False', 'false']: + proc_uptime_check = False + + if not server_name or not server_name.strip(): + raise Exception("Supervisor server name not specified in yaml configuration.") + + dimensions = self._set_dimensions({'server_name': server_name}, instance) + supe = self._connect(instance) + count_by_status = defaultdict(int) + + # Gather all process information + try: + processes = supe.getAllProcessInfo() + except xmlrpclib.Fault as error: + raise Exception( + 'An error occurred while reading process information: %s %s' + % (error.faultCode, error.faultString) + ) + except socket.error as error: + host = instance.get('host', DEFAULT_HOST) + port = instance.get('port', DEFAULT_PORT) + sock = instance.get('socket') + if sock is None: + msg = 'Cannot connect to http://%s:%s. ' \ + 'Make sure supervisor is running and XML-RPC ' \ + 'inet interface is enabled.' % (host, port) + else: + msg = 'Cannot connect to %s. Make sure sure supervisor ' \ + 'is running and socket is enabled and socket file' \ + ' has the right permissions.' % sock + + server_details = {'server_details': msg} + self.gauge(SERVER_SERVICE_CHECK, SERVER_STATE['DOWN'], + dimensions=dimensions, value_meta=server_details) + raise Exception(msg) + + except xmlrpclib.ProtocolError as error: + if error.errcode == 401: # authorization error + msg = 'Username or password to %s are incorrect.' % server_name + else: + msg = "An error occurred while connecting to %s: "\ + "%s %s " % (server_name, error.errcode, error.errmsg) + + server_details = {'server_details': msg} + self.gauge(SERVER_SERVICE_CHECK, SERVER_STATE['DOWN'], + dimensions=dimensions, value_meta=server_details) + raise Exception(msg) + + # If we're here, we were able to connect to the server + self.gauge(SERVER_SERVICE_CHECK, SERVER_STATE['UP'], dimensions=dimensions) + + # Filter monitored processes on configuration directives + proc_regex = instance.get('proc_regex', []) + if not isinstance(proc_regex, list): + raise Exception("Invalid proc_regex.") + + proc_names = instance.get('proc_names', []) + if not isinstance(proc_names, list): + raise Exception("Invalid proc_names.") + + # Collect information on each monitored process + monitored_processes = [] + + # monitor all processes if no filters were specified + if len(proc_regex) == 0 and len(proc_names) == 0: + monitored_processes = processes + + for pattern, process in itertools.product(proc_regex, processes): + try: + if re.match(pattern, process['name']) and process not in monitored_processes: + monitored_processes.append(process) + except re.error: + raise + + for process in processes: + if process['name'] in proc_names and process not in monitored_processes: + monitored_processes.append(process) + + # Report service checks and uptime for each process + for proc in monitored_processes: + proc_name = proc['name'] + dimensions[PROCESS_TAG] = proc_name + + # Retrieve status and update status count + status = STATUS[proc['statename']] + count_by_status[status] += 1 + + # Report process details + if proc_details_check: + msg = self._build_message(proc) + self.log.info('process details: %s' % msg) + self.gauge(PROCESS_SERVICE_CHECK, PROCESS_STATE[status], + dimensions=dimensions) + + # Report Uptime + if proc_uptime_check: + uptime = self._extract_uptime(proc) + self.gauge(PROCESS_UP_TIME_CHECK, uptime, dimensions=dimensions) + + dimensions.pop(PROCESS_TAG, None) + + # Report counts by status + self.gauge(PROCESS_COUNT_UP_CHECK, count_by_status['OK'], + dimensions=dimensions) + self.gauge(PROCESS_COUNT_DOWN_CHECK, count_by_status['CRITICAL'], + dimensions=dimensions) + self.gauge(PROCESS_COUNT_UNKNOWN_CHECK, count_by_status['UNKNOWN'], + dimensions=dimensions) + + @staticmethod + def _connect(instance): + sock = instance.get('socket') + if sock is not None: + host = instance.get('host', DEFAULT_SOCKET_IP) + transport = supervisor.xmlrpc.SupervisorTransport(None, None, sock) + server = xmlrpclib.ServerProxy(host, transport=transport) + else: + host = instance.get('host', DEFAULT_HOST) + port = instance.get('port', DEFAULT_PORT) + user = instance.get('user') + password = instance.get('pass') + auth = '%s:%s@' % (user, password) if user and password else '' + server = xmlrpclib.Server('http://%s%s:%s/RPC2' % (auth, host, port)) + return server.supervisor + + @staticmethod + def _extract_uptime(proc): + start, now = int(proc['start']), int(proc['now']) + status = proc['statename'] + active_state = status in ['BACKOFF', 'RUNNING', 'STOPPING'] + return now - start if active_state else 0 + + @staticmethod + def _build_message(proc): + start, stop, now = int(proc['start']), int(proc['stop']), int(proc['now']) + proc['now_str'] = FORMAT_TIME(now) + proc['start_str'] = FORMAT_TIME(start) + proc['stop_str'] = '' if stop == 0 else FORMAT_TIME(stop) + + return """Current time: %(now_str)s +Process name: %(name)s +Process group: %(group)s +Description: %(description)s +Error log file: %(stderr_logfile)s +Stdout log file: %(stdout_logfile)s +Log file: %(logfile)s +State: %(statename)s +Start time: %(start_str)s +Stop time: %(stop_str)s +Exit Status: %(exitstatus)s""" % proc diff --git a/monasca_setup/detection/plugins/supervisord.py b/monasca_setup/detection/plugins/supervisord.py new file mode 100644 index 00000000..b6a4134d --- /dev/null +++ b/monasca_setup/detection/plugins/supervisord.py @@ -0,0 +1,167 @@ +# (C) Copyright 2016 Hewlett Packard Enterprise Development Company LP +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import logging + +import monasca_setup.agent_config +import monasca_setup.detection + + +log = logging.getLogger(__name__) + +# Defaults +supervisord_conf = '/root/.supervisord.cnf' +supervisord_server_name = 'server0' + + +class Supervisord(monasca_setup.detection.Plugin): + """Detect supervisord process and setup configuration for monitoring. + + This plugin needs connection info for supervisord setup. There are two + ways to provide it, either by a file placed in /root/.supervisord.cnf + or by specifying the following arguments: + - server (req, arbitrary name to identify the supervisord server) + - socket (opt, required for socket connection type) + - host (opt, defaults to localhost) + - port (opt, defaults to 9001) + - user (opt, only if username is configured) + - password (opt, only if password is configured) + - process_regex (opt, regex patterns for processes to monitor) + - process_names (opt, process to monitor by name) + process_regex and process_names are comma separated lists + + The file at /root/.supervisord.cnf should have this format: + [client] + server=server0 + socket=unix:///var/run//supervisor.sock + process_names=apache2,webapp,java + """ + + def _detect(self): + """Run detection, set self.available True if the service is detected. + """ + if monasca_setup.detection.find_process_cmdline('supervisord') is not None: + self.available = True + + def _get_config(self): + """Set the configuration to be used for connecting to supervisord + :return: + """ + # Set defaults and read config or use arguments + if self.args is None: + self.server = supervisord_server_name + self.socket = None + self.host = None + self.port = None + self.user = None + self.password = None + self.process_regex = None + self.process_names = None + self.process_details_check = None + self.process_uptime_check = None + + self._read_config(supervisord_conf) + else: + self.server = self.args.get('server', supervisord_server_name) + self.socket = self.args.get('socket') + self.host = self.args.get('host') + self.port = self.args.get('port') + self.user = self.args.get('user') + self.password = self.args.get('pass') + self.process_regex = self.args.get('proc_regex') + self.process_names = self.args.get('proc_names') + self.process_details_check = self.args.get('proc_details_check') + self.process_uptime_check = self.args.get('proc_uptime_check') + + def _read_config(self, config_file): + """Read the configuration setting member variables as appropriate. + :param config_file: The filename of the configuration to read and parse + """ + # Read the supervisord config file to extract the needed variables. + client_section = False + try: + with open(config_file, "r") as conf: + for row in conf: + if "[client]" in row: + client_section = True + log.info("\tUsing client credentials from {:s}".format(config_file)) + pass + if client_section: + if "server=" in row: + self.server = row.split("=")[1].strip() + if "socket=" in row: + self.socket = row.split("=")[1].strip() + if "host=" in row: + self.host = row.split("=")[1].strip() + if "port=" in row: + self.port = row.split("=")[1].strip() + if "user=" in row: + self.user = row.split("=")[1].strip() + if "pass=" in row: + self.password = row.split("=")[1].strip() + if "proc_regex=" in row: + self.process_regex = row.split("=")[1].strip() + if "proc_names=" in row: + self.process_names = row.split("=")[1].strip() + if "proc_details_check=" in row: + self.process_details_check = row.split("=")[1].strip() + if "proc_uptime_check=" in row: + self.process_uptime_check = row.split("=")[1].strip() + except IOError: + log.error("\tI/O error reading {:s}".format(config_file)) + + @staticmethod + def _split_list(to_split): + return [x.strip() for x in to_split.split(',')] + + def build_config(self): + """Build the config as a Plugins object and return. + """ + config = monasca_setup.agent_config.Plugins() + # First watch the process + config.merge(monasca_setup.detection.watch_process(['supervisord'], + 'supervisord', + exact_match=False)) + log.info("\tWatching the supervisord process.") + + try: + self._get_config() + instance_config = {'name': self.server} + if self.socket is not None: + instance_config['socket'] = self.socket + if self.host is not None: + instance_config['host'] = self.host + if self.port is not None: + instance_config['port'] = self.port + if self.user is not None: + instance_config['user'] = self.user + if self.password is not None: + instance_config['pass'] = self.password + if self.process_regex is not None: + instance_config['proc_regex'] = self._split_list(self.process_regex) + if self.process_names is not None: + instance_config['proc_names'] = self._split_list(self.process_names) + if self.process_details_check is not None: + instance_config['proc_details_check'] = self.process_details_check + if self.process_uptime_check is not None: + instance_config['proc_uptime_check'] = self.process_uptime_check + + config['supervisord'] = {'init_config': None, 'instances': [instance_config]} + except Exception: + log.exception('Error configuring the supervisord check plugin') + + return config + + def dependencies_installed(self): + return True