monasca-agent/monasca_agent/common/check_status.py

713 lines
23 KiB
Python

"""
This module contains classes which are used to occasionally persist the status
of checks.
"""
# stdlib
import datetime
import logging
import os
import pickle
import platform
import sys
import tempfile
import time
# project
import collections
import config
import util
import yaml
# 3rd party
import ntplib
STATUS_OK = 'OK'
STATUS_ERROR = 'ERROR'
STATUS_WARNING = 'WARNING'
NTP_OFFSET_THRESHOLD = 600
log = logging.getLogger(__name__)
class Stylizer(object):
STYLES = {
'bold': 1,
'grey': 30,
'red': 31,
'green': 32,
'yellow': 33,
'blue': 34,
'magenta': 35,
'cyan': 36,
'white': 37,
}
HEADER = '\033[1m'
UNDERLINE = '\033[2m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
RESET = '\033[0m'
ENABLED = False
@classmethod
def stylize(cls, text, *styles):
"""stylize the text. """
if not cls.ENABLED:
return text
# don't bother about escaping, not that complicated.
fmt = '\033[%dm%s'
for style in styles or []:
text = fmt % (cls.STYLES[style], text)
return text + fmt % (0, '') # reset
# a small convienence method
def style(*args):
return Stylizer.stylize(*args)
def logger_info():
loggers = []
root_logger = logging.getLogger()
if len(root_logger.handlers) > 0:
for handler in root_logger.handlers:
if isinstance(handler, logging.StreamHandler):
loggers.append(handler.stream.name)
if isinstance(handler, logging.handlers.SysLogHandler):
if isinstance(handler.address, basestring):
loggers.append('syslog:%s' % handler.address)
else:
loggers.append('syslog:(%s, %s)' % handler.address)
else:
loggers.append("No loggers configured")
return ', '.join(loggers)
def get_ntp_info():
ntp_offset = ntplib.NTPClient().request('pool.ntp.org', version=3).offset
if abs(ntp_offset) > NTP_OFFSET_THRESHOLD:
ntp_styles = ['red', 'bold']
else:
ntp_styles = []
return ntp_offset, ntp_styles
class AgentStatus(object):
"""A small class used to load and save status messages to the filesystem.
"""
NAME = None
agent_config = config.Config()
def __init__(self):
self.created_at = datetime.datetime.now()
self.created_by_pid = os.getpid()
def has_error(self):
raise NotImplementedError
def persist(self):
try:
path = self._get_pickle_path()
log.debug("Persisting status to %s" % path)
f = open(path, 'w')
try:
pickle.dump(self, f)
finally:
f.close()
except Exception:
log.exception("Error persisting status")
def created_seconds_ago(self):
td = datetime.datetime.now() - self.created_at
return td.seconds
def render(self):
indent = " "
lines = self._header_lines(indent) + [
indent + l for l in self.body_lines()
] + ["", ""]
return "\n".join(lines)
@classmethod
def _title_lines(cls):
name_line = "%s (v %s)" % (cls.NAME, AgentStatus.agent_config.get_version())
lines = [
"=" * len(name_line),
"%s" % name_line,
"=" * len(name_line),
"",
]
return lines
def _header_lines(self, indent):
# Don't indent the header
lines = self._title_lines()
if self.created_seconds_ago() > 120:
styles = ['red', 'bold']
else:
styles = []
# We color it in red if the status is too old
fields = [
(
style("Status date", *styles),
style("%s (%ss ago)" %
(self.created_at.strftime('%Y-%m-%d %H:%M:%S'),
self.created_seconds_ago()), *styles)
)
]
fields += [
("Pid", self.created_by_pid),
("Platform", platform.platform()),
("Python Version", platform.python_version()),
("Logs", logger_info()),
]
for key, value in fields:
l = indent + "%s: %s" % (key, value)
lines.append(l)
return lines + [""]
def to_dict(self):
return {
'pid': self.created_by_pid,
'status_date': "%s (%ss ago)" % (self.created_at.strftime('%Y-%m-%d %H:%M:%S'),
self.created_seconds_ago())}
@classmethod
def _not_running_message(cls):
lines = cls._title_lines() + [
style(" %s is not running." % cls.NAME, 'red'),
style(""" You can get more details in the logs: %s""" % logger_info(), 'red'),
"",
""
]
return "\n".join(lines)
@classmethod
def remove_latest_status(cls):
log.debug("Removing latest status")
try:
os.remove(cls._get_pickle_path())
except OSError:
pass
@classmethod
def load_latest_status(cls):
try:
f = open(cls._get_pickle_path())
try:
return pickle.load(f)
finally:
f.close()
except IOError:
log.info("Couldn't load latest status")
return None
@classmethod
def print_latest_status(cls, verbose=False):
cls.verbose = verbose
Stylizer.ENABLED = False
try:
if sys.stdout.isatty():
Stylizer.ENABLED = True
except Exception:
# Don't worry if we can't enable the
# stylizer.
pass
message = cls._not_running_message()
exit_code = -1
module_status = cls.load_latest_status()
if module_status:
message = module_status.render()
exit_code = 0
if module_status.has_error():
exit_code = 1
sys.stdout.write(message)
return exit_code
@classmethod
def _get_pickle_path(cls):
return os.path.join(tempfile.gettempdir(), cls.__name__ + '.pickle')
class InstanceStatus(object):
def __init__(self, instance_id, status, error=None, tb=None, warnings=None, metric_count=None):
self.instance_id = instance_id
self.status = status
self.error = repr(error)
self.traceback = tb
self.warnings = warnings
self.metric_count = metric_count
def has_error(self):
return self.status == STATUS_ERROR
def has_warnings(self):
return self.status == STATUS_WARNING
class CheckStatus(object):
def __init__(self, check_name, instance_statuses, metric_count,
event_count, init_failed_error=None,
init_failed_traceback=None, library_versions=None):
self.name = check_name
self.instance_statuses = instance_statuses
self.metric_count = metric_count
self.event_count = event_count
self.init_failed_error = init_failed_error
self.init_failed_traceback = init_failed_traceback
self.library_versions = library_versions
@property
def status(self):
if self.init_failed_error:
return STATUS_ERROR
for instance_status in self.instance_statuses:
if instance_status.status == STATUS_ERROR:
return STATUS_ERROR
return STATUS_OK
def has_error(self):
return self.status == STATUS_ERROR
class EmitterStatus(object):
def __init__(self, name, error=None):
self.name = name
self.error = None
if error:
self.error = repr(error)
@property
def status(self):
if self.error:
return STATUS_ERROR
else:
return STATUS_OK
def has_error(self):
return self.status != STATUS_OK
class CollectorStatus(AgentStatus):
NAME = 'Collector'
def __init__(self, check_statuses=None, emitter_statuses=None):
AgentStatus.__init__(self)
self.check_statuses = check_statuses or []
self.emitter_statuses = emitter_statuses or []
@property
def status(self):
for check_status in self.check_statuses:
if check_status.status == STATUS_ERROR:
return STATUS_ERROR
return STATUS_OK
def has_error(self):
return self.status != STATUS_OK
def body_lines(self):
lines = [
'Clocks',
'======',
''
]
try:
ntp_offset, ntp_styles = get_ntp_info()
lines.append(' ' + style('NTP offset', *ntp_styles) + ': ' +
style('%s s' % round(ntp_offset, 4), *ntp_styles))
except Exception as e:
lines.append(' NTP offset: Unkwown (%s)' % str(e))
lines.append(' System UTC time: ' + datetime.datetime.utcnow().__str__())
lines.append('')
# Paths to checks_d/conf.d
lines += [
'Paths',
'=====',
''
]
paths = util.Paths()
try:
confd_path = paths.get_confd_path()
except util.PathNotFound:
confd_path = 'Not found'
try:
checksd_path = paths.get_checksd_path()
except util.PathNotFound:
checksd_path = 'Not found'
lines.append(' conf.d: ' + confd_path)
lines.append(' checks_d: ' + checksd_path)
lines.append('')
# Hostnames
lines += [
'Hostnames',
'=========',
''
]
# Checks.d Status
lines += [
'Checks',
'======',
''
]
check_statuses = self.check_statuses + get_jmx_status()
if not check_statuses:
lines.append(" No checks have run yet.")
else:
for cs in check_statuses:
check_lines = [
' ' + cs.name,
' ' + '-' * len(cs.name)
]
if cs.init_failed_error:
check_lines.append(" - initialize check class [%s]: %s" %
(style(STATUS_ERROR, 'red'),
repr(cs.init_failed_error)))
if self.verbose and cs.init_failed_traceback:
check_lines.extend(' ' + line for line in
cs.init_failed_traceback.split('\n'))
else:
for s in cs.instance_statuses:
c = 'green'
if s.has_warnings():
c = 'yellow'
if s.has_error():
c = 'red'
line = " - instance #%s [%s]" % (s.instance_id, style(s.status, c))
if s.has_error():
line += u": %s" % s.error
if s.metric_count is not None:
line += " collected %s metrics" % s.metric_count
check_lines.append(line)
if s.has_warnings():
for warning in s.warnings:
warn = warning.split('\n')
if not len(warn):
continue
check_lines.append(u" %s: %s" %
(style("Warning", 'yellow'), warn[0]))
check_lines.extend(u" %s" % l for l in warn[1:])
if self.verbose and s.traceback is not None:
check_lines.extend(' ' + line for line in s.traceback.split('\n'))
check_lines += [
" - Collected %s metrics & %s events" % (
cs.metric_count, cs.event_count),
]
if cs.library_versions is not None:
check_lines += [
" - Dependencies:"]
for library, version in cs.library_versions.iteritems():
check_lines += [" - %s: %s" % (library, version)]
check_lines += [""]
lines += check_lines
# Emitter status
lines += [
"",
"Emitters",
"========",
""
]
if not self.emitter_statuses:
lines.append(" No emitters have run yet.")
else:
for es in self.emitter_statuses:
c = 'green'
if es.has_error():
c = 'red'
line = " - %s [%s]" % (es.name, style(es.status, c))
if es.status != STATUS_OK:
line += ": %s" % es.error
lines.append(line)
return lines
def to_dict(self):
status_info = AgentStatus.to_dict(self)
# Hostnames
status_info['hostnames'] = {}
# Checks.d Status
status_info['checks'] = {}
check_statuses = self.check_statuses + get_jmx_status()
for cs in check_statuses:
status_info['checks'][cs.name] = {'instances': {}}
if cs.init_failed_error:
status_info['checks'][cs.name]['init_failed'] = True
status_info['checks'][cs.name]['traceback'] = cs.init_failed_traceback
else:
status_info['checks'][cs.name] = {'instances': {}}
status_info['checks'][cs.name]['init_failed'] = False
for s in cs.instance_statuses:
status_info['checks'][cs.name]['instances'][s.instance_id] = {
'status': s.status,
'has_error': s.has_error(),
'has_warnings': s.has_warnings(),
}
if s.has_error():
status_info['checks'][cs.name]['instances'][
s.instance_id]['error'] = s.error
if s.has_warnings():
status_info['checks'][cs.name]['instances'][
s.instance_id]['warnings'] = s.warnings
status_info['checks'][cs.name]['metric_count'] = cs.metric_count
status_info['checks'][cs.name]['event_count'] = cs.event_count
# Emitter status
status_info['emitter'] = []
for es in self.emitter_statuses:
check_status = {'name': es.name,
'status': es.status,
'has_error': es.has_error()}
if es.has_error():
check_status['error'] = es.error
status_info['emitter'].append(check_status)
paths = util.Paths()
try:
status_info['confd_path'] = paths.get_confd_path()
except config.PathNotFound:
status_info['confd_path'] = 'Not found'
try:
status_info['checksd_path'] = paths.get_checksd_path()
except config.PathNotFound:
status_info['checksd_path'] = 'Not found'
return status_info
class MonascaStatsdStatus(AgentStatus):
NAME = 'Monasca_Statsd'
def __init__(self, flush_count=0, packet_count=0,
packets_per_second=0, metric_count=0, event_count=0):
AgentStatus.__init__(self)
self.flush_count = flush_count
self.packet_count = packet_count
self.packets_per_second = packets_per_second
self.metric_count = metric_count
self.event_count = event_count
def has_error(self):
return self.flush_count == 0 and self.packet_count == 0 and self.metric_count == 0
def body_lines(self):
lines = [
"Flush count: %s" % self.flush_count,
"Packet Count: %s" % self.packet_count,
"Packets per second: %s" % self.packets_per_second,
"Metric count: %s" % self.metric_count,
"Event count: %s" % self.event_count,
]
return lines
def to_dict(self):
status_info = AgentStatus.to_dict(self)
status_info.update({
'flush_count': self.flush_count,
'packet_count': self.packet_count,
'packets_per_second': self.packets_per_second,
'metric_count': self.metric_count,
'event_count': self.event_count,
})
return status_info
class ForwarderStatus(AgentStatus):
NAME = 'Forwarder'
def __init__(self, queue_length=0, queue_size=0, flush_count=0, transactions_received=0,
transactions_flushed=0):
AgentStatus.__init__(self)
self.queue_length = queue_length
self.queue_size = queue_size
self.flush_count = flush_count
self.transactions_received = transactions_received
self.transactions_flushed = transactions_flushed
def body_lines(self):
lines = [
"Queue Size: %s bytes" % self.queue_size,
"Queue Length: %s" % self.queue_length,
"Flush Count: %s" % self.flush_count,
"Transactions received: %s" % self.transactions_received,
"Transactions flushed: %s" % self.transactions_flushed
]
if self.transactions_flushed == 0:
lines.append("[%s]: Unable to flush transactions\n %s" %
(style(STATUS_ERROR, 'red'),
"Please verify monasca-api is running as configured"))
elif self.transactions_flushed != self.transactions_received:
lines.append("[%s]: Transactions out of sync\n %s" %
(style(STATUS_WARNING, 'yellow'),
"Likely contact interruption with monasca-api"))
else:
lines.append("[%s]: Transactions up to date" %
style(STATUS_OK, 'green'))
return lines
def has_error(self):
return self.flush_count == 0
def to_dict(self):
status_info = AgentStatus.to_dict(self)
status_info.update({
'flush_count': self.flush_count,
'queue_length': self.queue_length,
'queue_size': self.queue_size,
})
return status_info
def get_jmx_instance_status(instance_name, status, message, metric_count):
if status == STATUS_ERROR:
instance_status = InstanceStatus(
instance_name, STATUS_ERROR, error=message, metric_count=metric_count)
elif status == STATUS_WARNING:
instance_status = InstanceStatus(
instance_name, STATUS_WARNING, warnings=[message], metric_count=metric_count)
elif status == STATUS_OK:
instance_status = InstanceStatus(instance_name, STATUS_OK, metric_count=metric_count)
return instance_status
def get_jmx_status():
"""This function tries to read the 2 jmxfetch status file which are yaml file
located in the temp directory.
There are 2 files:
- One generated by the Agent itself, for jmx checks that can't be initialized because
there are missing stuff.
Its format is as following:
###
invalid_checks:
jmx: !!python/object/apply:jmxfetch.InvalidJMXConfiguration [You need to have at
least one instance defined in the YAML file for this check]
timestamp: 1391040927.136523
###
- One generated by jmxfetch that return information about the collection of metrics
its format is as following:
###
timestamp: 1391037347435
checks:
failed_checks:
jmx:
- {message: Unable to create instance. Please check your yaml file, status: ERROR}
initialized_checks:
tomcat:
- {message: null, status: OK, metric_count: 7, instance_name: jmx-remihakim.fr-3000}
###
"""
check_statuses = []
java_status_path = os.path.join(tempfile.gettempdir(), "jmx_status.yaml")
python_status_path = os.path.join(tempfile.gettempdir(), "jmx_status_python.yaml")
if not os.path.exists(java_status_path) and not os.path.exists(python_status_path):
log.debug("There is no jmx_status file at: %s or at: %s" %
(java_status_path, python_status_path))
return []
check_data = collections.defaultdict(lambda: collections.defaultdict(list))
try:
if os.path.exists(java_status_path):
java_jmx_stats = yaml.load(file(java_status_path))
# JMX timestamp is saved in milliseconds
status_age = time.time() - java_jmx_stats.get('timestamp') / 1000
jmx_checks = java_jmx_stats.get('checks', {})
if status_age > 60:
check_statuses.append(
CheckStatus(
"jmx", [
InstanceStatus(
0, STATUS_ERROR, error="JMXfetch didn't return any metrics during the last minute")], 0, 0))
else:
for check_name, instances in jmx_checks.get('failed_checks', {}).iteritems():
for info in instances:
message = info.get('message', None)
metric_count = info.get('metric_count', 0)
status = info.get('status')
instance_name = info.get('instance_name', None)
check_data[check_name]['statuses'].append(
get_jmx_instance_status(
instance_name,
status,
message,
metric_count))
check_data[check_name]['metric_count'].append(metric_count)
for check_name, instances in jmx_checks.get('initialized_checks', {}).iteritems():
for info in instances:
message = info.get('message', None)
metric_count = info.get('metric_count', 0)
status = info.get('status')
instance_name = info.get('instance_name', None)
check_data[check_name]['statuses'].append(
get_jmx_instance_status(
instance_name,
status,
message,
metric_count))
check_data[check_name]['metric_count'].append(metric_count)
for check_name, data in check_data.iteritems():
check_status = CheckStatus(
check_name, data['statuses'], sum(data['metric_count']), 0)
check_statuses.append(check_status)
if os.path.exists(python_status_path):
python_jmx_stats = yaml.load(file(python_status_path))
jmx_checks = python_jmx_stats.get('invalid_checks', {})
for check_name, excep in jmx_checks.iteritems():
check_statuses.append(CheckStatus(check_name, [], 0, 0, init_failed_error=excep))
return check_statuses
except Exception:
log.exception("Couldn't load latest jmx status")
return []