Adjusted whitespace for pep8 compliance
This commit is contained in:
parent
9014f20a6d
commit
194c3625dd
|
@ -24,6 +24,7 @@ FLUSH_LOGGING_INITIAL = 5
|
|||
|
||||
|
||||
class Collector(object):
|
||||
|
||||
"""
|
||||
The collector is responsible for collecting data from each check and
|
||||
passing it along to the emitters, who send it to their final destination.
|
||||
|
@ -43,12 +44,13 @@ class Collector(object):
|
|||
|
||||
self._checks = []
|
||||
self._legacy_checks = [
|
||||
# todo dogstreams should be removed or moved over to a standard output type
|
||||
# Dogstreams.init(log, self.agent_config) # dogstreams
|
||||
# todo dogstreams should be removed or moved over to a standard output type
|
||||
# Dogstreams.init(log, self.agent_config) # dogstreams
|
||||
]
|
||||
|
||||
# add system checks
|
||||
# todo all these (legacy and system) should be moved to the newer AgentCheck class rather than check
|
||||
# todo all these (legacy and system) should be moved to the newer
|
||||
# AgentCheck class rather than check
|
||||
if self.os == 'windows':
|
||||
legacy_checks = [w32.Disk(log),
|
||||
w32.IO(log),
|
||||
|
@ -66,8 +68,10 @@ class Collector(object):
|
|||
self._legacy_checks.extend(legacy_checks)
|
||||
|
||||
if checksd:
|
||||
self.initialized_checks_d = checksd['initialized_checks'] # is of type {check_name: check}
|
||||
self.init_failed_checks_d = checksd['init_failed_checks'] # is of type {check_name: {error, traceback}}
|
||||
# is of type {check_name: check}
|
||||
self.initialized_checks_d = checksd['initialized_checks']
|
||||
# is of type {check_name: {error, traceback}}
|
||||
self.init_failed_checks_d = checksd['init_failed_checks']
|
||||
|
||||
def _emit(self, payload):
|
||||
""" Send the payload via the emitter. """
|
||||
|
@ -94,7 +98,8 @@ class Collector(object):
|
|||
log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" %
|
||||
(self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)))
|
||||
if self.run_count == FLUSH_LOGGING_INITIAL:
|
||||
log.info("First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD)
|
||||
log.info("First flushes done, next flushes will be logged every %s flushes." %
|
||||
FLUSH_LOGGING_PERIOD)
|
||||
|
||||
else:
|
||||
log.debug("Finished run #%s. Collection time: %ss. Emit time: %ss" %
|
||||
|
@ -110,12 +115,12 @@ class Collector(object):
|
|||
metrics['monagent.collector.collection.time'] = collection_time
|
||||
if collection_time > MAX_COLLECTION_TIME:
|
||||
log.info("Collection time (s) is high: %.1f, metrics count: %d, events count: %d" %
|
||||
(collection_time, num_metrics, num_events))
|
||||
(collection_time, num_metrics, num_events))
|
||||
|
||||
metrics['monagent.collector.emit.time'] = emit_time
|
||||
if emit_time is not None and emit_time > MAX_EMIT_TIME:
|
||||
log.info("Emit time (s) is high: %.1f, metrics count: %d, events count: %d" %
|
||||
(emit_time, num_metrics, num_events))
|
||||
(emit_time, num_metrics, num_events))
|
||||
|
||||
return metrics
|
||||
|
||||
|
|
|
@ -33,6 +33,7 @@ class EventDefaults(object):
|
|||
|
||||
|
||||
class Dogstreams(object):
|
||||
|
||||
@classmethod
|
||||
def init(cls, logger, config):
|
||||
dogstreams_config = config.get('dogstreams', None)
|
||||
|
@ -87,7 +88,8 @@ class Dogstreams(object):
|
|||
try:
|
||||
result = dogstream.check(agentConfig, move_end)
|
||||
# result may contain {"dogstream": [new]}.
|
||||
# If output contains {"dogstream": [old]}, that old value will get concatenated with the new value
|
||||
# If output contains {"dogstream": [old]}, that old value will get
|
||||
# concatenated with the new value
|
||||
assert type(result) == type(output), "dogstream.check must return a dictionary"
|
||||
for k in result:
|
||||
if k in output:
|
||||
|
@ -128,7 +130,8 @@ class Dogstream(object):
|
|||
parser_spec,
|
||||
os.environ.get('PYTHONPATH', ''))
|
||||
)
|
||||
logger.info("dogstream: parsing %s with %s (requested %s)" % (log_path, parse_func, parser_spec))
|
||||
logger.info("dogstream: parsing %s with %s (requested %s)" %
|
||||
(log_path, parse_func, parser_spec))
|
||||
else:
|
||||
logger.info("dogstream: parsing %s with default parser" % log_path)
|
||||
|
||||
|
@ -221,7 +224,8 @@ class Dogstream(object):
|
|||
# FIXME when the backend treats those as true synonyms, we can
|
||||
# deprecate event_object.
|
||||
if 'event_object' in datum or 'aggregation_key' in datum:
|
||||
datum['aggregation_key'] = datum.get('event_object', datum.get('aggregation_key'))
|
||||
datum['aggregation_key'] = datum.get(
|
||||
'event_object', datum.get('aggregation_key'))
|
||||
else:
|
||||
datum['aggregation_key'] = EventDefaults.EVENT_OBJECT
|
||||
datum['event_object'] = datum['aggregation_key']
|
||||
|
@ -511,4 +515,4 @@ class NagiosServicePerfData(NagiosPerfData):
|
|||
return metric
|
||||
|
||||
if __name__ == '__main__':
|
||||
testddForwarder()
|
||||
testddForwarder()
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
## {{{ http://code.activestate.com/recipes/576519/ (r9)
|
||||
# {{{ http://code.activestate.com/recipes/576519/ (r9)
|
||||
# Author: David Decotigny, Oct 1 2008
|
||||
# @brief Pool of threads similar to multiprocessing.Pool
|
||||
# See http://docs.python.org/dev/library/multiprocessing.html
|
||||
|
@ -26,7 +26,7 @@ import threading
|
|||
import traceback
|
||||
|
||||
|
||||
## Item pushed on the work queue to tell the worker threads to terminate
|
||||
# Item pushed on the work queue to tell the worker threads to terminate
|
||||
SENTINEL = "QUIT"
|
||||
|
||||
|
||||
|
@ -37,12 +37,15 @@ def is_sentinel(obj):
|
|||
|
||||
|
||||
class TimeoutError(Exception):
|
||||
|
||||
"""Raised when a result is not available within the given timeout"""
|
||||
pass
|
||||
|
||||
|
||||
class PoolWorker(threading.Thread):
|
||||
|
||||
"""Thread that consumes WorkUnits from a queue to process them"""
|
||||
|
||||
def __init__(self, workq, *args, **kwds):
|
||||
"""\param workq: Queue object to consume the work units from"""
|
||||
threading.Thread.__init__(self, *args, **kwds)
|
||||
|
@ -64,6 +67,7 @@ class PoolWorker(threading.Thread):
|
|||
|
||||
|
||||
class Pool(object):
|
||||
|
||||
"""
|
||||
The Pool class represents a pool of worker threads. It has methods
|
||||
which allows tasks to be offloaded to the worker processes in a
|
||||
|
@ -75,8 +79,8 @@ class Pool(object):
|
|||
\param nworkers (integer) number of worker threads to start
|
||||
\param name (string) prefix for the worker threads' name
|
||||
"""
|
||||
self._workq = Queue.Queue()
|
||||
self._closed = False
|
||||
self._workq = Queue.Queue()
|
||||
self._closed = False
|
||||
self._workers = []
|
||||
for idx in xrange(nworkers):
|
||||
thr = PoolWorker(self._workq, name="Worker-%s-%d" % (name, idx))
|
||||
|
@ -95,7 +99,8 @@ class Pool(object):
|
|||
def apply(self, func, args=(), kwds=None):
|
||||
"""Equivalent of the apply() builtin function. It blocks till
|
||||
the result is ready."""
|
||||
if not kwds: kwds = dict()
|
||||
if not kwds:
|
||||
kwds = dict()
|
||||
return self.apply_async(func, args, kwds).get()
|
||||
|
||||
def map(self, func, iterable, chunksize=None):
|
||||
|
@ -134,7 +139,7 @@ class Pool(object):
|
|||
collector = UnorderedResultCollector()
|
||||
self._create_sequences(func, iterable, chunksize, collector)
|
||||
return iter(collector)
|
||||
|
||||
|
||||
def apply_async(self, func, args=(), kwds=None, callback=None):
|
||||
"""A variant of the apply() method which returns an
|
||||
ApplyResult object.
|
||||
|
@ -144,8 +149,9 @@ class Pool(object):
|
|||
callback is applied to it (unless the call failed). callback
|
||||
should complete immediately since otherwise the thread which
|
||||
handles the results will get blocked."""
|
||||
if not kwds: kwds = dict()
|
||||
assert not self._closed # No lock here. We assume it's atomic...
|
||||
if not kwds:
|
||||
kwds = dict()
|
||||
assert not self._closed # No lock here. We assume it's atomic...
|
||||
apply_result = ApplyResult(callback=callback)
|
||||
job = Job(func, args, kwds, apply_result)
|
||||
self._workq.put(job)
|
||||
|
@ -161,7 +167,7 @@ class Pool(object):
|
|||
should complete immediately since otherwise the thread which
|
||||
handles the results will get blocked."""
|
||||
apply_result = ApplyResult(callback=callback)
|
||||
collector = OrderedResultCollector(apply_result, as_iterator=False)
|
||||
collector = OrderedResultCollector(apply_result, as_iterator=False)
|
||||
self._create_sequences(func, iterable, chunksize, collector)
|
||||
return apply_result
|
||||
|
||||
|
@ -176,7 +182,7 @@ class Pool(object):
|
|||
failed). callback should complete immediately since otherwise
|
||||
the thread which handles the results will get blocked."""
|
||||
apply_result = ApplyResult(callback=callback)
|
||||
collector = OrderedResultCollector(apply_result, as_iterator=True)
|
||||
collector = OrderedResultCollector(apply_result, as_iterator=True)
|
||||
self._create_sequences(func, iterable, chunksize, collector)
|
||||
return apply_result
|
||||
|
||||
|
@ -192,7 +198,7 @@ class Pool(object):
|
|||
failed). callback should complete immediately since otherwise
|
||||
the thread which handles the results will get blocked."""
|
||||
apply_result = ApplyResult(callback=callback)
|
||||
collector = UnorderedResultCollector(apply_result)
|
||||
collector = UnorderedResultCollector(apply_result)
|
||||
self._create_sequences(func, iterable, chunksize, collector)
|
||||
return apply_result
|
||||
|
||||
|
@ -227,7 +233,7 @@ class Pool(object):
|
|||
for thr in self._workers:
|
||||
thr.join()
|
||||
|
||||
def _create_sequences(self, func, iterable, chunksize, collector = None):
|
||||
def _create_sequences(self, func, iterable, chunksize, collector=None):
|
||||
"""
|
||||
Create the WorkUnit objects to process and pushes them on the
|
||||
work queue. Each work unit is meant to process a slice of
|
||||
|
@ -238,9 +244,9 @@ class Pool(object):
|
|||
\return the list of WorkUnit objects (basically: JobSequences)
|
||||
pushed onto the work queue
|
||||
"""
|
||||
assert not self._closed # No lock here. We assume it's atomic...
|
||||
assert not self._closed # No lock here. We assume it's atomic...
|
||||
sequences = []
|
||||
results = []
|
||||
results = []
|
||||
it_ = iter(iterable)
|
||||
exit_loop = False
|
||||
while not exit_loop:
|
||||
|
@ -264,15 +270,19 @@ class Pool(object):
|
|||
|
||||
|
||||
class WorkUnit(object):
|
||||
|
||||
"""ABC for a unit of work submitted to the worker threads. It's
|
||||
basically just an object equipped with a process() method"""
|
||||
|
||||
def process(self):
|
||||
"""Do the work. Shouldn't raise any exception"""
|
||||
raise NotImplementedError("Children must override Process")
|
||||
|
||||
|
||||
class Job(WorkUnit):
|
||||
|
||||
"""A work unit that corresponds to the execution of a single function"""
|
||||
|
||||
def __init__(self, func, args, kwds, apply_result):
|
||||
"""
|
||||
\param func/args/kwds used to call the function
|
||||
|
@ -280,9 +290,9 @@ class Job(WorkUnit):
|
|||
of the function call
|
||||
"""
|
||||
WorkUnit.__init__(self)
|
||||
self._func = func
|
||||
self._args = args
|
||||
self._kwds = kwds
|
||||
self._func = func
|
||||
self._args = args
|
||||
self._kwds = kwds
|
||||
self._result = apply_result
|
||||
|
||||
def process(self):
|
||||
|
@ -300,8 +310,10 @@ class Job(WorkUnit):
|
|||
|
||||
|
||||
class JobSequence(WorkUnit):
|
||||
|
||||
"""A work unit that corresponds to the processing of a continuous
|
||||
sequence of Job objects"""
|
||||
|
||||
def __init__(self, jobs):
|
||||
WorkUnit.__init__(self)
|
||||
self._jobs = jobs
|
||||
|
@ -315,6 +327,7 @@ class JobSequence(WorkUnit):
|
|||
|
||||
|
||||
class ApplyResult(object):
|
||||
|
||||
"""An object associated with a Job object that holds its result:
|
||||
it's available during the whole life the Job and after, even when
|
||||
the Job didn't process yet. It's possible to use this object to
|
||||
|
@ -322,6 +335,7 @@ class ApplyResult(object):
|
|||
|
||||
The result objects returns by the Pool::*_async() methods are of
|
||||
this type"""
|
||||
|
||||
def __init__(self, collector=None, callback=None):
|
||||
"""
|
||||
\param collector when not None, the notify_ready() method of
|
||||
|
@ -331,11 +345,11 @@ class ApplyResult(object):
|
|||
result becomes available (this is the paramater passed to the
|
||||
Pool::*_async() methods.
|
||||
"""
|
||||
self._success = False
|
||||
self._event = threading.Event()
|
||||
self._data = None
|
||||
self._success = False
|
||||
self._event = threading.Event()
|
||||
self._data = None
|
||||
self._collector = None
|
||||
self._callback = callback
|
||||
self._callback = callback
|
||||
|
||||
if collector is not None:
|
||||
collector.register_result(self)
|
||||
|
@ -354,7 +368,7 @@ class ApplyResult(object):
|
|||
return self._data
|
||||
raise self._data[0], self._data[1], self._data[2]
|
||||
|
||||
def wait(self, timeout = None):
|
||||
def wait(self, timeout=None):
|
||||
"""Waits until the result is available or until timeout
|
||||
seconds pass."""
|
||||
self._event.wait(timeout)
|
||||
|
@ -377,7 +391,7 @@ class ApplyResult(object):
|
|||
ready and successful. The collector's notify_ready() method
|
||||
will be called, and the callback method too"""
|
||||
assert not self.ready()
|
||||
self._data = value
|
||||
self._data = value
|
||||
self._success = True
|
||||
self._event.set()
|
||||
if self._collector is not None:
|
||||
|
@ -394,7 +408,7 @@ class ApplyResult(object):
|
|||
ready but not successful. The collector's notify_ready()
|
||||
method will be called, but NOT the callback method"""
|
||||
assert not self.ready()
|
||||
self._data = sys.exc_info()
|
||||
self._data = sys.exc_info()
|
||||
self._success = False
|
||||
self._event.set()
|
||||
if self._collector is not None:
|
||||
|
@ -402,6 +416,7 @@ class ApplyResult(object):
|
|||
|
||||
|
||||
class AbstractResultCollector(object):
|
||||
|
||||
"""ABC to define the interface of a ResultCollector object. It is
|
||||
basically an object which knows whuich results it's waiting for,
|
||||
and which is able to get notify when they get available. It is
|
||||
|
@ -414,7 +429,7 @@ class AbstractResultCollector(object):
|
|||
results we're waiting for become available. Can be None.
|
||||
"""
|
||||
self._to_notify = to_notify
|
||||
|
||||
|
||||
def register_result(self, apply_result):
|
||||
"""Used to identify which results we're waiting for. Will
|
||||
always be called BEFORE the Jobs get submitted to the work
|
||||
|
@ -433,7 +448,7 @@ class AbstractResultCollector(object):
|
|||
"""
|
||||
raise NotImplementedError("Children classes must implement it")
|
||||
|
||||
def _get_result(self, idx, timeout = None):
|
||||
def _get_result(self, idx, timeout=None):
|
||||
"""Called by the CollectorIterator object to retrieve the
|
||||
result's values one after another (order defined by the
|
||||
implementation)
|
||||
|
@ -450,19 +465,21 @@ class AbstractResultCollector(object):
|
|||
|
||||
|
||||
class CollectorIterator(object):
|
||||
|
||||
"""An iterator that allows to iterate over the result values
|
||||
available in the given collector object. Equipped with an extended
|
||||
next() method accepting a timeout argument. Created by the
|
||||
AbstractResultCollector::__iter__() method"""
|
||||
|
||||
def __init__(self, collector):
|
||||
"""\param AbstractResultCollector instance"""
|
||||
self._collector = collector
|
||||
self._idx = 0
|
||||
self._idx = 0
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def next(self, timeout = None):
|
||||
def next(self, timeout=None):
|
||||
"""Return the next result value in the sequence. Raise
|
||||
StopIteration at the end. Can raise the exception raised by
|
||||
the Job"""
|
||||
|
@ -481,20 +498,21 @@ class CollectorIterator(object):
|
|||
|
||||
|
||||
class UnorderedResultCollector(AbstractResultCollector):
|
||||
|
||||
"""An AbstractResultCollector implementation that collects the
|
||||
values of the ApplyResult objects in the order they become ready. The
|
||||
CollectorIterator object returned by __iter__() will iterate over
|
||||
them in the order they become ready"""
|
||||
|
||||
def __init__(self, to_notify = None):
|
||||
def __init__(self, to_notify=None):
|
||||
"""
|
||||
\param to_notify ApplyResult object to notify when all the
|
||||
results we're waiting for become available. Can be None.
|
||||
"""
|
||||
AbstractResultCollector.__init__(self, to_notify)
|
||||
self._cond = threading.Condition()
|
||||
self._cond = threading.Condition()
|
||||
self._collection = []
|
||||
self._expected = 0
|
||||
self._expected = 0
|
||||
|
||||
def register_result(self, apply_result):
|
||||
"""Used to identify which results we're waiting for. Will
|
||||
|
@ -505,7 +523,7 @@ class UnorderedResultCollector(AbstractResultCollector):
|
|||
"""
|
||||
self._expected += 1
|
||||
|
||||
def _get_result(self, idx, timeout = None):
|
||||
def _get_result(self, idx, timeout=None):
|
||||
"""Called by the CollectorIterator object to retrieve the
|
||||
result's values one after another, in the order the results have
|
||||
become available.
|
||||
|
@ -549,18 +567,19 @@ class UnorderedResultCollector(AbstractResultCollector):
|
|||
self._cond.notifyAll()
|
||||
finally:
|
||||
self._cond.release()
|
||||
|
||||
|
||||
if first_item and self._to_notify is not None:
|
||||
self._to_notify._set_value(iter(self))
|
||||
|
||||
|
||||
|
||||
class OrderedResultCollector(AbstractResultCollector):
|
||||
|
||||
"""An AbstractResultCollector implementation that collects the
|
||||
values of the ApplyResult objects in the order they have been
|
||||
submitted. The CollectorIterator object returned by __iter__()
|
||||
will iterate over them in the order they have been submitted"""
|
||||
|
||||
def __init__(self, to_notify = None, as_iterator = True):
|
||||
|
||||
def __init__(self, to_notify=None, as_iterator=True):
|
||||
"""
|
||||
\param to_notify ApplyResult object to notify when all the
|
||||
results we're waiting for become available. Can be None.
|
||||
|
@ -570,9 +589,9 @@ class OrderedResultCollector(AbstractResultCollector):
|
|||
result arrived)
|
||||
"""
|
||||
AbstractResultCollector.__init__(self, to_notify)
|
||||
self._results = []
|
||||
self._lock = threading.Lock()
|
||||
self._remaining = 0
|
||||
self._results = []
|
||||
self._lock = threading.Lock()
|
||||
self._remaining = 0
|
||||
self._as_iterator = as_iterator
|
||||
|
||||
def register_result(self, apply_result):
|
||||
|
@ -585,7 +604,7 @@ class OrderedResultCollector(AbstractResultCollector):
|
|||
self._results.append(apply_result)
|
||||
self._remaining += 1
|
||||
|
||||
def _get_result(self, idx, timeout = None):
|
||||
def _get_result(self, idx, timeout=None):
|
||||
"""Called by the CollectorIterator object to retrieve the
|
||||
result's values one after another (order defined by the
|
||||
implementation)
|
||||
|
@ -606,13 +625,13 @@ class OrderedResultCollector(AbstractResultCollector):
|
|||
has been processed
|
||||
"""
|
||||
got_first = False
|
||||
got_last = False
|
||||
got_last = False
|
||||
self._lock.acquire()
|
||||
try:
|
||||
assert self._remaining > 0
|
||||
got_first = (len(self._results) == self._remaining)
|
||||
self._remaining -= 1
|
||||
got_last = (self._remaining == 0)
|
||||
got_last = (self._remaining == 0)
|
||||
finally:
|
||||
self._lock.release()
|
||||
|
||||
|
@ -630,18 +649,19 @@ class OrderedResultCollector(AbstractResultCollector):
|
|||
|
||||
def _test():
|
||||
"""Some tests"""
|
||||
import thread, time
|
||||
import thread
|
||||
import time
|
||||
|
||||
def f(x):
|
||||
return x*x
|
||||
|
||||
return x * x
|
||||
|
||||
def work(seconds):
|
||||
print "[%d] Start to work for %fs..." % (thread.get_ident(), seconds)
|
||||
time.sleep(seconds)
|
||||
print "[%d] Work done (%fs)." % (thread.get_ident(), seconds)
|
||||
return "%d slept %fs" % (thread.get_ident(), seconds)
|
||||
|
||||
### Test copy/pasted from multiprocessing
|
||||
# Test copy/pasted from multiprocessing
|
||||
pool = Pool(9) # start 4 worker threads
|
||||
|
||||
result = pool.apply_async(f, (10,)) # evaluate "f(10)" asynchronously
|
||||
|
@ -749,4 +769,4 @@ def _test():
|
|||
|
||||
if __name__ == "__main__":
|
||||
_test()
|
||||
## end of http://code.activestate.com/recipes/576519/ }}}
|
||||
# end of http://code.activestate.com/recipes/576519/ }}}
|
||||
|
|
|
@ -25,16 +25,16 @@ class ServicesCheck(AgentCheck):
|
|||
This class should never be directly instanciated.
|
||||
|
||||
Work flow:
|
||||
The main agent loop will call the check function for each instance for
|
||||
The main agent loop will call the check function for each instance for
|
||||
each iteration of the loop.
|
||||
The check method will make an asynchronous call to the _process method in
|
||||
The check method will make an asynchronous call to the _process method in
|
||||
one of the thread initiated in the thread pool created in this class constructor.
|
||||
The _process method will call the _check method of the inherited class
|
||||
which will perform the actual check.
|
||||
|
||||
The _check method must return a tuple which first element is either
|
||||
Status.UP or Status.DOWN.
|
||||
The second element is a short error message that will be displayed
|
||||
The second element is a short error message that will be displayed
|
||||
when the service turns down.
|
||||
|
||||
"""
|
||||
|
@ -81,9 +81,11 @@ class ServicesCheck(AgentCheck):
|
|||
def check(self, instance):
|
||||
if not self.pool_started:
|
||||
self.start_pool()
|
||||
# On Windows the agent runs on multiple threads so we need to have an offset of 5 in case the pool_size is 1
|
||||
# On Windows the agent runs on multiple threads so we need to have an
|
||||
# offset of 5 in case the pool_size is 1
|
||||
if threading.activeCount() > 5 * self.pool_size + 5:
|
||||
raise Exception("Thread number (%s) is exploding. Skipping this check" % threading.activeCount())
|
||||
raise Exception("Thread number (%s) is exploding. Skipping this check" %
|
||||
threading.activeCount())
|
||||
self._process_results()
|
||||
self._clean()
|
||||
name = instance.get('name', None)
|
||||
|
@ -91,7 +93,7 @@ class ServicesCheck(AgentCheck):
|
|||
self.log.error('Each service check must have a name')
|
||||
return
|
||||
|
||||
if name not in self.jobs_status:
|
||||
if name not in self.jobs_status:
|
||||
# A given instance should be processed one at a time
|
||||
self.jobs_status[name] = time.time()
|
||||
self.pool.apply_async(self._process, args=(instance,))
|
||||
|
@ -175,5 +177,3 @@ class ServicesCheck(AgentCheck):
|
|||
if now - start_time > TIMEOUT:
|
||||
self.log.critical("Restarting Pool. One check is stuck.")
|
||||
self.restart_pool()
|
||||
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ to_float = lambda s: float(s.replace(",", "."))
|
|||
|
||||
|
||||
class Disk(Check):
|
||||
|
||||
""" Collects metrics about the machine's disks. """
|
||||
|
||||
def check(self):
|
||||
|
@ -67,7 +68,7 @@ class Disk(Check):
|
|||
def parse_df_output(self, df_output, platform_name, inodes=False, use_mount=False, blacklist_re=None):
|
||||
"""
|
||||
Parse the output of the df command. If use_volume is true the volume
|
||||
is used to anchor the metric, otherwise false the mount
|
||||
is used to anchor the metric, otherwise false the mount
|
||||
point is used. Returns a tuple of (disk, inode).
|
||||
"""
|
||||
usage_data = {}
|
||||
|
@ -181,7 +182,7 @@ class Disk(Check):
|
|||
if blacklist_re and blacklist_re.match(device[0]):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
devices = filter(keep_device, flattened_devices)
|
||||
|
||||
return devices
|
||||
|
@ -233,7 +234,7 @@ class IO(Check):
|
|||
io_stats[device][self.xlate(header_name, "linux")] = values[header_index]
|
||||
|
||||
return io_stats
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _parse_darwin(output):
|
||||
lines = [l.split() for l in output.split("\n") if len(l) > 0]
|
||||
|
@ -243,10 +244,10 @@ class IO(Check):
|
|||
for idx, disk in enumerate(disks):
|
||||
kb_t, tps, mb_s = map(float, lastline[(3 * idx):(3 * idx) + 3]) # 3 cols at a time
|
||||
io[disk] = {
|
||||
'system.io.bytes_per_s': mb_s * 10**6,
|
||||
'system.io.bytes_per_s': mb_s * 10 ** 6,
|
||||
}
|
||||
return io
|
||||
|
||||
|
||||
@staticmethod
|
||||
def xlate(metric_name, os_name):
|
||||
"""Standardize on linux metric names"""
|
||||
|
@ -284,7 +285,7 @@ class IO(Check):
|
|||
stdout=sp.PIPE,
|
||||
close_fds=True).communicate()[0]
|
||||
|
||||
# Linux 2.6.32-343-ec2 (ip-10-35-95-10) 12/11/2012 _x86_64_ (2 CPU)
|
||||
# Linux 2.6.32-343-ec2 (ip-10-35-95-10) 12/11/2012 _x86_64_ (2 CPU)
|
||||
#
|
||||
# Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await svctm %util
|
||||
# sda1 0.00 17.61 0.26 32.63 4.23 201.04 12.48 0.16 4.81 0.53 1.73
|
||||
|
@ -298,7 +299,8 @@ class IO(Check):
|
|||
# sdb 0.00 0.00 0.00 2.97 0.00 11.88 8.00 0.00 0.00 0.00 0.00
|
||||
# sdg 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
|
||||
# sdf 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
|
||||
# md0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
|
||||
# md0 0.00 0.00 0.00 0.00 0.00 0.00 0.00
|
||||
# 0.00 0.00 0.00 0.00
|
||||
io.update(self._parse_linux2(stdout))
|
||||
|
||||
elif sys.platform == "sunos5":
|
||||
|
@ -315,12 +317,12 @@ class IO(Check):
|
|||
# device r/s w/s kr/s kw/s wait actv svc_t %w %b
|
||||
# ramdisk1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0
|
||||
# sd0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0
|
||||
# sd1 0.0 139.0 0.0 1850.6 0.0 0.0 0.1 0 1
|
||||
|
||||
# sd1 0.0 139.0 0.0 1850.6 0.0 0.0 0.1 0 1
|
||||
|
||||
# discard the first half of the display (stats since boot)
|
||||
lines = [l for l in iostat.split("\n") if len(l) > 0]
|
||||
lines = lines[len(lines)/2:]
|
||||
|
||||
lines = lines[len(lines) / 2:]
|
||||
|
||||
assert "extended device statistics" in lines[0]
|
||||
headers = lines[1].split()
|
||||
assert "device" in headers
|
||||
|
@ -331,25 +333,25 @@ class IO(Check):
|
|||
io[cols[0]] = {}
|
||||
for i in range(1, len(cols)):
|
||||
io[cols[0]][self.xlate(headers[i], "sunos")] = cols[i]
|
||||
|
||||
|
||||
elif sys.platform.startswith("freebsd"):
|
||||
iostat = sp.Popen(["iostat", "-x", "-d", "1", "2"],
|
||||
stdout=sp.PIPE,
|
||||
close_fds=True).communicate()[0]
|
||||
|
||||
# Be careful!
|
||||
# Be careful!
|
||||
# It looks like SunOS, but some columms (wait, svc_t) have different meaning
|
||||
# extended device statistics
|
||||
# device r/s w/s kr/s kw/s wait svc_t %b
|
||||
# extended device statistics
|
||||
# device r/s w/s kr/s kw/s wait svc_t %b
|
||||
# ad0 3.1 1.3 49.9 18.8 0 0.7 0
|
||||
# extended device statistics
|
||||
# device r/s w/s kr/s kw/s wait svc_t %b
|
||||
# extended device statistics
|
||||
# device r/s w/s kr/s kw/s wait svc_t %b
|
||||
# ad0 0.0 2.0 0.0 31.8 0 0.2 0
|
||||
|
||||
|
||||
# discard the first half of the display (stats since boot)
|
||||
lines = [l for l in iostat.split("\n") if len(l) > 0]
|
||||
lines = lines[len(lines)/2:]
|
||||
|
||||
lines = lines[len(lines) / 2:]
|
||||
|
||||
assert "extended device statistics" in lines[0]
|
||||
headers = lines[1].split()
|
||||
assert "device" in headers
|
||||
|
@ -361,12 +363,12 @@ class IO(Check):
|
|||
for i in range(1, len(cols)):
|
||||
io[cols[0]][self.xlate(headers[i], "freebsd")] = cols[i]
|
||||
elif sys.platform == 'darwin':
|
||||
iostat = sp.Popen(['iostat', '-d', '-c', '2', '-w', '1'],
|
||||
iostat = sp.Popen(['iostat', '-d', '-c', '2', '-w', '1'],
|
||||
stdout=sp.PIPE,
|
||||
close_fds=True).communicate()[0]
|
||||
# disk0 disk1 <-- number of disks
|
||||
# KB/t tps MB/s KB/t tps MB/s
|
||||
# 21.11 23 0.47 20.01 0 0.00
|
||||
# KB/t tps MB/s KB/t tps MB/s
|
||||
# 21.11 23 0.47 20.01 0 0.00
|
||||
# 6.67 3 0.02 0.00 0 0.00 <-- line of interest
|
||||
io = self._parse_darwin(iostat)
|
||||
else:
|
||||
|
@ -388,7 +390,8 @@ class IO(Check):
|
|||
measurements = []
|
||||
timestamp = time.time()
|
||||
for dev_name, stats in filtered_io.iteritems():
|
||||
filtered_stats = {stat: stats[stat] for stat in stats.iterkeys() if stat not in self.stat_blacklist}
|
||||
filtered_stats = {stat: stats[stat]
|
||||
for stat in stats.iterkeys() if stat not in self.stat_blacklist}
|
||||
m_list = [Measurement(key, timestamp, value, {'device': dev_name})
|
||||
for key, value in filtered_stats.iteritems()]
|
||||
measurements.extend(m_list)
|
||||
|
@ -401,7 +404,7 @@ class IO(Check):
|
|||
|
||||
|
||||
class Load(Check):
|
||||
|
||||
|
||||
def check(self):
|
||||
if Platform.is_linux():
|
||||
try:
|
||||
|
@ -411,9 +414,9 @@ class Load(Check):
|
|||
except Exception:
|
||||
self.logger.exception('Cannot extract load')
|
||||
return {}
|
||||
|
||||
|
||||
uptime = uptime[0] # readlines() provides a list but we want a string
|
||||
|
||||
|
||||
elif sys.platform in ('darwin', 'sunos5') or sys.platform.startswith("freebsd"):
|
||||
# Get output from uptime
|
||||
try:
|
||||
|
@ -423,7 +426,7 @@ class Load(Check):
|
|||
except Exception:
|
||||
self.logger.exception('Cannot extract load')
|
||||
return {}
|
||||
|
||||
|
||||
# Split out the 3 load average values
|
||||
load = [res.replace(',', '.') for res in re.findall(r'([0-9]+[\.,]\d+)', uptime)]
|
||||
return {'load_avg_1_min': float(load[0]),
|
||||
|
@ -433,13 +436,14 @@ class Load(Check):
|
|||
|
||||
|
||||
class Memory(Check):
|
||||
|
||||
def __init__(self, logger):
|
||||
Check.__init__(self, logger)
|
||||
macV = None
|
||||
if sys.platform == 'darwin':
|
||||
macV = platform.mac_ver()
|
||||
macV_minor_version = int(re.match(r'10\.(\d+)\.?.*', macV[0]).group(1))
|
||||
|
||||
|
||||
# Output from top is slightly modified on OS X 10.6 (case #28239) and greater
|
||||
if macV and (macV_minor_version >= 6):
|
||||
self.topIndex = 6
|
||||
|
@ -456,7 +460,7 @@ class Memory(Check):
|
|||
except Exception:
|
||||
# No page size available
|
||||
pass
|
||||
|
||||
|
||||
def check(self):
|
||||
if Platform.is_linux():
|
||||
try:
|
||||
|
@ -466,7 +470,7 @@ class Memory(Check):
|
|||
except Exception:
|
||||
self.logger.exception('Cannot get memory metrics from /proc/meminfo')
|
||||
return {}
|
||||
|
||||
|
||||
# $ cat /proc/meminfo
|
||||
# MemTotal: 7995360 kB
|
||||
# MemFree: 1045120 kB
|
||||
|
@ -509,8 +513,9 @@ class Memory(Check):
|
|||
# Hugepagesize: 2048 kB
|
||||
# DirectMap4k: 10112 kB
|
||||
# DirectMap2M: 8243200 kB
|
||||
|
||||
regexp = re.compile(r'^(\w+):\s+([0-9]+)') # We run this several times so one-time compile now
|
||||
|
||||
# We run this several times so one-time compile now
|
||||
regexp = re.compile(r'^(\w+):\s+([0-9]+)')
|
||||
meminfo = {}
|
||||
|
||||
for line in lines:
|
||||
|
@ -520,7 +525,7 @@ class Memory(Check):
|
|||
meminfo[match.group(1)] = match.group(2)
|
||||
except Exception:
|
||||
self.logger.exception("Cannot parse /proc/meminfo")
|
||||
|
||||
|
||||
memData = {}
|
||||
|
||||
# Physical memory
|
||||
|
@ -534,43 +539,48 @@ class Memory(Check):
|
|||
|
||||
memData['mem_usable_perc'] = memData['mem_total_mb'] - memData['mem_free_mb']
|
||||
# Usable is relative since cached and buffers are actually used to speed things up.
|
||||
memData['mem_usable_mb'] = memData['mem_free_mb'] + memData['memphysBuffers'] + memData['memphysCached']
|
||||
memData['mem_usable_mb'] = memData['mem_free_mb'] + \
|
||||
memData['memphysBuffers'] + memData['memphysCached']
|
||||
|
||||
if memData['mem_total_mb'] > 0:
|
||||
memData['mem_usable_perc'] = float(memData['mem_usable_mb']) / float(memData['mem_total_mb'])
|
||||
memData['mem_usable_perc'] = float(
|
||||
memData['mem_usable_mb']) / float(memData['mem_total_mb'])
|
||||
except Exception:
|
||||
self.logger.exception('Cannot compute stats from /proc/meminfo')
|
||||
|
||||
|
||||
# Swap
|
||||
# FIXME units are in MB, we should use bytes instead
|
||||
try:
|
||||
memData['mem_swap_total_mb'] = int(meminfo.get('SwapTotal', 0)) / 1024
|
||||
memData['mem_swap_free_mb'] = int(meminfo.get('SwapFree', 0)) / 1024
|
||||
|
||||
memData['mem_swap_used_mb'] = memData['mem_swap_total_mb'] - memData['mem_swap_free_mb']
|
||||
|
||||
memData['mem_swap_used_mb'] = memData[
|
||||
'mem_swap_total_mb'] - memData['mem_swap_free_mb']
|
||||
|
||||
if memData['mem_swap_total_mb'] > 0:
|
||||
memData['mem_swap_free_perc'] = float(memData['mem_swap_free_mb']) / float(memData['mem_swap_total_mb'])
|
||||
memData['mem_swap_free_perc'] = float(
|
||||
memData['mem_swap_free_mb']) / float(memData['mem_swap_total_mb'])
|
||||
except Exception:
|
||||
self.logger.exception('Cannot compute swap stats')
|
||||
|
||||
return memData
|
||||
|
||||
|
||||
return memData
|
||||
|
||||
elif sys.platform == 'darwin':
|
||||
macV = platform.mac_ver()
|
||||
macV_minor_version = int(re.match(r'10\.(\d+)\.?.*', macV[0]).group(1))
|
||||
|
||||
try:
|
||||
top = sp.Popen(['top', '-l 1'], stdout=sp.PIPE, close_fds=True).communicate()[0]
|
||||
sysctl = sp.Popen(['sysctl', 'vm.swapusage'], stdout=sp.PIPE, close_fds=True).communicate()[0]
|
||||
sysctl = sp.Popen(
|
||||
['sysctl', 'vm.swapusage'], stdout=sp.PIPE, close_fds=True).communicate()[0]
|
||||
except StandardError:
|
||||
self.logger.exception('getMemoryUsage')
|
||||
return {}
|
||||
|
||||
|
||||
# Deal with top
|
||||
lines = top.split('\n')
|
||||
physParts = re.findall(r'([0-9]\d+)', lines[self.topIndex])
|
||||
|
||||
|
||||
# Deal with sysctl
|
||||
swapParts = re.findall(r'([0-9]+\.\d+)', sysctl)
|
||||
|
||||
|
@ -585,10 +595,11 @@ class Memory(Check):
|
|||
'physFree': physParts[physFreePartIndex],
|
||||
'swapUsed': swapParts[1],
|
||||
'swapFree': swapParts[2]}
|
||||
|
||||
|
||||
elif sys.platform.startswith("freebsd"):
|
||||
try:
|
||||
sysctl = sp.Popen(['sysctl', 'vm.stats.vm'], stdout=sp.PIPE, close_fds=True).communicate()[0]
|
||||
sysctl = sp.Popen(
|
||||
['sysctl', 'vm.stats.vm'], stdout=sp.PIPE, close_fds=True).communicate()[0]
|
||||
except Exception:
|
||||
self.logger.exception('getMemoryUsage')
|
||||
return {}
|
||||
|
@ -638,13 +649,15 @@ class Memory(Check):
|
|||
pageSize) / 1048576
|
||||
|
||||
if memData['physTotal'] > 0:
|
||||
memData['physPctUsable'] = float(memData['physUsable']) / float(memData['physTotal'])
|
||||
memData['physPctUsable'] = float(
|
||||
memData['physUsable']) / float(memData['physTotal'])
|
||||
except Exception:
|
||||
self.logger.exception('Cannot compute stats from /proc/meminfo')
|
||||
|
||||
# Swap
|
||||
try:
|
||||
sysctl = sp.Popen(['swapinfo', '-m'], stdout=sp.PIPE, close_fds=True).communicate()[0]
|
||||
sysctl = sp.Popen(
|
||||
['swapinfo', '-m'], stdout=sp.PIPE, close_fds=True).communicate()[0]
|
||||
except Exception:
|
||||
self.logger.exception('getMemoryUsage')
|
||||
return {}
|
||||
|
@ -669,7 +682,7 @@ class Memory(Check):
|
|||
memData['swapUsed'] += int(line[2])
|
||||
except Exception:
|
||||
self.logger.exception('Cannot compute stats from swapinfo')
|
||||
|
||||
|
||||
return memData
|
||||
elif sys.platform == 'sunos5':
|
||||
try:
|
||||
|
@ -694,14 +707,15 @@ class Memory(Check):
|
|||
# memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:snaptime 16787393.9439095
|
||||
# memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:swap 91828224 <--
|
||||
# memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:swapcap 1073741824 <--
|
||||
# memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:zonename 53aa9b7e-48ba-4152-a52b-a6368c3d9e7c
|
||||
|
||||
# memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:zonename
|
||||
# 53aa9b7e-48ba-4152-a52b-a6368c3d9e7c
|
||||
|
||||
# turn memory_cap:360:zone_name:key value
|
||||
# into { "key": value, ...}
|
||||
kv = [l.strip().split() for l in kmem.split("\n") if len(l) > 0]
|
||||
entries = dict([(k.split(":")[-1], v) for (k, v) in kv])
|
||||
# extract rss, physcap, swap, swapcap, turn into MB
|
||||
convert = lambda v: int(long(v))/2**20
|
||||
convert = lambda v: int(long(v)) / 2 ** 20
|
||||
memData["physTotal"] = convert(entries["physcap"])
|
||||
memData["physUsed"] = convert(entries["rss"])
|
||||
memData["physFree"] = memData["physTotal"] - memData["physUsed"]
|
||||
|
@ -710,7 +724,8 @@ class Memory(Check):
|
|||
memData["swapFree"] = memData["swapTotal"] - memData["swapUsed"]
|
||||
|
||||
if memData['swapTotal'] > 0:
|
||||
memData['swapPctFree'] = float(memData['swapFree']) / float(memData['swapTotal'])
|
||||
memData['swapPctFree'] = float(
|
||||
memData['swapFree']) / float(memData['swapTotal'])
|
||||
return memData
|
||||
except Exception:
|
||||
self.logger.exception("Cannot compute mem stats from kstat -c zone_memory_cap")
|
||||
|
@ -778,7 +793,7 @@ class Cpu(Check):
|
|||
data = avg[0].split()
|
||||
|
||||
# Userland
|
||||
# Debian lenny says %user so we look for both
|
||||
# Debian lenny says %user so we look for both
|
||||
# One of them will be 0
|
||||
cpu_metrics = {"%usr": None, "%user": None, "%nice": None,
|
||||
"%iowait": None, "%idle": None, "%sys": None,
|
||||
|
@ -798,16 +813,17 @@ class Cpu(Check):
|
|||
|
||||
return format_results(cpu_user,
|
||||
cpu_system,
|
||||
cpu_wait,
|
||||
cpu_wait,
|
||||
cpu_idle,
|
||||
cpu_stolen)
|
||||
else:
|
||||
return {}
|
||||
|
||||
|
||||
elif sys.platform == 'darwin':
|
||||
# generate 3 seconds of data
|
||||
# [' disk0 disk1 cpu load average', ' KB/t tps MB/s KB/t tps MB/s us sy id 1m 5m 15m', ' 21.23 13 0.27 17.85 7 0.13 14 7 79 1.04 1.27 1.31', ' 4.00 3 0.01 5.00 8 0.04 12 10 78 1.04 1.27 1.31', '']
|
||||
iostats = sp.Popen(['iostat', '-C', '-w', '3', '-c', '2'], stdout=sp.PIPE, close_fds=True).communicate()[0]
|
||||
iostats = sp.Popen(
|
||||
['iostat', '-C', '-w', '3', '-c', '2'], stdout=sp.PIPE, close_fds=True).communicate()[0]
|
||||
lines = [l for l in iostats.split("\n") if len(l) > 0]
|
||||
legend = [l for l in lines if "us" in l]
|
||||
if len(legend) == 1:
|
||||
|
@ -830,7 +846,8 @@ class Cpu(Check):
|
|||
# tin tout KB/t tps MB/s KB/t tps MB/s KB/t tps MB/s us ni sy in id
|
||||
# 0 69 26.71 0 0.01 0.00 0 0.00 0.00 0 0.00 2 0 0 1 97
|
||||
# 0 78 0.00 0 0.00 0.00 0 0.00 0.00 0 0.00 0 0 0 0 100
|
||||
iostats = sp.Popen(['iostat', '-w', '3', '-c', '2'], stdout=sp.PIPE, close_fds=True).communicate()[0]
|
||||
iostats = sp.Popen(
|
||||
['iostat', '-w', '3', '-c', '2'], stdout=sp.PIPE, close_fds=True).communicate()[0]
|
||||
lines = [l for l in iostats.split("\n") if len(l) > 0]
|
||||
legend = [l for l in lines if "us" in l]
|
||||
if len(legend) == 1:
|
||||
|
@ -862,10 +879,11 @@ class Cpu(Check):
|
|||
#
|
||||
# Will aggregate over all processor sets
|
||||
try:
|
||||
mpstat = sp.Popen(['mpstat', '-aq', '1', '2'], stdout=sp.PIPE, close_fds=True).communicate()[0]
|
||||
mpstat = sp.Popen(
|
||||
['mpstat', '-aq', '1', '2'], stdout=sp.PIPE, close_fds=True).communicate()[0]
|
||||
lines = [l for l in mpstat.split("\n") if len(l) > 0]
|
||||
# discard the first len(lines)/2 lines
|
||||
lines = lines[len(lines)/2:]
|
||||
lines = lines[len(lines) / 2:]
|
||||
legend = [l for l in lines if "SET" in l]
|
||||
assert len(legend) == 1
|
||||
if len(legend) == 1:
|
||||
|
@ -879,7 +897,7 @@ class Cpu(Check):
|
|||
idle = [get_value(headers, l.split(), "idl") for l in d_lines]
|
||||
size = [get_value(headers, l.split(), "sze") for l in d_lines]
|
||||
count = sum(size)
|
||||
rel_size = [s/count for s in size]
|
||||
rel_size = [s / count for s in size]
|
||||
dot = lambda v1, v2: reduce(operator.add, map(operator.mul, v1, v2))
|
||||
return format_results(dot(user, rel_size),
|
||||
dot(kern, rel_size),
|
||||
|
@ -929,5 +947,3 @@ if __name__ == '__main__':
|
|||
print(mem.check(config))
|
||||
print("\n\n\n")
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
|
|
|
@ -16,6 +16,7 @@ KB2MB = B2KB = float(1024)
|
|||
|
||||
|
||||
class Processes(Check):
|
||||
|
||||
def __init__(self, logger):
|
||||
Check.__init__(self, logger)
|
||||
self.gauge('system.proc.queue_length')
|
||||
|
@ -44,6 +45,7 @@ class Processes(Check):
|
|||
|
||||
|
||||
class Memory(Check):
|
||||
|
||||
def __init__(self, logger):
|
||||
Check.__init__(self, logger)
|
||||
self.logger = logger
|
||||
|
@ -83,6 +85,7 @@ class Memory(Check):
|
|||
|
||||
|
||||
class Cpu(Check):
|
||||
|
||||
def __init__(self, logger):
|
||||
Check.__init__(self, logger)
|
||||
self.logger = logger
|
||||
|
@ -140,6 +143,7 @@ class Cpu(Check):
|
|||
|
||||
|
||||
class Network(Check):
|
||||
|
||||
def __init__(self, logger):
|
||||
Check.__init__(self, logger)
|
||||
self.logger = logger
|
||||
|
@ -166,6 +170,7 @@ class Network(Check):
|
|||
|
||||
|
||||
class Disk(Check):
|
||||
|
||||
def __init__(self, logger):
|
||||
Check.__init__(self, logger)
|
||||
self.logger = logger
|
||||
|
@ -199,6 +204,7 @@ class Disk(Check):
|
|||
|
||||
|
||||
class IO(Check):
|
||||
|
||||
def __init__(self, logger):
|
||||
Check.__init__(self, logger)
|
||||
self.logger = logger
|
||||
|
|
|
@ -41,11 +41,11 @@ class TailFile(object):
|
|||
self._crc = None
|
||||
self._log = logger
|
||||
self._callback = callback
|
||||
|
||||
|
||||
def _open_file(self, move_end=False, pos=False):
|
||||
|
||||
already_open = False
|
||||
#close and reopen to handle logrotate
|
||||
# close and reopen to handle logrotate
|
||||
if self._f is not None:
|
||||
self._f.close()
|
||||
self._f = None
|
||||
|
|
|
@ -6,6 +6,7 @@ from monagent.collector.checks.utils import add_basic_auth
|
|||
|
||||
|
||||
class Apache(AgentCheck):
|
||||
|
||||
"""Tracks basic connection/requests/workers metrics
|
||||
|
||||
See http://httpd.apache.org/docs/2.2/mod/mod_status.html for more details
|
||||
|
@ -70,12 +71,12 @@ class Apache(AgentCheck):
|
|||
|
||||
if metric_count == 0:
|
||||
if self.assumed_url.get(instance['apache_status_url'], None) is None and url[-5:] != '?auto':
|
||||
self.assumed_url[instance['apache_status_url']]= '%s?auto' % url
|
||||
self.assumed_url[instance['apache_status_url']] = '%s?auto' % url
|
||||
self.warning("Assuming url was not correct. Trying to add ?auto suffix to the url")
|
||||
self.check(instance)
|
||||
else:
|
||||
raise Exception("No metrics were fetched for this instance. Make sure that %s is the proper url." % instance['apache_status_url'])
|
||||
|
||||
raise Exception("No metrics were fetched for this instance. Make sure that %s is the proper url." % instance[
|
||||
'apache_status_url'])
|
||||
|
||||
@staticmethod
|
||||
def parse_agent_config(agentConfig):
|
||||
|
|
|
@ -27,6 +27,7 @@ CACTI_TO_DD = {
|
|||
|
||||
|
||||
class Cacti(AgentCheck):
|
||||
|
||||
def __init__(self, name, init_config, agent_config):
|
||||
AgentCheck.__init__(self, name, init_config, agent_config)
|
||||
self.last_ts = {}
|
||||
|
@ -41,10 +42,10 @@ class Cacti(AgentCheck):
|
|||
except AttributeError:
|
||||
version = "Unknown"
|
||||
|
||||
return {"rrdtool": version}
|
||||
return {"rrdtool": version}
|
||||
|
||||
def check(self, instance):
|
||||
|
||||
|
||||
# Load the instance config
|
||||
config = self._get_config(instance)
|
||||
|
||||
|
@ -52,13 +53,15 @@ class Cacti(AgentCheck):
|
|||
try:
|
||||
import rrdtool
|
||||
except ImportError, e:
|
||||
raise Exception("Cannot import rrdtool module. Check the instructions to install this module at https://app.datadoghq.com/account/settings#integrations/cacti")
|
||||
raise Exception(
|
||||
"Cannot import rrdtool module. Check the instructions to install this module at https://app.datadoghq.com/account/settings#integrations/cacti")
|
||||
|
||||
# Try importing MySQL
|
||||
try:
|
||||
import MySQLdb
|
||||
except ImportError, e:
|
||||
raise Exception("Cannot import MySQLdb module. Check the instructions to install this module at https://app.datadoghq.com/account/settings#integrations/cacti")
|
||||
raise Exception(
|
||||
"Cannot import MySQLdb module. Check the instructions to install this module at https://app.datadoghq.com/account/settings#integrations/cacti")
|
||||
|
||||
connection = MySQLdb.connect(config.host, config.user, config.password, config.db)
|
||||
|
||||
|
@ -132,7 +135,7 @@ class Cacti(AgentCheck):
|
|||
return metric_count
|
||||
|
||||
# Find the consolidation functions for the RRD metrics
|
||||
c_funcs = set([v for k,v in info.items() if k.endswith('.cf')])
|
||||
c_funcs = set([v for k, v in info.items() if k.endswith('.cf')])
|
||||
|
||||
for c in list(c_funcs):
|
||||
last_ts_key = '%s.%s' % (rrd_path, c)
|
||||
|
@ -165,7 +168,7 @@ class Cacti(AgentCheck):
|
|||
# Save this metric as a gauge
|
||||
val = self._transform_metric(m_name, p[k])
|
||||
self.gauge(m_name, val, hostname=hostname,
|
||||
device_name=device_name, timestamp=ts)
|
||||
device_name=device_name, timestamp=ts)
|
||||
metric_count += 1
|
||||
last_ts = (ts + interval)
|
||||
|
||||
|
@ -178,7 +181,7 @@ class Cacti(AgentCheck):
|
|||
tuples of (hostname, device_name, rrd_path)
|
||||
'''
|
||||
def _in_whitelist(rrd):
|
||||
path = rrd.replace('<path_rra>/','')
|
||||
path = rrd.replace('<path_rra>/', '')
|
||||
for p in whitelist:
|
||||
if fnmatch(path, p):
|
||||
return True
|
||||
|
@ -186,7 +189,8 @@ class Cacti(AgentCheck):
|
|||
|
||||
c = connection.cursor()
|
||||
|
||||
and_parameters = " OR ".join(["hsc.field_name = '%s'" % field_name for field_name in field_names])
|
||||
and_parameters = " OR ".join(
|
||||
["hsc.field_name = '%s'" % field_name for field_name in field_names])
|
||||
|
||||
# Check for the existence of the `host_snmp_cache` table
|
||||
rrd_query = """
|
||||
|
@ -202,7 +206,7 @@ class Cacti(AgentCheck):
|
|||
WHERE dt.data_source_path IS NOT NULL
|
||||
AND dt.data_source_path != ''
|
||||
AND (%s OR hsc.field_name is NULL) """ % and_parameters
|
||||
|
||||
|
||||
c.execute(rrd_query)
|
||||
res = []
|
||||
for hostname, device_name, rrd_path in c.fetchall():
|
||||
|
@ -244,7 +248,6 @@ class Cacti(AgentCheck):
|
|||
return val / 1024
|
||||
return val
|
||||
|
||||
|
||||
'''
|
||||
For backwards compatability with pre-checks_d configuration.
|
||||
Convert old-style config to new-style config.
|
||||
|
|
|
@ -6,9 +6,11 @@ from monagent.collector.checks import AgentCheck
|
|||
|
||||
|
||||
class CouchDb(AgentCheck):
|
||||
|
||||
"""Extracts stats from CouchDB via its REST API
|
||||
http://wiki.apache.org/couchdb/Runtime_Statistics
|
||||
"""
|
||||
|
||||
def _create_metric(self, data, dimensions=None):
|
||||
overall_stats = data.get('stats', {})
|
||||
for key, stats in overall_stats.items():
|
||||
|
@ -16,7 +18,7 @@ class CouchDb(AgentCheck):
|
|||
if val['current'] is not None:
|
||||
metric_name = '.'.join(['couchdb', key, metric])
|
||||
self.gauge(metric_name, val['current'], dimensions=dimensions)
|
||||
|
||||
|
||||
for db_name, db_stats in data.get('databases', {}).items():
|
||||
for name, val in db_stats.items():
|
||||
if name in ['doc_count', 'disk_size'] and val is not None:
|
||||
|
@ -80,7 +82,6 @@ class CouchDb(AgentCheck):
|
|||
if not agentConfig.get('couchdb_server'):
|
||||
return False
|
||||
|
||||
|
||||
return {
|
||||
'instances': [{
|
||||
'server': agentConfig.get('couchdb_server'),
|
||||
|
|
|
@ -8,13 +8,13 @@ from monagent.collector.checks import AgentCheck
|
|||
from monagent.collector.checks.utils import add_basic_auth
|
||||
|
||||
|
||||
|
||||
#Constants
|
||||
# Constants
|
||||
COUCHBASE_STATS_PATH = '/pools/nodes'
|
||||
DEFAULT_TIMEOUT = 10
|
||||
|
||||
|
||||
class Couchbase(AgentCheck):
|
||||
|
||||
"""Extracts stats from Couchbase via its REST API
|
||||
http://docs.couchbase.com/couchbase-manual-2.0/#using-the-rest-api
|
||||
"""
|
||||
|
@ -24,25 +24,29 @@ class Couchbase(AgentCheck):
|
|||
for key, storage_type in storage_totals.items():
|
||||
for metric_name, val in storage_type.items():
|
||||
if val is not None:
|
||||
metric_name = '.'.join(['couchbase', key, self.camel_case_to_joined_lower(metric_name)])
|
||||
metric_name = '.'.join(
|
||||
['couchbase', key, self.camel_case_to_joined_lower(metric_name)])
|
||||
self.gauge(metric_name, val, dimensions=dimensions)
|
||||
|
||||
for bucket_name, bucket_stats in data['buckets'].items():
|
||||
for metric_name, val in bucket_stats.items():
|
||||
if val is not None:
|
||||
metric_name = '.'.join(['couchbase', 'by_bucket', self.camel_case_to_joined_lower(metric_name)])
|
||||
metric_name = '.'.join(
|
||||
['couchbase', 'by_bucket', self.camel_case_to_joined_lower(metric_name)])
|
||||
metric_dimensions = dimensions.copy()
|
||||
metric_dimensions['bucket'] = bucket_name
|
||||
self.gauge(metric_name, val[0], dimensions=metric_dimensions, device_name=bucket_name)
|
||||
self.gauge(
|
||||
metric_name, val[0], dimensions=metric_dimensions, device_name=bucket_name)
|
||||
|
||||
for node_name, node_stats in data['nodes'].items():
|
||||
for metric_name, val in node_stats['interestingStats'].items():
|
||||
if val is not None:
|
||||
metric_name = '.'.join(['couchbase', 'by_node', self.camel_case_to_joined_lower(metric_name)])
|
||||
metric_name = '.'.join(
|
||||
['couchbase', 'by_node', self.camel_case_to_joined_lower(metric_name)])
|
||||
metric_dimensions = dimensions.copy()
|
||||
metric_dimensions['node'] = node_name
|
||||
self.gauge(metric_name, val, dimensions=metric_dimensions, device_name=node_name)
|
||||
|
||||
self.gauge(
|
||||
metric_name, val, dimensions=metric_dimensions, device_name=node_name)
|
||||
|
||||
def _get_stats(self, url, instance):
|
||||
"Hit a given URL and return the parsed json"
|
||||
|
@ -52,8 +56,8 @@ class Couchbase(AgentCheck):
|
|||
add_basic_auth(req, instance['user'], instance['password'])
|
||||
|
||||
if instance['is_recent_python']:
|
||||
timeout = instance.get('timeout' , DEFAULT_TIMEOUT)
|
||||
request = urllib2.urlopen(req,timeout=timeout)
|
||||
timeout = instance.get('timeout', DEFAULT_TIMEOUT)
|
||||
request = urllib2.urlopen(req, timeout=timeout)
|
||||
else:
|
||||
request = urllib2.urlopen(req)
|
||||
|
||||
|
@ -73,9 +77,9 @@ class Couchbase(AgentCheck):
|
|||
def get_data(self, server, instance):
|
||||
# The dictionary to be returned.
|
||||
couchbase = {'stats': None,
|
||||
'buckets': {},
|
||||
'nodes': {}
|
||||
}
|
||||
'buckets': {},
|
||||
'nodes': {}
|
||||
}
|
||||
|
||||
# build couchbase stats entry point
|
||||
url = '%s%s' % (server, COUCHBASE_STATS_PATH)
|
||||
|
@ -84,7 +88,7 @@ class Couchbase(AgentCheck):
|
|||
# No overall stats? bail out now
|
||||
if overall_stats is None:
|
||||
raise Exception("No data returned from couchbase endpoint: %s" % url)
|
||||
|
||||
|
||||
couchbase['stats'] = overall_stats
|
||||
|
||||
nodes = overall_stats['nodes']
|
||||
|
@ -104,7 +108,8 @@ class Couchbase(AgentCheck):
|
|||
for bucket in buckets:
|
||||
bucket_name = bucket['name']
|
||||
|
||||
# We have to manually build the URI for the stats bucket, as this is not auto discoverable
|
||||
# We have to manually build the URI for the stats bucket, as this is not
|
||||
# auto discoverable
|
||||
url = '%s/pools/nodes/buckets/%s/stats' % (server, bucket_name)
|
||||
bucket_stats = self._get_stats(url, instance)
|
||||
bucket_samples = bucket_stats['op']['samples']
|
||||
|
@ -124,9 +129,8 @@ class Couchbase(AgentCheck):
|
|||
|
||||
# remove duplicate _
|
||||
converted_variable = re.sub('_+', '_', converted_variable)
|
||||
|
||||
|
||||
# handle special case of starting/ending underscores
|
||||
converted_variable = re.sub('^_|_$', '', converted_variable)
|
||||
|
||||
return converted_variable
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ from monagent.collector.checks import AgentCheck
|
|||
|
||||
|
||||
class DirectoryCheck(AgentCheck):
|
||||
|
||||
"""This check is for monitoring and reporting metrics on the files for a provided directory
|
||||
|
||||
WARNING: the user/group that dd-agent runs as must have access to stat the files in the desired directory
|
||||
|
@ -17,6 +18,7 @@ class DirectoryCheck(AgentCheck):
|
|||
"pattern" - string, the `fnmatch` pattern to use when reading the "directory"'s files. default "*"
|
||||
"recursive" - boolean, when true the stats will recurse into directories. default False
|
||||
"""
|
||||
|
||||
def check(self, instance):
|
||||
if "directory" not in instance:
|
||||
raise Exception('DirectoryCheck: missing "directory" in config')
|
||||
|
@ -51,7 +53,8 @@ class DirectoryCheck(AgentCheck):
|
|||
directory_files += 1
|
||||
directory_bytes += file_stat.st_size
|
||||
# file specific metrics
|
||||
self.histogram("system.disk.directory.file.bytes", file_stat.st_size, dimensions=dimensions)
|
||||
self.histogram(
|
||||
"system.disk.directory.file.bytes", file_stat.st_size, dimensions=dimensions)
|
||||
self.histogram("system.disk.directory.file.modified_sec_ago", time.time() - file_stat.st_mtime,
|
||||
dimensions=dimensions)
|
||||
self.histogram("system.disk.directory.file.created_sec_ago", time.time() - file_stat.st_ctime,
|
||||
|
|
|
@ -68,8 +68,10 @@ DOCKER_TAGS = [
|
|||
|
||||
|
||||
class UnixHTTPConnection(httplib.HTTPConnection, object):
|
||||
|
||||
"""Class used in conjuction with UnixSocketHandler to make urllib2
|
||||
compatible with Unix sockets."""
|
||||
|
||||
def __init__(self, unix_socket):
|
||||
self._unix_socket = unix_socket
|
||||
|
||||
|
@ -84,8 +86,10 @@ class UnixHTTPConnection(httplib.HTTPConnection, object):
|
|||
|
||||
|
||||
class UnixSocketHandler(urllib2.AbstractHTTPHandler):
|
||||
|
||||
"""Class that makes Unix sockets work with urllib2 without any additional
|
||||
dependencies."""
|
||||
|
||||
def unix_open(self, req):
|
||||
full_path = "%s%s" % urlsplit(req.get_full_url())[1:3]
|
||||
path = os.path.sep
|
||||
|
@ -104,6 +108,7 @@ class UnixSocketHandler(urllib2.AbstractHTTPHandler):
|
|||
|
||||
|
||||
class Docker(AgentCheck):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(Docker, self).__init__(*args, **kwargs)
|
||||
urllib2.install_opener(urllib2.build_opener(UnixSocketHandler()))
|
||||
|
@ -121,7 +126,8 @@ class Docker(AgentCheck):
|
|||
|
||||
if not instance.get("exclude") or not instance.get("include"):
|
||||
if len(containers) > max_containers:
|
||||
self.warning("Too many containers to collect. Please refine the containers to collect by editing the configuration file. Truncating to %s containers" % max_containers)
|
||||
self.warning(
|
||||
"Too many containers to collect. Please refine the containers to collect by editing the configuration file. Truncating to %s containers" % max_containers)
|
||||
containers = containers[:max_containers]
|
||||
|
||||
collected_containers = 0
|
||||
|
@ -136,19 +142,22 @@ class Docker(AgentCheck):
|
|||
|
||||
collected_containers += 1
|
||||
if collected_containers > max_containers:
|
||||
self.warning("Too many containers are matching the current configuration. Some containers will not be collected. Please refine your configuration")
|
||||
self.warning(
|
||||
"Too many containers are matching the current configuration. Some containers will not be collected. Please refine your configuration")
|
||||
break
|
||||
|
||||
for key, (dd_key, metric_type) in DOCKER_METRICS.items():
|
||||
if key in container:
|
||||
getattr(self, metric_type)(dd_key, int(container[key]), dimensions=container_dimensions)
|
||||
getattr(self, metric_type)(
|
||||
dd_key, int(container[key]), dimensions=container_dimensions)
|
||||
for metric in LXC_METRICS:
|
||||
mountpoint = self._mounpoints[metric["cgroup"]]
|
||||
stat_file = os.path.join(mountpoint, metric["file"] % container["Id"])
|
||||
stats = self._parse_cgroup_file(stat_file)
|
||||
for key, (dd_key, metric_type) in metric["metrics"].items():
|
||||
if key in stats:
|
||||
getattr(self, metric_type)(dd_key, int(stats[key]), dimensions=container_dimensions)
|
||||
getattr(self, metric_type)(
|
||||
dd_key, int(stats[key]), dimensions=container_dimensions)
|
||||
|
||||
@staticmethod
|
||||
def _make_tag(key, value):
|
||||
|
@ -187,7 +196,8 @@ class Docker(AgentCheck):
|
|||
request = urllib2.urlopen(req)
|
||||
except urllib2.URLError, e:
|
||||
if "Errno 13" in str(e):
|
||||
raise Exception("Unable to connect to socket. dd-agent user must be part of the 'docker' group")
|
||||
raise Exception(
|
||||
"Unable to connect to socket. dd-agent user must be part of the 'docker' group")
|
||||
raise
|
||||
response = request.read()
|
||||
return json.loads(response)
|
||||
|
@ -217,7 +227,8 @@ class Docker(AgentCheck):
|
|||
try:
|
||||
fp = open(file_)
|
||||
except IOError:
|
||||
raise IOError("Can't open %s. If you are using Docker 0.9.0 or higher, the Datadog agent is not yet compatible with these versions. Please get in touch with Datadog Support for more information" % file_)
|
||||
raise IOError(
|
||||
"Can't open %s. If you are using Docker 0.9.0 or higher, the Datadog agent is not yet compatible with these versions. Please get in touch with Datadog Support for more information" % file_)
|
||||
return dict(map(lambda x: x.split(), fp.read().splitlines()))
|
||||
|
||||
finally:
|
||||
|
|
|
@ -21,35 +21,35 @@ class ElasticSearch(AgentCheck):
|
|||
"elasticsearch.docs.deleted": ("gauge", "indices.docs.deleted"),
|
||||
"elasticsearch.store.size": ("gauge", "indices.store.size_in_bytes"),
|
||||
"elasticsearch.indexing.index.total": ("gauge", "indices.indexing.index_total"),
|
||||
"elasticsearch.indexing.index.time": ("gauge", "indices.indexing.index_time_in_millis", lambda v: float(v)/1000),
|
||||
"elasticsearch.indexing.index.time": ("gauge", "indices.indexing.index_time_in_millis", lambda v: float(v) / 1000),
|
||||
"elasticsearch.indexing.index.current": ("gauge", "indices.indexing.index_current"),
|
||||
"elasticsearch.indexing.delete.total": ("gauge", "indices.indexing.delete_total"),
|
||||
"elasticsearch.indexing.delete.time": ("gauge", "indices.indexing.delete_time_in_millis", lambda v: float(v)/1000),
|
||||
"elasticsearch.indexing.delete.time": ("gauge", "indices.indexing.delete_time_in_millis", lambda v: float(v) / 1000),
|
||||
"elasticsearch.indexing.delete.current": ("gauge", "indices.indexing.delete_current"),
|
||||
"elasticsearch.get.total": ("gauge", "indices.get.total"),
|
||||
"elasticsearch.get.time": ("gauge", "indices.get.time_in_millis", lambda v: float(v)/1000),
|
||||
"elasticsearch.get.time": ("gauge", "indices.get.time_in_millis", lambda v: float(v) / 1000),
|
||||
"elasticsearch.get.current": ("gauge", "indices.get.current"),
|
||||
"elasticsearch.get.exists.total": ("gauge", "indices.get.exists_total"),
|
||||
"elasticsearch.get.exists.time": ("gauge", "indices.get.exists_time_in_millis", lambda v: float(v)/1000),
|
||||
"elasticsearch.get.exists.time": ("gauge", "indices.get.exists_time_in_millis", lambda v: float(v) / 1000),
|
||||
"elasticsearch.get.missing.total": ("gauge", "indices.get.missing_total"),
|
||||
"elasticsearch.get.missing.time": ("gauge", "indices.get.missing_time_in_millis", lambda v: float(v)/1000),
|
||||
"elasticsearch.get.missing.time": ("gauge", "indices.get.missing_time_in_millis", lambda v: float(v) / 1000),
|
||||
"elasticsearch.search.query.total": ("gauge", "indices.search.query_total"),
|
||||
"elasticsearch.search.query.time": ("gauge", "indices.search.query_time_in_millis", lambda v: float(v)/1000),
|
||||
"elasticsearch.search.query.time": ("gauge", "indices.search.query_time_in_millis", lambda v: float(v) / 1000),
|
||||
"elasticsearch.search.query.current": ("gauge", "indices.search.query_current"),
|
||||
"elasticsearch.search.fetch.total": ("gauge", "indices.search.fetch_total"),
|
||||
"elasticsearch.search.fetch.time": ("gauge", "indices.search.fetch_time_in_millis", lambda v: float(v)/1000),
|
||||
"elasticsearch.search.fetch.time": ("gauge", "indices.search.fetch_time_in_millis", lambda v: float(v) / 1000),
|
||||
"elasticsearch.search.fetch.current": ("gauge", "indices.search.fetch_current"),
|
||||
"elasticsearch.merges.current": ("gauge", "indices.merges.current"),
|
||||
"elasticsearch.merges.current.docs": ("gauge", "indices.merges.current_docs"),
|
||||
"elasticsearch.merges.current.size": ("gauge", "indices.merges.current_size_in_bytes"),
|
||||
"elasticsearch.merges.total": ("gauge", "indices.merges.total"),
|
||||
"elasticsearch.merges.total.time": ("gauge", "indices.merges.total_time_in_millis", lambda v: float(v)/1000),
|
||||
"elasticsearch.merges.total.time": ("gauge", "indices.merges.total_time_in_millis", lambda v: float(v) / 1000),
|
||||
"elasticsearch.merges.total.docs": ("gauge", "indices.merges.total_docs"),
|
||||
"elasticsearch.merges.total.size": ("gauge", "indices.merges.total_size_in_bytes"),
|
||||
"elasticsearch.refresh.total": ("gauge", "indices.refresh.total"),
|
||||
"elasticsearch.refresh.total.time": ("gauge", "indices.refresh.total_time_in_millis", lambda v: float(v)/1000),
|
||||
"elasticsearch.refresh.total.time": ("gauge", "indices.refresh.total_time_in_millis", lambda v: float(v) / 1000),
|
||||
"elasticsearch.flush.total": ("gauge", "indices.flush.total"),
|
||||
"elasticsearch.flush.total.time": ("gauge", "indices.flush.total_time_in_millis", lambda v: float(v)/1000),
|
||||
"elasticsearch.flush.total.time": ("gauge", "indices.flush.total_time_in_millis", lambda v: float(v) / 1000),
|
||||
"elasticsearch.process.open_fd": ("gauge", "process.open_file_descriptors"),
|
||||
"elasticsearch.transport.rx_count": ("gauge", "transport.rx_count"),
|
||||
"elasticsearch.transport.tx_count": ("gauge", "transport.tx_count"),
|
||||
|
@ -92,9 +92,9 @@ class ElasticSearch(AgentCheck):
|
|||
"elasticsearch.http.current_open": ("gauge", "http.current_open"),
|
||||
"elasticsearch.http.total_opened": ("gauge", "http.total_opened"),
|
||||
"jvm.gc.concurrent_mark_sweep.count": ("gauge", "jvm.gc.collectors.ConcurrentMarkSweep.collection_count"),
|
||||
"jvm.gc.concurrent_mark_sweep.collection_time": ("gauge", "jvm.gc.collectors.ConcurrentMarkSweep.collection_time_in_millis", lambda v: float(v)/1000),
|
||||
"jvm.gc.concurrent_mark_sweep.collection_time": ("gauge", "jvm.gc.collectors.ConcurrentMarkSweep.collection_time_in_millis", lambda v: float(v) / 1000),
|
||||
"jvm.gc.par_new.count": ("gauge", "jvm.gc.collectors.ParNew.collection_count"),
|
||||
"jvm.gc.par_new.collection_time": ("gauge", "jvm.gc.collectors.ParNew.collection_time_in_millis", lambda v: float(v)/1000),
|
||||
"jvm.gc.par_new.collection_time": ("gauge", "jvm.gc.collectors.ParNew.collection_time_in_millis", lambda v: float(v) / 1000),
|
||||
"jvm.mem.heap_committed": ("gauge", "jvm.mem.heap_committed_in_bytes"),
|
||||
"jvm.mem.heap_used": ("gauge", "jvm.mem.heap_used_in_bytes"),
|
||||
"jvm.mem.non_heap_committed": ("gauge", "jvm.mem.non_heap_committed_in_bytes"),
|
||||
|
@ -153,7 +153,6 @@ class ElasticSearch(AgentCheck):
|
|||
health_data = self._get_data(url, auth)
|
||||
self._process_health_data(config_url, health_data, dimensions=dimensions)
|
||||
|
||||
|
||||
def _get_es_version(self, config_url, auth=None):
|
||||
"""
|
||||
Get the running version of Elastic Search
|
||||
|
@ -163,7 +162,8 @@ class ElasticSearch(AgentCheck):
|
|||
data = self._get_data(config_url, auth)
|
||||
version = map(int, data['version']['number'].split('.'))
|
||||
except Exception, e:
|
||||
self.warning("Error while trying to get Elasticsearch version from %s %s" % (config_url, str(e)))
|
||||
self.warning("Error while trying to get Elasticsearch version from %s %s" %
|
||||
(config_url, str(e)))
|
||||
version = [0, 0, 0]
|
||||
|
||||
self.log.debug("Elasticsearch version is %s" % version)
|
||||
|
@ -174,7 +174,7 @@ class ElasticSearch(AgentCheck):
|
|||
Define the set of URLs and METRICS to use depending on the running ES version
|
||||
"""
|
||||
|
||||
if version >= [0,90,10]:
|
||||
if version >= [0, 90, 10]:
|
||||
# ES versions 0.90.10 and above
|
||||
# Metrics architecture changed starting with version 0.90.10
|
||||
self.HEALTH_URL = "/_cluster/health?pretty=true"
|
||||
|
@ -185,9 +185,9 @@ class ElasticSearch(AgentCheck):
|
|||
"elasticsearch.search.fetch.open_contexts": ("gauge", "indices.search.open_contexts"),
|
||||
"elasticsearch.cache.filter.evictions": ("gauge", "indices.filter_cache.evictions"),
|
||||
"elasticsearch.cache.filter.size": ("gauge", "indices.filter_cache.memory_size_in_bytes"),
|
||||
"elasticsearch.id_cache.size": ("gauge","indices.id_cache.memory_size_in_bytes"),
|
||||
"elasticsearch.fielddata.size": ("gauge","indices.fielddata.memory_size_in_bytes"),
|
||||
"elasticsearch.fielddata.evictions": ("gauge","indices.fielddata.evictions")
|
||||
"elasticsearch.id_cache.size": ("gauge", "indices.id_cache.memory_size_in_bytes"),
|
||||
"elasticsearch.fielddata.size": ("gauge", "indices.fielddata.memory_size_in_bytes"),
|
||||
"elasticsearch.fielddata.evictions": ("gauge", "indices.fielddata.evictions")
|
||||
}
|
||||
|
||||
else:
|
||||
|
@ -206,9 +206,9 @@ class ElasticSearch(AgentCheck):
|
|||
"elasticsearch.thread_pool.cache.threads": ("gauge", "thread_pool.cache.threads"),
|
||||
"elasticsearch.thread_pool.cache.queue": ("gauge", "thread_pool.cache.queue"),
|
||||
"jvm.gc.collection_count": ("gauge", "jvm.gc.collection_count"),
|
||||
"jvm.gc.collection_time": ("gauge", "jvm.gc.collection_time_in_millis", lambda v: float(v)/1000),
|
||||
"jvm.gc.collection_time": ("gauge", "jvm.gc.collection_time_in_millis", lambda v: float(v) / 1000),
|
||||
"jvm.gc.copy.count": ("gauge", "jvm.gc.collectors.Copy.collection_count"),
|
||||
"jvm.gc.copy.collection_time": ("gauge", "jvm.gc.collectors.Copy.collection_time_in_millis", lambda v: float(v)/1000)
|
||||
"jvm.gc.copy.collection_time": ("gauge", "jvm.gc.collectors.Copy.collection_time_in_millis", lambda v: float(v) / 1000)
|
||||
}
|
||||
|
||||
self.METRICS.update(additional_metrics)
|
||||
|
@ -278,8 +278,8 @@ class ElasticSearch(AgentCheck):
|
|||
if node_name in data['nodes']:
|
||||
node = data['nodes'][node_name]
|
||||
if 'network' in node\
|
||||
and 'primary_interface' in node['network']\
|
||||
and 'address' in node['network']['primary_interface']:
|
||||
and 'primary_interface' in node['network']\
|
||||
and 'address' in node['network']['primary_interface']:
|
||||
return node['network']['primary_interface']['address']
|
||||
|
||||
raise NodeNotFound()
|
||||
|
@ -295,7 +295,7 @@ class ElasticSearch(AgentCheck):
|
|||
else:
|
||||
ifaces = subprocess.Popen(['ip', 'addr'], stdout=subprocess.PIPE)
|
||||
grepper = subprocess.Popen(['grep', 'inet'], stdin=ifaces.stdout,
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
|
||||
ifaces.stdout.close()
|
||||
out, err = grepper.communicate()
|
||||
|
@ -305,7 +305,7 @@ class ElasticSearch(AgentCheck):
|
|||
for iface in out.split("\n"):
|
||||
iface = iface.strip()
|
||||
if iface:
|
||||
ips.append( iface.split(' ')[1].split('/')[0] )
|
||||
ips.append(iface.split(' ')[1].split('/')[0])
|
||||
|
||||
# Check the interface addresses against the primary address
|
||||
return primary_addrs in ips
|
||||
|
@ -326,7 +326,8 @@ class ElasticSearch(AgentCheck):
|
|||
break
|
||||
|
||||
if value is not None:
|
||||
if xform: value = xform(value)
|
||||
if xform:
|
||||
value = xform(value)
|
||||
if self.METRICS[metric][0] == "gauge":
|
||||
self.gauge(metric, value, dimensions=dimensions)
|
||||
else:
|
||||
|
@ -346,7 +347,6 @@ class ElasticSearch(AgentCheck):
|
|||
event = self._create_event(data['status'])
|
||||
self.event(event)
|
||||
|
||||
|
||||
def process_metric(metric, xtype, path, xform=None):
|
||||
# closure over data
|
||||
self._process_metric(data, metric, path, xform, dimensions=dimensions)
|
||||
|
@ -356,7 +356,6 @@ class ElasticSearch(AgentCheck):
|
|||
desc = self.METRICS[metric]
|
||||
process_metric(metric, *desc)
|
||||
|
||||
|
||||
def _metric_not_found(self, metric, path):
|
||||
self.log.debug("Metric not found: %s -> %s", path, metric)
|
||||
|
||||
|
@ -377,15 +376,15 @@ class ElasticSearch(AgentCheck):
|
|||
|
||||
msg = "ElasticSearch: %s just reported as %s" % (hostname, status)
|
||||
|
||||
return { 'timestamp': int(time.time()),
|
||||
'event_type': 'elasticsearch',
|
||||
'host': hostname,
|
||||
'msg_text':msg,
|
||||
'msg_title': msg_title,
|
||||
"alert_type": alert_type,
|
||||
"source_type_name": "elasticsearch",
|
||||
"event_object": hostname
|
||||
}
|
||||
return {'timestamp': int(time.time()),
|
||||
'event_type': 'elasticsearch',
|
||||
'host': hostname,
|
||||
'msg_text': msg,
|
||||
'msg_title': msg_title,
|
||||
"alert_type": alert_type,
|
||||
"source_type_name": "elasticsearch",
|
||||
"event_object": hostname
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def parse_agent_config(agentConfig):
|
||||
|
@ -397,4 +396,3 @@ class ElasticSearch(AgentCheck):
|
|||
'url': agentConfig.get('elasticsearch'),
|
||||
}]
|
||||
}
|
||||
|
||||
|
|
|
@ -30,7 +30,7 @@ class Gearman(AgentCheck):
|
|||
running = 0
|
||||
queued = 0
|
||||
workers = 0
|
||||
|
||||
|
||||
for stat in data:
|
||||
running += stat['running']
|
||||
queued += stat['queued']
|
||||
|
|
|
@ -81,11 +81,11 @@ class GUnicornCheck(AgentCheck):
|
|||
except psutil.NoSuchProcess:
|
||||
self.warning('Process %s disappeared while scanning' % proc.name)
|
||||
continue
|
||||
|
||||
|
||||
# Let them do a little bit more work.
|
||||
time.sleep(self.CPU_SLEEP_SECS)
|
||||
|
||||
# Processes which have used more CPU are considered active (this is a very
|
||||
# Processes which have used more CPU are considered active (this is a very
|
||||
# naive check, but gunicorn exposes no stats API)
|
||||
for proc in worker_procs:
|
||||
if proc.pid not in cpu_time_by_pid:
|
||||
|
@ -109,11 +109,13 @@ class GUnicornCheck(AgentCheck):
|
|||
def _get_master_proc_by_name(name):
|
||||
""" Return a psutil process for the master gunicorn process with the given name. """
|
||||
master_name = GUnicornCheck._get_master_proc_name(name)
|
||||
master_procs = [p for p in psutil.process_iter() if p.cmdline and p.cmdline[0] == master_name]
|
||||
master_procs = [
|
||||
p for p in psutil.process_iter() if p.cmdline and p.cmdline[0] == master_name]
|
||||
if len(master_procs) == 0:
|
||||
raise GUnicornCheckError("Found no master process with name: %s" % master_name)
|
||||
elif len(master_procs) > 1:
|
||||
raise GUnicornCheckError("Found more than one master process with name: %s" % master_name)
|
||||
raise GUnicornCheckError(
|
||||
"Found more than one master process with name: %s" % master_name)
|
||||
else:
|
||||
return master_procs[0]
|
||||
|
||||
|
@ -124,9 +126,8 @@ class GUnicornCheck(AgentCheck):
|
|||
# root 22976 0.1 0.1 60364 13424 ? Ss 19:30 0:00 gunicorn: master [web1]
|
||||
# web 22984 20.7 2.3 521924 176136 ? Sl 19:30 1:58 gunicorn: worker [web1]
|
||||
# web 22985 26.4 6.1 795288 449596 ? Sl 19:30 2:32 gunicorn: worker [web1]
|
||||
return "gunicorn: master [%s]" % name
|
||||
return "gunicorn: master [%s]" % name
|
||||
|
||||
|
||||
class GUnicornCheckError(Exception):
|
||||
pass
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@ class Services(object):
|
|||
|
||||
|
||||
class HAProxy(AgentCheck):
|
||||
|
||||
def __init__(self, name, init_config, agent_config):
|
||||
AgentCheck.__init__(self, name, init_config, agent_config)
|
||||
|
||||
|
@ -60,7 +61,8 @@ class HAProxy(AgentCheck):
|
|||
|
||||
process_events = instance.get('status_check', self.init_config.get('status_check', False))
|
||||
|
||||
self._process_data(data, collect_aggregates_only, process_events, url=url, collect_status_metrics=collect_status_metrics)
|
||||
self._process_data(data, collect_aggregates_only, process_events,
|
||||
url=url, collect_status_metrics=collect_status_metrics)
|
||||
|
||||
def _fetch_data(self, url, username, password):
|
||||
''' Hit a given URL and return the parsed json '''
|
||||
|
@ -95,7 +97,7 @@ class HAProxy(AgentCheck):
|
|||
# Holds a list of dictionaries describing each system
|
||||
data_list = []
|
||||
|
||||
for line in data[1:]: # Skip the first line
|
||||
for line in data[1:]: # Skip the first line
|
||||
if not line.strip():
|
||||
continue
|
||||
data_dict = {}
|
||||
|
@ -123,7 +125,6 @@ class HAProxy(AgentCheck):
|
|||
if collect_status_metrics and 'status' in data_dict and 'pxname' in data_dict:
|
||||
hosts_statuses[(data_dict['pxname'], data_dict['status'])] += 1
|
||||
|
||||
|
||||
if data_dict['svname'] in Services.ALL:
|
||||
data_list.append(data_dict)
|
||||
|
||||
|
@ -143,7 +144,7 @@ class HAProxy(AgentCheck):
|
|||
return data
|
||||
|
||||
def _process_status_metric(self, hosts_statuses):
|
||||
agg_statuses = defaultdict(lambda:{'available':0, 'unavailable':0})
|
||||
agg_statuses = defaultdict(lambda: {'available': 0, 'unavailable': 0})
|
||||
for (service, status), count in hosts_statuses.iteritems():
|
||||
status = status.lower()
|
||||
|
||||
|
@ -196,7 +197,7 @@ class HAProxy(AgentCheck):
|
|||
for data in data_list:
|
||||
hostname = data['svname']
|
||||
service_name = data['pxname']
|
||||
key = "%s:%s" % (hostname,service_name)
|
||||
key = "%s:%s" % (hostname, service_name)
|
||||
status = self.host_status[url][key]
|
||||
|
||||
if status is None:
|
||||
|
@ -227,17 +228,18 @@ class HAProxy(AgentCheck):
|
|||
alert_type = "success"
|
||||
else:
|
||||
alert_type = "info"
|
||||
title = "HAProxy %s front-end reported %s back and %s" % (service_name, hostname, status)
|
||||
title = "HAProxy %s front-end reported %s back and %s" % (
|
||||
service_name, hostname, status)
|
||||
|
||||
return {
|
||||
'timestamp': int(time.time() - lastchg),
|
||||
'event_type': EVENT_TYPE,
|
||||
'host': hostname,
|
||||
'msg_title': title,
|
||||
'alert_type': alert_type,
|
||||
"source_type_name": SOURCE_TYPE_NAME,
|
||||
"event_object": hostname,
|
||||
"dimensions": {"frontend": service_name, "host": hostname}
|
||||
'timestamp': int(time.time() - lastchg),
|
||||
'event_type': EVENT_TYPE,
|
||||
'host': hostname,
|
||||
'msg_title': title,
|
||||
'alert_type': alert_type,
|
||||
"source_type_name": SOURCE_TYPE_NAME,
|
||||
"event_object": hostname,
|
||||
"dimensions": {"frontend": service_name, "host": hostname}
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
|
|
|
@ -2,6 +2,7 @@ from monagent.collector.checks import AgentCheck
|
|||
|
||||
|
||||
class HDFSCheck(AgentCheck):
|
||||
|
||||
"""Report on free space and space used in HDFS.
|
||||
"""
|
||||
|
||||
|
|
|
@ -9,6 +9,7 @@ from monagent.collector.checks.services_checks import ServicesCheck, Status
|
|||
|
||||
|
||||
class HostAlive(ServicesCheck):
|
||||
|
||||
"""Inherit ServicesCheck class to test if a host is alive or not"""
|
||||
|
||||
def __init__(self, name, init_config, agent_config, instances=None):
|
||||
|
@ -84,11 +85,11 @@ class HostAlive(ServicesCheck):
|
|||
|
||||
if instance['alive_test'] == 'ssh':
|
||||
success = self._test_ssh(instance['host_name'],
|
||||
self.init_config.get('ssh_port'),
|
||||
self.init_config.get('ssh_timeout'))
|
||||
self.init_config.get('ssh_port'),
|
||||
self.init_config.get('ssh_timeout'))
|
||||
elif instance['alive_test'] == 'ping':
|
||||
success = self._test_ping(instance['host_name'],
|
||||
self.init_config.get('ping_timeout'))
|
||||
self.init_config.get('ping_timeout'))
|
||||
else:
|
||||
self.log.info("Unrecognized alive_test " + instance['alive_test'])
|
||||
|
||||
|
@ -99,4 +100,3 @@ class HostAlive(ServicesCheck):
|
|||
self.gauge('host_alive', 1, dimensions=dimensions)
|
||||
self.log.error("Host down: " + instance['host_name'])
|
||||
return Status.DOWN, "DOWN"
|
||||
|
||||
|
|
|
@ -89,8 +89,8 @@ class IIS(AgentCheck):
|
|||
|
||||
for metric, mtype, wmi_val in self.METRICS:
|
||||
if not hasattr(iis_site, wmi_val):
|
||||
self.warning('Unable to fetch metric %s. Missing %s in Win32_PerfFormattedData_W3SVC_WebService' \
|
||||
% (metric, wmi_val))
|
||||
self.warning('Unable to fetch metric %s. Missing %s in Win32_PerfFormattedData_W3SVC_WebService'
|
||||
% (metric, wmi_val))
|
||||
continue
|
||||
|
||||
# Submit the metric value with the correct type
|
||||
|
|
|
@ -22,6 +22,7 @@ class Skip(Exception):
|
|||
Raised by :class:`Jenkins` when it comes across
|
||||
a build or job that should be excluded from being checked.
|
||||
"""
|
||||
|
||||
def __init__(self, reason, dir_name):
|
||||
message = 'skipping build or job at %s because %s' % (dir_name, reason)
|
||||
Exception.__init__(self, message)
|
||||
|
@ -66,15 +67,15 @@ class Jenkins(AgentCheck):
|
|||
d = dict([(k, v.text) for k, v in kv_pairs if v is not None])
|
||||
|
||||
try:
|
||||
d['branch'] = tree.find('actions')\
|
||||
.find('hudson.plugins.git.util.BuildData')\
|
||||
.find('buildsByBranchName')\
|
||||
.find('entry')\
|
||||
.find('hudson.plugins.git.util.Build')\
|
||||
.find('revision')\
|
||||
.find('branches')\
|
||||
.find('hudson.plugins.git.Branch')\
|
||||
.find('name')\
|
||||
d['branch'] = tree.find('actions') \
|
||||
.find('hudson.plugins.git.util.BuildData') \
|
||||
.find('buildsByBranchName') \
|
||||
.find('entry') \
|
||||
.find('hudson.plugins.git.util.Build') \
|
||||
.find('revision') \
|
||||
.find('branches') \
|
||||
.find('hudson.plugins.git.Branch') \
|
||||
.find('name') \
|
||||
.text
|
||||
except Exception:
|
||||
pass
|
||||
|
@ -104,9 +105,9 @@ class Jenkins(AgentCheck):
|
|||
continue
|
||||
|
||||
output = {
|
||||
'job_name': job_name,
|
||||
'timestamp': timestamp,
|
||||
'event_type': 'build result'
|
||||
'job_name': job_name,
|
||||
'timestamp': timestamp,
|
||||
'event_type': 'build result'
|
||||
}
|
||||
output.update(build_metadata)
|
||||
self.high_watermarks[instance_key][job_name] = timestamp
|
||||
|
@ -123,8 +124,8 @@ class Jenkins(AgentCheck):
|
|||
# so that we only send events that occured after the agent
|
||||
# started.
|
||||
# (Setting high_watermarks in the next statement prevents
|
||||
# any kind of infinite loop (assuming nothing ever sets
|
||||
# high_watermarks to None again!))
|
||||
# any kind of infinite loop (assuming nothing ever sets
|
||||
# high_watermarks to None again!))
|
||||
self.high_watermarks[instance.get('name')] = defaultdict(lambda: 0)
|
||||
self.check(instance, create_event=False)
|
||||
|
||||
|
@ -150,7 +151,8 @@ class Jenkins(AgentCheck):
|
|||
dimensions = {'job_name': output['job_name']}
|
||||
if 'branch' in output:
|
||||
dimensions['branch'] = output['branch']
|
||||
self.gauge("jenkins.job.duration", float(output['duration'])/1000.0, dimensions=dimensions)
|
||||
self.gauge("jenkins.job.duration", float(
|
||||
output['duration']) / 1000.0, dimensions=dimensions)
|
||||
|
||||
if output['result'] == 'SUCCESS':
|
||||
self.increment('jenkins.job.success', dimensions=dimensions)
|
||||
|
@ -168,4 +170,3 @@ class Jenkins(AgentCheck):
|
|||
'jenkins_home': agentConfig.get('hudson_home'),
|
||||
}]
|
||||
}
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ import random
|
|||
|
||||
|
||||
class KafkaCheck(AgentCheck):
|
||||
|
||||
def check(self, instance):
|
||||
consumer_groups = self.read_config(instance, 'consumer_groups',
|
||||
cast=self._validate_consumer_groups)
|
||||
|
@ -131,4 +132,5 @@ consumer_groups:
|
|||
return host_ports
|
||||
except Exception, e:
|
||||
self.log.exception(e)
|
||||
raise Exception('Could not parse %s. Must be in the form of `host0:port0,host1:port1,host2:port2`' % val)
|
||||
raise Exception(
|
||||
'Could not parse %s. Must be in the form of `host0:port0,host1:port1,host2:port2`' % val)
|
||||
|
|
|
@ -15,31 +15,31 @@ class KyotoTycoonCheck(AgentCheck):
|
|||
"""
|
||||
|
||||
GAUGES = {
|
||||
'repl_delay': 'replication.delay',
|
||||
'serv_thread_count': 'threads',
|
||||
'repl_delay': 'replication.delay',
|
||||
'serv_thread_count': 'threads',
|
||||
}
|
||||
|
||||
RATES = {
|
||||
'serv_conn_count': 'connections',
|
||||
'cnt_get': 'ops.get.hits',
|
||||
'cnt_get_misses': 'ops.get.misses',
|
||||
'cnt_set': 'ops.set.hits',
|
||||
'cnt_set_misses': 'ops.set.misses',
|
||||
'cnt_remove': 'ops.del.hits',
|
||||
'cnt_remove_misses': 'ops.del.misses',
|
||||
'serv_conn_count': 'connections',
|
||||
'cnt_get': 'ops.get.hits',
|
||||
'cnt_get_misses': 'ops.get.misses',
|
||||
'cnt_set': 'ops.set.hits',
|
||||
'cnt_set_misses': 'ops.set.misses',
|
||||
'cnt_remove': 'ops.del.hits',
|
||||
'cnt_remove_misses': 'ops.del.misses',
|
||||
}
|
||||
|
||||
DB_GAUGES = {
|
||||
'count': 'records',
|
||||
'size': 'size',
|
||||
'count': 'records',
|
||||
'size': 'size',
|
||||
}
|
||||
TOTALS = {
|
||||
'cnt_get': 'ops.get.total',
|
||||
'cnt_get_misses': 'ops.get.total',
|
||||
'cnt_set': 'ops.set.total',
|
||||
'cnt_set_misses': 'ops.set.total',
|
||||
'cnt_remove': 'ops.del.total',
|
||||
'cnt_remove_misses': 'ops.del.total',
|
||||
'cnt_get': 'ops.get.total',
|
||||
'cnt_get_misses': 'ops.get.total',
|
||||
'cnt_set': 'ops.set.total',
|
||||
'cnt_set_misses': 'ops.set.total',
|
||||
'cnt_remove': 'ops.del.total',
|
||||
'cnt_remove_misses': 'ops.del.total',
|
||||
}
|
||||
|
||||
def check(self, instance):
|
||||
|
@ -65,7 +65,7 @@ class KyotoTycoonCheck(AgentCheck):
|
|||
if key in self.GAUGES:
|
||||
name = self.GAUGES[key]
|
||||
self.gauge('kyototycoon.%s' % name, float(value), dimensions=dimensions)
|
||||
|
||||
|
||||
elif key in self.RATES:
|
||||
name = self.RATES[key]
|
||||
self.rate('kyototycoon.%s_per_s' % name, float(value), dimensions=dimensions)
|
||||
|
|
|
@ -6,6 +6,7 @@ from monagent.collector.checks.utils import add_basic_auth
|
|||
|
||||
|
||||
class Lighttpd(AgentCheck):
|
||||
|
||||
"""Tracks basic connection/requests/workers metrics
|
||||
|
||||
See http://redmine.lighttpd.net/projects/1/wiki/Docs_ModStatus for Lighttpd details
|
||||
|
@ -114,10 +115,12 @@ class Lighttpd(AgentCheck):
|
|||
url_suffix = self.URL_SUFFIX_PER_VERSION[server_version]
|
||||
if self.assumed_url.get(instance['lighttpd_status_url'], None) is None and url[-len(url_suffix):] != url_suffix:
|
||||
self.assumed_url[instance['lighttpd_status_url']] = '%s%s' % (url, url_suffix)
|
||||
self.warning("Assuming url was not correct. Trying to add %s suffix to the url" % url_suffix)
|
||||
self.warning(
|
||||
"Assuming url was not correct. Trying to add %s suffix to the url" % url_suffix)
|
||||
self.check(instance)
|
||||
else:
|
||||
raise Exception("No metrics were fetched for this instance. Make sure that %s is the proper url." % instance['lighttpd_status_url'])
|
||||
raise Exception("No metrics were fetched for this instance. Make sure that %s is the proper url." % instance[
|
||||
'lighttpd_status_url'])
|
||||
|
||||
def _get_server_version(self, headers):
|
||||
for h in headers:
|
||||
|
@ -133,4 +136,3 @@ class Lighttpd(AgentCheck):
|
|||
|
||||
self.log.debug("Lighttpd server version is Unknown")
|
||||
return "Unknown"
|
||||
|
||||
|
|
|
@ -9,34 +9,34 @@ from monagent.collector.checks import *
|
|||
# version string Version string of this server
|
||||
# pointer_size 32 Default size of pointers on the host OS
|
||||
# (generally 32 or 64)
|
||||
# rusage_user 32u:32u Accumulated user time for this process
|
||||
# rusage_user 32u:32u Accumulated user time for this process
|
||||
# (seconds:microseconds)
|
||||
# rusage_system 32u:32u Accumulated system time for this process
|
||||
# rusage_system 32u:32u Accumulated system time for this process
|
||||
# (seconds:microseconds)
|
||||
# curr_items 32u Current number of items stored by the server
|
||||
# total_items 32u Total number of items stored by this server
|
||||
# total_items 32u Total number of items stored by this server
|
||||
# ever since it started
|
||||
# bytes 64u Current number of bytes used by this server
|
||||
# bytes 64u Current number of bytes used by this server
|
||||
# to store items
|
||||
# curr_connections 32u Number of open connections
|
||||
# total_connections 32u Total number of connections opened since
|
||||
# total_connections 32u Total number of connections opened since
|
||||
# the server started running
|
||||
# connection_structures 32u Number of connection structures allocated
|
||||
# connection_structures 32u Number of connection structures allocated
|
||||
# by the server
|
||||
# cmd_get 64u Cumulative number of retrieval requests
|
||||
# cmd_set 64u Cumulative number of storage requests
|
||||
# get_hits 64u Number of keys that have been requested and
|
||||
# get_hits 64u Number of keys that have been requested and
|
||||
# found present
|
||||
# get_misses 64u Number of items that have been requested
|
||||
# get_misses 64u Number of items that have been requested
|
||||
# and not found
|
||||
# evictions 64u Number of valid items removed from cache
|
||||
# to free memory for new items
|
||||
# bytes_read 64u Total number of bytes read by this server
|
||||
# bytes_read 64u Total number of bytes read by this server
|
||||
# from network
|
||||
# bytes_written 64u Total number of bytes sent by this server to
|
||||
# bytes_written 64u Total number of bytes sent by this server to
|
||||
# network
|
||||
# limit_maxbytes 32u Number of bytes this server is allowed to
|
||||
# use for storage.
|
||||
# use for storage.
|
||||
# threads 32u Number of worker threads requested.
|
||||
# (see doc/threads.txt)
|
||||
# >>> mc.get_stats()
|
||||
|
@ -54,6 +54,7 @@ from monagent.collector.checks import *
|
|||
# http://www.couchbase.org/wiki/display/membase/Membase+Statistics
|
||||
# https://github.com/membase/ep-engine/blob/master/docs/stats.org
|
||||
|
||||
|
||||
class Memcache(AgentCheck):
|
||||
DEFAULT_PORT = 11211
|
||||
|
||||
|
@ -102,7 +103,8 @@ class Memcache(AgentCheck):
|
|||
mc = memcache.Client(["%s:%d" % (server, port)])
|
||||
raw_stats = mc.get_stats()
|
||||
|
||||
assert len(raw_stats) == 1 and len(raw_stats[0]) == 2, "Malformed response: %s" % raw_stats
|
||||
assert len(raw_stats) == 1 and len(
|
||||
raw_stats[0]) == 2, "Malformed response: %s" % raw_stats
|
||||
# Access the dict
|
||||
stats = raw_stats[0][1]
|
||||
for metric in stats:
|
||||
|
@ -147,7 +149,8 @@ class Memcache(AgentCheck):
|
|||
except ZeroDivisionError:
|
||||
pass
|
||||
except AssertionError:
|
||||
raise Exception("Unable to retrieve stats from memcache instance: " + server + ":" + str(port) + ". Please check your configuration")
|
||||
raise Exception("Unable to retrieve stats from memcache instance: " +
|
||||
server + ":" + str(port) + ". Please check your configuration")
|
||||
|
||||
if mc is not None:
|
||||
mc.disconnect_all()
|
||||
|
@ -162,7 +165,8 @@ class Memcache(AgentCheck):
|
|||
try:
|
||||
import memcache
|
||||
except ImportError:
|
||||
raise Exception("Cannot import memcache module. Check the instructions to install this module at https://app.datadoghq.com/account/settings#integrations/mcache")
|
||||
raise Exception(
|
||||
"Cannot import memcache module. Check the instructions to install this module at https://app.datadoghq.com/account/settings#integrations/mcache")
|
||||
|
||||
# Hacky monkeypatch to fix a memory leak in the memcache library.
|
||||
# See https://github.com/DataDog/dd-agent/issues/278 for details.
|
||||
|
@ -192,9 +196,9 @@ class Memcache(AgentCheck):
|
|||
all_instances.append(instance)
|
||||
|
||||
# Load the conf according to the new schema
|
||||
#memcache_instance_1: first_host:first_port:first_tag
|
||||
#memcache_instance_2: second_host:second_port:second_tag
|
||||
#memcache_instance_3: third_host:third_port:third_tag
|
||||
# memcache_instance_1: first_host:first_port:first_tag
|
||||
# memcache_instance_2: second_host:second_port:second_tag
|
||||
# memcache_instance_3: third_host:third_port:third_tag
|
||||
index = 1
|
||||
instance = agentConfig.get("memcache_instance_%s" % index, None)
|
||||
while instance:
|
||||
|
|
|
@ -6,7 +6,6 @@ from monagent.collector.checks import AgentCheck
|
|||
from monagent.common.util import get_hostname
|
||||
|
||||
|
||||
|
||||
# When running with pymongo < 2.0
|
||||
# Not the full spec for mongo URIs -- just extract username and password
|
||||
# http://www.mongodb.org/display/DOCS/connections6
|
||||
|
@ -113,16 +112,26 @@ class MongoDb(AgentCheck):
|
|||
state of a mongo node"""
|
||||
|
||||
def get_state_description(state):
|
||||
if state == 0: return 'Starting Up'
|
||||
elif state == 1: return 'Primary'
|
||||
elif state == 2: return 'Secondary'
|
||||
elif state == 3: return 'Recovering'
|
||||
elif state == 4: return 'Fatal'
|
||||
elif state == 5: return 'Starting up (forking threads)'
|
||||
elif state == 6: return 'Unknown'
|
||||
elif state == 7: return 'Arbiter'
|
||||
elif state == 8: return 'Down'
|
||||
elif state == 9: return 'Rollback'
|
||||
if state == 0:
|
||||
return 'Starting Up'
|
||||
elif state == 1:
|
||||
return 'Primary'
|
||||
elif state == 2:
|
||||
return 'Secondary'
|
||||
elif state == 3:
|
||||
return 'Recovering'
|
||||
elif state == 4:
|
||||
return 'Fatal'
|
||||
elif state == 5:
|
||||
return 'Starting up (forking threads)'
|
||||
elif state == 6:
|
||||
return 'Unknown'
|
||||
elif state == 7:
|
||||
return 'Arbiter'
|
||||
elif state == 8:
|
||||
return 'Down'
|
||||
elif state == 9:
|
||||
return 'Rollback'
|
||||
|
||||
status = get_state_description(state)
|
||||
hostname = get_hostname(agentConfig)
|
||||
|
@ -152,7 +161,7 @@ class MongoDb(AgentCheck):
|
|||
'ssl': instance.get('ssl', None),
|
||||
'ssl_keyfile': instance.get('ssl_keyfile', None),
|
||||
'ssl_certfile': instance.get('ssl_certfile', None),
|
||||
'ssl_cert_reqs': instance.get('ssl_cert_reqs', None),
|
||||
'ssl_cert_reqs': instance.get('ssl_cert_reqs', None),
|
||||
'ssl_ca_certs': instance.get('ssl_ca_certs', None)
|
||||
}
|
||||
|
||||
|
@ -166,8 +175,10 @@ class MongoDb(AgentCheck):
|
|||
try:
|
||||
from pymongo import Connection
|
||||
except ImportError:
|
||||
self.log.error('mongo.yaml exists but pymongo module can not be imported. Skipping check.')
|
||||
raise Exception('Python PyMongo Module can not be imported. Please check the installation instruction on the Datadog Website')
|
||||
self.log.error(
|
||||
'mongo.yaml exists but pymongo module can not be imported. Skipping check.')
|
||||
raise Exception(
|
||||
'Python PyMongo Module can not be imported. Please check the installation instruction on the Datadog Website')
|
||||
|
||||
try:
|
||||
from pymongo import uri_parser
|
||||
|
@ -194,7 +205,7 @@ class MongoDb(AgentCheck):
|
|||
do_auth = False
|
||||
|
||||
conn = Connection(server, network_timeout=DEFAULT_TIMEOUT,
|
||||
**ssl_params)
|
||||
**ssl_params)
|
||||
db = conn[db_name]
|
||||
if do_auth:
|
||||
if not db.authenticate(username, password):
|
||||
|
@ -204,7 +215,8 @@ class MongoDb(AgentCheck):
|
|||
status['stats'] = db.command('dbstats')
|
||||
|
||||
# Handle replica data, if any
|
||||
# See http://www.mongodb.org/display/DOCS/Replica+Set+Commands#ReplicaSetCommands-replSetGetStatus
|
||||
# See
|
||||
# http://www.mongodb.org/display/DOCS/Replica+Set+Commands#ReplicaSetCommands-replSetGetStatus
|
||||
try:
|
||||
data = {}
|
||||
|
||||
|
@ -224,11 +236,11 @@ class MongoDb(AgentCheck):
|
|||
if current is not None and primary is not None:
|
||||
lag = current['optimeDate'] - primary['optimeDate']
|
||||
# Python 2.7 has this built in, python < 2.7 don't...
|
||||
if hasattr(lag,'total_seconds'):
|
||||
if hasattr(lag, 'total_seconds'):
|
||||
data['replicationLag'] = lag.total_seconds()
|
||||
else:
|
||||
data['replicationLag'] = (lag.microseconds + \
|
||||
(lag.seconds + lag.days * 24 * 3600) * 10**6) / 10.0**6
|
||||
data['replicationLag'] = (lag.microseconds +
|
||||
(lag.seconds + lag.days * 24 * 3600) * 10 ** 6) / 10.0 ** 6
|
||||
|
||||
if current is not None:
|
||||
data['health'] = current['health']
|
||||
|
|
|
@ -45,6 +45,7 @@ STATUS_VARS = {
|
|||
|
||||
|
||||
class MySql(AgentCheck):
|
||||
|
||||
def __init__(self, name, init_config, agent_config):
|
||||
AgentCheck.__init__(self, name, init_config, agent_config)
|
||||
self.mysql_version = {}
|
||||
|
@ -63,7 +64,8 @@ class MySql(AgentCheck):
|
|||
return {"MySQLdb": version}
|
||||
|
||||
def check(self, instance):
|
||||
host, port, user, password, mysql_sock, defaults_file, dimensions, options = self._get_config(instance)
|
||||
host, port, user, password, mysql_sock, defaults_file, dimensions, options = self._get_config(
|
||||
instance)
|
||||
|
||||
if (not host or not user) and not defaults_file:
|
||||
raise Exception("Mysql host and user are needed.")
|
||||
|
@ -92,23 +94,23 @@ class MySql(AgentCheck):
|
|||
import MySQLdb
|
||||
except ImportError:
|
||||
raise Exception("Cannot import MySQLdb module. Check the instructions "
|
||||
"to install this module at https://app.datadoghq.com/account/settings#integrations/mysql")
|
||||
"to install this module at https://app.datadoghq.com/account/settings#integrations/mysql")
|
||||
|
||||
if defaults_file != '':
|
||||
db = MySQLdb.connect(read_default_file=defaults_file)
|
||||
elif mysql_sock != '':
|
||||
elif mysql_sock != '':
|
||||
db = MySQLdb.connect(unix_socket=mysql_sock,
|
||||
user=user,
|
||||
passwd=password)
|
||||
user=user,
|
||||
passwd=password)
|
||||
elif port:
|
||||
db = MySQLdb.connect(host=host,
|
||||
port=port,
|
||||
user=user,
|
||||
passwd=password)
|
||||
port=port,
|
||||
user=user,
|
||||
passwd=password)
|
||||
else:
|
||||
db = MySQLdb.connect(host=host,
|
||||
user=user,
|
||||
passwd=password)
|
||||
user=user,
|
||||
passwd=password)
|
||||
self.log.debug("Connected to MySQL")
|
||||
|
||||
return db
|
||||
|
@ -125,15 +127,21 @@ class MySql(AgentCheck):
|
|||
# Be sure InnoDB is enabled
|
||||
if 'Innodb_page_size' in results:
|
||||
page_size = self._collect_scalar('Innodb_page_size', results)
|
||||
innodb_buffer_pool_pages_total = self._collect_scalar('Innodb_buffer_pool_pages_total', results)
|
||||
innodb_buffer_pool_pages_free = self._collect_scalar('Innodb_buffer_pool_pages_free', results)
|
||||
innodb_buffer_pool_pages_total = self._collect_scalar(
|
||||
'Innodb_buffer_pool_pages_total', results)
|
||||
innodb_buffer_pool_pages_free = self._collect_scalar(
|
||||
'Innodb_buffer_pool_pages_free', results)
|
||||
innodb_buffer_pool_pages_total = innodb_buffer_pool_pages_total * page_size
|
||||
innodb_buffer_pool_pages_free = innodb_buffer_pool_pages_free * page_size
|
||||
innodb_buffer_pool_pages_used = innodb_buffer_pool_pages_total - innodb_buffer_pool_pages_free
|
||||
innodb_buffer_pool_pages_used = innodb_buffer_pool_pages_total - \
|
||||
innodb_buffer_pool_pages_free
|
||||
|
||||
self.gauge("mysql.innodb.buffer_pool_free", innodb_buffer_pool_pages_free, dimensions=dimensions)
|
||||
self.gauge("mysql.innodb.buffer_pool_used", innodb_buffer_pool_pages_used, dimensions=dimensions)
|
||||
self.gauge("mysql.innodb.buffer_pool_total", innodb_buffer_pool_pages_total, dimensions=dimensions)
|
||||
self.gauge("mysql.innodb.buffer_pool_free",
|
||||
innodb_buffer_pool_pages_free, dimensions=dimensions)
|
||||
self.gauge("mysql.innodb.buffer_pool_used",
|
||||
innodb_buffer_pool_pages_used, dimensions=dimensions)
|
||||
self.gauge("mysql.innodb.buffer_pool_total",
|
||||
innodb_buffer_pool_pages_total, dimensions=dimensions)
|
||||
|
||||
if 'galera_cluster' in options and options['galera_cluster']:
|
||||
value = self._collect_scalar('wsrep_cluster_size', results)
|
||||
|
@ -181,7 +189,8 @@ class MySql(AgentCheck):
|
|||
greater_502 = True
|
||||
|
||||
except Exception, exception:
|
||||
self.warning("Cannot compute mysql version, assuming older than 5.0.2: %s" % str(exception))
|
||||
self.warning("Cannot compute mysql version, assuming older than 5.0.2: %s" %
|
||||
str(exception))
|
||||
|
||||
self.greater_502[host] = greater_502
|
||||
|
||||
|
@ -250,7 +259,8 @@ class MySql(AgentCheck):
|
|||
else:
|
||||
self.log.debug("Received value is None for index %d" % col_idx)
|
||||
except ValueError:
|
||||
self.log.exception("Cannot find %s in the columns %s" % (field, cursor.description))
|
||||
self.log.exception("Cannot find %s in the columns %s" %
|
||||
(field, cursor.description))
|
||||
cursor.close()
|
||||
del cursor
|
||||
except Exception:
|
||||
|
@ -281,10 +291,13 @@ class MySql(AgentCheck):
|
|||
# Convert time to s (number of second of CPU used by mysql)
|
||||
# It's a counter, it will be divided by the period, multiply by 100
|
||||
# to get the percentage of CPU used by mysql over the period
|
||||
self.rate("mysql.performance.user_time", int((float(ucpu)/float(clk_tck)) * 100), dimensions=dimensions)
|
||||
self.rate("mysql.performance.kernel_time", int((float(kcpu)/float(clk_tck)) * 100), dimensions=dimensions)
|
||||
self.rate("mysql.performance.user_time", int(
|
||||
(float(ucpu) / float(clk_tck)) * 100), dimensions=dimensions)
|
||||
self.rate("mysql.performance.kernel_time", int(
|
||||
(float(kcpu) / float(clk_tck)) * 100), dimensions=dimensions)
|
||||
except Exception:
|
||||
self.warning("Error while reading mysql (pid: %s) procfs data\n%s" % (pid, traceback.format_exc()))
|
||||
self.warning("Error while reading mysql (pid: %s) procfs data\n%s" %
|
||||
(pid, traceback.format_exc()))
|
||||
|
||||
def _get_server_pid(self, db):
|
||||
pid = None
|
||||
|
@ -331,10 +344,10 @@ class MySql(AgentCheck):
|
|||
|
||||
return {
|
||||
'instances': [{
|
||||
'server': agent_config.get('mysql_server',''),
|
||||
'sock': agent_config.get('mysql_sock',''),
|
||||
'user': agent_config.get('mysql_user',''),
|
||||
'pass': agent_config.get('mysql_pass',''),
|
||||
'server': agent_config.get('mysql_server', ''),
|
||||
'sock': agent_config.get('mysql_sock', ''),
|
||||
'user': agent_config.get('mysql_user', ''),
|
||||
'pass': agent_config.get('mysql_pass', ''),
|
||||
'options': {'replication': True},
|
||||
}]
|
||||
}
|
||||
|
|
|
@ -13,6 +13,7 @@ from monagent.collector.checks.services_checks import ServicesCheck, Status
|
|||
|
||||
|
||||
class WrapNagios(ServicesCheck):
|
||||
|
||||
"""Inherit ServicesCheck class to process Nagios checks"""
|
||||
|
||||
def __init__(self, name, init_config, agent_config, instances=None):
|
||||
|
@ -55,7 +56,8 @@ class WrapNagios(ServicesCheck):
|
|||
|
||||
if last_run_path.endswith('/') is False:
|
||||
last_run_path += '/'
|
||||
last_run_file = (last_run_path + 'nagios_wrapper_' + hashlib.md5(instance['service_name']).hexdigest() + '.pck')
|
||||
last_run_file = (
|
||||
last_run_path + 'nagios_wrapper_' + hashlib.md5(instance['service_name']).hexdigest() + '.pck')
|
||||
|
||||
# Load last-run data from shared memory file
|
||||
last_run_data = {}
|
||||
|
|
|
@ -86,7 +86,7 @@ class Network(AgentCheck):
|
|||
|
||||
# For reasons i don't understand only these metrics are skipped if a
|
||||
# particular interface is in the `excluded_interfaces` config list.
|
||||
# Not sure why the others aren't included. Until I understand why, I'm
|
||||
# Not sure why the others aren't included. Until I understand why, I'm
|
||||
# going to keep the same behaviour.
|
||||
exclude_iface_metrics = [
|
||||
'packets_in',
|
||||
|
@ -104,7 +104,6 @@ class Network(AgentCheck):
|
|||
count += 1
|
||||
self.log.debug("tracked %s network metrics for interface %s" % (count, iface))
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _parse_value(v):
|
||||
if v == "-":
|
||||
|
@ -150,7 +149,6 @@ class Network(AgentCheck):
|
|||
for metric, value in metrics.iteritems():
|
||||
self.gauge(metric, value)
|
||||
|
||||
|
||||
proc = open('/proc/net/dev', 'r')
|
||||
try:
|
||||
lines = proc.readlines()
|
||||
|
@ -160,7 +158,8 @@ class Network(AgentCheck):
|
|||
# face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed
|
||||
# lo:45890956 112797 0 0 0 0 0 0 45890956 112797 0 0 0 0 0 0
|
||||
# eth0:631947052 1042233 0 19 0 184 0 1206 1208625538 1320529 0 0 0 0 0 0
|
||||
# eth1: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
# eth1: 0 0 0 0 0 0 0 0
|
||||
# 0 0 0 0 0 0 0 0
|
||||
for l in lines[2:]:
|
||||
cols = l.split(':', 1)
|
||||
x = cols[1].split()
|
||||
|
@ -198,7 +197,8 @@ class Network(AgentCheck):
|
|||
# ham0 1404 <Link#6> 7a:79:05:4d:bf:f5 30100 0 6815204 18742 0 8494811 0
|
||||
# ham0 1404 5 5.77.191.245 30100 - 6815204 18742 - 8494811 -
|
||||
# ham0 1404 seneca.loca fe80:6::7879:5ff: 30100 - 6815204 18742 - 8494811 -
|
||||
# ham0 1404 2620:9b::54 2620:9b::54d:bff5 30100 - 6815204 18742 - 8494811 -
|
||||
# ham0 1404 2620:9b::54 2620:9b::54d:bff5 30100 - 6815204
|
||||
# 18742 - 8494811 -
|
||||
|
||||
lines = netstat.split("\n")
|
||||
headers = lines[0].split()
|
||||
|
|
|
@ -7,6 +7,7 @@ from monagent.collector.checks.utils import add_basic_auth
|
|||
|
||||
|
||||
class Nginx(AgentCheck):
|
||||
|
||||
"""Tracks basic nginx metrics via the status module
|
||||
* number of connections
|
||||
* number of requets per second
|
||||
|
@ -21,6 +22,7 @@ class Nginx(AgentCheck):
|
|||
Reading: 0 Writing: 2 Waiting: 6
|
||||
|
||||
"""
|
||||
|
||||
def check(self, instance):
|
||||
if 'nginx_status_url' not in instance:
|
||||
raise Exception('NginX instance missing "nginx_status_url" value.')
|
||||
|
@ -37,7 +39,6 @@ class Nginx(AgentCheck):
|
|||
request = urllib2.urlopen(req)
|
||||
return request.read()
|
||||
|
||||
|
||||
def _get_metrics(self, response, dimensions):
|
||||
# Thanks to http://hostingfu.com/files/nginx/nginxstats.py for this code
|
||||
# Connections
|
||||
|
@ -85,7 +86,7 @@ class Nginx(AgentCheck):
|
|||
'nginx_status_url': ":".join(instance[:-1]),
|
||||
'dimensions': {'instance': instance[-1]}
|
||||
})
|
||||
load_conf(index+1)
|
||||
load_conf(index + 1)
|
||||
|
||||
load_conf()
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ from monagent.collector.checks import AgentCheck
|
|||
|
||||
|
||||
class PostfixCheck(AgentCheck):
|
||||
|
||||
"""This check provides metrics on the number of messages in a given postfix queue
|
||||
|
||||
WARNING: the user that dd-agent runs as must have sudo access for the 'find' command
|
||||
|
@ -16,6 +17,7 @@ class PostfixCheck(AgentCheck):
|
|||
"directory" - the value of 'postconf -h queue_directory'
|
||||
"queues" - the postfix mail queues you would like to get message count totals for
|
||||
"""
|
||||
|
||||
def check(self, instance):
|
||||
config = self._get_config(instance)
|
||||
|
||||
|
@ -67,4 +69,3 @@ class PostfixCheck(AgentCheck):
|
|||
# these can be retrieved in a single graph statement
|
||||
# for example:
|
||||
# sum:postfix.queue.size{instance:postfix-2,queue:incoming,host:hostname.domain.tld}
|
||||
|
||||
|
|
|
@ -7,12 +7,13 @@ class ShouldRestartException(Exception):
|
|||
|
||||
|
||||
class PostgreSql(AgentCheck):
|
||||
|
||||
"""Collects per-database, and optionally per-relation metrics
|
||||
"""
|
||||
|
||||
RATE = AgentCheck.rate
|
||||
GAUGE = AgentCheck.gauge
|
||||
|
||||
|
||||
# turning columns into dimensions
|
||||
DB_METRICS = {
|
||||
'descriptors': [('datname', 'db')],
|
||||
|
@ -133,7 +134,7 @@ SELECT relname,
|
|||
metric_scope = (self.DB_METRICS,)
|
||||
else:
|
||||
metric_scope = (self.DB_METRICS, self.REL_METRICS, self.IDX_METRICS)
|
||||
|
||||
|
||||
for scope in metric_scope:
|
||||
# build query
|
||||
cols = scope['metrics'].keys() # list of metrics to query, in some order
|
||||
|
@ -143,7 +144,7 @@ SELECT relname,
|
|||
except InterfaceError, e:
|
||||
self.log.error("Connection seems broken: %s" % str(e))
|
||||
raise ShouldRestartException
|
||||
|
||||
|
||||
# if this is a relation-specific query, we need to list all relations last
|
||||
if scope['relation'] and len(relations) > 0:
|
||||
query = scope['query'] % (", ".join(cols), "%s") # Keep the last %s intact
|
||||
|
@ -156,7 +157,7 @@ SELECT relname,
|
|||
|
||||
results = cursor.fetchall()
|
||||
cursor.close()
|
||||
|
||||
|
||||
# parse & submit results
|
||||
# A row should look like this
|
||||
# (descriptor, descriptor, ..., value, value, value, value, ...)
|
||||
|
@ -166,7 +167,7 @@ SELECT relname,
|
|||
desc = scope['descriptors']
|
||||
# Check that all columns will be processed
|
||||
assert len(row) == len(cols) + len(desc)
|
||||
|
||||
|
||||
# Build dimensions
|
||||
# descriptors are: (pg_name, dd_tag_name): value
|
||||
# Special-case the "db" tag, which overrides the one that is passed as instance_dimensions
|
||||
|
@ -181,13 +182,13 @@ SELECT relname,
|
|||
# metric-map is: (dd_name, "rate"|"gauge")
|
||||
# shift the results since the first columns will be the "descriptors"
|
||||
values = zip([scope['metrics'][c] for c in cols], row[len(desc):])
|
||||
|
||||
|
||||
# To submit simply call the function for each value v
|
||||
# v[0] == (metric_name, submit_function)
|
||||
# v[1] == the actual value
|
||||
# dimensions are
|
||||
[v[0][1](self, v[0][0], v[1], dimensions=dimensions) for v in values]
|
||||
|
||||
|
||||
def get_connection(self, key, host, port, user, password, dbname, use_cached=True):
|
||||
"Get and memoize connections to instances"
|
||||
if key in self.dbs and use_cached:
|
||||
|
@ -197,17 +198,18 @@ SELECT relname,
|
|||
try:
|
||||
import psycopg2 as pg
|
||||
except ImportError:
|
||||
raise ImportError("psycopg2 library cannot be imported. Please check the installation instruction on the Datadog Website.")
|
||||
|
||||
raise ImportError(
|
||||
"psycopg2 library cannot be imported. Please check the installation instruction on the Datadog Website.")
|
||||
|
||||
if host == 'localhost' and password == '':
|
||||
# Use ident method
|
||||
connection = pg.connect("user=%s dbname=%s" % (user, dbname))
|
||||
elif port != '':
|
||||
connection = pg.connect(host=host, port=port, user=user,
|
||||
password=password, database=dbname)
|
||||
password=password, database=dbname)
|
||||
else:
|
||||
connection = pg.connect(host=host, user=user, password=password,
|
||||
database=dbname)
|
||||
database=dbname)
|
||||
else:
|
||||
if not host:
|
||||
raise CheckException("Please specify a Postgres host to connect to.")
|
||||
|
@ -220,7 +222,7 @@ SELECT relname,
|
|||
# connection.autocommit was added in version 2.4.2
|
||||
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
|
||||
connection.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
|
||||
|
||||
self.dbs[key] = connection
|
||||
return connection
|
||||
|
||||
|
@ -233,21 +235,21 @@ SELECT relname,
|
|||
dbname = instance.get('dbname', 'postgres')
|
||||
relations = instance.get('relations', [])
|
||||
|
||||
key = '%s:%s:%s' % (host, port,dbname)
|
||||
key = '%s:%s:%s' % (host, port, dbname)
|
||||
db = self.get_connection(key, host, port, user, password, dbname)
|
||||
|
||||
# Clean up dimensions in case there was a None entry in the instance
|
||||
# e.g. if the yaml contains dimensions: but no actual dimensions
|
||||
if dimensions is None:
|
||||
dimensions = {}
|
||||
|
||||
|
||||
# preset dimensions to the database name
|
||||
dimensions["db"] = dbname
|
||||
|
||||
# Check version
|
||||
version = self._get_version(key, db)
|
||||
self.log.debug("Running check against version %s" % version)
|
||||
|
||||
|
||||
# Collect metrics
|
||||
try:
|
||||
self._collect_stats(key, db, dimensions, relations)
|
||||
|
@ -258,10 +260,10 @@ SELECT relname,
|
|||
|
||||
@staticmethod
|
||||
def parse_agent_config(agentConfig):
|
||||
server = agentConfig.get('postgresql_server','')
|
||||
port = agentConfig.get('postgresql_port','')
|
||||
user = agentConfig.get('postgresql_user','')
|
||||
passwd = agentConfig.get('postgresql_pass','')
|
||||
server = agentConfig.get('postgresql_server', '')
|
||||
port = agentConfig.get('postgresql_port', '')
|
||||
user = agentConfig.get('postgresql_user', '')
|
||||
passwd = agentConfig.get('postgresql_pass', '')
|
||||
|
||||
if server != '' and user != '':
|
||||
return {
|
||||
|
|
|
@ -93,7 +93,7 @@ class ProcessCheck(AgentCheck):
|
|||
|
||||
# process metrics available for psutil versions 0.5.0 and later on UNIX
|
||||
extended_metrics_0_5_0_unix = self.is_psutil_version_later_than((0, 5, 0)) and \
|
||||
Platform.is_unix()
|
||||
Platform.is_unix()
|
||||
if extended_metrics_0_5_0_unix:
|
||||
open_file_descriptors = 0
|
||||
else:
|
||||
|
@ -158,11 +158,12 @@ class ProcessCheck(AgentCheck):
|
|||
pass
|
||||
|
||||
if got_denied:
|
||||
self.warning("The Monitoring Agent was denied access when trying to get the number of file descriptors")
|
||||
self.warning(
|
||||
"The Monitoring Agent was denied access when trying to get the number of file descriptors")
|
||||
|
||||
#Memory values are in Byte
|
||||
# Memory values are in Byte
|
||||
return (thr, cpu, rss, vms, real, open_file_descriptors,
|
||||
read_count, write_count, read_bytes, write_bytes, voluntary_ctx_switches, involuntary_ctx_switches)
|
||||
read_count, write_count, read_bytes, write_bytes, voluntary_ctx_switches, involuntary_ctx_switches)
|
||||
|
||||
def check(self, instance):
|
||||
try:
|
||||
|
@ -194,7 +195,7 @@ class ProcessCheck(AgentCheck):
|
|||
self.gauge('processes_pid_count', len(pids), dimensions=dimensions)
|
||||
|
||||
metrics = dict(zip(ProcessCheck.PROCESS_GAUGE, self.get_process_metrics(pids,
|
||||
psutil, cpu_check_interval)))
|
||||
psutil, cpu_check_interval)))
|
||||
|
||||
for metric, value in metrics.iteritems():
|
||||
if value is not None:
|
||||
|
|
|
@ -11,7 +11,8 @@ QUEUE_TYPE = 'queues'
|
|||
NODE_TYPE = 'nodes'
|
||||
MAX_DETAILED_QUEUES = 200
|
||||
MAX_DETAILED_NODES = 100
|
||||
# Post an event in the stream when the number of queues or nodes to collect is above 90% of the limit
|
||||
# Post an event in the stream when the number of queues or nodes to
|
||||
# collect is above 90% of the limit
|
||||
ALERT_THRESHOLD = 0.9
|
||||
QUEUE_ATTRIBUTES = ['active_consumers',
|
||||
'consumers',
|
||||
|
@ -39,6 +40,7 @@ METRIC_SUFFIX = {QUEUE_TYPE: "queue", NODE_TYPE: "node"}
|
|||
|
||||
|
||||
class RabbitMQ(AgentCheck):
|
||||
|
||||
"""This check is for gathering statistics from the RabbitMQ
|
||||
Management Plugin (http://www.rabbitmq.com/management.html)
|
||||
"""
|
||||
|
@ -67,7 +69,7 @@ class RabbitMQ(AgentCheck):
|
|||
}
|
||||
|
||||
# List of queues/nodes to collect metrics from
|
||||
specified = {
|
||||
specified = {
|
||||
QUEUE_TYPE: instance.get('queues', []),
|
||||
NODE_TYPE: instance.get('nodes', []),
|
||||
}
|
||||
|
@ -78,16 +80,17 @@ class RabbitMQ(AgentCheck):
|
|||
|
||||
# setup urllib2 for Basic Auth
|
||||
auth_handler = urllib2.HTTPBasicAuthHandler()
|
||||
auth_handler.add_password(realm='RabbitMQ Management', uri=base_url, user=username, passwd=password)
|
||||
auth_handler.add_password(
|
||||
realm='RabbitMQ Management', uri=base_url, user=username, passwd=password)
|
||||
opener = urllib2.build_opener(auth_handler)
|
||||
urllib2.install_opener(opener)
|
||||
|
||||
return base_url, max_detailed, specified
|
||||
|
||||
|
||||
def check(self, instance):
|
||||
base_url, max_detailed, specified = self._get_config(instance)
|
||||
self.get_stats(instance, base_url, QUEUE_TYPE, max_detailed[QUEUE_TYPE], specified[QUEUE_TYPE])
|
||||
self.get_stats(
|
||||
instance, base_url, QUEUE_TYPE, max_detailed[QUEUE_TYPE], specified[QUEUE_TYPE])
|
||||
self.get_stats(instance, base_url, NODE_TYPE, max_detailed[NODE_TYPE], specified[NODE_TYPE])
|
||||
|
||||
@staticmethod
|
||||
|
@ -100,7 +103,6 @@ class RabbitMQ(AgentCheck):
|
|||
raise Exception('Cannot parse JSON response from API url: %s %s' % (url, str(e)))
|
||||
return data
|
||||
|
||||
|
||||
def get_stats(self, instance, base_url, object_type, max_detailed, specified_list):
|
||||
"""
|
||||
instance: the check instance
|
||||
|
@ -111,20 +113,23 @@ class RabbitMQ(AgentCheck):
|
|||
"""
|
||||
|
||||
data = self._get_data(urlparse.urljoin(base_url, object_type))
|
||||
specified_items = list(specified_list) # Make a copy of this list as we will remove items from it at each iteration
|
||||
# Make a copy of this list as we will remove items from it at each iteration
|
||||
specified_items = list(specified_list)
|
||||
|
||||
""" data is a list of nodes or queues:
|
||||
data = [
|
||||
{'status': 'running', 'node': 'rabbit@host', 'name': 'queue1', 'consumers': 0, 'vhost': '/', 'backing_queue_status': {'q1': 0, 'q3': 0, 'q2': 0, 'q4': 0, 'avg_ack_egress_rate': 0.0, 'ram_msg_count': 0, 'ram_ack_count': 0, 'len': 0, 'persistent_count': 0, 'target_ram_count': 'infinity', 'next_seq_id': 0, 'delta': ['delta', 'undefined', 0, 'undefined'], 'pending_acks': 0, 'avg_ack_ingress_rate': 0.0, 'avg_egress_rate': 0.0, 'avg_ingress_rate': 0.0}, 'durable': True, 'idle_since': '2013-10-03 13:38:18', 'exclusive_consumer_tag': '', 'arguments': {}, 'memory': 10956, 'policy': '', 'auto_delete': False},
|
||||
{'status': 'running', 'node': 'rabbit@host, 'name': 'queue10', 'consumers': 0, 'vhost': '/', 'backing_queue_status': {'q1': 0, 'q3': 0, 'q2': 0, 'q4': 0, 'avg_ack_egress_rate': 0.0, 'ram_msg_count': 0, 'ram_ack_count': 0, 'len': 0, 'persistent_count': 0, 'target_ram_count': 'infinity', 'next_seq_id': 0, 'delta': ['delta', 'undefined', 0, 'undefined'], 'pending_acks': 0, 'avg_ack_ingress_rate': 0.0, 'avg_egress_rate': 0.0, 'avg_ingress_rate': 0.0}, 'durable': True, 'idle_since': '2013-10-03 13:38:18', 'exclusive_consumer_tag': '', 'arguments': {}, 'memory': 10956, 'policy': '', 'auto_delete': False},
|
||||
{'status': 'running', 'node': 'rabbit@host', 'name': 'queue11', 'consumers': 0, 'vhost': '/', 'backing_queue_status': {'q1': 0, 'q3': 0, 'q2': 0, 'q4': 0, 'avg_ack_egress_rate': 0.0, 'ram_msg_count': 0, 'ram_ack_count': 0, 'len': 0, 'persistent_count': 0, 'target_ram_count': 'infinity', 'next_seq_id': 0, 'delta': ['delta', 'undefined', 0, 'undefined'], 'pending_acks': 0, 'avg_ack_ingress_rate': 0.0, 'avg_egress_rate': 0.0, 'avg_ingress_rate': 0.0}, 'durable': True, 'idle_since': '2013-10-03 13:38:18', 'exclusive_consumer_tag': '', 'arguments': {}, 'memory': 10956, 'policy': '', 'auto_delete': False},
|
||||
{'status': 'running', 'node': 'rabbit@host', 'name': 'queue1', 'consumers': 0, 'vhost': '/', 'backing_queue_status': {'q1': 0, 'q3': 0, 'q2': 0, 'q4': 0, 'avg_ack_egress_rate': 0.0, 'ram_msg_count': 0, 'ram_ack_count': 0, 'len': 0, 'persistent_count': 0, 'target_ram_count': 'infinity', 'next_seq_id': 0, 'delta': ['delta', 'undefined', 0, 'undefined'], 'pending_acks': 0, 'avg_ack_ingress_rate': 0.0, 'avg_egress_rate': 0.0, 'avg_ingress_rate': 0.0}, 'durable': True, 'idle_since': '2013-10-03 13:38:18', 'exclusive_consumer_tag': '', 'arguments': {}, 'memory': 10956, 'policy': '', 'auto_delete': False},
|
||||
{'status': 'running', 'node': 'rabbit@host, 'name': 'queue10', 'consumers': 0, 'vhost': '/', 'backing_queue_status': {'q1': 0, 'q3': 0, 'q2': 0, 'q4': 0, 'avg_ack_egress_rate': 0.0, 'ram_msg_count': 0, 'ram_ack_count': 0, 'len': 0, 'persistent_count': 0, 'target_ram_count': 'infinity', 'next_seq_id': 0, 'delta': ['delta', 'undefined', 0, 'undefined'], 'pending_acks': 0, 'avg_ack_ingress_rate': 0.0, 'avg_egress_rate': 0.0, 'avg_ingress_rate': 0.0}, 'durable': True, 'idle_since': '2013-10-03 13:38:18', 'exclusive_consumer_tag': '', 'arguments': {}, 'memory': 10956, 'policy': '', 'auto_delete': False},
|
||||
{'status': 'running', 'node': 'rabbit@host', 'name': 'queue11', 'consumers': 0, 'vhost': '/', 'backing_queue_status': {'q1': 0, 'q3': 0, 'q2': 0, 'q4': 0, 'avg_ack_egress_rate': 0.0, 'ram_msg_count': 0, 'ram_ack_count': 0, 'len': 0, 'persistent_count': 0, 'target_ram_count': 'infinity', 'next_seq_id': 0, 'delta': ['delta', 'undefined', 0, 'undefined'], 'pending_acks': 0, 'avg_ack_ingress_rate': 0.0, 'avg_egress_rate': 0.0, 'avg_ingress_rate': 0.0}, 'durable': True, 'idle_since': '2013-10-03 13:38:18', 'exclusive_consumer_tag': '', 'arguments': {}, 'memory': 10956, 'policy': '', 'auto_delete': False},
|
||||
...
|
||||
]
|
||||
"""
|
||||
if len(specified_items) > max_detailed:
|
||||
raise Exception("The maximum number of %s you can specify is %d." % (object_type, max_detailed))
|
||||
raise Exception("The maximum number of %s you can specify is %d." %
|
||||
(object_type, max_detailed))
|
||||
|
||||
if specified_items is not None and len(specified_items) > 0: # a list of queues/nodes is specified. We process only those
|
||||
# a list of queues/nodes is specified. We process only those
|
||||
if specified_items is not None and len(specified_items) > 0:
|
||||
if object_type == NODE_TYPE:
|
||||
for data_line in data:
|
||||
name = data_line.get("name")
|
||||
|
@ -132,7 +137,7 @@ class RabbitMQ(AgentCheck):
|
|||
self._get_metrics(data_line, object_type)
|
||||
specified_items.remove(name)
|
||||
|
||||
else: # object_type == QUEUE_TYPE
|
||||
else: # object_type == QUEUE_TYPE
|
||||
for data_line in data:
|
||||
name = data_line.get("name")
|
||||
absolute_name = '%s/%s' % (data_line.get("vhost"), name)
|
||||
|
@ -143,14 +148,16 @@ class RabbitMQ(AgentCheck):
|
|||
self._get_metrics(data_line, object_type)
|
||||
specified_items.remove(absolute_name)
|
||||
|
||||
else: # No queues/node are specified. We will process every queue/node if it's under the limit
|
||||
# No queues/node are specified. We will process every queue/node if it's under the limit
|
||||
else:
|
||||
if len(data) > ALERT_THRESHOLD * max_detailed:
|
||||
# Post a message on the dogweb stream to warn
|
||||
self.alert(base_url, max_detailed, len(data), object_type)
|
||||
|
||||
if len(data) > max_detailed:
|
||||
# Display a warning in the info page
|
||||
self.warning("Too many queues to fetch. You must choose the %s you are interested in by editing the rabbitmq.yaml configuration file or get in touch with Datadog Support" % object_type)
|
||||
self.warning(
|
||||
"Too many queues to fetch. You must choose the %s you are interested in by editing the rabbitmq.yaml configuration file or get in touch with Datadog Support" % object_type)
|
||||
|
||||
for data_line in data[:max_detailed]:
|
||||
# We truncate the list of nodes/queues if it's above the limit
|
||||
|
@ -168,9 +175,11 @@ class RabbitMQ(AgentCheck):
|
|||
value = data.get(attribute, None)
|
||||
if value is not None:
|
||||
try:
|
||||
self.gauge('rabbitmq.%s.%s' % (METRIC_SUFFIX[object_type], attribute), float(value), dimensions=dimensions)
|
||||
self.gauge('rabbitmq.%s.%s' % (METRIC_SUFFIX[object_type], attribute), float(
|
||||
value), dimensions=dimensions)
|
||||
except ValueError:
|
||||
self.log.debug("Caught ValueError for %s %s = %s with dimensions: %s" % (METRIC_SUFFIX[object_type], attribute, value, dimensions))
|
||||
self.log.debug("Caught ValueError for %s %s = %s with dimensions: %s" % (
|
||||
METRIC_SUFFIX[object_type], attribute, value, dimensions))
|
||||
|
||||
def alert(self, base_url, max_detailed, size, object_type):
|
||||
key = "%s%s" % (base_url, object_type)
|
||||
|
@ -180,20 +189,21 @@ class RabbitMQ(AgentCheck):
|
|||
|
||||
self.already_alerted.append(key)
|
||||
|
||||
title = "RabbitMQ integration is approaching the limit on the number of %s that can be collected from on %s" % (object_type, self.hostname)
|
||||
msg = """%s %s are present. The limit is %s.
|
||||
title = "RabbitMQ integration is approaching the limit on the number of %s that can be collected from on %s" % (
|
||||
object_type, self.hostname)
|
||||
msg = """%s %s are present. The limit is %s.
|
||||
Please get in touch with Datadog support to increase the limit.""" % (size, object_type, max_detailed)
|
||||
|
||||
event = {
|
||||
"timestamp": int(time.time()),
|
||||
"event_type": EVENT_TYPE,
|
||||
"msg_title": title,
|
||||
"msg_text": msg,
|
||||
"alert_type": 'warning',
|
||||
"source_type_name": SOURCE_TYPE_NAME,
|
||||
"host": self.hostname,
|
||||
"dimensions": {"base_url": base_url, "host": self.hostname},
|
||||
"event_object": "rabbitmq.limit.%s" % object_type,
|
||||
}
|
||||
"timestamp": int(time.time()),
|
||||
"event_type": EVENT_TYPE,
|
||||
"msg_title": title,
|
||||
"msg_text": msg,
|
||||
"alert_type": 'warning',
|
||||
"source_type_name": SOURCE_TYPE_NAME,
|
||||
"host": self.hostname,
|
||||
"dimensions": {"base_url": base_url, "host": self.hostname},
|
||||
"event_object": "rabbitmq.limit.%s" % object_type,
|
||||
}
|
||||
|
||||
self.event(event)
|
||||
|
|
|
@ -12,59 +12,58 @@ class Redis(AgentCheck):
|
|||
subkeys = ['keys', 'expires']
|
||||
GAUGE_KEYS = {
|
||||
# Append-only metrics
|
||||
'aof_last_rewrite_time_sec': 'redis.aof.last_rewrite_time',
|
||||
'aof_rewrite_in_progress': 'redis.aof.rewrite',
|
||||
'aof_current_size': 'redis.aof.size',
|
||||
'aof_buffer_length': 'redis.aof.buffer_length',
|
||||
'aof_last_rewrite_time_sec': 'redis.aof.last_rewrite_time',
|
||||
'aof_rewrite_in_progress': 'redis.aof.rewrite',
|
||||
'aof_current_size': 'redis.aof.size',
|
||||
'aof_buffer_length': 'redis.aof.buffer_length',
|
||||
|
||||
# Network
|
||||
'connected_clients': 'redis.net.clients',
|
||||
'connected_slaves': 'redis.net.slaves',
|
||||
'rejected_connections': 'redis.net.rejected',
|
||||
'connected_clients': 'redis.net.clients',
|
||||
'connected_slaves': 'redis.net.slaves',
|
||||
'rejected_connections': 'redis.net.rejected',
|
||||
|
||||
# clients
|
||||
'blocked_clients': 'redis.clients.blocked',
|
||||
'client_biggest_input_buf': 'redis.clients.biggest_input_buf',
|
||||
'client_longest_output_list': 'redis.clients.longest_output_list',
|
||||
'blocked_clients': 'redis.clients.blocked',
|
||||
'client_biggest_input_buf': 'redis.clients.biggest_input_buf',
|
||||
'client_longest_output_list': 'redis.clients.longest_output_list',
|
||||
|
||||
# Keys
|
||||
'evicted_keys': 'redis.keys.evicted',
|
||||
'expired_keys': 'redis.keys.expired',
|
||||
'evicted_keys': 'redis.keys.evicted',
|
||||
'expired_keys': 'redis.keys.expired',
|
||||
|
||||
# stats
|
||||
'keyspace_hits': 'redis.stats.keyspace_hits',
|
||||
'keyspace_misses': 'redis.stats.keyspace_misses',
|
||||
'latest_fork_usec': 'redis.perf.latest_fork_usec',
|
||||
'keyspace_hits': 'redis.stats.keyspace_hits',
|
||||
'keyspace_misses': 'redis.stats.keyspace_misses',
|
||||
'latest_fork_usec': 'redis.perf.latest_fork_usec',
|
||||
|
||||
# pubsub
|
||||
'pubsub_channels': 'redis.pubsub.channels',
|
||||
'pubsub_patterns': 'redis.pubsub.patterns',
|
||||
'pubsub_channels': 'redis.pubsub.channels',
|
||||
'pubsub_patterns': 'redis.pubsub.patterns',
|
||||
|
||||
# rdb
|
||||
'rdb_bgsave_in_progress': 'redis.rdb.bgsave',
|
||||
'rdb_changes_since_last_save': 'redis.rdb.changes_since_last',
|
||||
'rdb_last_bgsave_time_sec': 'redis.rdb.last_bgsave_time',
|
||||
'rdb_bgsave_in_progress': 'redis.rdb.bgsave',
|
||||
'rdb_changes_since_last_save': 'redis.rdb.changes_since_last',
|
||||
'rdb_last_bgsave_time_sec': 'redis.rdb.last_bgsave_time',
|
||||
|
||||
# memory
|
||||
'mem_fragmentation_ratio': 'redis.mem.fragmentation_ratio',
|
||||
'used_memory': 'redis.mem.used',
|
||||
'used_memory_lua': 'redis.mem.lua',
|
||||
'used_memory_peak': 'redis.mem.peak',
|
||||
'used_memory_rss': 'redis.mem.rss',
|
||||
'mem_fragmentation_ratio': 'redis.mem.fragmentation_ratio',
|
||||
'used_memory': 'redis.mem.used',
|
||||
'used_memory_lua': 'redis.mem.lua',
|
||||
'used_memory_peak': 'redis.mem.peak',
|
||||
'used_memory_rss': 'redis.mem.rss',
|
||||
|
||||
# replication
|
||||
'master_last_io_seconds_ago': 'redis.replication.last_io_seconds_ago',
|
||||
'master_sync_in_progress': 'redis.replication.sync',
|
||||
'master_sync_left_bytes': 'redis.replication.sync_left_bytes',
|
||||
|
||||
'master_last_io_seconds_ago': 'redis.replication.last_io_seconds_ago',
|
||||
'master_sync_in_progress': 'redis.replication.sync',
|
||||
'master_sync_left_bytes': 'redis.replication.sync_left_bytes',
|
||||
}
|
||||
|
||||
RATE_KEYS = {
|
||||
# cpu
|
||||
'used_cpu_sys': 'redis.cpu.sys',
|
||||
'used_cpu_sys_children': 'redis.cpu.sys_children',
|
||||
'used_cpu_user': 'redis.cpu.user',
|
||||
'used_cpu_user_children': 'redis.cpu.user_children',
|
||||
'used_cpu_sys': 'redis.cpu.sys',
|
||||
'used_cpu_sys_children': 'redis.cpu.sys_children',
|
||||
'used_cpu_user': 'redis.cpu.user',
|
||||
'used_cpu_user_children': 'redis.cpu.user_children',
|
||||
}
|
||||
|
||||
def __init__(self, name, init_config, agent_config):
|
||||
|
@ -75,6 +74,7 @@ class Redis(AgentCheck):
|
|||
def get_library_versions():
|
||||
try:
|
||||
import redis
|
||||
|
||||
version = redis.__version__
|
||||
except ImportError:
|
||||
version = "Not Found"
|
||||
|
@ -107,20 +107,22 @@ class Redis(AgentCheck):
|
|||
|
||||
def _get_conn(self, instance):
|
||||
import redis
|
||||
|
||||
key = self._generate_instance_key(instance)
|
||||
if key not in self.connections:
|
||||
try:
|
||||
|
||||
|
||||
# Only send useful parameters to the redis client constructor
|
||||
list_params = ['host', 'port', 'db', 'password', 'socket_timeout',
|
||||
'connection_pool', 'charset', 'errors', 'unix_socket_path']
|
||||
'connection_pool', 'charset', 'errors', 'unix_socket_path']
|
||||
|
||||
connection_params = dict((k, instance[k]) for k in list_params if k in instance)
|
||||
|
||||
self.connections[key] = redis.Redis(**connection_params)
|
||||
|
||||
except TypeError:
|
||||
raise Exception("You need a redis library that supports authenticated connections. Try sudo easy_install redis.")
|
||||
raise Exception(
|
||||
"You need a redis library that supports authenticated connections. Try sudo easy_install redis.")
|
||||
|
||||
return self.connections[key]
|
||||
|
||||
|
@ -143,12 +145,13 @@ class Redis(AgentCheck):
|
|||
try:
|
||||
info = conn.info()
|
||||
except ValueError, e:
|
||||
# This is likely a know issue with redis library 2.0.0
|
||||
# This is likely a know issue with redis library 2.0.0
|
||||
# See https://github.com/DataDog/dd-agent/issues/374 for details
|
||||
import redis
|
||||
|
||||
raise Exception("""Unable to run the info command. This is probably an issue with your version of the python-redis library.
|
||||
Minimum required version: 2.4.11
|
||||
Your current version: %s
|
||||
Your current version: %s
|
||||
Please upgrade to a newer version by running sudo easy_install redis""" % redis.__version__)
|
||||
|
||||
latency_ms = round((time.time() - start) * 1000, 2)
|
||||
|
@ -172,8 +175,10 @@ class Redis(AgentCheck):
|
|||
self.gauge(metric, val, dimensions=db_dimensions)
|
||||
|
||||
# Save a subset of db-wide statistics
|
||||
[self.gauge(self.GAUGE_KEYS[k], info[k], dimensions=dimensions) for k in self.GAUGE_KEYS if k in info]
|
||||
[self.rate (self.RATE_KEYS[k], info[k], dimensions=dimensions) for k in self.RATE_KEYS if k in info]
|
||||
[self.gauge(self.GAUGE_KEYS[k], info[k], dimensions=dimensions)
|
||||
for k in self.GAUGE_KEYS if k in info]
|
||||
[self.rate(self.RATE_KEYS[k], info[k], dimensions=dimensions)
|
||||
for k in self.RATE_KEYS if k in info]
|
||||
|
||||
# Save the number of commands.
|
||||
self.rate('redis.net.commands', info['total_commands_processed'], dimensions=dimensions)
|
||||
|
@ -182,7 +187,8 @@ class Redis(AgentCheck):
|
|||
try:
|
||||
import redis
|
||||
except ImportError:
|
||||
raise Exception('Python Redis Module can not be imported. Please check the installation instruction on the Datadog Website')
|
||||
raise Exception(
|
||||
'Python Redis Module can not be imported. Please check the installation instruction on the Datadog Website')
|
||||
|
||||
if (not "host" in instance or not "port" in instance) and not "unix_socket_path" in instance:
|
||||
raise Exception("You must specify a host/port couple or a unix_socket_path")
|
||||
|
|
|
@ -49,11 +49,10 @@ class Riak(AgentCheck):
|
|||
|
||||
self.prev_coord_redirs_total = -1
|
||||
|
||||
|
||||
def check(self, instance):
|
||||
url = instance['url']
|
||||
url = instance['url']
|
||||
default_timeout = self.init_config.get('default_timeout', 5)
|
||||
timeout = float(instance.get('timeout', default_timeout))
|
||||
timeout = float(instance.get('timeout', default_timeout))
|
||||
|
||||
aggregation_key = md5(url).hexdigest()
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ class SQLServer(AgentCheck):
|
|||
if row['type'] not in VALID_METRIC_TYPES:
|
||||
self.log.error('%s has an invalid metric type: %s' % (row['name'], row['type']))
|
||||
self.METRICS.append((row['name'], row['type'], row['counter_name'],
|
||||
row.get('instance_name', ''), row.get('tag_by', None)))
|
||||
row.get('instance_name', ''), row.get('tag_by', None)))
|
||||
|
||||
# Cache connections
|
||||
self.connections = {}
|
||||
|
@ -76,8 +76,8 @@ class SQLServer(AgentCheck):
|
|||
self.connections[conn_key] = conn
|
||||
except Exception, e:
|
||||
cx = "%s - %s" % (host, database)
|
||||
raise Exception("Unable to connect to SQL Server for instance %s.\n %s" \
|
||||
% (cx, traceback.format_exc()))
|
||||
raise Exception("Unable to connect to SQL Server for instance %s.\n %s"
|
||||
% (cx, traceback.format_exc()))
|
||||
|
||||
conn = self.connections[conn_key]
|
||||
cursor = conn.cursor()
|
||||
|
@ -137,4 +137,3 @@ class SQLServer(AgentCheck):
|
|||
dimensions[tag_by] = instance_name.strip()
|
||||
metric_func = getattr(self, mtype)
|
||||
metric_func(mname, value, dimensions=dimensions)
|
||||
|
||||
|
|
|
@ -61,9 +61,11 @@ class TCPCheck(ServicesCheck):
|
|||
sock.close()
|
||||
|
||||
except socket.timeout, e:
|
||||
# The connection timed out because it took more time than the specified value in the yaml config file
|
||||
# The connection timed out because it took more time than the specified
|
||||
# value in the yaml config file
|
||||
length = int((time.time() - start) * 1000)
|
||||
self.log.info("%s:%s is DOWN (%s). Connection failed after %s ms" % (addr, port, str(e), length))
|
||||
self.log.info("%s:%s is DOWN (%s). Connection failed after %s ms" %
|
||||
(addr, port, str(e), length))
|
||||
return Status.DOWN, "%s. Connection failed after %s ms" % (str(e), length)
|
||||
|
||||
except socket.error, e:
|
||||
|
@ -71,18 +73,21 @@ class TCPCheck(ServicesCheck):
|
|||
if "timed out" in str(e):
|
||||
|
||||
# The connection timed out becase it took more time than the system tcp stack allows
|
||||
self.log.warning("The connection timed out because it took more time than the system tcp stack allows. You might want to change this setting to allow longer timeouts")
|
||||
self.log.warning(
|
||||
"The connection timed out because it took more time than the system tcp stack allows. You might want to change this setting to allow longer timeouts")
|
||||
self.log.info("System tcp timeout. Assuming that the checked system is down")
|
||||
return Status.DOWN, """Socket error: %s.
|
||||
The connection timed out after %s ms because it took more time than the system tcp stack allows.
|
||||
You might want to change this setting to allow longer timeouts""" % (str(e), length)
|
||||
else:
|
||||
self.log.info("%s:%s is DOWN (%s). Connection failed after %s ms" % (addr, port, str(e), length))
|
||||
self.log.info("%s:%s is DOWN (%s). Connection failed after %s ms" %
|
||||
(addr, port, str(e), length))
|
||||
return Status.DOWN, "%s. Connection failed after %s ms" % (str(e), length)
|
||||
|
||||
except Exception, e:
|
||||
length = int((time.time() - start) * 1000)
|
||||
self.log.info("%s:%s is DOWN (%s). Connection failed after %s ms" % (addr, port, str(e), length))
|
||||
self.log.info("%s:%s is DOWN (%s). Connection failed after %s ms" %
|
||||
(addr, port, str(e), length))
|
||||
return Status.DOWN, "%s. Connection failed after %s ms" % (str(e), length)
|
||||
|
||||
if response_time:
|
||||
|
@ -92,7 +97,6 @@ class TCPCheck(ServicesCheck):
|
|||
self.log.debug("%s:%s is UP" % (addr, port))
|
||||
return Status.UP, "UP"
|
||||
|
||||
|
||||
def _create_status_event(self, status, msg, instance):
|
||||
# Get the instance settings
|
||||
host = instance.get('host', None)
|
||||
|
@ -106,7 +110,6 @@ class TCPCheck(ServicesCheck):
|
|||
if custom_message:
|
||||
custom_message += " \n"
|
||||
|
||||
|
||||
# Let the possibility to override the source type name
|
||||
instance_source_type_name = instance.get('source_type', None)
|
||||
if instance_source_type_name is None:
|
||||
|
@ -123,29 +126,28 @@ class TCPCheck(ServicesCheck):
|
|||
notify_list.append("@%s" % handle.strip())
|
||||
notify_message = " ".join(notify_list) + " \n"
|
||||
|
||||
|
||||
if status == Status.DOWN:
|
||||
title = "[Alert] %s reported that %s is down" % (self.hostname, name)
|
||||
alert_type = "error"
|
||||
msg = """%s %s %s reported that %s (%s:%s) failed %s time(s) within %s last attempt(s).
|
||||
Last error: %s""" % (notify_message,
|
||||
custom_message, self.hostname, name, host, port, nb_failures, nb_tries, msg)
|
||||
custom_message, self.hostname, name, host, port, nb_failures, nb_tries, msg)
|
||||
event_type = EventType.DOWN
|
||||
|
||||
else: # Status is UP
|
||||
else: # Status is UP
|
||||
title = "[Recovered] %s reported that %s is up" % (self.hostname, name)
|
||||
alert_type = "success"
|
||||
msg = "%s %s %s reported that %s (%s:%s) recovered." % (notify_message,
|
||||
custom_message, self.hostname, name, host, port)
|
||||
custom_message, self.hostname, name, host, port)
|
||||
event_type = EventType.UP
|
||||
|
||||
return {
|
||||
'timestamp': int(time.time()),
|
||||
'event_type': event_type,
|
||||
'host': self.hostname,
|
||||
'msg_text': msg,
|
||||
'msg_title': title,
|
||||
'alert_type': alert_type,
|
||||
"source_type_name": source_type,
|
||||
"event_object": name,
|
||||
'timestamp': int(time.time()),
|
||||
'event_type': event_type,
|
||||
'host': self.hostname,
|
||||
'msg_text': msg,
|
||||
'msg_title': title,
|
||||
'alert_type': alert_type,
|
||||
"source_type_name": source_type,
|
||||
"event_object": name,
|
||||
}
|
||||
|
|
|
@ -7,6 +7,7 @@ from monagent.collector.checks import AgentCheck
|
|||
|
||||
class Varnish(AgentCheck):
|
||||
# XML parsing bits, a.k.a. Kafka in Code
|
||||
|
||||
def _reset(self):
|
||||
self._current_element = ""
|
||||
self._current_metric = "varnish"
|
||||
|
@ -92,7 +93,7 @@ class Varnish(AgentCheck):
|
|||
|
||||
# Assumptions regarding varnish's version
|
||||
use_xml = True
|
||||
arg = "-x" # varnishstat argument
|
||||
arg = "-x" # varnishstat argument
|
||||
version = 3
|
||||
|
||||
m1 = re.search(r"varnish-(\d+)", output, re.MULTILINE)
|
||||
|
@ -123,7 +124,7 @@ class Varnish(AgentCheck):
|
|||
dimensions[u'varnish_name': 'default']
|
||||
try:
|
||||
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
stderr=subprocess.PIPE)
|
||||
output, error = proc.communicate()
|
||||
except Exception:
|
||||
self.log.error(u"Failed to run %s" % repr(cmd))
|
||||
|
|
|
@ -15,6 +15,7 @@ EVENT_TYPE = 'win32_log_event'
|
|||
|
||||
|
||||
class Win32EventLog(AgentCheck):
|
||||
|
||||
def __init__(self, name, init_config, agent_config):
|
||||
AgentCheck.__init__(self, name, init_config, agent_config)
|
||||
self.last_ts = {}
|
||||
|
@ -81,8 +82,9 @@ class Win32EventLog(AgentCheck):
|
|||
|
||||
|
||||
class EventLogQuery(object):
|
||||
|
||||
def __init__(self, ltype=None, user=None, source_name=None, log_file=None,
|
||||
start_ts=None, message_filters=None):
|
||||
start_ts=None, message_filters=None):
|
||||
self.filters = [
|
||||
('Type', self._convert_event_types(ltype)),
|
||||
('User', user),
|
||||
|
@ -139,8 +141,8 @@ class EventLogQuery(object):
|
|||
time struct.
|
||||
'''
|
||||
return wmi.from_time(year=dt.year, month=dt.month, day=dt.day,
|
||||
hours=dt.hour, minutes=dt.minute, seconds=dt.second, microseconds=0,
|
||||
timezone=0)
|
||||
hours=dt.hour, minutes=dt.minute, seconds=dt.second, microseconds=0,
|
||||
timezone=0)
|
||||
|
||||
@staticmethod
|
||||
def _convert_event_types(types):
|
||||
|
@ -149,7 +151,9 @@ class EventLogQuery(object):
|
|||
'''
|
||||
return types
|
||||
|
||||
|
||||
class LogEvent(object):
|
||||
|
||||
def __init__(self, ev, api_key, hostname, tags, notify_list):
|
||||
self.event = ev
|
||||
self.api_key = api_key
|
||||
|
@ -183,9 +187,9 @@ class LogEvent(object):
|
|||
''' Convert a wmi formatted timestamp into an epoch using wmi.to_time().
|
||||
'''
|
||||
year, month, day, hour, minute, second, microsecond, tz = \
|
||||
wmi.to_time(wmi_ts)
|
||||
wmi.to_time(wmi_ts)
|
||||
dt = datetime(year=year, month=month, day=day, hour=hour, minute=minute,
|
||||
second=second, microsecond=microsecond)
|
||||
second=second, microsecond=microsecond)
|
||||
return int(calendar.timegm(dt.timetuple()))
|
||||
|
||||
@staticmethod
|
||||
|
|
|
@ -17,6 +17,7 @@ SEARCH_WILDCARD = '*'
|
|||
|
||||
|
||||
class WMICheck(AgentCheck):
|
||||
|
||||
def __init__(self, name, init_config, agent_config):
|
||||
AgentCheck.__init__(self, name, init_config, agent_config)
|
||||
self.wmi_conns = {}
|
||||
|
@ -30,7 +31,7 @@ class WMICheck(AgentCheck):
|
|||
def check(self, instance):
|
||||
if wmi is None:
|
||||
raise Exception("Missing 'wmi' module")
|
||||
|
||||
|
||||
host = instance.get('host', None)
|
||||
user = instance.get('username', None)
|
||||
password = instance.get('password', None)
|
||||
|
@ -62,7 +63,8 @@ class WMICheck(AgentCheck):
|
|||
|
||||
def _extract_metrics(self, results, metrics, tag_by):
|
||||
if len(results) > 1 and tag_by is None:
|
||||
raise Exception('WMI query returned multiple rows but no `tag_by` value was given. metrics=%s' % metrics)
|
||||
raise Exception(
|
||||
'WMI query returned multiple rows but no `tag_by` value was given. metrics=%s' % metrics)
|
||||
|
||||
for wmi_property, name, mtype in metrics:
|
||||
for res in results:
|
||||
|
|
|
@ -60,7 +60,8 @@ class Zookeeper(AgentCheck):
|
|||
while chunk:
|
||||
if num_reads > max_reads:
|
||||
# Safeguard against an infinite loop
|
||||
raise Exception("Read %s bytes before exceeding max reads of %s. " % (buf.tell(), max_reads))
|
||||
raise Exception(
|
||||
"Read %s bytes before exceeding max reads of %s. " % (buf.tell(), max_reads))
|
||||
chunk = sock.recv(chunk_size)
|
||||
buf.write(chunk)
|
||||
num_reads += 1
|
||||
|
@ -101,7 +102,7 @@ class Zookeeper(AgentCheck):
|
|||
has_connections_val = version_tuple >= ('3', '4', '4')
|
||||
|
||||
# Clients:
|
||||
buf.readline() # skip the Clients: header
|
||||
buf.readline() # skip the Clients: header
|
||||
connections = 0
|
||||
client_line = buf.readline().strip()
|
||||
if client_line:
|
||||
|
|
|
@ -51,8 +51,10 @@ START_COMMANDS = ['start', 'restart', 'foreground']
|
|||
log = logging.getLogger('collector')
|
||||
|
||||
|
||||
# todo the collector has daemon code but is always run in foreground mode from the supervisor, is there a reason for the daemon code then?
|
||||
# todo the collector has daemon code but is always run in foreground mode
|
||||
# from the supervisor, is there a reason for the daemon code then?
|
||||
class CollectorDaemon(Daemon):
|
||||
|
||||
"""
|
||||
The agent class is a daemon that runs the collector in a background process.
|
||||
"""
|
||||
|
@ -115,7 +117,7 @@ class CollectorDaemon(Daemon):
|
|||
|
||||
# Run the main loop.
|
||||
while self.run_forever:
|
||||
|
||||
|
||||
# enable profiler if needed
|
||||
profiled = False
|
||||
if config.get('profile', False) and config.get('profile').lower() == 'yes':
|
||||
|
@ -127,7 +129,7 @@ class CollectorDaemon(Daemon):
|
|||
log.debug("Agent profiling is enabled")
|
||||
except Exception:
|
||||
log.warn("Cannot enable profiler")
|
||||
|
||||
|
||||
# Do the work.
|
||||
self.collector.run()
|
||||
|
||||
|
@ -171,7 +173,7 @@ class CollectorDaemon(Daemon):
|
|||
watchdog = None
|
||||
if agentConfig.get("watchdog", True):
|
||||
watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER,
|
||||
max_mem_mb=agentConfig.get('limit_memory_consumption', None))
|
||||
max_mem_mb=agentConfig.get('limit_memory_consumption', None))
|
||||
watchdog.reset()
|
||||
return watchdog
|
||||
|
||||
|
@ -186,6 +188,7 @@ class CollectorDaemon(Daemon):
|
|||
self.collector.stop()
|
||||
sys.exit(AgentSupervisor.RESTART_EXIT_STATUS)
|
||||
|
||||
|
||||
def main():
|
||||
options, args = get_parsed_args()
|
||||
agentConfig = get_config(options=options)
|
||||
|
@ -295,20 +298,20 @@ def main():
|
|||
return 0
|
||||
else:
|
||||
print("Fix the invalid yaml files above in order to start the Monitoring agent. "
|
||||
"A useful external tool for yaml parsing can be found at "
|
||||
"http://yaml-online-parser.appspot.com/")
|
||||
"A useful external tool for yaml parsing can be found at "
|
||||
"http://yaml-online-parser.appspot.com/")
|
||||
return 1
|
||||
|
||||
elif 'jmx' == command:
|
||||
from collector.jmxfetch import JMX_LIST_COMMANDS, JMXFetch
|
||||
|
||||
|
||||
if len(args) < 2 or args[1] not in JMX_LIST_COMMANDS.keys():
|
||||
print "#" * 80
|
||||
print "JMX tool to be used to help configuring your JMX checks."
|
||||
print "See http://docs.datadoghq.com/integrations/java/ for more information"
|
||||
print "#" * 80
|
||||
print "\n"
|
||||
print "You have to specify one of the following command:"
|
||||
print "You have to specify one of the following command:"
|
||||
for command, desc in JMX_LIST_COMMANDS.iteritems():
|
||||
print " - %s [OPTIONAL: LIST OF CHECKS]: %s" % (command, desc)
|
||||
print "Example: sudo /etc/init.d/mon-agent jmx list_matching_attributes tomcat jmx solr"
|
||||
|
@ -318,12 +321,12 @@ def main():
|
|||
jmx_command = args[1]
|
||||
checks_list = args[2:]
|
||||
confd_directory = get_confd_path(get_os())
|
||||
should_run = JMXFetch.init(confd_directory, agentConfig, get_logging_config(), 15, jmx_command, checks_list, reporter="console")
|
||||
should_run = JMXFetch.init(
|
||||
confd_directory, agentConfig, get_logging_config(), 15, jmx_command, checks_list, reporter="console")
|
||||
if not should_run:
|
||||
print "Couldn't find any valid JMX configuration in your conf.d directory: %s" % confd_directory
|
||||
print "Have you enabled any JMX check ?"
|
||||
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
|
|
|
@ -36,7 +36,7 @@ LOG_PATTERN = re.compile(r"".join([
|
|||
r"\s*(?P<priority>%s)\s+" % "|".join("(%s)" % p for p in LOG4J_PRIORITY),
|
||||
r"(\[CompactionExecutor:\d*\]\s+)?", # optional thread name and number
|
||||
r"((?P<timestamp>\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2},\d*)|",
|
||||
r"(?P<time>\d{2}:\d{2}:\d{2},\d*))\s+",
|
||||
r"(?P<time>\d{2}:\d{2}:\d{2},\d*))\s+",
|
||||
r"(\w+\.java \(line \d+\)\s+)?", # optional source file and line
|
||||
r"(?P<msg>Compact(ed|ing) .*)\s*",
|
||||
]))
|
||||
|
@ -50,6 +50,7 @@ def parse_date(timestamp):
|
|||
timestamp, _ = timestamp.split(',')
|
||||
return common.parse_date(timestamp, LEGACY_DATE_FORMAT)
|
||||
|
||||
|
||||
def parse_cassandra(log, line):
|
||||
matched = LOG_PATTERN.match(line)
|
||||
if matched:
|
||||
|
@ -58,7 +59,8 @@ def parse_cassandra(log, line):
|
|||
# Convert the timestamp string into an epoch timestamp
|
||||
time_val = event.get('time', None)
|
||||
if time_val:
|
||||
event['timestamp'] = parse_date("%s %s" % (datetime.utcnow().strftime("%Y-%m-%d"), time_val))
|
||||
event['timestamp'] = parse_date(
|
||||
"%s %s" % (datetime.utcnow().strftime("%Y-%m-%d"), time_val))
|
||||
else:
|
||||
try:
|
||||
event['timestamp'] = parse_date(event['timestamp'])
|
||||
|
|
|
@ -20,9 +20,9 @@ SUPERVISORD_LEVELS = [
|
|||
'INFO', # normal informational output
|
||||
|
||||
# IGNORED...
|
||||
#'DEBG', # messages useful for users trying to debug configurations
|
||||
#'TRAC', # messages useful to developers trying to debug plugins
|
||||
#'BLAT', # messages useful for developers trying to debug supervisor
|
||||
# 'DEBG', # messages useful for users trying to debug configurations
|
||||
# 'TRAC', # messages useful to developers trying to debug plugins
|
||||
# 'BLAT', # messages useful for developers trying to debug supervisor
|
||||
|
||||
]
|
||||
|
||||
|
@ -35,6 +35,7 @@ ALERT_TYPES_MAPPING = {"CRIT": "error",
|
|||
# regex to extract the 'program' supervisord is managing from the text
|
||||
program_matcher = re.compile("^\w+:? '?(?P<program>\w+)'?")
|
||||
|
||||
|
||||
def parse_supervisord(log, line):
|
||||
"""
|
||||
Parse the supervisord.log line into a dogstream event
|
||||
|
@ -42,7 +43,8 @@ def parse_supervisord(log, line):
|
|||
if len(line) == 0:
|
||||
log.info("Skipping empty line of supervisord.log")
|
||||
return None
|
||||
if log: log.debug('PARSE supervisord:%s' % line)
|
||||
if log:
|
||||
log.debug('PARSE supervisord:%s' % line)
|
||||
line_items = line.split(' ', 3)
|
||||
timestamp = ' '.join(line_items[:2])
|
||||
timestamp_parts = timestamp.split(',')
|
||||
|
@ -52,7 +54,7 @@ def parse_supervisord(log, line):
|
|||
event_type = line_items[2]
|
||||
msg = line_items[3]
|
||||
if event_type in SUPERVISORD_LEVELS:
|
||||
alert_type=ALERT_TYPES_MAPPING.get(event_type, 'info')
|
||||
alert_type = ALERT_TYPES_MAPPING.get(event_type, 'info')
|
||||
if alert_type == 'info' and 'success' in msg:
|
||||
alert_type = 'success'
|
||||
event = dict(timestamp=date,
|
||||
|
@ -62,7 +64,8 @@ def parse_supervisord(log, line):
|
|||
program_result = program_matcher.match(msg)
|
||||
if program_result:
|
||||
event['event_object'] = program_result.groupdict()['program']
|
||||
if log: log.debug('RESULT supervisord:%s' %event)
|
||||
if log:
|
||||
log.debug('RESULT supervisord:%s' % event)
|
||||
return [event]
|
||||
else:
|
||||
return None
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# std
|
||||
import yaml
|
||||
|
||||
try:
|
||||
from yaml import CLoader as Loader
|
||||
except ImportError:
|
||||
|
@ -37,13 +38,13 @@ JMX_CHECKS = [
|
|||
]
|
||||
JMX_COLLECT_COMMAND = 'collect'
|
||||
JMX_LIST_COMMANDS = {
|
||||
'list_everything': 'List every attributes available that has a type supported by JMXFetch',
|
||||
'list_collected_attributes': 'List attributes that will actually be collected by your current instances configuration',
|
||||
'list_matching_attributes': 'List attributes that match at least one of your instances configuration',
|
||||
'list_not_matching_attributes': "List attributes that don't match any of your instances configuration",
|
||||
'list_limited_attributes': "List attributes that do match one of your instances configuration but that are not being collected because it would exceed the number of metrics that can be collected",
|
||||
JMX_COLLECT_COMMAND: "Start the collection of metrics based on your current configuration and display them in the console"
|
||||
}
|
||||
'list_everything': 'List every attributes available that has a type supported by JMXFetch',
|
||||
'list_collected_attributes': 'List attributes that will actually be collected by your current instances configuration',
|
||||
'list_matching_attributes': 'List attributes that match at least one of your instances configuration',
|
||||
'list_not_matching_attributes': "List attributes that don't match any of your instances configuration",
|
||||
'list_limited_attributes': "List attributes that do match one of your instances configuration but that are not being collected because it would exceed the number of metrics that can be collected",
|
||||
JMX_COLLECT_COMMAND: "Start the collection of metrics based on your current configuration and display them in the console"
|
||||
}
|
||||
|
||||
PYTHON_JMX_STATUS_FILE = 'jmx_status_python.yaml'
|
||||
|
||||
|
@ -55,7 +56,6 @@ class InvalidJMXConfiguration(Exception):
|
|||
|
||||
|
||||
class JMXFetch(object):
|
||||
|
||||
pid_file = PidFile("jmxfetch")
|
||||
pid_file_path = pid_file.get_path()
|
||||
|
||||
|
@ -64,7 +64,8 @@ class JMXFetch(object):
|
|||
default_check_frequency, command=None, checks_list=None, reporter=None):
|
||||
try:
|
||||
command = command or JMX_COLLECT_COMMAND
|
||||
jmx_checks, invalid_checks, java_bin_path, java_options = JMXFetch.should_run(confd_path, checks_list)
|
||||
jmx_checks, invalid_checks, java_bin_path, java_options = JMXFetch.should_run(
|
||||
confd_path, checks_list)
|
||||
if len(invalid_checks) > 0:
|
||||
try:
|
||||
JMXFetch.write_status_file(invalid_checks)
|
||||
|
@ -77,8 +78,8 @@ class JMXFetch(object):
|
|||
JMXFetch.stop()
|
||||
|
||||
JMXFetch.start(confd_path, agentConfig, logging_config,
|
||||
java_bin_path, java_options, default_check_frequency,
|
||||
jmx_checks, command, reporter)
|
||||
java_bin_path, java_options, default_check_frequency,
|
||||
jmx_checks, command, reporter)
|
||||
return True
|
||||
except Exception:
|
||||
log.exception("Error while initiating JMXFetch")
|
||||
|
@ -86,7 +87,7 @@ class JMXFetch(object):
|
|||
@classmethod
|
||||
def write_status_file(cls, invalid_checks):
|
||||
data = {
|
||||
'timestamp': time.time(),
|
||||
'timestamp': time.time(),
|
||||
'invalid_checks': invalid_checks
|
||||
}
|
||||
stream = file(os.path.join(tempfile.gettempdir(), PYTHON_JMX_STATUS_FILE), 'w')
|
||||
|
@ -138,7 +139,8 @@ class JMXFetch(object):
|
|||
continue
|
||||
|
||||
try:
|
||||
is_jmx, check_java_bin_path, check_java_options = JMXFetch.is_jmx_check(check_config, check_name, checks_list)
|
||||
is_jmx, check_java_bin_path, check_java_options = JMXFetch.is_jmx_check(
|
||||
check_config, check_name, checks_list)
|
||||
if is_jmx:
|
||||
jmx_checks.append(filename)
|
||||
if java_bin_path is None and check_java_bin_path is not None:
|
||||
|
@ -170,11 +172,13 @@ class JMXFetch(object):
|
|||
if is_jmx:
|
||||
instances = check_config.get('instances', [])
|
||||
if type(instances) != list or len(instances) == 0:
|
||||
raise InvalidJMXConfiguration('You need to have at least one instance defined in the YAML file for this check')
|
||||
raise InvalidJMXConfiguration(
|
||||
'You need to have at least one instance defined in the YAML file for this check')
|
||||
|
||||
for inst in instances:
|
||||
if type(inst) != dict:
|
||||
raise InvalidJMXConfiguration("Each instance should be a dictionary. %s" % LINK_TO_DOC)
|
||||
raise InvalidJMXConfiguration(
|
||||
"Each instance should be a dictionary. %s" % LINK_TO_DOC)
|
||||
host = inst.get('host', None)
|
||||
port = inst.get('port', None)
|
||||
conf = inst.get('conf', init_config.get('conf', None))
|
||||
|
@ -184,22 +188,28 @@ class JMXFetch(object):
|
|||
raise InvalidJMXConfiguration("A numeric port must be specified")
|
||||
|
||||
if conf is None:
|
||||
log.warning("%s doesn't have a 'conf' section. Only basic JVM metrics will be collected. %s" % (inst, LINK_TO_DOC))
|
||||
log.warning(
|
||||
"%s doesn't have a 'conf' section. Only basic JVM metrics will be collected. %s" % (
|
||||
inst, LINK_TO_DOC))
|
||||
else:
|
||||
if type(conf) != list or len(conf) == 0:
|
||||
raise InvalidJMXConfiguration("'conf' section should be a list of configurations %s" % LINK_TO_DOC)
|
||||
raise InvalidJMXConfiguration(
|
||||
"'conf' section should be a list of configurations %s" % LINK_TO_DOC)
|
||||
|
||||
for config in conf:
|
||||
include = config.get('include', None)
|
||||
if include is None:
|
||||
raise InvalidJMXConfiguration("Each configuration must have an 'include' section. %s" % LINK_TO_DOC)
|
||||
raise InvalidJMXConfiguration(
|
||||
"Each configuration must have an 'include' section. %s" % LINK_TO_DOC)
|
||||
|
||||
if type(include) != dict:
|
||||
raise InvalidJMXConfiguration("'include' section must be a dictionary %s" % LINK_TO_DOC)
|
||||
raise InvalidJMXConfiguration(
|
||||
"'include' section must be a dictionary %s" % LINK_TO_DOC)
|
||||
|
||||
if java_bin_path is None:
|
||||
if init_config and init_config.get('java_bin_path'):
|
||||
# We get the java bin path from the yaml file for backward compatibility purposes
|
||||
# We get the java bin path from the yaml file for backward compatibility
|
||||
# purposes
|
||||
java_bin_path = init_config.get('java_bin_path')
|
||||
|
||||
else:
|
||||
|
@ -235,12 +245,15 @@ class JMXFetch(object):
|
|||
return True
|
||||
except Exception, e:
|
||||
if "Errno 3" not in str(e):
|
||||
log.debug("Couldn't determine if JMXFetch is running. We suppose it's not. %s" % str(e))
|
||||
log.debug(
|
||||
"Couldn't determine if JMXFetch is running. We suppose it's not. %s" % str(
|
||||
e))
|
||||
return False
|
||||
|
||||
# Else we are on windows, we need another way to check if it's running
|
||||
try:
|
||||
import ctypes # Available from python2.5
|
||||
import ctypes # Available from python2.5
|
||||
|
||||
kernel32 = ctypes.windll.kernel32
|
||||
SYNCHRONIZE = 0x100000
|
||||
|
||||
|
@ -281,13 +294,16 @@ class JMXFetch(object):
|
|||
@classmethod
|
||||
def get_path_to_jmxfetch(cls):
|
||||
if get_os() != 'windows':
|
||||
return os.path.realpath(os.path.join(os.path.abspath(__file__), "..", "../collector/checks", "libs", JMX_FETCH_JAR_NAME))
|
||||
return os.path.realpath(
|
||||
os.path.join(os.path.abspath(__file__), "..", "../collector/checks", "libs",
|
||||
JMX_FETCH_JAR_NAME))
|
||||
|
||||
return os.path.realpath(os.path.join(os.path.abspath(__file__), "..", "../../", "jmxfetch", JMX_FETCH_JAR_NAME))
|
||||
return os.path.realpath(
|
||||
os.path.join(os.path.abspath(__file__), "..", "../../", "jmxfetch", JMX_FETCH_JAR_NAME))
|
||||
|
||||
@classmethod
|
||||
def start(cls, confd_path, agentConfig, logging_config, path_to_java, java_run_opts,
|
||||
default_check_frequency, jmx_checks, command, reporter=None):
|
||||
default_check_frequency, jmx_checks, command, reporter=None):
|
||||
statsd_port = agentConfig.get('monstatsd_port', "8125")
|
||||
|
||||
if reporter is None:
|
||||
|
@ -302,16 +318,21 @@ class JMXFetch(object):
|
|||
path_to_status_file = os.path.join(tempfile.gettempdir(), "jmx_status.yaml")
|
||||
|
||||
subprocess_args = [
|
||||
path_to_java, # Path to the java bin
|
||||
path_to_java, # Path to the java bin
|
||||
'-jar',
|
||||
r"%s" % path_to_jmxfetch, # Path to the jmxfetch jar
|
||||
'--check_period', str(default_check_frequency * 1000), # Period of the main loop of jmxfetch in ms
|
||||
'--conf_directory', r"%s" % confd_path, # Path of the conf.d directory that will be read by jmxfetch,
|
||||
'--log_level', JAVA_LOGGING_LEVEL.get(logging_config.get("log_level"), "INFO"), # Log Level: Mapping from Python log level to log4j log levels
|
||||
'--log_location', r"%s" % logging_config.get('jmxfetch_log_file'), # Path of the log file
|
||||
'--reporter', reporter, # Reporter to use
|
||||
'--status_location', r"%s" % path_to_status_file, # Path to the status file to write
|
||||
command, # Name of the command
|
||||
r"%s" % path_to_jmxfetch, # Path to the jmxfetch jar
|
||||
# Period of the main loop of jmxfetch in ms
|
||||
'--check_period', str(default_check_frequency * 1000),
|
||||
# Path of the conf.d directory that will be read by jmxfetch,
|
||||
'--conf_directory', r"%s" % confd_path,
|
||||
# Log Level: Mapping from Python log level to log4j log levels
|
||||
'--log_level', JAVA_LOGGING_LEVEL.get(logging_config.get("log_level"), "INFO"),
|
||||
# Path of the log file
|
||||
'--log_location', r"%s" % logging_config.get('jmxfetch_log_file'),
|
||||
'--reporter', reporter, # Reporter to use
|
||||
# Path to the status file to write
|
||||
'--status_location', r"%s" % path_to_status_file,
|
||||
command, # Name of the command
|
||||
]
|
||||
|
||||
subprocess_args.insert(3, '--check')
|
||||
|
@ -320,7 +341,7 @@ class JMXFetch(object):
|
|||
|
||||
if java_run_opts:
|
||||
for opt in java_run_opts.split():
|
||||
subprocess_args.insert(1,opt)
|
||||
subprocess_args.insert(1, opt)
|
||||
|
||||
log.info("Running %s" % " ".join(subprocess_args))
|
||||
if reporter != "console":
|
||||
|
|
|
@ -38,7 +38,7 @@ def load_qualified_module(full_module_name, path=None):
|
|||
|
||||
def module_name_for_filename(filename):
|
||||
"""Given the name of a Python file, find an appropropriate module name.
|
||||
|
||||
|
||||
This involves determining whether the file is within a package, and
|
||||
determining the name of same."""
|
||||
all_segments = filename.split(os.sep)
|
||||
|
@ -54,7 +54,7 @@ def module_name_for_filename(filename):
|
|||
def get_module(name):
|
||||
"""Given either an absolute path to a Python file or a module name, load
|
||||
and return a Python module.
|
||||
|
||||
|
||||
If the module is already loaded, takes no action."""
|
||||
if name.startswith('/'):
|
||||
basename, modulename = module_name_for_filename(name)
|
||||
|
|
|
@ -22,6 +22,7 @@ RECENT_POINT_THRESHOLD_DEFAULT = 3600
|
|||
|
||||
|
||||
class Aggregator(object):
|
||||
|
||||
"""
|
||||
Abstract metric aggregator class.
|
||||
"""
|
||||
|
@ -57,7 +58,7 @@ class Aggregator(object):
|
|||
def packets_per_second(self, interval):
|
||||
if interval == 0:
|
||||
return 0
|
||||
return round(float(self.count)/interval, 2)
|
||||
return round(float(self.count) / interval, 2)
|
||||
|
||||
def submit_metric(self, name, value, mtype, dimensions=None, hostname=None, device_name=None, timestamp=None,
|
||||
sample_rate=1):
|
||||
|
@ -108,12 +109,14 @@ class Aggregator(object):
|
|||
|
||||
|
||||
class MetricsBucketAggregator(Aggregator):
|
||||
|
||||
"""
|
||||
A metric aggregator class.
|
||||
"""
|
||||
|
||||
def __init__(self, hostname, interval=1.0, expiry_seconds=300, recent_point_threshold=None):
|
||||
super(MetricsBucketAggregator, self).__init__(hostname, interval, expiry_seconds, recent_point_threshold)
|
||||
super(MetricsBucketAggregator, self).__init__(
|
||||
hostname, interval, expiry_seconds, recent_point_threshold)
|
||||
self.metric_by_bucket = {}
|
||||
self.last_sample_time_by_context = {}
|
||||
self.current_bucket = None
|
||||
|
@ -171,7 +174,8 @@ class MetricsBucketAggregator(Aggregator):
|
|||
# (Set, Gauge, Histogram) do not report if no data is submitted
|
||||
for context, last_sample_time in sample_time_by_context.items():
|
||||
if last_sample_time < expiry_timestamp:
|
||||
log.debug("%s hasn't been submitted in %ss. Expiring." % (context, self.expiry_seconds))
|
||||
log.debug("%s hasn't been submitted in %ss. Expiring." %
|
||||
(context, self.expiry_seconds))
|
||||
self.last_sample_time_by_context.pop(context, None)
|
||||
else:
|
||||
# The expiration currently only applies to Counters
|
||||
|
@ -197,7 +201,8 @@ class MetricsBucketAggregator(Aggregator):
|
|||
for context, metric in metric_by_context.items():
|
||||
if metric.last_sample_time < expiry_timestamp:
|
||||
# This should never happen
|
||||
log.warning("%s hasn't been submitted in %ss. Expiring." % (context, self.expiry_seconds))
|
||||
log.warning("%s hasn't been submitted in %ss. Expiring." %
|
||||
(context, self.expiry_seconds))
|
||||
not_sampled_in_this_bucket.pop(context, None)
|
||||
self.last_sample_time_by_context.pop(context, None)
|
||||
else:
|
||||
|
@ -205,21 +210,24 @@ class MetricsBucketAggregator(Aggregator):
|
|||
if isinstance(metric, Counter):
|
||||
self.last_sample_time_by_context[context] = metric.last_sample_time
|
||||
not_sampled_in_this_bucket.pop(context, None)
|
||||
# We need to account for Metrics that have not expired and were not flushed for this bucket
|
||||
# We need to account for Metrics that have not expired and were not
|
||||
# flushed for this bucket
|
||||
self.create_empty_metrics(not_sampled_in_this_bucket, expiry_timestamp, bucket_start_timestamp,
|
||||
metrics)
|
||||
|
||||
del self.metric_by_bucket[bucket_start_timestamp]
|
||||
else:
|
||||
# Even if there are no metrics in this flush, there may be some non-expired counters
|
||||
# We should only create these non-expired metrics if we've passed an interval since the last flush
|
||||
# We should only create these non-expired metrics if we've passed an
|
||||
# interval since the last flush
|
||||
if flush_cutoff_time >= self.last_flush_cutoff_time + self.interval:
|
||||
self.create_empty_metrics(self.last_sample_time_by_context.copy(), expiry_timestamp,
|
||||
flush_cutoff_time-self.interval, metrics)
|
||||
flush_cutoff_time - self.interval, metrics)
|
||||
|
||||
# Log a warning regarding metrics with old timestamps being submitted
|
||||
if self.num_discarded_old_points > 0:
|
||||
log.warn('%s points were discarded as a result of having an old timestamp' % self.num_discarded_old_points)
|
||||
log.warn('%s points were discarded as a result of having an old timestamp' %
|
||||
self.num_discarded_old_points)
|
||||
self.num_discarded_old_points = 0
|
||||
|
||||
# Save some stats.
|
||||
|
@ -233,12 +241,14 @@ class MetricsBucketAggregator(Aggregator):
|
|||
|
||||
|
||||
class MetricsAggregator(Aggregator):
|
||||
|
||||
"""
|
||||
A metric aggregator class.
|
||||
"""
|
||||
|
||||
def __init__(self, hostname, interval=1.0, expiry_seconds=300, recent_point_threshold=None):
|
||||
super(MetricsAggregator, self).__init__(hostname, interval, expiry_seconds, recent_point_threshold)
|
||||
super(MetricsAggregator, self).__init__(
|
||||
hostname, interval, expiry_seconds, recent_point_threshold)
|
||||
self.metrics = {}
|
||||
self.metric_type_to_class = {
|
||||
'g': Gauge,
|
||||
|
@ -294,14 +304,16 @@ class MetricsAggregator(Aggregator):
|
|||
metrics = []
|
||||
for context, metric in self.metrics.items():
|
||||
if metric.last_sample_time < expiry_timestamp:
|
||||
log.debug("%s hasn't been submitted in %ss. Expiring." % (context, self.expiry_seconds))
|
||||
log.debug("%s hasn't been submitted in %ss. Expiring." %
|
||||
(context, self.expiry_seconds))
|
||||
del self.metrics[context]
|
||||
else:
|
||||
metrics += metric.flush(timestamp, self.interval)
|
||||
|
||||
# Log a warning regarding metrics with old timestamps being submitted
|
||||
if self.num_discarded_old_points > 0:
|
||||
log.warn('%s points were discarded as a result of having an old timestamp' % self.num_discarded_old_points)
|
||||
log.warn('%s points were discarded as a result of having an old timestamp' %
|
||||
self.num_discarded_old_points)
|
||||
self.num_discarded_old_points = 0
|
||||
|
||||
# Save some stats.
|
||||
|
@ -309,4 +321,3 @@ class MetricsAggregator(Aggregator):
|
|||
self.total_count += self.count
|
||||
self.count = 0
|
||||
return metrics
|
||||
|
||||
|
|
|
@ -102,6 +102,7 @@ def get_ntp_info():
|
|||
|
||||
|
||||
class AgentStatus(object):
|
||||
|
||||
"""
|
||||
A small class used to load and save status messages to the filesystem.
|
||||
"""
|
||||
|
@ -162,7 +163,7 @@ class AgentStatus(object):
|
|||
style("Status date", *styles),
|
||||
style("%s (%ss ago)" %
|
||||
(self.created_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
self.created_seconds_ago()), *styles)
|
||||
self.created_seconds_ago()), *styles)
|
||||
)
|
||||
]
|
||||
|
||||
|
@ -188,15 +189,12 @@ class AgentStatus(object):
|
|||
def _not_running_message(cls):
|
||||
lines = cls._title_lines() + [
|
||||
style(" %s is not running." % cls.NAME, 'red'),
|
||||
style(""" You can get more details in the logs:
|
||||
%s""" % logger_info(), 'red'),
|
||||
style(""" You can get more details in the logs: %s""" % logger_info(), 'red'),
|
||||
"",
|
||||
""
|
||||
]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
|
||||
@classmethod
|
||||
def remove_latest_status(cls):
|
||||
log.debug("Removing latest status")
|
||||
|
@ -205,7 +203,6 @@ class AgentStatus(object):
|
|||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
@classmethod
|
||||
def load_latest_status(cls):
|
||||
try:
|
||||
|
@ -218,7 +215,6 @@ class AgentStatus(object):
|
|||
log.info("Couldn't load latest status")
|
||||
return None
|
||||
|
||||
|
||||
@classmethod
|
||||
def print_latest_status(cls, verbose=False):
|
||||
cls.verbose = verbose
|
||||
|
@ -358,7 +354,7 @@ class CollectorStatus(AgentStatus):
|
|||
confd_path = config.get_confd_path(osname)
|
||||
except config.PathNotFound:
|
||||
confd_path = 'Not found'
|
||||
|
||||
|
||||
try:
|
||||
checksd_path = config.get_checksd_path(osname)
|
||||
except config.PathNotFound:
|
||||
|
@ -393,7 +389,7 @@ class CollectorStatus(AgentStatus):
|
|||
if cs.init_failed_error:
|
||||
check_lines.append(" - initialize check class [%s]: %s" %
|
||||
(style(STATUS_ERROR, 'red'),
|
||||
repr(cs.init_failed_error)))
|
||||
repr(cs.init_failed_error)))
|
||||
if self.verbose and cs.init_failed_traceback:
|
||||
check_lines.extend(' ' + line for line in
|
||||
cs.init_failed_traceback.split('\n'))
|
||||
|
@ -417,13 +413,15 @@ class CollectorStatus(AgentStatus):
|
|||
warn = warning.split('\n')
|
||||
if not len(warn):
|
||||
continue
|
||||
check_lines.append(u" %s: %s" % (style("Warning", 'yellow'), warn[0]))
|
||||
check_lines.append(u" %s: %s" %
|
||||
(style("Warning", 'yellow'), warn[0]))
|
||||
check_lines.extend(u" %s" % l for l in warn[1:])
|
||||
if self.verbose and s.traceback is not None:
|
||||
check_lines.extend(' ' + line for line in s.traceback.split('\n'))
|
||||
|
||||
check_lines += [
|
||||
" - Collected %s metrics & %s events" % (cs.metric_count, cs.event_count),
|
||||
" - Collected %s metrics & %s events" % (
|
||||
cs.metric_count, cs.event_count),
|
||||
]
|
||||
|
||||
if cs.library_versions is not None:
|
||||
|
@ -481,9 +479,11 @@ class CollectorStatus(AgentStatus):
|
|||
'has_warnings': s.has_warnings(),
|
||||
}
|
||||
if s.has_error():
|
||||
status_info['checks'][cs.name]['instances'][s.instance_id]['error'] = s.error
|
||||
status_info['checks'][cs.name]['instances'][
|
||||
s.instance_id]['error'] = s.error
|
||||
if s.has_warnings():
|
||||
status_info['checks'][cs.name]['instances'][s.instance_id]['warnings'] = s.warnings
|
||||
status_info['checks'][cs.name]['instances'][
|
||||
s.instance_id]['warnings'] = s.warnings
|
||||
status_info['checks'][cs.name]['metric_count'] = cs.metric_count
|
||||
status_info['checks'][cs.name]['event_count'] = cs.event_count
|
||||
|
||||
|
@ -503,7 +503,7 @@ class CollectorStatus(AgentStatus):
|
|||
status_info['confd_path'] = config.get_confd_path(osname)
|
||||
except config.PathNotFound:
|
||||
status_info['confd_path'] = 'Not found'
|
||||
|
||||
|
||||
try:
|
||||
status_info['checksd_path'] = config.get_checksd_path(osname)
|
||||
except config.PathNotFound:
|
||||
|
@ -554,7 +554,7 @@ class ForwarderStatus(AgentStatus):
|
|||
NAME = 'Forwarder'
|
||||
|
||||
def __init__(self, queue_length=0, queue_size=0, flush_count=0, transactions_received=0,
|
||||
transactions_flushed=0):
|
||||
transactions_flushed=0):
|
||||
AgentStatus.__init__(self)
|
||||
self.queue_length = queue_length
|
||||
self.queue_size = queue_size
|
||||
|
@ -587,10 +587,12 @@ class ForwarderStatus(AgentStatus):
|
|||
|
||||
def get_jmx_instance_status(instance_name, status, message, metric_count):
|
||||
if status == STATUS_ERROR:
|
||||
instance_status = InstanceStatus(instance_name, STATUS_ERROR, error=message, metric_count=metric_count)
|
||||
instance_status = InstanceStatus(
|
||||
instance_name, STATUS_ERROR, error=message, metric_count=metric_count)
|
||||
|
||||
elif status == STATUS_WARNING:
|
||||
instance_status = InstanceStatus(instance_name, STATUS_WARNING, warnings=[message], metric_count=metric_count)
|
||||
instance_status = InstanceStatus(
|
||||
instance_name, STATUS_WARNING, warnings=[message], metric_count=metric_count)
|
||||
|
||||
elif status == STATUS_OK:
|
||||
instance_status = InstanceStatus(instance_name, STATUS_OK, metric_count=metric_count)
|
||||
|
@ -616,7 +618,7 @@ def get_jmx_status():
|
|||
|
||||
- One generated by jmxfetch that return information about the collection of metrics
|
||||
its format is as following:
|
||||
|
||||
|
||||
###
|
||||
timestamp: 1391037347435
|
||||
checks:
|
||||
|
@ -632,7 +634,8 @@ def get_jmx_status():
|
|||
java_status_path = os.path.join(tempfile.gettempdir(), "jmx_status.yaml")
|
||||
python_status_path = os.path.join(tempfile.gettempdir(), "jmx_status_python.yaml")
|
||||
if not os.path.exists(java_status_path) and not os.path.exists(python_status_path):
|
||||
log.debug("There is no jmx_status file at: %s or at: %s" % (java_status_path, python_status_path))
|
||||
log.debug("There is no jmx_status file at: %s or at: %s" %
|
||||
(java_status_path, python_status_path))
|
||||
return []
|
||||
|
||||
check_data = defaultdict(lambda: defaultdict(list))
|
||||
|
@ -640,12 +643,13 @@ def get_jmx_status():
|
|||
if os.path.exists(java_status_path):
|
||||
java_jmx_stats = yaml.load(file(java_status_path))
|
||||
|
||||
status_age = time.time() - java_jmx_stats.get('timestamp')/1000 # JMX timestamp is saved in milliseconds
|
||||
# JMX timestamp is saved in milliseconds
|
||||
status_age = time.time() - java_jmx_stats.get('timestamp') / 1000
|
||||
jmx_checks = java_jmx_stats.get('checks', {})
|
||||
|
||||
if status_age > 60:
|
||||
check_statuses.append(CheckStatus("jmx", [InstanceStatus(0, STATUS_ERROR,
|
||||
error="JMXfetch didn't return any metrics during the last minute")],
|
||||
error="JMXfetch didn't return any metrics during the last minute")],
|
||||
0, 0))
|
||||
else:
|
||||
|
||||
|
@ -658,7 +662,7 @@ def get_jmx_status():
|
|||
check_data[check_name]['statuses'].append(get_jmx_instance_status(instance_name, status,
|
||||
message, metric_count))
|
||||
check_data[check_name]['metric_count'].append(metric_count)
|
||||
|
||||
|
||||
for check_name, instances in jmx_checks.get('initialized_checks', {}).iteritems():
|
||||
for info in instances:
|
||||
message = info.get('message', None)
|
||||
|
@ -670,7 +674,8 @@ def get_jmx_status():
|
|||
check_data[check_name]['metric_count'].append(metric_count)
|
||||
|
||||
for check_name, data in check_data.iteritems():
|
||||
check_status = CheckStatus(check_name, data['statuses'], sum(data['metric_count']), 0)
|
||||
check_status = CheckStatus(
|
||||
check_name, data['statuses'], sum(data['metric_count']), 0)
|
||||
check_statuses.append(check_status)
|
||||
|
||||
if os.path.exists(python_status_path):
|
||||
|
|
|
@ -59,6 +59,7 @@ def get_parsed_args():
|
|||
def get_version():
|
||||
return __version__
|
||||
|
||||
|
||||
def skip_leading_wsp(f):
|
||||
"Works on a file, returns a file-like object"
|
||||
return StringIO("\n".join(map(string.strip, f.readlines())))
|
||||
|
@ -177,7 +178,8 @@ def get_config_path(cfg_path=None, os_name=None):
|
|||
return os.path.join(path, AGENT_CONF)
|
||||
|
||||
# If all searches fail, exit the agent with an error
|
||||
sys.stderr.write("Please supply a configuration file at %s or in the directory where the Agent is currently deployed.\n" % bad_path)
|
||||
sys.stderr.write(
|
||||
"Please supply a configuration file at %s or in the directory where the Agent is currently deployed.\n" % bad_path)
|
||||
sys.exit(3)
|
||||
|
||||
|
||||
|
@ -234,14 +236,16 @@ def get_config(parse_args=True, cfg_path=None, options=None):
|
|||
|
||||
# Concerns only Windows
|
||||
if config.has_option('Main', 'use_web_info_page'):
|
||||
agent_config['use_web_info_page'] = config.get('Main', 'use_web_info_page').lower() in ("yes", "true")
|
||||
agent_config['use_web_info_page'] = config.get(
|
||||
'Main', 'use_web_info_page').lower() in ("yes", "true")
|
||||
else:
|
||||
agent_config['use_web_info_page'] = True
|
||||
|
||||
# local traffic only? Default to no
|
||||
agent_config['non_local_traffic'] = False
|
||||
if config.has_option('Main', 'non_local_traffic'):
|
||||
agent_config['non_local_traffic'] = config.get('Main', 'non_local_traffic').lower() in ("yes", "true")
|
||||
agent_config['non_local_traffic'] = config.get(
|
||||
'Main', 'non_local_traffic').lower() in ("yes", "true")
|
||||
|
||||
if config.has_option('Main', 'check_freq'):
|
||||
try:
|
||||
|
@ -267,14 +271,15 @@ def get_config(parse_args=True, cfg_path=None, options=None):
|
|||
else:
|
||||
agent_config[key] = value
|
||||
|
||||
#Forwarding to external statsd server
|
||||
# Forwarding to external statsd server
|
||||
if config.has_option('Main', 'statsd_forward_host'):
|
||||
agent_config['statsd_forward_host'] = config.get('Main', 'statsd_forward_host')
|
||||
if config.has_option('Main', 'statsd_forward_port'):
|
||||
agent_config['statsd_forward_port'] = int(config.get('Main', 'statsd_forward_port'))
|
||||
|
||||
# normalize 'yes'/'no' to boolean
|
||||
monstatsd_defaults['monstatsd_normalize'] = _is_affirmative(monstatsd_defaults['monstatsd_normalize'])
|
||||
monstatsd_defaults['monstatsd_normalize'] = _is_affirmative(
|
||||
monstatsd_defaults['monstatsd_normalize'])
|
||||
|
||||
# Optional config
|
||||
# FIXME not the prettiest code ever...
|
||||
|
@ -298,7 +303,8 @@ def get_config(parse_args=True, cfg_path=None, options=None):
|
|||
# Older version, single log support
|
||||
log_path = config.get("Main", "dogstream_log")
|
||||
if config.has_option("Main", "dogstream_line_parser"):
|
||||
agent_config["dogstreams"] = ':'.join([log_path, config.get("Main", "dogstream_line_parser")])
|
||||
agent_config["dogstreams"] = ':'.join(
|
||||
[log_path, config.get("Main", "dogstream_line_parser")])
|
||||
else:
|
||||
agent_config["dogstreams"] = log_path
|
||||
|
||||
|
@ -314,13 +320,15 @@ def get_config(parse_args=True, cfg_path=None, options=None):
|
|||
agent_config['WMI'][key] = value
|
||||
|
||||
if config.has_option("Main", "limit_memory_consumption") and \
|
||||
config.get("Main", "limit_memory_consumption") is not None:
|
||||
agent_config["limit_memory_consumption"] = int(config.get("Main", "limit_memory_consumption"))
|
||||
config.get("Main", "limit_memory_consumption") is not None:
|
||||
agent_config["limit_memory_consumption"] = int(
|
||||
config.get("Main", "limit_memory_consumption"))
|
||||
else:
|
||||
agent_config["limit_memory_consumption"] = None
|
||||
|
||||
if config.has_option("Main", "skip_ssl_validation"):
|
||||
agent_config["skip_ssl_validation"] = _is_affirmative(config.get("Main", "skip_ssl_validation"))
|
||||
agent_config["skip_ssl_validation"] = _is_affirmative(
|
||||
config.get("Main", "skip_ssl_validation"))
|
||||
|
||||
agent_config['Api'] = get_mon_api_config(config)
|
||||
|
||||
|
@ -333,7 +341,8 @@ def get_config(parse_args=True, cfg_path=None, options=None):
|
|||
sys.exit(2)
|
||||
|
||||
except ConfigParser.NoOptionError, e:
|
||||
sys.stderr.write('There are some items missing from your config file, but nothing fatal [%s]' % e)
|
||||
sys.stderr.write(
|
||||
'There are some items missing from your config file, but nothing fatal [%s]' % e)
|
||||
|
||||
# Storing proxy settings in the agent_config
|
||||
agent_config['proxy_settings'] = get_proxy(agent_config)
|
||||
|
@ -357,11 +366,12 @@ def set_win32_cert_path():
|
|||
else:
|
||||
cur_path = os.path.dirname(__file__)
|
||||
crt_path = os.path.join(cur_path, 'packaging', 'mon-agent', 'win32',
|
||||
'install_files', 'ca-certificates.crt')
|
||||
'install_files', 'ca-certificates.crt')
|
||||
import tornado.simple_httpclient
|
||||
log.info("Windows certificate path: %s" % crt_path)
|
||||
tornado.simple_httpclient._DEFAULT_CA_CERTS = crt_path
|
||||
|
||||
|
||||
def get_proxy(agent_config, use_system_settings=False):
|
||||
proxy_settings = {}
|
||||
|
||||
|
@ -378,7 +388,8 @@ def get_proxy(agent_config, use_system_settings=False):
|
|||
proxy_settings['user'] = agent_config.get('proxy_user', None)
|
||||
proxy_settings['password'] = agent_config.get('proxy_password', None)
|
||||
proxy_settings['system_settings'] = False
|
||||
log.debug("Proxy Settings: %s:%s@%s:%s" % (proxy_settings['user'], "*****", proxy_settings['host'], proxy_settings['port']))
|
||||
log.debug("Proxy Settings: %s:%s@%s:%s" %
|
||||
(proxy_settings['user'], "*****", proxy_settings['host'], proxy_settings['port']))
|
||||
return proxy_settings
|
||||
|
||||
# If no proxy configuration was specified in agent.conf
|
||||
|
@ -404,11 +415,13 @@ def get_proxy(agent_config, use_system_settings=False):
|
|||
if len(creds) == 2:
|
||||
proxy_settings['password'] = creds[1]
|
||||
|
||||
log.debug("Proxy Settings: %s:%s@%s:%s" % (proxy_settings['user'], "*****", proxy_settings['host'], proxy_settings['port']))
|
||||
log.debug("Proxy Settings: %s:%s@%s:%s" % (
|
||||
proxy_settings['user'], "*****", proxy_settings['host'], proxy_settings['port']))
|
||||
return proxy_settings
|
||||
|
||||
except Exception, e:
|
||||
log.debug("Error while trying to fetch proxy settings using urllib %s. Proxy is probably not set" % str(e))
|
||||
log.debug(
|
||||
"Error while trying to fetch proxy settings using urllib %s. Proxy is probably not set" % str(e))
|
||||
|
||||
log.debug("No proxy configured")
|
||||
|
||||
|
@ -486,12 +499,14 @@ def check_yaml(conf_path):
|
|||
valid_instances = False
|
||||
break
|
||||
if not valid_instances:
|
||||
raise Exception('You need to have at least one instance defined in the YAML file for this check')
|
||||
raise Exception(
|
||||
'You need to have at least one instance defined in the YAML file for this check')
|
||||
else:
|
||||
return check_config
|
||||
finally:
|
||||
f.close()
|
||||
|
||||
|
||||
def load_check_directory(agent_config):
|
||||
''' Return the initialized checks from checks_d, and a mapping of checks that failed to
|
||||
initialize. Only checks that have a configuration
|
||||
|
@ -514,11 +529,13 @@ def load_check_directory(agent_config):
|
|||
try:
|
||||
confd_path = get_confd_path(osname)
|
||||
except PathNotFound, e:
|
||||
log.error("No conf.d folder found at '%s' or in the directory where the Agent is currently deployed.\n" % e.args[0])
|
||||
log.error(
|
||||
"No conf.d folder found at '%s' or in the directory where the Agent is currently deployed.\n" % e.args[0])
|
||||
sys.exit(3)
|
||||
|
||||
# Start JMXFetch if needed
|
||||
JMXFetch.init(confd_path, agent_config, get_logging_config(), DEFAULT_CHECK_FREQUENCY, JMX_COLLECT_COMMAND)
|
||||
JMXFetch.init(confd_path, agent_config, get_logging_config(),
|
||||
DEFAULT_CHECK_FREQUENCY, JMX_COLLECT_COMMAND)
|
||||
|
||||
# For backwards-compatability with old style checks, we have to load every
|
||||
# checks_d module and check for a corresponding config OR check if the old
|
||||
|
@ -529,7 +546,8 @@ def load_check_directory(agent_config):
|
|||
for check in itertools.chain(*checks_paths):
|
||||
check_name = os.path.basename(check).split('.')[0]
|
||||
if check_name in initialized_checks or check_name in init_failed_checks:
|
||||
log.debug('Skipping check %s because it has already been loaded from another location', check)
|
||||
log.debug(
|
||||
'Skipping check %s because it has already been loaded from another location', check)
|
||||
continue
|
||||
try:
|
||||
check_module = imp.load_source('checksd_%s' % check_name, check)
|
||||
|
@ -540,9 +558,9 @@ def load_check_directory(agent_config):
|
|||
conf_path = os.path.join(confd_path, '%s.yaml' % check_name)
|
||||
if os.path.exists(conf_path):
|
||||
# There is a configuration file for that check but the module can't be imported
|
||||
init_failed_checks[check_name] = {'error':e, 'traceback':traceback_message}
|
||||
init_failed_checks[check_name] = {'error': e, 'traceback': traceback_message}
|
||||
log.exception('Unable to import check module %s.py from checks_d' % check_name)
|
||||
else: # There is no conf for that check. Let's not spam the logs for it.
|
||||
else: # There is no conf for that check. Let's not spam the logs for it.
|
||||
log.debug('Unable to import check module %s.py from checks_d' % check_name)
|
||||
continue
|
||||
|
||||
|
@ -571,7 +589,7 @@ def load_check_directory(agent_config):
|
|||
except Exception, e:
|
||||
log.exception("Unable to parse yaml config in %s" % conf_path)
|
||||
traceback_message = traceback.format_exc()
|
||||
init_failed_checks[check_name] = {'error':e, 'traceback':traceback_message}
|
||||
init_failed_checks[check_name] = {'error': e, 'traceback': traceback_message}
|
||||
continue
|
||||
elif hasattr(check_class, 'parse_agent_config'):
|
||||
# FIXME: Remove this check once all old-style checks are gone
|
||||
|
@ -618,7 +636,7 @@ def load_check_directory(agent_config):
|
|||
except Exception, e:
|
||||
log.exception('Unable to initialize check %s' % check_name)
|
||||
traceback_message = traceback.format_exc()
|
||||
init_failed_checks[check_name] = {'error':e, 'traceback':traceback_message}
|
||||
init_failed_checks[check_name] = {'error': e, 'traceback': traceback_message}
|
||||
else:
|
||||
initialized_checks[check_name] = c
|
||||
|
||||
|
@ -633,8 +651,8 @@ def load_check_directory(agent_config):
|
|||
|
||||
log.info('initialized checks_d checks: %s' % initialized_checks.keys())
|
||||
log.info('initialization failed checks_d checks: %s' % init_failed_checks.keys())
|
||||
return {'initialized_checks':initialized_checks.values(),
|
||||
'init_failed_checks':init_failed_checks,
|
||||
return {'initialized_checks': initialized_checks.values(),
|
||||
'init_failed_checks': init_failed_checks,
|
||||
}
|
||||
|
||||
|
||||
|
@ -644,6 +662,7 @@ def load_check_directory(agent_config):
|
|||
def get_log_date_format():
|
||||
return "%Y-%m-%d %H:%M:%S %Z"
|
||||
|
||||
|
||||
def get_log_format(logger_name):
|
||||
if get_os() != 'windows':
|
||||
return '%%(asctime)s | %%(levelname)s | %s | %%(name)s(%%(filename)s:%%(lineno)s) | %%(message)s' % logger_name
|
||||
|
@ -713,10 +732,12 @@ def get_logging_config(cfg_path=None):
|
|||
logging_config['log_level'] = levels.get(config.get('Main', 'log_level'))
|
||||
|
||||
if config.has_option('Main', 'log_to_syslog'):
|
||||
logging_config['log_to_syslog'] = config.get('Main', 'log_to_syslog').strip().lower() in ['yes', 'true', 1]
|
||||
logging_config['log_to_syslog'] = config.get(
|
||||
'Main', 'log_to_syslog').strip().lower() in ['yes', 'true', 1]
|
||||
|
||||
if config.has_option('Main', 'log_to_event_viewer'):
|
||||
logging_config['log_to_event_viewer'] = config.get('Main', 'log_to_event_viewer').strip().lower() in ['yes', 'true', 1]
|
||||
logging_config['log_to_event_viewer'] = config.get(
|
||||
'Main', 'log_to_event_viewer').strip().lower() in ['yes', 'true', 1]
|
||||
|
||||
if config.has_option('Main', 'syslog_host'):
|
||||
host = config.get('Main', 'syslog_host').strip()
|
||||
|
@ -733,7 +754,8 @@ def get_logging_config(cfg_path=None):
|
|||
logging_config['syslog_port'] = None
|
||||
|
||||
if config.has_option('Main', 'disable_file_logging'):
|
||||
logging_config['disable_file_logging'] = config.get('Main', 'disable_file_logging').strip().lower() in ['yes', 'true', 1]
|
||||
logging_config['disable_file_logging'] = config.get(
|
||||
'Main', 'disable_file_logging').strip().lower() in ['yes', 'true', 1]
|
||||
else:
|
||||
logging_config['disable_file_logging'] = False
|
||||
|
||||
|
@ -760,7 +782,8 @@ def initialize_logging(logger_name):
|
|||
# make sure the log directory is writeable
|
||||
# NOTE: the entire directory needs to be writable so that rotation works
|
||||
if os.access(os.path.dirname(log_file), os.R_OK | os.W_OK):
|
||||
file_handler = logging.handlers.RotatingFileHandler(log_file, maxBytes=LOGGING_MAX_BYTES, backupCount=1)
|
||||
file_handler = logging.handlers.RotatingFileHandler(
|
||||
log_file, maxBytes=LOGGING_MAX_BYTES, backupCount=1)
|
||||
formatter = logging.Formatter(get_log_format(logger_name), get_log_date_format())
|
||||
file_handler.setFormatter(formatter)
|
||||
|
||||
|
@ -783,7 +806,8 @@ def initialize_logging(logger_name):
|
|||
sys_log_addr = "/var/run/syslog"
|
||||
|
||||
handler = SysLogHandler(address=sys_log_addr, facility=SysLogHandler.LOG_DAEMON)
|
||||
handler.setFormatter(logging.Formatter(get_syslog_format(logger_name), get_log_date_format()))
|
||||
handler.setFormatter(
|
||||
logging.Formatter(get_syslog_format(logger_name), get_log_date_format()))
|
||||
root_log = logging.getLogger()
|
||||
root_log.addHandler(handler)
|
||||
except Exception, e:
|
||||
|
@ -794,8 +818,10 @@ def initialize_logging(logger_name):
|
|||
if get_os() == 'windows' and logging_config['log_to_event_viewer']:
|
||||
try:
|
||||
from logging.handlers import NTEventLogHandler
|
||||
nt_event_handler = NTEventLogHandler(logger_name,get_win32service_file('windows', 'win32service.pyd'), 'Application')
|
||||
nt_event_handler.setFormatter(logging.Formatter(get_syslog_format(logger_name), get_log_date_format()))
|
||||
nt_event_handler = NTEventLogHandler(
|
||||
logger_name, get_win32service_file('windows', 'win32service.pyd'), 'Application')
|
||||
nt_event_handler.setFormatter(
|
||||
logging.Formatter(get_syslog_format(logger_name), get_log_date_format()))
|
||||
nt_event_handler.setLevel(logging.ERROR)
|
||||
app_log = logging.getLogger(logger_name)
|
||||
app_log.addHandler(nt_event_handler)
|
||||
|
@ -834,8 +860,7 @@ def get_mon_api_config(config):
|
|||
dim_list = [dim.split(':') for dim in config.get('Main', 'dimensions').split(',')]
|
||||
mon_api_config['dimensions'] = {key.strip(): value.strip() for key, value in dim_list}
|
||||
except ValueError:
|
||||
mon_api_config['dimensions'] = { }
|
||||
|
||||
mon_api_config['dimensions'] = {}
|
||||
|
||||
if config.has_section("Api"):
|
||||
options = {"url": config.get,
|
||||
|
|
|
@ -2,11 +2,11 @@
|
|||
***
|
||||
Modified generic daemon class
|
||||
***
|
||||
|
||||
|
||||
Author: http://www.jejik.com/articles/2007/02/a_simple_unix_linux_daemon_in_python/
|
||||
www.boxedice.com
|
||||
www.datadoghq.com
|
||||
|
||||
|
||||
License: http://creativecommons.org/licenses/by-sa/3.0/
|
||||
"""
|
||||
|
||||
|
@ -23,6 +23,7 @@ log = logging.getLogger(__name__)
|
|||
|
||||
|
||||
class AgentSupervisor(object):
|
||||
|
||||
''' A simple supervisor to keep a restart a child on expected auto-restarts
|
||||
'''
|
||||
RESTART_EXIT_STATUS = 5
|
||||
|
@ -71,40 +72,42 @@ class AgentSupervisor(object):
|
|||
|
||||
|
||||
class Daemon(object):
|
||||
|
||||
"""
|
||||
A generic daemon class.
|
||||
|
||||
|
||||
Usage: subclass the Daemon class and override the run() method
|
||||
"""
|
||||
|
||||
def __init__(self, pidfile, stdin=os.devnull, stdout=os.devnull, stderr=os.devnull, autorestart=False):
|
||||
self.autorestart = autorestart
|
||||
self.stdin = stdin
|
||||
self.stdout = stdout
|
||||
self.stderr = stderr
|
||||
self.pidfile = pidfile
|
||||
|
||||
|
||||
def daemonize(self):
|
||||
"""
|
||||
Do the UNIX double-fork magic, see Stevens' "Advanced
|
||||
Do the UNIX double-fork magic, see Stevens' "Advanced
|
||||
Programming in the UNIX Environment" for details (ISBN 0201563177)
|
||||
http://www.erlenstar.demon.co.uk/unix/faq_2.html#SEC16
|
||||
"""
|
||||
try:
|
||||
pid = os.fork()
|
||||
try:
|
||||
pid = os.fork()
|
||||
if pid > 0:
|
||||
# Exit first parent
|
||||
sys.exit(0)
|
||||
except OSError, e:
|
||||
sys.exit(0)
|
||||
except OSError, e:
|
||||
msg = "fork #1 failed: %d (%s)" % (e.errno, e.strerror)
|
||||
log.error(msg)
|
||||
sys.stderr.write(msg + "\n")
|
||||
sys.exit(1)
|
||||
|
||||
log.debug("Fork 1 ok")
|
||||
|
||||
log.debug("Fork 1 ok")
|
||||
|
||||
# Decouple from parent environment
|
||||
os.chdir("/")
|
||||
os.setsid()
|
||||
os.chdir("/")
|
||||
os.setsid()
|
||||
|
||||
if self.autorestart:
|
||||
# Set up the supervisor callbacks and put a fork in it.
|
||||
|
@ -123,7 +126,7 @@ class Daemon(object):
|
|||
sys.stderr.write(msg + "\n")
|
||||
sys.exit(1)
|
||||
|
||||
if sys.platform != 'darwin': # This block breaks on OS X
|
||||
if sys.platform != 'darwin': # This block breaks on OS X
|
||||
# Redirect standard file descriptors
|
||||
sys.stdout.flush()
|
||||
sys.stderr.flush()
|
||||
|
@ -133,11 +136,11 @@ class Daemon(object):
|
|||
os.dup2(si.fileno(), sys.stdin.fileno())
|
||||
os.dup2(so.fileno(), sys.stdout.fileno())
|
||||
os.dup2(se.fileno(), sys.stderr.fileno())
|
||||
|
||||
|
||||
log.info("Daemon started")
|
||||
|
||||
|
||||
# Write pidfile
|
||||
atexit.register(self.delpid) # Make sure pid file is removed if we quit
|
||||
atexit.register(self.delpid) # Make sure pid file is removed if we quit
|
||||
pid = str(os.getpid())
|
||||
try:
|
||||
fp = open(self.pidfile, 'w+')
|
||||
|
@ -150,11 +153,10 @@ class Daemon(object):
|
|||
sys.stderr.write(msg + "\n")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def start(self):
|
||||
log.info("Starting daemon")
|
||||
pid = self.pid()
|
||||
|
||||
|
||||
if pid:
|
||||
message = "pidfile %s already exists. Is it already running?\n"
|
||||
log.error(message % self.pidfile)
|
||||
|
@ -162,12 +164,11 @@ class Daemon(object):
|
|||
sys.exit(1)
|
||||
|
||||
log.info("Daemon pidfile: %s" % self.pidfile)
|
||||
self.daemonize()
|
||||
self.daemonize()
|
||||
self.run()
|
||||
|
||||
|
||||
def stop(self):
|
||||
log.info("Stopping daemon")
|
||||
log.info("Stopping daemon")
|
||||
pid = self.pid()
|
||||
|
||||
# Clear the pid file
|
||||
|
@ -195,20 +196,18 @@ class Daemon(object):
|
|||
message = "Pidfile %s does not exist. Not running?\n" % self.pidfile
|
||||
log.info(message)
|
||||
sys.stderr.write(message)
|
||||
|
||||
|
||||
# A ValueError might occur if the PID file is empty but does actually exist
|
||||
if os.path.exists(self.pidfile):
|
||||
os.remove(self.pidfile)
|
||||
|
||||
return # Not an error in a restart
|
||||
|
||||
return # Not an error in a restart
|
||||
|
||||
def restart(self):
|
||||
"Restart the daemon"
|
||||
self.stop()
|
||||
self.stop()
|
||||
self.start()
|
||||
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
You should override this method when you subclass Daemon. It will be called after the process has been
|
||||
|
@ -216,7 +215,6 @@ class Daemon(object):
|
|||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def info(self):
|
||||
"""
|
||||
You should override this method when you subclass Daemon. It will be
|
||||
|
@ -224,7 +222,6 @@ class Daemon(object):
|
|||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def status(self):
|
||||
"""
|
||||
Get the status of the daemon. Exits with 0 if running, 1 if not.
|
||||
|
@ -246,7 +243,8 @@ class Daemon(object):
|
|||
os.kill(pid, 0)
|
||||
except OSError, e:
|
||||
if e.errno != errno.EPERM:
|
||||
message = '%s pidfile contains pid %s, but no running process could be found' % (self.__class__.__name__, pid)
|
||||
message = '%s pidfile contains pid %s, but no running process could be found' % (
|
||||
self.__class__.__name__, pid)
|
||||
exit_code = 1
|
||||
else:
|
||||
message = '%s is running with pid %s' % (self.__class__.__name__, pid)
|
||||
|
@ -256,7 +254,6 @@ class Daemon(object):
|
|||
sys.stdout.write(message + "\n")
|
||||
sys.exit(exit_code)
|
||||
|
||||
|
||||
def pid(self):
|
||||
# Get the pid from the pidfile
|
||||
try:
|
||||
|
@ -269,7 +266,6 @@ class Daemon(object):
|
|||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def delpid(self):
|
||||
try:
|
||||
os.remove(self.pidfile)
|
||||
|
|
|
@ -34,8 +34,10 @@ def http_emitter(message, log, url):
|
|||
headers = post_headers(payload)
|
||||
|
||||
try:
|
||||
proxy_handler = urllib2.ProxyHandler({}) # Make sure no proxy is autodetected for this localhost connection
|
||||
opener = urllib2.build_opener(proxy_handler) # Should this be installed as the default opener and reused?
|
||||
# Make sure no proxy is autodetected for this localhost connection
|
||||
proxy_handler = urllib2.ProxyHandler({})
|
||||
# Should this be installed as the default opener and reused?
|
||||
opener = urllib2.build_opener(proxy_handler)
|
||||
request = urllib2.Request(url, payload, headers)
|
||||
response = opener.open(request)
|
||||
try:
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import json
|
||||
import requests
|
||||
|
||||
|
||||
class Keystone(object):
|
||||
|
||||
password_auth = {
|
||||
|
@ -33,10 +34,11 @@ class Keystone(object):
|
|||
# Make this a singleton class so we don't get the token every time
|
||||
# the class is created
|
||||
_instance = None
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
if not cls._instance:
|
||||
cls._instance = super(Keystone, cls).__new__(
|
||||
cls, *args, **kwargs)
|
||||
cls, *args, **kwargs)
|
||||
return cls._instance
|
||||
|
||||
def __init__(self, endpoint, user_id, password, project_name):
|
||||
|
@ -57,7 +59,8 @@ class Keystone(object):
|
|||
self.password_auth['auth']['scope']['project']['name'] = self.project_name
|
||||
data = json.dumps(self.password_auth)
|
||||
headers = {'Content-Type': 'application/json'}
|
||||
response = requests.post(self.endpoint.rstrip('/') + '/auth/tokens', data=data, headers=headers)
|
||||
response = requests.post(
|
||||
self.endpoint.rstrip('/') + '/auth/tokens', data=data, headers=headers)
|
||||
response.raise_for_status()
|
||||
self.token = response.headers['X-Subject-Token']
|
||||
return self.token
|
||||
|
|
|
@ -12,7 +12,8 @@ log = logging.getLogger(__name__)
|
|||
|
||||
# todo it would be best to implement a Measurement group/list container, it could then have methods for converting to json
|
||||
# in the current setup both the emitter and the mon api are converting to json in for loops
|
||||
# A Measurement is the standard format used to pass data from the collector and monstatsd to the forwarder
|
||||
# A Measurement is the standard format used to pass data from the
|
||||
# collector and monstatsd to the forwarder
|
||||
Measurement = namedtuple('Measurement', ['name', 'timestamp', 'value', 'dimensions'])
|
||||
|
||||
|
||||
|
@ -23,6 +24,7 @@ class MetricTypes(object):
|
|||
|
||||
|
||||
class Metric(object):
|
||||
|
||||
"""
|
||||
A base metric class that accepts points, slices them into time intervals
|
||||
and performs roll-ups within those intervals.
|
||||
|
@ -38,6 +40,7 @@ class Metric(object):
|
|||
|
||||
|
||||
class Gauge(Metric):
|
||||
|
||||
""" A metric that tracks a value at particular points in time. """
|
||||
|
||||
def __init__(self, formatter, name, dimensions, hostname, device_name):
|
||||
|
@ -74,6 +77,7 @@ class Gauge(Metric):
|
|||
|
||||
|
||||
class BucketGauge(Gauge):
|
||||
|
||||
""" A metric that tracks a value at particular points in time.
|
||||
The difference beween this class and Gauge is that this class will
|
||||
report that gauge sample time as the time that Metric is flushed, as
|
||||
|
@ -100,6 +104,7 @@ class BucketGauge(Gauge):
|
|||
|
||||
|
||||
class Counter(Metric):
|
||||
|
||||
""" A metric that tracks a counter value. """
|
||||
|
||||
def __init__(self, formatter, name, dimensions, hostname, device_name):
|
||||
|
@ -133,6 +138,7 @@ class Counter(Metric):
|
|||
|
||||
|
||||
class Histogram(Metric):
|
||||
|
||||
""" A metric to track the distribution of a set of values. """
|
||||
|
||||
def __init__(self, formatter, name, dimensions, hostname, device_name):
|
||||
|
@ -159,26 +165,26 @@ class Histogram(Metric):
|
|||
length = len(self.samples)
|
||||
|
||||
max_ = self.samples[-1]
|
||||
med = self.samples[int(round(length/2 - 1))]
|
||||
med = self.samples[int(round(length / 2 - 1))]
|
||||
avg = sum(self.samples) / float(length)
|
||||
|
||||
metric_aggrs = [
|
||||
('max', max_, MetricTypes.GAUGE),
|
||||
('median', med, MetricTypes.GAUGE),
|
||||
('avg', avg, MetricTypes.GAUGE),
|
||||
('count', self.count/interval, MetricTypes.RATE)
|
||||
('count', self.count / interval, MetricTypes.RATE)
|
||||
]
|
||||
|
||||
metrics = [self.formatter(
|
||||
hostname=self.hostname,
|
||||
device_name=self.device_name,
|
||||
dimensions=self.dimensions,
|
||||
metric='%s.%s' % (self.name, suffix),
|
||||
value=value,
|
||||
timestamp=ts,
|
||||
metric_type=metric_type,
|
||||
interval=interval,
|
||||
) for suffix, value, metric_type in metric_aggrs
|
||||
hostname=self.hostname,
|
||||
device_name=self.device_name,
|
||||
dimensions=self.dimensions,
|
||||
metric='%s.%s' % (self.name, suffix),
|
||||
value=value,
|
||||
timestamp=ts,
|
||||
metric_type=metric_type,
|
||||
interval=interval,
|
||||
) for suffix, value, metric_type in metric_aggrs
|
||||
]
|
||||
|
||||
for p in self.percentiles:
|
||||
|
@ -202,6 +208,7 @@ class Histogram(Metric):
|
|||
|
||||
|
||||
class Set(Metric):
|
||||
|
||||
""" A metric to track the number of unique elements in a set. """
|
||||
|
||||
def __init__(self, formatter, name, dimensions, hostname, device_name):
|
||||
|
@ -236,6 +243,7 @@ class Set(Metric):
|
|||
|
||||
|
||||
class Rate(Metric):
|
||||
|
||||
""" Track the rate of metrics over each flush interval """
|
||||
|
||||
def __init__(self, formatter, name, dimensions, hostname, device_name):
|
||||
|
@ -285,4 +293,4 @@ class Rate(Metric):
|
|||
interval=interval
|
||||
)]
|
||||
finally:
|
||||
self.samples = self.samples[-1:]
|
||||
self.samples = self.samples[-1:]
|
||||
|
|
|
@ -17,7 +17,8 @@ try:
|
|||
except ImportError:
|
||||
pass # We are likely running the agent without the forwarder and tornado is not installed
|
||||
|
||||
VALID_HOSTNAME_RFC_1123_PATTERN = re.compile(r"^(([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z0-9]|[A-Za-z0-9][A-Za-z0-9\-]*[A-Za-z0-9])$")
|
||||
VALID_HOSTNAME_RFC_1123_PATTERN = re.compile(
|
||||
r"^(([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\-]*[a-zA-Z0-9])\.)*([A-Za-z0-9]|[A-Za-z0-9][A-Za-z0-9\-]*[A-Za-z0-9])$")
|
||||
MAX_HOSTNAME_LEN = 255
|
||||
|
||||
import logging
|
||||
|
@ -94,7 +95,7 @@ def isnan(val):
|
|||
|
||||
# for py < 2.6, use a different check
|
||||
# http://stackoverflow.com/questions/944700/how-to-check-for-nan-in-python
|
||||
return str(val) == str(1e400*0)
|
||||
return str(val) == str(1e400 * 0)
|
||||
|
||||
|
||||
def cast_metric_val(val):
|
||||
|
@ -118,7 +119,8 @@ def is_valid_hostname(hostname):
|
|||
log.warning("Hostname: %s is local" % hostname)
|
||||
return False
|
||||
if len(hostname) > MAX_HOSTNAME_LEN:
|
||||
log.warning("Hostname: %s is too long (max length is %s characters)" % (hostname, MAX_HOSTNAME_LEN))
|
||||
log.warning("Hostname: %s is too long (max length is %s characters)" %
|
||||
(hostname, MAX_HOSTNAME_LEN))
|
||||
return False
|
||||
if VALID_HOSTNAME_RFC_1123_PATTERN.match(hostname) is None:
|
||||
log.warning("Hostname: %s is not complying with RFC 1123" % hostname)
|
||||
|
@ -175,23 +177,27 @@ def get_hostname(config=None):
|
|||
hostname = socket_hostname
|
||||
|
||||
if hostname is None:
|
||||
log.critical('Unable to reliably determine host name. You can define one in agent.conf or in your hosts file')
|
||||
raise Exception('Unable to reliably determine host name. You can define one in agent.conf or in your hosts file')
|
||||
log.critical(
|
||||
'Unable to reliably determine host name. You can define one in agent.conf or in your hosts file')
|
||||
raise Exception(
|
||||
'Unable to reliably determine host name. You can define one in agent.conf or in your hosts file')
|
||||
else:
|
||||
return hostname
|
||||
|
||||
|
||||
class Watchdog(object):
|
||||
|
||||
"""Simple signal-based watchdog that will scuttle the current process
|
||||
if it has not been reset every N seconds, or if the processes exceeds
|
||||
a specified memory threshold.
|
||||
Can only be invoked once per process, so don't use with multiple threads.
|
||||
If you instantiate more than one, you're also asking for trouble.
|
||||
"""
|
||||
def __init__(self, duration, max_mem_mb = None):
|
||||
|
||||
def __init__(self, duration, max_mem_mb=None):
|
||||
import resource
|
||||
|
||||
#Set the duration
|
||||
# Set the duration
|
||||
self._duration = int(duration)
|
||||
signal.signal(signal.SIGALRM, Watchdog.self_destruct)
|
||||
|
||||
|
@ -213,7 +219,6 @@ class Watchdog(object):
|
|||
finally:
|
||||
os.kill(os.getpid(), signal.SIGKILL)
|
||||
|
||||
|
||||
def reset(self):
|
||||
# self destruct if using too much memory, as tornado will swallow MemoryErrors
|
||||
mem_usage_kb = int(os.popen('ps -p %d -o %s | tail -1' % (os.getpid(), 'rss')).read())
|
||||
|
@ -225,6 +230,7 @@ class Watchdog(object):
|
|||
|
||||
|
||||
class PidFile(object):
|
||||
|
||||
""" A small helper class for pidfiles. """
|
||||
|
||||
PID_DIR = '/var/run/mon-agent'
|
||||
|
@ -283,6 +289,7 @@ class PidFile(object):
|
|||
|
||||
|
||||
class LaconicFilter(logging.Filter):
|
||||
|
||||
"""
|
||||
Filters messages, only print them once while keeping memory under control
|
||||
"""
|
||||
|
@ -312,6 +319,7 @@ class LaconicFilter(logging.Filter):
|
|||
|
||||
|
||||
class Timer(object):
|
||||
|
||||
""" Helper class """
|
||||
|
||||
def __init__(self):
|
||||
|
@ -328,7 +336,7 @@ class Timer(object):
|
|||
|
||||
def step(self):
|
||||
now = self._now()
|
||||
step = now - self.last
|
||||
step = now - self.last
|
||||
self.last = now
|
||||
return step
|
||||
|
||||
|
@ -337,6 +345,7 @@ class Timer(object):
|
|||
|
||||
|
||||
class Platform(object):
|
||||
|
||||
"""
|
||||
Return information about the given platform.
|
||||
"""
|
||||
|
@ -373,7 +382,7 @@ class Platform(object):
|
|||
return (Platform.is_darwin()
|
||||
or Platform.is_linux()
|
||||
or Platform.is_freebsd()
|
||||
)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def is_win32(name=None):
|
||||
|
@ -384,6 +393,7 @@ class Platform(object):
|
|||
Iterable Recipes
|
||||
"""
|
||||
|
||||
|
||||
def chunks(iterable, chunk_size):
|
||||
"""Generate sequences of `chunk_size` elements from `iterable`."""
|
||||
iterable = iter(iterable)
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
|
|
@ -8,10 +8,12 @@ log = logging.getLogger(__name__)
|
|||
|
||||
|
||||
class MonAPI(object):
|
||||
|
||||
"""Sends measurements to MonAPI
|
||||
Any errors should raise an exception so the transaction calling
|
||||
this is not committed
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
"""
|
||||
Initialize Mon api client connection.
|
||||
|
|
|
@ -54,7 +54,7 @@ MAX_WAIT_FOR_REPLAY = timedelta(seconds=90)
|
|||
# Maximum queue size in bytes (when this is reached, old messages are dropped)
|
||||
MAX_QUEUE_SIZE = 30 * 1024 * 1024 # 30MB
|
||||
|
||||
THROTTLING_DELAY = timedelta(microseconds=1000000/2) # 2 msg/second
|
||||
THROTTLING_DELAY = timedelta(microseconds=1000000 / 2) # 2 msg/second
|
||||
|
||||
|
||||
class MetricTransaction(Transaction):
|
||||
|
@ -116,7 +116,8 @@ class StatusHandler(tornado.web.RequestHandler):
|
|||
|
||||
m = MetricTransaction.get_tr_manager()
|
||||
|
||||
self.write("<table><tr><td>Id</td><td>Size</td><td>Error count</td><td>Next flush</td></tr>")
|
||||
self.write(
|
||||
"<table><tr><td>Id</td><td>Size</td><td>Error count</td><td>Next flush</td></tr>")
|
||||
transactions = m.get_transactions()
|
||||
for tr in transactions:
|
||||
self.write("<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>" %
|
||||
|
@ -135,7 +136,8 @@ class AgentInputHandler(tornado.web.RequestHandler):
|
|||
The message is expected to follow the format:
|
||||
|
||||
"""
|
||||
# read the message it should be a list of monagent.common.metrics.Measurements expressed as a dict
|
||||
# read the message it should be a list of
|
||||
# monagent.common.metrics.Measurements expressed as a dict
|
||||
msg = tornado.escape.json_decode(self.request.body)
|
||||
try:
|
||||
log.debug(msg)
|
||||
|
@ -167,14 +169,16 @@ class Forwarder(tornado.web.Application):
|
|||
MetricTransaction.set_tr_manager(self._tr_manager)
|
||||
|
||||
self._watchdog = None
|
||||
self.skip_ssl_validation = skip_ssl_validation or agent_config.get('skip_ssl_validation', False)
|
||||
self.skip_ssl_validation = skip_ssl_validation or agent_config.get(
|
||||
'skip_ssl_validation', False)
|
||||
self.use_simple_http_client = use_simple_http_client
|
||||
if self.skip_ssl_validation:
|
||||
log.info("Skipping SSL hostname validation, useful when using a transparent proxy")
|
||||
|
||||
if watchdog:
|
||||
watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER
|
||||
self._watchdog = Watchdog(watchdog_timeout, max_mem_mb=agent_config.get('limit_memory_consumption', None))
|
||||
self._watchdog = Watchdog(
|
||||
watchdog_timeout, max_mem_mb=agent_config.get('limit_memory_consumption', None))
|
||||
|
||||
def _post_metrics(self):
|
||||
|
||||
|
@ -225,7 +229,8 @@ class Forwarder(tornado.web.Application):
|
|||
try:
|
||||
http_server.listen(self._port, address="localhost")
|
||||
except gaierror:
|
||||
log.warning("localhost seems undefined in your host file, using 127.0.0.1 instead")
|
||||
log.warning(
|
||||
"localhost seems undefined in your host file, using 127.0.0.1 instead")
|
||||
http_server.listen(self._port, address="127.0.0.1")
|
||||
except socket_error, e:
|
||||
if "Errno 99" in str(e):
|
||||
|
@ -234,7 +239,8 @@ class Forwarder(tornado.web.Application):
|
|||
else:
|
||||
raise
|
||||
except socket_error, e:
|
||||
log.exception("Socket error %s. Is another application listening on the same port ? Exiting", e)
|
||||
log.exception(
|
||||
"Socket error %s. Is another application listening on the same port ? Exiting", e)
|
||||
sys.exit(1)
|
||||
except Exception:
|
||||
log.exception("Uncaught exception. Forwarder is exiting.")
|
||||
|
@ -253,7 +259,8 @@ class Forwarder(tornado.web.Application):
|
|||
self._post_metrics()
|
||||
self._tr_manager.flush()
|
||||
|
||||
tr_sched = tornado.ioloop.PeriodicCallback(flush_trs, TRANSACTION_FLUSH_INTERVAL, io_loop=self.mloop)
|
||||
tr_sched = tornado.ioloop.PeriodicCallback(
|
||||
flush_trs, TRANSACTION_FLUSH_INTERVAL, io_loop=self.mloop)
|
||||
|
||||
# Start everything
|
||||
if self._watchdog:
|
||||
|
@ -291,7 +298,8 @@ def init_forwarder(skip_ssl_validation=False, use_simple_http_client=False):
|
|||
|
||||
def main():
|
||||
define("sslcheck", default=1, help="Verify SSL hostname, on by default")
|
||||
define("use_simple_http_client", default=0, help="Use Tornado SimpleHTTPClient instead of CurlAsyncHTTPClient")
|
||||
define("use_simple_http_client", default=0,
|
||||
help="Use Tornado SimpleHTTPClient instead of CurlAsyncHTTPClient")
|
||||
args = parse_command_line()
|
||||
skip_ssl_validation = False
|
||||
use_simple_http_client = False
|
||||
|
|
|
@ -21,7 +21,7 @@ class Transaction(object):
|
|||
|
||||
self._id = None
|
||||
self._error_count = 0
|
||||
self._next_flush = datetime.now()
|
||||
self._next_flush = datetime.now()
|
||||
self._size = None
|
||||
|
||||
def get_id(self):
|
||||
|
@ -35,7 +35,7 @@ class Transaction(object):
|
|||
self._error_count += 1
|
||||
|
||||
def get_error_count(self):
|
||||
return self._error_count
|
||||
return self._error_count
|
||||
|
||||
def get_size(self):
|
||||
if self._size is None:
|
||||
|
@ -64,6 +64,7 @@ class Transaction(object):
|
|||
|
||||
|
||||
class TransactionManager(object):
|
||||
|
||||
"""Holds any transaction derived object list and make sure they
|
||||
are all commited, without exceeding parameters (throttling, memory consumption) """
|
||||
|
||||
|
@ -76,7 +77,7 @@ class TransactionManager(object):
|
|||
|
||||
self._transactions = [] # List of all non commited transactions
|
||||
self._total_count = 0 # Maintain size/count not to recompute it everytime
|
||||
self._total_size = 0
|
||||
self._total_size = 0
|
||||
self._flush_count = 0
|
||||
self._transactions_received = 0
|
||||
self._transactions_flushed = 0
|
||||
|
@ -96,7 +97,7 @@ class TransactionManager(object):
|
|||
|
||||
def print_queue_stats(self):
|
||||
log.debug("Queue size: at %s, %s transaction(s), %s KB" %
|
||||
(time.time(), self._total_count, (self._total_size/1024)))
|
||||
(time.time(), self._total_count, (self._total_size / 1024)))
|
||||
|
||||
def get_tr_id(self):
|
||||
self._counter += 1
|
||||
|
@ -110,8 +111,8 @@ class TransactionManager(object):
|
|||
# Check the size
|
||||
tr_size = tr.get_size()
|
||||
|
||||
log.debug("New transaction to add, total size of queue would be: %s KB" %
|
||||
((self._total_size + tr_size)/1024))
|
||||
log.debug("New transaction to add, total size of queue would be: %s KB" %
|
||||
((self._total_size + tr_size) / 1024))
|
||||
|
||||
if (self._total_size + tr_size) > self._MAX_QUEUE_SIZE:
|
||||
log.warn("Queue is too big, removing old transactions...")
|
||||
|
@ -147,7 +148,7 @@ class TransactionManager(object):
|
|||
|
||||
count = len(to_flush)
|
||||
should_log = self._flush_count + 1 <= FLUSH_LOGGING_INITIAL or \
|
||||
(self._flush_count + 1) % FLUSH_LOGGING_PERIOD == 0
|
||||
(self._flush_count + 1) % FLUSH_LOGGING_PERIOD == 0
|
||||
if count > 0:
|
||||
if should_log:
|
||||
log.info("Flushing %s transaction%s during flush #%s" %
|
||||
|
@ -165,7 +166,8 @@ class TransactionManager(object):
|
|||
log.debug("No transaction to flush during flush #%s" % str(self._flush_count + 1))
|
||||
|
||||
if self._flush_count + 1 == FLUSH_LOGGING_INITIAL:
|
||||
log.info("First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD)
|
||||
log.info("First flushes done, next flushes will be logged every %s flushes." %
|
||||
FLUSH_LOGGING_PERIOD)
|
||||
|
||||
self._flush_count += 1
|
||||
|
||||
|
@ -185,7 +187,7 @@ class TransactionManager(object):
|
|||
if hasattr(td, 'total_seconds'):
|
||||
delay = td.total_seconds()
|
||||
else:
|
||||
delay = (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10.0**6
|
||||
delay = (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10 ** 6) / 10.0 ** 6
|
||||
|
||||
if delay <= 0:
|
||||
tr = self._trs_to_flush.pop()
|
||||
|
@ -203,7 +205,7 @@ class TransactionManager(object):
|
|||
if tornado_ioloop._running:
|
||||
tornado_ioloop.add_timeout(time.time() + delay, lambda: self.flush_next())
|
||||
elif self._flush_without_ioloop:
|
||||
# Tornado is no started (ie, unittests), do it manually: BLOCKING
|
||||
# Tornado is no started (ie, unittests), do it manually: BLOCKING
|
||||
time.sleep(delay)
|
||||
self.flush_next()
|
||||
else:
|
||||
|
@ -213,7 +215,7 @@ class TransactionManager(object):
|
|||
tr.inc_error_count()
|
||||
tr.compute_next_flush(self._MAX_WAIT_FOR_REPLAY)
|
||||
log.warn("Transaction %d in error (%s error%s), it will be replayed after %s" %
|
||||
(tr.get_id(), tr.get_error_count(), plural(tr.get_error_count()), tr.get_next_flush()))
|
||||
(tr.get_id(), tr.get_error_count(), plural(tr.get_error_count()), tr.get_next_flush()))
|
||||
|
||||
def tr_success(self, tr):
|
||||
log.debug("Transaction %d completed" % tr.get_id())
|
||||
|
@ -222,5 +224,3 @@ class TransactionManager(object):
|
|||
self._total_size += - tr.get_size()
|
||||
self._transactions_flushed += 1
|
||||
self.print_queue_stats()
|
||||
|
||||
|
||||
|
|
|
@ -30,6 +30,7 @@ log = logging.getLogger('monstatsd')
|
|||
|
||||
|
||||
class Monstatsd(Daemon):
|
||||
|
||||
""" This class is the monstatsd daemon. """
|
||||
|
||||
def __init__(self, pid_file, server, reporter, autorestart):
|
||||
|
@ -107,7 +108,8 @@ def init_monstatsd(config_path=None, use_watchdog=False):
|
|||
if non_local_traffic:
|
||||
server_host = ''
|
||||
|
||||
server = Server(aggregator, server_host, port, forward_to_host=forward_to_host, forward_to_port=forward_to_port)
|
||||
server = Server(aggregator, server_host, port, forward_to_host=forward_to_host,
|
||||
forward_to_port=forward_to_port)
|
||||
|
||||
return reporter, server, c
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@ EVENT_CHUNK_SIZE = 50
|
|||
|
||||
|
||||
class Reporter(threading.Thread):
|
||||
|
||||
"""
|
||||
The reporter periodically sends the aggregated metrics to the
|
||||
server.
|
||||
|
|
|
@ -10,6 +10,7 @@ UDP_SOCKET_TIMEOUT = 5
|
|||
|
||||
|
||||
class Server(object):
|
||||
|
||||
"""
|
||||
A statsd udp server.
|
||||
"""
|
||||
|
@ -55,9 +56,9 @@ class Server(object):
|
|||
|
||||
event = {
|
||||
'title': metadata[:title_length],
|
||||
'text': (metadata[title_length+1:title_length+text_length+1]).replace('\\n', '\n')
|
||||
'text': (metadata[title_length + 1:title_length + text_length + 1]).replace('\\n', '\n')
|
||||
}
|
||||
meta = metadata[title_length+text_length+1:]
|
||||
meta = metadata[title_length + text_length + 1:]
|
||||
for m in meta.split('|')[1:]:
|
||||
if m[0] == u't':
|
||||
event['alert_type'] = m[2:]
|
||||
|
@ -137,7 +138,8 @@ class Server(object):
|
|||
# todo it seems like this count should be done in the submit_metric method
|
||||
self.aggregator.count += 1
|
||||
name, value, mtype, dimensions, sample_rate = self._parse_metric_packet(packet)
|
||||
self.aggregator.submit_metric(name, value, mtype, dimensions=dimensions, sample_rate=sample_rate)
|
||||
self.aggregator.submit_metric(
|
||||
name, value, mtype, dimensions=dimensions, sample_rate=sample_rate)
|
||||
|
||||
def start(self):
|
||||
""" Run the server. """
|
||||
|
@ -149,7 +151,8 @@ class Server(object):
|
|||
open_socket.bind(self.address)
|
||||
except socket.gaierror:
|
||||
if self.address[0] == 'localhost':
|
||||
log.warning("Warning localhost seems undefined in your host file, using 127.0.0.1 instead")
|
||||
log.warning(
|
||||
"Warning localhost seems undefined in your host file, using 127.0.0.1 instead")
|
||||
self.address = ('127.0.0.1', self.address[1])
|
||||
open_socket.bind(self.address)
|
||||
|
||||
|
@ -187,4 +190,4 @@ class Server(object):
|
|||
log.exception('Error receiving datagram')
|
||||
|
||||
def stop(self):
|
||||
self.running = False
|
||||
self.running = False
|
||||
|
|
|
@ -24,6 +24,7 @@ from collector.jmxfetch import JMXFetch
|
|||
log = logging.getLogger(__name__)
|
||||
RESTART_INTERVAL = 24 * 60 * 60 # Defaults to 1 day
|
||||
|
||||
|
||||
class AgentSvc(win32serviceutil.ServiceFramework):
|
||||
_svc_name_ = "DatadogAgent"
|
||||
_svc_display_name_ = "Datadog Agent"
|
||||
|
@ -50,7 +51,7 @@ class AgentSvc(win32serviceutil.ServiceFramework):
|
|||
'forwarder': DDForwarder(config),
|
||||
'collector': DDAgent(agentConfig),
|
||||
'dogstatsd': DogstatsdProcess(config),
|
||||
'pup': PupProcess(config),
|
||||
'pup': PupProcess(config),
|
||||
}
|
||||
|
||||
def SvcStop(self):
|
||||
|
@ -105,6 +106,7 @@ class AgentSvc(win32serviceutil.ServiceFramework):
|
|||
|
||||
|
||||
class DDAgent(multiprocessing.Process):
|
||||
|
||||
def __init__(self, agentConfig, start_event=True):
|
||||
multiprocessing.Process.__init__(self, name='ddagent')
|
||||
self.config = agentConfig
|
||||
|
@ -137,7 +139,7 @@ class DDAgent(multiprocessing.Process):
|
|||
def get_emitters(self):
|
||||
emitters = [http_emitter]
|
||||
custom = [s.strip() for s in
|
||||
self.config.get('custom_emitters', '').split(',')]
|
||||
self.config.get('custom_emitters', '').split(',')]
|
||||
for emitter_spec in custom:
|
||||
if not emitter_spec:
|
||||
continue
|
||||
|
@ -145,7 +147,9 @@ class DDAgent(multiprocessing.Process):
|
|||
|
||||
return emitters
|
||||
|
||||
|
||||
class DDForwarder(multiprocessing.Process):
|
||||
|
||||
def __init__(self, agentConfig):
|
||||
multiprocessing.Process.__init__(self, name='ddforwarder')
|
||||
self.config = agentConfig
|
||||
|
@ -159,7 +163,7 @@ class DDForwarder(multiprocessing.Process):
|
|||
port = 17123
|
||||
else:
|
||||
port = int(port)
|
||||
app_config = get_config(parse_args = False)
|
||||
app_config = get_config(parse_args=False)
|
||||
self.forwarder = Application(port, app_config, watchdog=False)
|
||||
self.forwarder.run()
|
||||
|
||||
|
@ -167,7 +171,9 @@ class DDForwarder(multiprocessing.Process):
|
|||
log.debug("Windows Service - Stopping forwarder")
|
||||
self.forwarder.stop()
|
||||
|
||||
|
||||
class DogstatsdProcess(multiprocessing.Process):
|
||||
|
||||
def __init__(self, agentConfig):
|
||||
multiprocessing.Process.__init__(self, name='dogstatsd')
|
||||
self.config = agentConfig
|
||||
|
@ -185,7 +191,9 @@ class DogstatsdProcess(multiprocessing.Process):
|
|||
self.reporter.stop()
|
||||
self.reporter.join()
|
||||
|
||||
|
||||
class PupProcess(multiprocessing.Process):
|
||||
|
||||
def __init__(self, agentConfig):
|
||||
multiprocessing.Process.__init__(self, name='pup')
|
||||
self.config = agentConfig
|
||||
|
|
|
@ -2,7 +2,7 @@ import ctypes
|
|||
|
||||
|
||||
def handle_exe_click(name):
|
||||
''' When the executables are clicked directly in the UI, we must let the
|
||||
''' When the executables are clicked directly in the UI, we must let the
|
||||
user know that they have to install the program as a service instead of
|
||||
running it directly. '''
|
||||
message = """To use %(name)s, you must install it as a service.
|
||||
|
@ -16,4 +16,4 @@ For all available options, including how to install the service for a particular
|
|||
%(name)s.exe help
|
||||
""" % ({'name': name})
|
||||
MessageBox = ctypes.windll.user32.MessageBoxA
|
||||
MessageBox(None, message, 'Install as a Service', 0)
|
||||
MessageBox(None, message, 'Install as a Service', 0)
|
||||
|
|
|
@ -29,14 +29,14 @@ from spyderlib.widgets.sourcecode.codeeditor import CodeEditor
|
|||
|
||||
# Datadog
|
||||
from common.util import get_os
|
||||
from config import (get_confd_path, get_config_path, get_config,
|
||||
_windows_commondata_path)
|
||||
from config import (get_confd_path, get_config_path, get_config,
|
||||
_windows_commondata_path)
|
||||
|
||||
EXCLUDED_WINDOWS_CHECKS = [
|
||||
'cacti', 'directory', 'gearmand',
|
||||
'hdfs', 'kafka_consumer', 'mcache', 'network',
|
||||
'redis', 'postfix', 'process', 'gunicorn', 'zk',
|
||||
]
|
||||
]
|
||||
|
||||
MAIN_WINDOW_TITLE = "Datadog Agent Manager"
|
||||
|
||||
|
@ -81,8 +81,8 @@ def get_checks():
|
|||
continue
|
||||
|
||||
agent_check = AgentCheck(filename, ext, conf_d_directory)
|
||||
if (agent_check.enabled or agent_check.module_name not in checks or
|
||||
(not agent_check.is_example and not checks[agent_check.module_name].enabled)):
|
||||
if (agent_check.enabled or agent_check.module_name not in checks or
|
||||
(not agent_check.is_example and not checks[agent_check.module_name].enabled)):
|
||||
checks[agent_check.module_name] = agent_check
|
||||
|
||||
checks_list = checks.values()
|
||||
|
@ -92,6 +92,7 @@ def get_checks():
|
|||
|
||||
|
||||
class EditorFile(object):
|
||||
|
||||
def __init__(self, file_path, description):
|
||||
self.file_path = file_path
|
||||
self.description = description
|
||||
|
@ -101,7 +102,7 @@ class EditorFile(object):
|
|||
|
||||
def save(self, content):
|
||||
try:
|
||||
f = open(self.file_path,'w')
|
||||
f = open(self.file_path, 'w')
|
||||
f.write(content)
|
||||
self.content = content
|
||||
info_popup("File saved.")
|
||||
|
@ -111,9 +112,10 @@ class EditorFile(object):
|
|||
|
||||
|
||||
class LogFile(EditorFile):
|
||||
|
||||
def __init__(self):
|
||||
EditorFile.__init__(self, AGENT_LOG_FILE, "Agent log file")
|
||||
|
||||
|
||||
|
||||
class DatadogConf(EditorFile):
|
||||
|
||||
|
@ -128,7 +130,7 @@ class DatadogConf(EditorFile):
|
|||
def check_api_key(self, editor):
|
||||
if self.api_key is None:
|
||||
api_key, ok = QInputDialog.getText(None, "Add your API KEY",
|
||||
"You must first set your api key in this file. You can find it here: https://app.datadoghq.com/account/settings#api")
|
||||
"You must first set your api key in this file. You can find it here: https://app.datadoghq.com/account/settings#api")
|
||||
if ok and api_key:
|
||||
new_content = []
|
||||
for line in self.content.splitlines():
|
||||
|
@ -149,12 +151,13 @@ class DatadogConf(EditorFile):
|
|||
|
||||
|
||||
class AgentCheck(EditorFile):
|
||||
|
||||
def __init__(self, filename, ext, conf_d_directory):
|
||||
file_path = osp.join(conf_d_directory, filename)
|
||||
self.module_name = filename.split('.')[0]
|
||||
|
||||
EditorFile.__init__(self, file_path, description=self.module_name.replace("_", " ").title())
|
||||
|
||||
|
||||
self.enabled = ext == '.yaml'
|
||||
self.is_example = ext == '.example'
|
||||
self.enabled_name = osp.join(conf_d_directory, "%s.yaml" % self.module_name)
|
||||
|
@ -176,10 +179,11 @@ class AgentCheck(EditorFile):
|
|||
|
||||
|
||||
class PropertiesWidget(QWidget):
|
||||
|
||||
def __init__(self, parent):
|
||||
QWidget.__init__(self, parent)
|
||||
font = QFont(get_family(MONOSPACE), 10, QFont.Normal)
|
||||
|
||||
|
||||
info_icon = QLabel()
|
||||
icon = get_std_icon('MessageBoxInformation').pixmap(24, 24)
|
||||
info_icon.setPixmap(icon)
|
||||
|
@ -201,10 +205,10 @@ class PropertiesWidget(QWidget):
|
|||
layout.addWidget(info_icon)
|
||||
layout.addWidget(self.desc_label)
|
||||
layout.addStretch()
|
||||
layout.addWidget(self.service_status_label )
|
||||
layout.addWidget(self.service_status_label)
|
||||
|
||||
group_desc.setLayout(layout)
|
||||
|
||||
|
||||
self.editor = CodeEditor(self)
|
||||
self.editor.setup_editor(linenumbers=True, font=font)
|
||||
self.editor.setReadOnly(False)
|
||||
|
@ -212,27 +216,24 @@ class PropertiesWidget(QWidget):
|
|||
layout = QVBoxLayout()
|
||||
layout.addWidget(self.editor)
|
||||
group_code.setLayout(layout)
|
||||
|
||||
|
||||
self.enable_button = QPushButton(get_icon("apply.png"),
|
||||
"Enable", self)
|
||||
"Enable", self)
|
||||
|
||||
self.save_button = QPushButton(get_icon("filesave.png"),
|
||||
"Save", self)
|
||||
"Save", self)
|
||||
|
||||
self.edit_datadog_conf_button = QPushButton(get_icon("edit.png"),
|
||||
"Edit agent settings", self)
|
||||
"Edit agent settings", self)
|
||||
|
||||
self.disable_button = QPushButton(get_icon("delete.png"),
|
||||
"Disable", self)
|
||||
"Disable", self)
|
||||
|
||||
|
||||
self.view_log_button = QPushButton(get_icon("txt.png"),
|
||||
"View log", self)
|
||||
self.view_log_button = QPushButton(get_icon("txt.png"),
|
||||
"View log", self)
|
||||
|
||||
self.menu_button = QPushButton(get_icon("settings.png"),
|
||||
"Manager", self)
|
||||
|
||||
|
||||
"Manager", self)
|
||||
|
||||
hlayout = QHBoxLayout()
|
||||
hlayout.addWidget(self.save_button)
|
||||
|
@ -246,7 +247,7 @@ class PropertiesWidget(QWidget):
|
|||
hlayout.addWidget(self.view_log_button)
|
||||
hlayout.addStretch()
|
||||
hlayout.addWidget(self.menu_button)
|
||||
|
||||
|
||||
vlayout = QVBoxLayout()
|
||||
vlayout.addWidget(group_desc)
|
||||
vlayout.addWidget(group_code)
|
||||
|
@ -254,7 +255,7 @@ class PropertiesWidget(QWidget):
|
|||
self.setLayout(vlayout)
|
||||
|
||||
self.current_file = None
|
||||
|
||||
|
||||
def set_item(self, check):
|
||||
self.current_file = check
|
||||
self.desc_label.setText(check.get_description())
|
||||
|
@ -288,28 +289,32 @@ class PropertiesWidget(QWidget):
|
|||
|
||||
|
||||
class MainWindow(QSplitter):
|
||||
|
||||
def __init__(self, parent=None):
|
||||
|
||||
QSplitter.__init__(self, parent)
|
||||
self.setWindowTitle(MAIN_WINDOW_TITLE)
|
||||
self.setWindowIcon(get_icon("agent.svg"))
|
||||
|
||||
|
||||
self.sysTray = SystemTray(self)
|
||||
|
||||
self.connect(self.sysTray, SIGNAL("activated(QSystemTrayIcon::ActivationReason)"), self.__icon_activated)
|
||||
self.connect(self.sysTray, SIGNAL(
|
||||
"activated(QSystemTrayIcon::ActivationReason)"), self.__icon_activated)
|
||||
|
||||
checks = get_checks()
|
||||
datadog_conf = DatadogConf(get_config_path(), description="Agent settings file: datadog.conf")
|
||||
datadog_conf = DatadogConf(
|
||||
get_config_path(), description="Agent settings file: datadog.conf")
|
||||
self.log_file = LogFile()
|
||||
|
||||
listwidget = QListWidget(self)
|
||||
listwidget.addItems([osp.basename(check.module_name).replace("_", " ").title() for check in checks])
|
||||
|
||||
listwidget.addItems(
|
||||
[osp.basename(check.module_name).replace("_", " ").title() for check in checks])
|
||||
|
||||
self.properties = PropertiesWidget(self)
|
||||
|
||||
|
||||
self.addWidget(listwidget)
|
||||
self.addWidget(self.properties)
|
||||
|
||||
|
||||
self.connect(self.properties.enable_button, SIGNAL("clicked()"),
|
||||
lambda: enable_check(self.properties))
|
||||
|
||||
|
@ -330,11 +335,10 @@ class MainWindow(QSplitter):
|
|||
|
||||
self.manager_menu = Menu(self)
|
||||
self.connect(self.properties.menu_button, SIGNAL("clicked()"),
|
||||
lambda: self.manager_menu.popup(self.properties.menu_button.mapToGlobal(QPoint(0,0))))
|
||||
|
||||
lambda: self.manager_menu.popup(self.properties.menu_button.mapToGlobal(QPoint(0, 0))))
|
||||
|
||||
listwidget.setCurrentRow(0)
|
||||
|
||||
|
||||
self.setSizes([150, 1])
|
||||
self.setStretchFactor(1, 1)
|
||||
self.resize(QSize(950, 600))
|
||||
|
@ -355,13 +359,14 @@ class MainWindow(QSplitter):
|
|||
|
||||
def closeEvent(self, event):
|
||||
self.hide()
|
||||
self.sysTray.show()
|
||||
self.sysTray.show()
|
||||
event.ignore()
|
||||
|
||||
def __icon_activated(self, reason):
|
||||
if reason == QSystemTrayIcon.DoubleClick:
|
||||
self.show()
|
||||
|
||||
|
||||
class Menu(QMenu):
|
||||
|
||||
def __init__(self, parent=None, ):
|
||||
|
@ -375,7 +380,6 @@ class Menu(QMenu):
|
|||
|
||||
self.connect(self, SIGNAL("aboutToShow()"), lambda: self.update_options())
|
||||
|
||||
|
||||
def update_options(self):
|
||||
status = get_service_status()
|
||||
if is_service_running(status):
|
||||
|
@ -403,7 +407,7 @@ class SystemTray(QSystemTrayIcon):
|
|||
menu = Menu(self.parent())
|
||||
self.setContextMenu(menu)
|
||||
|
||||
|
||||
|
||||
def disable_check(properties):
|
||||
check = properties.current_file
|
||||
new_content = properties.editor.toPlainText().__str__()
|
||||
|
@ -415,7 +419,8 @@ def disable_check(properties):
|
|||
properties.enable_button.setEnabled(True)
|
||||
properties.disable_button.setEnabled(False)
|
||||
check.disable()
|
||||
|
||||
|
||||
|
||||
def enable_check(properties):
|
||||
check = properties.current_file
|
||||
|
||||
|
@ -427,20 +432,22 @@ def enable_check(properties):
|
|||
properties.enable_button.setEnabled(False)
|
||||
properties.disable_button.setEnabled(True)
|
||||
check.enable()
|
||||
|
||||
|
||||
|
||||
def save_file(properties):
|
||||
current_file = properties.current_file
|
||||
new_content = properties.editor.toPlainText().__str__()
|
||||
current_file.save(new_content)
|
||||
|
||||
|
||||
|
||||
def check_yaml_syntax(content):
|
||||
try:
|
||||
yaml.load(content, Loader=Loader)
|
||||
except Exception, e:
|
||||
warning_popup("Unable to parse yaml: \n %s" % str(e))
|
||||
raise
|
||||
|
||||
|
||||
|
||||
def _service_manager(action):
|
||||
try:
|
||||
if action == 'stop':
|
||||
|
@ -452,36 +459,43 @@ def _service_manager(action):
|
|||
except Exception, e:
|
||||
warning_popup("Couldn't %s service: \n %s" % (action, str(e)))
|
||||
|
||||
|
||||
def service_manager(action, async=True):
|
||||
if not async:
|
||||
_service_manager(action)
|
||||
else:
|
||||
thread.start_new_thread(_service_manager, (action,))
|
||||
|
||||
|
||||
def get_service_status():
|
||||
try:
|
||||
return win32serviceutil.QueryServiceStatus(DATADOG_SERVICE)[1]
|
||||
except Exception:
|
||||
return "Unknown"
|
||||
|
||||
def is_service_running(status = None):
|
||||
|
||||
def is_service_running(status=None):
|
||||
if status is None:
|
||||
status = get_service_status()
|
||||
return status == win32service.SERVICE_RUNNING
|
||||
|
||||
def is_service_pending(status = None):
|
||||
|
||||
def is_service_pending(status=None):
|
||||
if status is None:
|
||||
status = get_service_status()
|
||||
return status in [win32service.SERVICE_STOP_PENDING, win32service.SERVICE_START_PENDING]
|
||||
|
||||
def is_service_stopped(status = None):
|
||||
|
||||
def is_service_stopped(status=None):
|
||||
if status is None:
|
||||
status = get_service_status()
|
||||
return status == win32service.SERVICE_STOPPED
|
||||
|
||||
|
||||
def warning_popup(message, parent=None):
|
||||
QMessageBox.warning(parent, 'Message', message, QMessageBox.Ok)
|
||||
|
||||
|
||||
def info_popup(message, parent=None):
|
||||
QMessageBox.information(parent, 'Message', message, QMessageBox.Ok)
|
||||
|
||||
|
@ -491,4 +505,4 @@ if __name__ == '__main__':
|
|||
app = QApplication([])
|
||||
win = MainWindow()
|
||||
win.show()
|
||||
app.exec_()
|
||||
app.exec_()
|
||||
|
|
|
@ -16,4 +16,4 @@ Datadog Agent v%s - Python Shell
|
|||
print traceback.format_exc(e)
|
||||
|
||||
if __name__ == "__main__":
|
||||
shell()
|
||||
shell()
|
||||
|
|
|
@ -4,6 +4,7 @@ import collections
|
|||
|
||||
|
||||
class Plugins(collections.defaultdict):
|
||||
|
||||
"""A container for the plugin configurations used by the mon-agent.
|
||||
This is essentially a defaultdict(dict) but put into a class primarily to make the interface clear, also
|
||||
to add a couple of helper methods.
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
from monclient import exc as exc, client
|
||||
|
||||
class Plugin(object):
|
||||
|
||||
"""Abstract class implemented by the mon-agent plugin detection classes
|
||||
"""
|
||||
# todo these should include dependency detection
|
||||
|
|
|
@ -8,6 +8,7 @@ log = logging.getLogger(__name__)
|
|||
|
||||
|
||||
class Kafka(Plugin):
|
||||
|
||||
"""Detect Kafka daemons and sets up configuration to monitor them.
|
||||
This plugin configures the kafka_consumer plugin and does not configure any jmx based checks against kafka.
|
||||
Note this plugin will pull the same information from kafka on each node in the cluster it runs on.
|
||||
|
@ -32,7 +33,8 @@ class Kafka(Plugin):
|
|||
import kazoo
|
||||
from kazoo.client import KazooClient
|
||||
|
||||
logging.getLogger('kazoo').setLevel(logging.WARN) # kazoo fills up the console without this
|
||||
# kazoo fills up the console without this
|
||||
logging.getLogger('kazoo').setLevel(logging.WARN)
|
||||
|
||||
zk = KazooClient(hosts='127.0.0.1:2181', read_only=True)
|
||||
zk.start()
|
||||
|
@ -40,7 +42,8 @@ class Kafka(Plugin):
|
|||
for topic in zk.get_children('/brokers/topics'):
|
||||
topics[topic] = zk.get_children('/brokers/topics/%s/partitions' % topic)
|
||||
|
||||
consumers = collections.defaultdict(dict) # {'consumer_group_name': { 'topic1': [ 0, 1, 2] # partitions }}
|
||||
# {'consumer_group_name': { 'topic1': [ 0, 1, 2] # partitions }}
|
||||
consumers = collections.defaultdict(dict)
|
||||
for consumer in zk.get_children('/consumers'):
|
||||
try:
|
||||
for topic in zk.get_children('/consumers/%s/offsets' % consumer):
|
||||
|
|
|
@ -12,8 +12,10 @@ log = logging.getLogger(__name__)
|
|||
|
||||
|
||||
class MonPersister(Plugin):
|
||||
|
||||
"""Detect mon_persister and setup monitoring.
|
||||
"""
|
||||
|
||||
def _detect(self):
|
||||
"""Run detection, set self.available True if the service is detected."""
|
||||
if find_process_cmdline('mon-persister') is not None:
|
||||
|
@ -26,16 +28,18 @@ class MonPersister(Plugin):
|
|||
return dropwizard_health_check('mon-persister', 'http://localhost:8091/healthcheck')
|
||||
|
||||
# todo
|
||||
#log.info("\tEnabling the mon persister metric collection")
|
||||
#http://localhost:8091/metrics
|
||||
# log.info("\tEnabling the mon persister metric collection")
|
||||
# http://localhost:8091/metrics
|
||||
|
||||
def dependencies_installed(self):
|
||||
return True
|
||||
|
||||
|
||||
class MonAPI(Plugin):
|
||||
|
||||
"""Detect mon_api and setup monitoring.
|
||||
"""
|
||||
|
||||
def _detect(self):
|
||||
"""Run detection, set self.available True if the service is detected."""
|
||||
if find_process_cmdline('mon-api') is not None:
|
||||
|
@ -48,15 +52,17 @@ class MonAPI(Plugin):
|
|||
return dropwizard_health_check('mon-api', 'http://localhost:8081/healthcheck')
|
||||
|
||||
# todo
|
||||
#log.info("\tEnabling the mon api metric collection")
|
||||
#http://localhost:8081/metrics
|
||||
# log.info("\tEnabling the mon api metric collection")
|
||||
# http://localhost:8081/metrics
|
||||
|
||||
def dependencies_installed(self):
|
||||
return True
|
||||
|
||||
|
||||
class MonThresh(Plugin):
|
||||
|
||||
"""Detect the running mon-thresh and monitor"""
|
||||
|
||||
def _detect(self):
|
||||
"""Run detection, set self.available True if the service is detected."""
|
||||
if find_process_cmdline('mon-thresh') is not None:
|
||||
|
|
|
@ -7,6 +7,7 @@ log = logging.getLogger(__name__)
|
|||
|
||||
|
||||
class MySQL(Plugin):
|
||||
|
||||
"""Detect MySQL daemons and setup configuration to monitor them.
|
||||
This plugin needs user/pass infor for mysql setup, this is best placed in /root/.my.cnf in a format such as
|
||||
[client]
|
||||
|
@ -27,7 +28,8 @@ class MySQL(Plugin):
|
|||
config.merge(watch_process(['mysqld']))
|
||||
log.info("\tWatching the mysqld process.")
|
||||
|
||||
# Attempt login, requires either an empty root password from localhost or relying on a configured .my.cnf
|
||||
# Attempt login, requires either an empty root password from localhost or
|
||||
# relying on a configured .my.cnf
|
||||
if self.dependencies_installed(): # ensures MySQLdb is available
|
||||
import MySQLdb
|
||||
import _mysql_exceptions
|
||||
|
@ -36,7 +38,8 @@ class MySQL(Plugin):
|
|||
except _mysql_exceptions.MySQLError:
|
||||
pass
|
||||
else:
|
||||
log.info("\tConfiguring MySQL plugin to connect with auth settings from /root/.my.cnf")
|
||||
log.info(
|
||||
"\tConfiguring MySQL plugin to connect with auth settings from /root/.my.cnf")
|
||||
config['mysql'] = {'init_config': None, 'instances':
|
||||
[{'server': 'localhost', 'user': 'root', 'defaults_file': '/root/.my.cnf'}]}
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@ from monsetup import agent_config
|
|||
|
||||
|
||||
class Network(Plugin):
|
||||
|
||||
"""No configuration here, working networking is assumed so this is either on or off.
|
||||
"""
|
||||
|
||||
|
@ -16,7 +17,8 @@ class Network(Plugin):
|
|||
def build_config(self):
|
||||
"""Build the config as a Plugins object and return.
|
||||
"""
|
||||
# A bit silly to parse the yaml only for it to be converted back but this plugin is the exception not the rule
|
||||
# A bit silly to parse the yaml only for it to be converted back but this
|
||||
# plugin is the exception not the rule
|
||||
with open(os.path.join(self.template_dir, 'conf.d/network.yaml'), 'r') as network_template:
|
||||
default_net_config = yaml.load(network_template.read())
|
||||
config = agent_config.Plugins()
|
||||
|
|
|
@ -6,9 +6,11 @@ from monsetup import agent_config
|
|||
|
||||
|
||||
class Postfix(Plugin):
|
||||
|
||||
"""If postfix is running install the default config
|
||||
"""
|
||||
# todo this is is disabled as postfix requires passwordless sudo for the mon-agent user, a bad practice
|
||||
# todo this is is disabled as postfix requires passwordless sudo for the
|
||||
# mon-agent user, a bad practice
|
||||
|
||||
def _detect(self):
|
||||
"""Run detection, set self.available True if the service is detected."""
|
||||
|
@ -18,7 +20,8 @@ class Postfix(Plugin):
|
|||
def build_config(self):
|
||||
"""Build the config as a Plugins object and return.
|
||||
"""
|
||||
# A bit silly to parse the yaml only for it to be converted back but this plugin is the exception not the rule
|
||||
# A bit silly to parse the yaml only for it to be converted back but this
|
||||
# plugin is the exception not the rule
|
||||
with open(os.path.join(self.template_dir, 'conf.d/postfix.yaml.example'), 'r') as postfix_template:
|
||||
default_net_config = yaml.load(postfix_template.read())
|
||||
config = agent_config.Plugins()
|
||||
|
|
|
@ -9,8 +9,10 @@ log = logging.getLogger(__name__)
|
|||
|
||||
|
||||
class Zookeeper(Plugin):
|
||||
|
||||
"""Detect Zookeeper daemons and setup configuration to monitor them.
|
||||
"""
|
||||
|
||||
def _detect(self):
|
||||
"""Run detection, set self.available True if the service is detected."""
|
||||
if find_process_cmdline('zookeeper') is not None:
|
||||
|
@ -32,4 +34,6 @@ class Zookeeper(Plugin):
|
|||
return config
|
||||
|
||||
def dependencies_installed(self):
|
||||
return True # The current plugin just does a simple socket connection to zookeeper and parses the stat command
|
||||
# The current plugin just does a simple socket connection to zookeeper and
|
||||
# parses the stat command
|
||||
return True
|
||||
|
|
|
@ -29,15 +29,19 @@ log = logging.getLogger(__name__)
|
|||
def main(argv=None):
|
||||
parser = argparse.ArgumentParser(description='Detect running daemons then configure and start the agent.',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument('-u', '--username', help="Keystone username used to post metrics", required=True)
|
||||
parser.add_argument('-p', '--password', help="Keystone password used to post metrics", required=True)
|
||||
parser.add_argument(
|
||||
'-u', '--username', help="Keystone username used to post metrics", required=True)
|
||||
parser.add_argument(
|
||||
'-p', '--password', help="Keystone password used to post metrics", required=True)
|
||||
parser.add_argument('--project_name', help="Keystone project/tenant name", required=True)
|
||||
parser.add_argument('-s', '--service', help="Service this node is associated with.", required=True)
|
||||
parser.add_argument(
|
||||
'-s', '--service', help="Service this node is associated with.", required=True)
|
||||
parser.add_argument('--keystone_url', help="Keystone url", required=True)
|
||||
parser.add_argument('--mon_url', help="Mon API url", required=True)
|
||||
parser.add_argument('--config_dir', help="Configuration directory", default='/etc/mon-agent')
|
||||
parser.add_argument('--log_dir', help="mon-agent log directory", default='/var/log/mon-agent')
|
||||
parser.add_argument('--template_dir', help="Alternative template directory", default='/usr/local/share/mon/agent')
|
||||
parser.add_argument(
|
||||
'--template_dir', help="Alternative template directory", default='/usr/local/share/mon/agent')
|
||||
parser.add_argument('--headless', help="Run in a non-interactive mode", action="store_true")
|
||||
parser.add_argument('--overwrite',
|
||||
help="Overwrite existing plugin configuration." +
|
||||
|
@ -58,7 +62,8 @@ def main(argv=None):
|
|||
# Detect os
|
||||
detected_os = 'linux' # todo add detection
|
||||
|
||||
# Service enable, includes setup of users/config directories so must be done before configuration
|
||||
# Service enable, includes setup of users/config directories so must be
|
||||
# done before configuration
|
||||
agent_service = OS_SERVICE_MAP[detected_os](os.path.join(args.template_dir, 'mon-agent.init'), args.config_dir,
|
||||
args.log_dir, username=args.user)
|
||||
if not args.skip_enable:
|
||||
|
@ -102,14 +107,15 @@ def main(argv=None):
|
|||
if not detect.configure_alarms(args.mon_url, token):
|
||||
log.warn('Unable to configure alarms for {0}'.format(detect.name))
|
||||
|
||||
#todo add option to install dependencies
|
||||
# todo add option to install dependencies
|
||||
|
||||
# Write out the plugin config
|
||||
for key, value in plugin_config.iteritems():
|
||||
# todo if overwrite is set I should either warn or just delete any config files not in the new config
|
||||
# todo add the ability to show a diff before overwriting or merging config
|
||||
config_path = os.path.join(args.config_dir, 'conf.d', key + '.yaml')
|
||||
if (not args.overwrite) and os.path.exists(config_path): # merge old and new config, new has precedence
|
||||
# merge old and new config, new has precedence
|
||||
if (not args.overwrite) and os.path.exists(config_path):
|
||||
with open(config_path, 'r') as config_file:
|
||||
old_config = yaml.load(config_file.read())
|
||||
if old_config is not None:
|
||||
|
|
|
@ -1,2 +1 @@
|
|||
from service import Service
|
||||
|
||||
|
|
|
@ -4,7 +4,9 @@ import psutil
|
|||
|
||||
|
||||
class Service(object):
|
||||
|
||||
"""Abstract base class implementing the interface for various service types."""
|
||||
|
||||
def __init__(self, config_dir, log_dir, name='mon-agent'):
|
||||
self.config_dir = config_dir
|
||||
self.log_dir = log_dir
|
||||
|
|
|
@ -12,6 +12,7 @@ log = logging.getLogger(__name__)
|
|||
|
||||
|
||||
class SysV(Service):
|
||||
|
||||
def __init__(self, init_template, config_dir, log_dir, name='mon-agent', username='mon-agent'):
|
||||
"""Setup this service with the given init template"""
|
||||
super(SysV, self).__init__(config_dir, log_dir, name)
|
||||
|
@ -84,4 +85,3 @@ class SysV(Service):
|
|||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
|
8
setup.py
8
setup.py
|
@ -11,7 +11,7 @@ setup_requires = [
|
|||
]
|
||||
|
||||
# Prereqs of the install. Will install when deploying the egg.
|
||||
install_requires=[
|
||||
install_requires = [
|
||||
'requests',
|
||||
'gearman',
|
||||
'httplib2',
|
||||
|
@ -78,8 +78,9 @@ if sys.platform == 'win32':
|
|||
]
|
||||
|
||||
class Target(object):
|
||||
|
||||
def __init__(self, **kw):
|
||||
self.__dict__.update(kw)
|
||||
self.__dict__.update(kw)
|
||||
self.version = '1.0.0'
|
||||
self.cmdline_style = 'pywin32'
|
||||
|
||||
|
@ -100,7 +101,8 @@ if sys.platform == 'win32':
|
|||
'service': [agent_svc],
|
||||
'windows': [{'script': 'win32\gui.py',
|
||||
'dest_base': "agent-manager",
|
||||
'uac_info': "requireAdministrator", # The manager needs to be administrator to stop/start the service
|
||||
# The manager needs to be administrator to stop/start the service
|
||||
'uac_info': "requireAdministrator",
|
||||
'icon_resources': [(1, r"packaging\mon-agent\win32\install_files\dd_agent_win_256.ico")],
|
||||
}],
|
||||
'data_files': [
|
||||
|
|
|
@ -26,7 +26,8 @@ def load_check(name, config, agent_config):
|
|||
else:
|
||||
break
|
||||
if check_class is None:
|
||||
raise Exception("Unable to import check %s. Missing a class that inherits AgentCheck" % name)
|
||||
raise Exception(
|
||||
"Unable to import check %s. Missing a class that inherits AgentCheck" % name)
|
||||
|
||||
init_config = config.get('init_config', None)
|
||||
instances = config.get('instances')
|
||||
|
@ -51,7 +52,7 @@ def kill_subprocess(process_obj):
|
|||
import ctypes
|
||||
PROCESS_TERMINATE = 1
|
||||
handle = ctypes.windll.kernel32.OpenProcess(PROCESS_TERMINATE, False,
|
||||
process_obj.pid)
|
||||
process_obj.pid)
|
||||
ctypes.windll.kernel32.TerminateProcess(handle, -1)
|
||||
ctypes.windll.kernel32.CloseHandle(handle)
|
||||
else:
|
||||
|
@ -70,7 +71,8 @@ def get_check(name, config_str):
|
|||
check_class = clsmember
|
||||
break
|
||||
if check_class is None:
|
||||
raise Exception("Unable to import check %s. Missing a class that inherits AgentCheck" % name)
|
||||
raise Exception(
|
||||
"Unable to import check %s. Missing a class that inherits AgentCheck" % name)
|
||||
|
||||
agent_config = {
|
||||
'version': '0.1',
|
||||
|
|
|
@ -1,13 +1,3 @@
|
|||
"""
|
||||
Functional tests for dogstatsd.
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -6,8 +6,6 @@ Performance tests for the agent/dogstatsd metrics aggregator.
|
|||
from monagent.common.aggregator import MetricsAggregator
|
||||
|
||||
|
||||
|
||||
|
||||
class TestAggregatorPerf(object):
|
||||
|
||||
FLUSH_COUNT = 10
|
||||
|
@ -58,4 +56,4 @@ class TestAggregatorPerf(object):
|
|||
if __name__ == '__main__':
|
||||
t = TestAggregatorPerf()
|
||||
t.test_dogstatsd_aggregation_perf()
|
||||
#t.test_checksd_aggregation_perf()
|
||||
# t.test_checksd_aggregation_perf()
|
||||
|
|
|
@ -17,7 +17,9 @@ instances:
|
|||
rrd_whitelist: %s
|
||||
""" % (os.path.join(os.path.dirname(__file__), "cacti", "whitelist.txt"))
|
||||
|
||||
|
||||
class TestCacti(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.tmp_dir = '/tmp/cacti_test'
|
||||
self.rrd_dir = os.path.join(os.path.dirname(__file__), "cacti")
|
||||
|
@ -55,7 +57,7 @@ class TestCacti(unittest.TestCase):
|
|||
|
||||
# Bump the last timestamps back 20 minutes so we have some actual data
|
||||
twenty_min = 20 * 60
|
||||
for k,v in check.last_ts.items():
|
||||
for k, v in check.last_ts.items():
|
||||
check.last_ts[k] = v - twenty_min
|
||||
|
||||
# Do a first check
|
||||
|
|
|
@ -8,20 +8,25 @@ from collector.dogstream.cassandra import parse_cassandra
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TestCassandraDogstream(unittest.TestCase):
|
||||
|
||||
@attr('cassandra')
|
||||
def testStart(self):
|
||||
events = parse_cassandra(logger, " INFO [main] 2012-12-11 21:46:26,995 StorageService.java (line 687) Bootstrap/Replace/Move completed! Now serving reads.")
|
||||
events = parse_cassandra(
|
||||
logger, " INFO [main] 2012-12-11 21:46:26,995 StorageService.java (line 687) Bootstrap/Replace/Move completed! Now serving reads.")
|
||||
self.assertTrue(events is None)
|
||||
|
||||
@attr('cassandra')
|
||||
def testInfo(self):
|
||||
events = parse_cassandra(logger, " INFO [CompactionExecutor:35] 2012-12-02 21:15:03,738 AutoSavingCache.java (line 268) Saved KeyCache (5 items) in 3 ms")
|
||||
events = parse_cassandra(
|
||||
logger, " INFO [CompactionExecutor:35] 2012-12-02 21:15:03,738 AutoSavingCache.java (line 268) Saved KeyCache (5 items) in 3 ms")
|
||||
self.assertTrue(events is None)
|
||||
|
||||
@attr('cassandra')
|
||||
def testWarn(self):
|
||||
events = parse_cassandra(logger, " WARN [MemoryMeter:1] 2012-12-03 20:07:47,158 Memtable.java (line 197) setting live ratio to minimum of 1.0 instead of 0.9416553595658074")
|
||||
events = parse_cassandra(
|
||||
logger, " WARN [MemoryMeter:1] 2012-12-03 20:07:47,158 Memtable.java (line 197) setting live ratio to minimum of 1.0 instead of 0.9416553595658074")
|
||||
self.assertTrue(events is None)
|
||||
|
||||
@attr('cassandra')
|
||||
|
@ -55,13 +60,17 @@ java.util.concurrent.RejectedExecutionException
|
|||
|
||||
@attr('cassandra')
|
||||
def testCompactionStart(self):
|
||||
events = parse_cassandra(logger, " INFO [CompactionExecutor:2] 2012-12-11 21:46:27,012 CompactionTask.java (line 109) Compacting [SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-11-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-9-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-12-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-10-Data.db')]")
|
||||
self.assertEquals(events, [{'alert_type': 'info', 'event_type': 'cassandra.compaction', 'timestamp': 1355262387, 'msg_title': "Compacting [SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-1", 'msg_text': "Compacting [SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-11-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-9-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-12-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-10-Data.db')]", 'auto_priority': 0}])
|
||||
events = parse_cassandra(
|
||||
logger, " INFO [CompactionExecutor:2] 2012-12-11 21:46:27,012 CompactionTask.java (line 109) Compacting [SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-11-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-9-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-12-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-10-Data.db')]")
|
||||
self.assertEquals(events, [{'alert_type': 'info', 'event_type': 'cassandra.compaction', 'timestamp': 1355262387, 'msg_title': "Compacting [SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-1", 'msg_text':
|
||||
"Compacting [SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-11-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-9-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-12-Data.db'), SSTableReader(path='/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-10-Data.db')]", 'auto_priority': 0}])
|
||||
|
||||
@attr('cassandra')
|
||||
def testCompactionEnd(self):
|
||||
events = parse_cassandra(logger, "INFO [CompactionExecutor:2] 2012-12-11 21:46:27,095 CompactionTask.java (line 221) Compacted to [/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-13-Data.db,]. 880 to 583 (~66% of original) bytes for 4 keys at 0.007831MB/s. Time: 71ms.")
|
||||
self.assertEquals(events, [{'alert_type': 'info', 'event_type': 'cassandra.compaction', 'timestamp': 1355262387, 'msg_title': 'Compacted to [/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-13-Data.db,]. 880 ', 'msg_text': 'Compacted to [/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-13-Data.db,]. 880 to 583 (~66% of original) bytes for 4 keys at 0.007831MB/s. Time: 71ms.', 'auto_priority': 0}])
|
||||
events = parse_cassandra(
|
||||
logger, "INFO [CompactionExecutor:2] 2012-12-11 21:46:27,095 CompactionTask.java (line 221) Compacted to [/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-13-Data.db,]. 880 to 583 (~66% of original) bytes for 4 keys at 0.007831MB/s. Time: 71ms.")
|
||||
self.assertEquals(events, [{'alert_type': 'info', 'event_type': 'cassandra.compaction', 'timestamp': 1355262387, 'msg_title': 'Compacted to [/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-13-Data.db,]. 880 ',
|
||||
'msg_text': 'Compacted to [/var/lib/cassandra/data/system/LocationInfo/system-LocationInfo-he-13-Data.db,]. 880 to 583 (~66% of original) bytes for 4 keys at 0.007831MB/s. Time: 71ms.', 'auto_priority': 0}])
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
|
|
@ -13,7 +13,10 @@ from monagent.collector.jmxfetch import JMXFetch
|
|||
|
||||
|
||||
STATSD_PORT = 8121
|
||||
|
||||
|
||||
class DummyReporter(threading.Thread):
|
||||
|
||||
def __init__(self, metrics_aggregator):
|
||||
threading.Thread.__init__(self)
|
||||
self.finished = threading.Event()
|
||||
|
@ -23,7 +26,6 @@ class DummyReporter(threading.Thread):
|
|||
self.finished = False
|
||||
self.start()
|
||||
|
||||
|
||||
def run(self):
|
||||
while not self.finished:
|
||||
time.sleep(self.interval)
|
||||
|
@ -34,26 +36,26 @@ class DummyReporter(threading.Thread):
|
|||
if metrics:
|
||||
self.metrics = metrics
|
||||
|
||||
|
||||
class JMXTestCase(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
aggregator = MetricsAggregator("test_host")
|
||||
self.server = Server(aggregator, "localhost", STATSD_PORT)
|
||||
pid_file = PidFile('dogstatsd')
|
||||
self.reporter = DummyReporter(aggregator)
|
||||
|
||||
|
||||
self.t1 = threading.Thread(target=self.server.start)
|
||||
self.t1.start()
|
||||
|
||||
confd_path = os.path.realpath(os.path.join(os.path.abspath(__file__), "..", "jmx_yamls"))
|
||||
JMXFetch.init(confd_path, {'dogstatsd_port':STATSD_PORT}, get_logging_config(), 15)
|
||||
|
||||
JMXFetch.init(confd_path, {'dogstatsd_port': STATSD_PORT}, get_logging_config(), 15)
|
||||
|
||||
def tearDown(self):
|
||||
self.server.stop()
|
||||
self.reporter.finished = True
|
||||
JMXFetch.stop()
|
||||
|
||||
|
||||
def testCustomJMXMetric(self):
|
||||
raise SkipTest('Requires JMX be setup')
|
||||
count = 0
|
||||
|
@ -67,7 +69,8 @@ class JMXTestCase(unittest.TestCase):
|
|||
|
||||
self.assertTrue(type(metrics) == type([]))
|
||||
self.assertTrue(len(metrics) > 0)
|
||||
self.assertTrue(len([t for t in metrics if "cassandra.db." in t['metric'] and "instance:cassandra_instance" in t['dimensions']]) > 40, metrics)
|
||||
self.assertTrue(len([t for t in metrics if "cassandra.db." in t[
|
||||
'metric'] and "instance:cassandra_instance" in t['dimensions']]) > 40, metrics)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
|
@ -12,34 +12,34 @@ class DummyAgentCheck(AgentCheck):
|
|||
raise Exception("failure")
|
||||
|
||||
|
||||
|
||||
def test_check_status_fail():
|
||||
instances = [
|
||||
{'pass':True},
|
||||
{'pass':False},
|
||||
{'pass':True}
|
||||
{'pass': True},
|
||||
{'pass': False},
|
||||
{'pass': True}
|
||||
]
|
||||
|
||||
|
||||
check = DummyAgentCheck('dummy_agent_check', {}, {}, instances)
|
||||
instance_statuses = check.run()
|
||||
assert len(instance_statuses) == 3
|
||||
assert instance_statuses[0].status == STATUS_OK
|
||||
assert instance_statuses[1].status == STATUS_ERROR
|
||||
assert instance_statuses[2].status == STATUS_OK
|
||||
|
||||
|
||||
|
||||
def test_check_status_pass():
|
||||
instances = [
|
||||
{'pass':True},
|
||||
{'pass':True},
|
||||
{'pass': True},
|
||||
{'pass': True},
|
||||
]
|
||||
|
||||
|
||||
check = DummyAgentCheck('dummy_agent_check', {}, {}, instances)
|
||||
instances_status = check.run()
|
||||
assert len(instances_status) == 2
|
||||
for i in instances_status:
|
||||
assert i.status == STATUS_OK
|
||||
|
||||
|
||||
def test_persistence():
|
||||
i1 = InstanceStatus(1, STATUS_OK)
|
||||
chk1 = CheckStatus("dummy", [i1], 1, 2)
|
||||
|
@ -54,6 +54,7 @@ def test_persistence():
|
|||
assert chk2.metric_count == 1
|
||||
assert chk2.event_count == 2
|
||||
|
||||
|
||||
def test_persistence_fail():
|
||||
|
||||
# Assert remove doesn't crap out if a file doesn't exist.
|
||||
|
|
|
@ -6,9 +6,11 @@ from monagent.common.exceptions import UnknownValue, CheckException, Infinity
|
|||
from monagent.collector.checks import Check
|
||||
from monagent.common.aggregator import MetricsAggregator
|
||||
|
||||
|
||||
class TestCore(unittest.TestCase):
|
||||
|
||||
"Tests to validate the core check logic"
|
||||
|
||||
|
||||
def setUp(self):
|
||||
self.c = Check(logger)
|
||||
self.c.gauge("test-metric")
|
||||
|
@ -28,7 +30,8 @@ class TestCore(unittest.TestCase):
|
|||
self.assertEquals(len(self.c._sample_store["test-metric"]), 1)
|
||||
# with explicit timestamp
|
||||
self.c.save_sample("test-metric", 3.0, 1298066183.607717)
|
||||
self.assertEquals(self.c.get_sample_with_timestamp("test-metric"), (1298066183.607717, 3.0, None, None))
|
||||
self.assertEquals(self.c.get_sample_with_timestamp(
|
||||
"test-metric"), (1298066183.607717, 3.0, None, None))
|
||||
# get_samples()
|
||||
self.assertEquals(self.c.get_samples(), {"test-metric": 3.0})
|
||||
|
||||
|
@ -44,7 +47,8 @@ class TestCore(unittest.TestCase):
|
|||
self.assertRaises(UnknownValue, self.c.get_sample, "test-counter", expire=False)
|
||||
self.c.save_sample("test-counter", 2.0, 2.0)
|
||||
self.assertEquals(self.c.get_sample("test-counter", expire=False), 1.0)
|
||||
self.assertEquals(self.c.get_sample_with_timestamp("test-counter", expire=False), (2.0, 1.0, None, None))
|
||||
self.assertEquals(self.c.get_sample_with_timestamp(
|
||||
"test-counter", expire=False), (2.0, 1.0, None, None))
|
||||
self.assertEquals(self.c.get_samples(expire=False), {"test-counter": 1.0})
|
||||
self.c.save_sample("test-counter", -2.0, 3.0)
|
||||
self.assertRaises(UnknownValue, self.c.get_sample_with_timestamp, "test-counter")
|
||||
|
@ -53,21 +57,27 @@ class TestCore(unittest.TestCase):
|
|||
# Test metric dimensions
|
||||
now = int(time.time())
|
||||
# dimensions metrics
|
||||
self.c.save_sample("test-counter", 1.0, 1.0, dimensions={"dim1": "value1", "dim2": "value2"})
|
||||
self.c.save_sample("test-counter", 2.0, 2.0, dimensions={"dim1": "value1", "dim2": "value2"})
|
||||
self.c.save_sample(
|
||||
"test-counter", 1.0, 1.0, dimensions={"dim1": "value1", "dim2": "value2"})
|
||||
self.c.save_sample(
|
||||
"test-counter", 2.0, 2.0, dimensions={"dim1": "value1", "dim2": "value2"})
|
||||
# Only 1 point recording for this combination of dimensions, won't be sent
|
||||
self.c.save_sample("test-counter", 3.0, 3.0, dimensions={"dim1": "value1", "dim3": "value3"})
|
||||
self.c.save_sample(
|
||||
"test-counter", 3.0, 3.0, dimensions={"dim1": "value1", "dim3": "value3"})
|
||||
self.c.save_sample("test-metric", 3.0, now, dimensions={"dim3": "value3", "dim4": "value4"})
|
||||
# Arg checks
|
||||
self.assertRaises(CheckException, self.c.save_sample, "test-metric", 4.0, now + 5, dimensions="abc")
|
||||
self.assertRaises(
|
||||
CheckException, self.c.save_sample, "test-metric", 4.0, now + 5, dimensions="abc")
|
||||
# This is a different combination of dimensions
|
||||
self.c.save_sample("test-metric", 3.0, now, dimensions={"dim5": "value5", "dim3": "value3"})
|
||||
results = self.c.get_metrics()
|
||||
results.sort()
|
||||
self.assertEquals(results,
|
||||
[("test-counter", 2.0, 1.0, {"dimensions": {"dim1": "value1", "dim2": "value2"}}),
|
||||
("test-metric", now, 3.0, {"dimensions": {"dim3": "value3", "dim4": "value4"}}),
|
||||
("test-metric", now, 3.0, {"dimensions": {"dim3": "value3", "dim5": "value5"}})
|
||||
("test-metric", now, 3.0,
|
||||
{"dimensions": {"dim3": "value3", "dim4": "value4"}}),
|
||||
("test-metric", now, 3.0,
|
||||
{"dimensions": {"dim3": "value3", "dim5": "value5"}})
|
||||
])
|
||||
# dimensions metrics are not available through get_samples anymore
|
||||
self.assertEquals(self.c.get_samples(), {})
|
||||
|
@ -77,26 +87,34 @@ class TestCore(unittest.TestCase):
|
|||
self.c.save_sample("test-metric", 1.0, 0.0) # value, ts
|
||||
self.c.save_sample("test-counter", 1.0, 1.0) # value, ts
|
||||
self.c.save_sample("test-counter", 4.0, 2.0) # value, ts
|
||||
assert "test-metric" in self.c.get_samples_with_timestamps(expire=False), self.c.get_samples_with_timestamps(expire=False)
|
||||
self.assertEquals(self.c.get_samples_with_timestamps(expire=False)["test-metric"], (0.0, 1.0, None, None))
|
||||
assert "test-counter" in self.c.get_samples_with_timestamps(expire=False), self.c.get_samples_with_timestamps(expire=False)
|
||||
self.assertEquals(self.c.get_samples_with_timestamps(expire=False)["test-counter"], (2.0, 3.0, None, None))
|
||||
assert "test-metric" in self.c.get_samples_with_timestamps(
|
||||
expire=False), self.c.get_samples_with_timestamps(expire=False)
|
||||
self.assertEquals(self.c.get_samples_with_timestamps(
|
||||
expire=False)["test-metric"], (0.0, 1.0, None, None))
|
||||
assert "test-counter" in self.c.get_samples_with_timestamps(
|
||||
expire=False), self.c.get_samples_with_timestamps(expire=False)
|
||||
self.assertEquals(self.c.get_samples_with_timestamps(
|
||||
expire=False)["test-counter"], (2.0, 3.0, None, None))
|
||||
|
||||
def test_name(self):
|
||||
self.assertEquals(self.c.normalize("metric"), "metric")
|
||||
self.assertEquals(self.c.normalize("metric", "prefix"), "prefix.metric")
|
||||
self.assertEquals(self.c.normalize("__metric__", "prefix"), "prefix.metric")
|
||||
self.assertEquals(self.c.normalize("abc.metric(a+b+c{}/5)", "prefix"), "prefix.abc.metric_a_b_c_5")
|
||||
self.assertEquals(self.c.normalize("VBE.default(127.0.0.1,,8080).happy", "varnish"), "varnish.VBE.default_127.0.0.1_8080.happy")
|
||||
self.assertEquals(
|
||||
self.c.normalize("abc.metric(a+b+c{}/5)", "prefix"), "prefix.abc.metric_a_b_c_5")
|
||||
self.assertEquals(self.c.normalize(
|
||||
"VBE.default(127.0.0.1,,8080).happy", "varnish"), "varnish.VBE.default_127.0.0.1_8080.happy")
|
||||
|
||||
|
||||
class TestAggregator(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.aggr = MetricsAggregator('test-aggr')
|
||||
|
||||
def test_dupe_tags(self):
|
||||
self.aggr.increment('test-counter', 1, dimensions={'a': 'avalue', 'b': 'bvalue'})
|
||||
self.aggr.increment('test-counter', 1, dimensions={'a': 'avalue', 'b': 'bvalue', 'b': 'bvalue'})
|
||||
self.aggr.increment(
|
||||
'test-counter', 1, dimensions={'a': 'avalue', 'b': 'bvalue', 'b': 'bvalue'})
|
||||
self.assertEquals(len(self.aggr.metrics), 1, self.aggr.metrics)
|
||||
metric = self.aggr.metrics.values()[0]
|
||||
self.assertEquals(metric.value, 2)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
## -*- coding: latin-1 -*-
|
||||
# -*- coding: latin-1 -*-
|
||||
import unittest
|
||||
import os.path
|
||||
import tempfile
|
||||
|
@ -8,10 +8,12 @@ from monagent.common.util import PidFile, is_valid_hostname
|
|||
|
||||
|
||||
class TestConfig(unittest.TestCase):
|
||||
|
||||
def testWhiteSpaceConfig(self):
|
||||
"""Leading whitespace confuse ConfigParser
|
||||
"""
|
||||
agent_config = get_config(cfg_path=os.path.join(os.path.dirname(os.path.realpath(__file__)), "badconfig.conf"))
|
||||
agent_config = get_config(
|
||||
cfg_path=os.path.join(os.path.dirname(os.path.realpath(__file__)), "badconfig.conf"))
|
||||
self.assertEquals(agent_config["api_key"], "1234")
|
||||
|
||||
def testGoodPidFie(self):
|
||||
|
@ -38,10 +40,10 @@ class TestConfig(unittest.TestCase):
|
|||
u'i-123445',
|
||||
u'5dfsdfsdrrfsv',
|
||||
u'432498234234A'
|
||||
u'234234235235235235', # Couldn't find anything in the RFC saying it's not valid
|
||||
u'234234235235235235', # Couldn't find anything in the RFC saying it's not valid
|
||||
u'A45fsdff045-dsflk4dfsdc.ret43tjssfd',
|
||||
u'4354sfsdkfj4TEfdlv56gdgdfRET.dsf-dg',
|
||||
u'r'*255,
|
||||
u'r' * 255,
|
||||
]
|
||||
|
||||
not_valid_hostnames = [
|
||||
|
@ -60,4 +62,3 @@ class TestConfig(unittest.TestCase):
|
|||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
|
|
|
@ -24,4 +24,5 @@ class CouchDBTestCase(unittest.TestCase):
|
|||
metrics = self.check.get_metrics()
|
||||
self.assertTrue(type(metrics) == type([]), metrics)
|
||||
self.assertTrue(len(metrics) > 3)
|
||||
self.assertTrue(len([k for k in metrics if "instance:http://localhost:5984" in k[3]['dimensions']]) > 3)
|
||||
self.assertTrue(
|
||||
len([k for k in metrics if "instance:http://localhost:5984" in k[3]['dimensions']]) > 3)
|
||||
|
|
|
@ -3,6 +3,8 @@ from tests.common import load_check
|
|||
|
||||
from nose.plugins.attrib import attr
|
||||
from nose.plugins.skip import SkipTest
|
||||
|
||||
|
||||
class CouchbaseTestCase(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
|
@ -22,22 +24,22 @@ class CouchbaseTestCase(unittest.TestCase):
|
|||
@attr('couchbase')
|
||||
def test_camel_case_to_joined_lower(self):
|
||||
test_pairs = {
|
||||
'camelCase' : 'camel_case',
|
||||
'FirstCapital' : 'first_capital',
|
||||
'joined_lower' : 'joined_lower',
|
||||
'joined_Upper1' : 'joined_upper1',
|
||||
'Joined_upper2' : 'joined_upper2',
|
||||
'Joined_Upper3' : 'joined_upper3',
|
||||
'_leading_Underscore' : 'leading_underscore',
|
||||
'Trailing_Underscore_' : 'trailing_underscore',
|
||||
'DOubleCAps' : 'd_ouble_c_aps',
|
||||
'@@@super--$$-Funky__$__$$%' : 'super_funky',
|
||||
'camelCase': 'camel_case',
|
||||
'FirstCapital': 'first_capital',
|
||||
'joined_lower': 'joined_lower',
|
||||
'joined_Upper1': 'joined_upper1',
|
||||
'Joined_upper2': 'joined_upper2',
|
||||
'Joined_Upper3': 'joined_upper3',
|
||||
'_leading_Underscore': 'leading_underscore',
|
||||
'Trailing_Underscore_': 'trailing_underscore',
|
||||
'DOubleCAps': 'd_ouble_c_aps',
|
||||
'@@@super--$$-Funky__$__$$%': 'super_funky',
|
||||
}
|
||||
|
||||
for test_input, expected_output in test_pairs.items():
|
||||
test_output = self.check.camel_case_to_joined_lower(test_input)
|
||||
self.assertEqual(test_output, expected_output,
|
||||
'Input was %s, expected output was %s, actual output was %s' % (test_input, expected_output, test_output))
|
||||
self.assertEqual(test_output, expected_output,
|
||||
'Input was %s, expected output was %s, actual output was %s' % (test_input, expected_output, test_output))
|
||||
|
||||
@attr('couchbase')
|
||||
def test_metrics_casing(self):
|
||||
|
@ -46,25 +48,28 @@ class CouchbaseTestCase(unittest.TestCase):
|
|||
|
||||
metrics = self.check.get_metrics()
|
||||
|
||||
camel_cased_metrics = [u'couchbase.hdd.used_by_data',
|
||||
u'couchbase.ram.used_by_data',
|
||||
u'couchbase.ram.quota_total',
|
||||
u'couchbase.ram.quota_used',
|
||||
]
|
||||
camel_cased_metrics = [u'couchbase.hdd.used_by_data',
|
||||
u'couchbase.ram.used_by_data',
|
||||
u'couchbase.ram.quota_total',
|
||||
u'couchbase.ram.quota_used',
|
||||
]
|
||||
|
||||
found_metrics = [k[0] for k in metrics if k[0] in camel_cased_metrics]
|
||||
self.assertEqual(found_metrics.sort(), camel_cased_metrics.sort())
|
||||
|
||||
|
||||
@attr('couchbase')
|
||||
def test_metrics(self):
|
||||
raise SkipTest("Skipped for now as it's hard to configure couchbase on travis")
|
||||
self.check.check(self.config['instances'][0])
|
||||
|
||||
metrics = self.check.get_metrics()
|
||||
|
||||
|
||||
self.assertTrue(type(metrics) == type([]), metrics)
|
||||
self.assertTrue(len(metrics) > 3)
|
||||
self.assertTrue(len([k for k in metrics if "instance:http://localhost:8091" in k[3]['dimensions']]) > 3)
|
||||
self.assertTrue(
|
||||
len([k for k in metrics if "instance:http://localhost:8091" in k[3]['dimensions']]) > 3)
|
||||
|
||||
self.assertTrue(len([k for k in metrics if -1 != k[0].find('by_node')]) > 1, 'Unable to fund any per node metrics')
|
||||
self.assertTrue(len([k for k in metrics if -1 != k[0].find('by_bucket')]) > 1, 'Unable to fund any per node metrics')
|
||||
self.assertTrue(len([k for k in metrics if -1 != k[0].find('by_node')])
|
||||
> 1, 'Unable to fund any per node metrics')
|
||||
self.assertTrue(len([k for k in metrics if -1 != k[0].find('by_bucket')])
|
||||
> 1, 'Unable to fund any per node metrics')
|
||||
|
|
|
@ -11,14 +11,16 @@ from collector.dogstream import cassandra, supervisord_log, common
|
|||
log = logging.getLogger('datadog.test')
|
||||
NAGIOS_TEST_HOST = os.path.join(os.path.dirname(__file__), "host-perfdata")
|
||||
NAGIOS_TEST_SVC = os.path.join(os.path.dirname(__file__), "service-perfdata")
|
||||
NAGIOS_TEST_HOST_TEMPLATE="[HOSTPERFDATA]\t$TIMET$\t$HOSTNAME$\t$HOSTEXECUTIONTIME$\t$HOSTOUTPUT$\t$HOSTPERFDATA$"
|
||||
NAGIOS_TEST_SVC_TEMPLATE="[SERVICEPERFDATA]\t$TIMET$\t$HOSTNAME$\t$SERVICEDESC$\t$SERVICEEXECUTIONTIME$\t$SERVICELATENCY$\t$SERVICEOUTPUT$\t$SERVICEPERFDATA$"
|
||||
NAGIOS_TEST_HOST_TEMPLATE = "[HOSTPERFDATA]\t$TIMET$\t$HOSTNAME$\t$HOSTEXECUTIONTIME$\t$HOSTOUTPUT$\t$HOSTPERFDATA$"
|
||||
NAGIOS_TEST_SVC_TEMPLATE = "[SERVICEPERFDATA]\t$TIMET$\t$HOSTNAME$\t$SERVICEDESC$\t$SERVICEEXECUTIONTIME$\t$SERVICELATENCY$\t$SERVICEOUTPUT$\t$SERVICEPERFDATA$"
|
||||
|
||||
|
||||
def parse_ancient_function_plugin(logger, line):
|
||||
"""Ancient stateless parser"""
|
||||
res = line.split()
|
||||
res[3] = {'metric_type': 'gauge'}
|
||||
|
||||
|
||||
def parse_function_plugin(logger, line, state):
|
||||
"""Simple stateful parser"""
|
||||
try:
|
||||
|
@ -31,13 +33,16 @@ def parse_function_plugin(logger, line, state):
|
|||
res[3] = {'metric_type': 'counter'}
|
||||
return tuple(res)
|
||||
|
||||
|
||||
class ParseClassPlugin(object):
|
||||
"""Class-based stateful parser"""
|
||||
|
||||
def __init__(self, logger=None, user_args=(), **kwargs):
|
||||
self.logger = logger
|
||||
self.args = '.'.join(user_args)
|
||||
self.acc = 0
|
||||
self.logger.info('Completed initialization')
|
||||
|
||||
def parse_line(self, line):
|
||||
self.logger.info('Parsing line %r; counter is %r', line, self.acc)
|
||||
self.acc += 1
|
||||
|
@ -47,21 +52,24 @@ class ParseClassPlugin(object):
|
|||
res[3] = {'metric_type': 'counter'}
|
||||
return tuple(res)
|
||||
|
||||
|
||||
import time
|
||||
from datetime import datetime
|
||||
import calendar
|
||||
|
||||
log_event_pattern = re.compile("".join([
|
||||
r"(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) ", # iso timestamp
|
||||
r"\[(?P<alert_type>(ERROR)|(RECOVERY))\] - ", # alert type
|
||||
r"(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) ", # iso timestamp
|
||||
r"\[(?P<alert_type>(ERROR)|(RECOVERY))\] - ", # alert type
|
||||
r"(?P<msg_title>(?P<host>[^ ]*).*)"
|
||||
]))
|
||||
alert_types = {
|
||||
"ERROR": "error",
|
||||
"RECOVERY": "success"
|
||||
}
|
||||
|
||||
|
||||
def parse_events(logger, line):
|
||||
""" Expecting lines like this:
|
||||
""" Expecting lines like this:
|
||||
2012-05-14 12:46:01 [ERROR] - host0 is down (broke its collarbone)
|
||||
"""
|
||||
match = log_event_pattern.match(line)
|
||||
|
@ -69,22 +77,25 @@ def parse_events(logger, line):
|
|||
groups = match.groupdict()
|
||||
groups.update({
|
||||
'alert_type': alert_types.get(groups['alert_type'], ''),
|
||||
'timestamp': calendar.timegm(datetime.strptime(groups['timestamp'], '%Y-%m-%d %H:%M:%S').timetuple()),
|
||||
'timestamp': calendar.timegm(
|
||||
datetime.strptime(groups['timestamp'], '%Y-%m-%d %H:%M:%S').timetuple()),
|
||||
'msg_text': line
|
||||
})
|
||||
})
|
||||
|
||||
return groups
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def repr_event_parser(logger, line):
|
||||
return eval(line)
|
||||
|
||||
|
||||
class TailTestCase(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.log_file = NamedTemporaryFile()
|
||||
self.logger = logging.getLogger('test.dogstream')
|
||||
|
||||
|
||||
def _write_log(self, log_data):
|
||||
for data in log_data:
|
||||
print >> self.log_file, data
|
||||
|
@ -93,6 +104,7 @@ class TailTestCase(unittest.TestCase):
|
|||
def tearDown(self):
|
||||
self.log_file.close()
|
||||
|
||||
|
||||
class TestDogstream(TailTestCase):
|
||||
gauge = {'metric_type': 'gauge'}
|
||||
counter = {'metric_type': 'counter'}
|
||||
|
@ -107,7 +119,7 @@ class TestDogstream(TailTestCase):
|
|||
log.info("Test config: %s" % self.config)
|
||||
self.dogstream = Dogstreams.init(self.logger, self.config)
|
||||
self.maxDiff = None
|
||||
|
||||
|
||||
def test_dogstream_gauge(self):
|
||||
log_data = [
|
||||
# bucket 0
|
||||
|
@ -122,21 +134,21 @@ class TestDogstream(TailTestCase):
|
|||
('test.metric.a', '1000000006', '7', 'metric_type=gauge'),
|
||||
('test.metric.a', '1000000007', '8', 'metric_type=gauge'),
|
||||
]
|
||||
|
||||
|
||||
expected_output = {
|
||||
"dogstream": [
|
||||
('test.metric.a', 1000000000, 5.0, self.gauge),
|
||||
('test.metric.a', 1000000005, 8.0, self.gauge),
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
self._write_log((' '.join(data) for data in log_data))
|
||||
|
||||
actual_output = self.dogstream.check(self.config, move_end=False)
|
||||
self.assertEquals(expected_output, actual_output)
|
||||
for metric, timestamp, val, attr in expected_output['dogstream']:
|
||||
assert isinstance(val, float)
|
||||
|
||||
|
||||
def test_dogstream_counter(self):
|
||||
log_data = [
|
||||
# bucket 0
|
||||
|
@ -151,14 +163,14 @@ class TestDogstream(TailTestCase):
|
|||
('test.metric.a', '1000000006', '7', 'metric_type=counter'),
|
||||
('test.metric.a', '1000000007', '8', 'metric_type=counter'),
|
||||
]
|
||||
|
||||
|
||||
expected_output = {
|
||||
"dogstream": [
|
||||
('test.metric.a', 1000000000, 42, self.counter),
|
||||
('test.metric.a', 1000000005, 27, self.counter),
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
self._write_log((' '.join(data) for data in log_data))
|
||||
|
||||
actual_output = self.dogstream.check(self.config, move_end=False)
|
||||
|
@ -173,12 +185,10 @@ class TestDogstream(TailTestCase):
|
|||
('test_metric.e 1 1000000002 metric_type=gauge'),
|
||||
('test_metric.e 1000000002 10 metric_type=gauge'),
|
||||
]
|
||||
expected_output = {"dogstream":
|
||||
[('test_metric.e', 1000000000, 10, self.gauge)]
|
||||
}
|
||||
|
||||
expected_output = {"dogstream": [('test_metric.e', 1000000000, 10, self.gauge)]}
|
||||
|
||||
self._write_log(log_data)
|
||||
|
||||
|
||||
actual_output = self.dogstream.check(self.config, move_end=False)
|
||||
self.assertEquals(expected_output, actual_output)
|
||||
|
||||
|
@ -194,7 +204,9 @@ class TestDogstream(TailTestCase):
|
|||
('test.metric.simple', 1100000000, 1, self.gauge)]
|
||||
}
|
||||
self._write_log(log_data)
|
||||
plugdog = Dogstreams.init(self.logger, {'dogstreams': '%s:tests.test_datadog:parse_ancient_function_plugin' % self.log_file.name})
|
||||
plugdog = Dogstreams.init(
|
||||
self.logger, {
|
||||
'dogstreams': '%s:tests.test_datadog:parse_ancient_function_plugin' % self.log_file.name})
|
||||
actual_output = plugdog.check(self.config, move_end=False)
|
||||
|
||||
def test_dogstream_function_plugin(self):
|
||||
|
@ -210,7 +222,9 @@ class TestDogstream(TailTestCase):
|
|||
}
|
||||
self._write_log(log_data)
|
||||
|
||||
statedog = Dogstreams.init(self.logger, {'dogstreams': '%s:tests.test_datadog:parse_function_plugin' % self.log_file.name})
|
||||
statedog = Dogstreams.init(
|
||||
self.logger,
|
||||
{'dogstreams': '%s:tests.test_datadog:parse_function_plugin' % self.log_file.name})
|
||||
actual_output = statedog.check(self.config, move_end=False)
|
||||
self.assertEquals(expected_output, actual_output)
|
||||
|
||||
|
@ -227,7 +241,9 @@ class TestDogstream(TailTestCase):
|
|||
}
|
||||
self._write_log(log_data)
|
||||
|
||||
statedog = Dogstreams.init(self.logger, {'dogstreams': '%s:tests.test_datadog:ParseClassPlugin:foo:bar' % self.log_file.name})
|
||||
statedog = Dogstreams.init(
|
||||
self.logger,
|
||||
{'dogstreams': '%s:tests.test_datadog:ParseClassPlugin:foo:bar' % self.log_file.name})
|
||||
actual_output = statedog.check(self.config, move_end=False)
|
||||
self.assertEquals(expected_output, actual_output)
|
||||
|
||||
|
@ -289,7 +305,8 @@ class TestDogstream(TailTestCase):
|
|||
|
||||
self._write_log(log_data)
|
||||
|
||||
dogstream = Dogstreams.init(self.logger, {'dogstreams': '%s:tests.test_datadog:parse_events' % self.log_file.name})
|
||||
dogstream = Dogstreams.init(
|
||||
self.logger, {'dogstreams': '%s:tests.test_datadog:parse_events' % self.log_file.name})
|
||||
actual_output = dogstream.check(self.config, move_end=False)
|
||||
self.assertEquals(expected_output, actual_output)
|
||||
|
||||
|
@ -323,7 +340,9 @@ class TestDogstream(TailTestCase):
|
|||
|
||||
self._write_log([repr(d) for d in log_data])
|
||||
|
||||
dogstream = Dogstreams.init(self.logger, {'dogstreams': '%s:tests.test_datadog:repr_event_parser' % self.log_file.name})
|
||||
dogstream = Dogstreams.init(
|
||||
self.logger,
|
||||
{'dogstreams': '%s:tests.test_datadog:repr_event_parser' % self.log_file.name})
|
||||
actual_output = dogstream.check(self.config, move_end=False)
|
||||
self.assertEquals(expected_output, actual_output)
|
||||
|
||||
|
@ -347,55 +366,60 @@ class TestDogstream(TailTestCase):
|
|||
event_object = EventDefaults.EVENT_OBJECT
|
||||
|
||||
expected_output = {
|
||||
"dogstreamEvents":[
|
||||
{
|
||||
"timestamp": cassandra.parse_date("2012-05-12 21:10:48,058"),
|
||||
"msg_title": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6528-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6531-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6529-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6530-Data.db')]"[0:common.MAX_TITLE_LEN],
|
||||
"msg_text": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6528-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6531-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6529-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6530-Data.db')]",
|
||||
"alert_type": alert_type,
|
||||
"auto_priority": 0,
|
||||
"event_type": event_type,
|
||||
"aggregation_key": event_object,
|
||||
"event_object": event_object,
|
||||
}, {
|
||||
"timestamp": cassandra.parse_date("2012-05-12 21:10:54,851"),
|
||||
"msg_title": "Compacted to [/var/cassandra/a-hc-65-Data.db,]. 102,079,134 to 101,546,397",
|
||||
"alert_type": alert_type,
|
||||
"auto_priority": 0,
|
||||
"event_type": event_type,
|
||||
"aggregation_key": event_object,
|
||||
"event_object": event_object,
|
||||
}, {
|
||||
"timestamp": cassandra.parse_date("2012-05-13 13:15:01,927"),
|
||||
"msg_title": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6527-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6522-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6532-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6517-Data.db')]"[0:common.MAX_TITLE_LEN],
|
||||
"msg_text": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6527-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6522-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6532-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6517-Data.db')]",
|
||||
"alert_type": alert_type,
|
||||
"event_type": event_type,
|
||||
"auto_priority": 0,
|
||||
"aggregation_key": event_object,
|
||||
"event_object": event_object,
|
||||
}, {
|
||||
"timestamp": cassandra.parse_date("2012-05-13 13:27:17,685"),
|
||||
"msg_title": "Compacting large row test_data/series:6c6f677c32 (782001077 bytes) incrementally",
|
||||
"alert_type": alert_type,
|
||||
"event_type": event_type,
|
||||
"auto_priority": 0,
|
||||
"aggregation_key": event_object,
|
||||
"event_object": event_object,
|
||||
}, {
|
||||
"timestamp": cassandra.parse_date(datetime.utcnow().strftime("%Y-%m-%d") + " 13:27:17,685"),
|
||||
"msg_title": "Compacting large row test_data/series:6c6f677c32 (782001077 bytes) incrementally",
|
||||
"alert_type": alert_type,
|
||||
"event_type": event_type,
|
||||
"auto_priority": 0,
|
||||
"aggregation_key": event_object,
|
||||
"event_object": event_object,
|
||||
},
|
||||
]}
|
||||
"dogstreamEvents": [
|
||||
{
|
||||
"timestamp": cassandra.parse_date("2012-05-12 21:10:48,058"),
|
||||
"msg_title": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6528-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6531-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6529-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6530-Data.db')]"[
|
||||
0:common.MAX_TITLE_LEN],
|
||||
"msg_text": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6528-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6531-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6529-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6530-Data.db')]",
|
||||
"alert_type": alert_type,
|
||||
"auto_priority": 0,
|
||||
"event_type": event_type,
|
||||
"aggregation_key": event_object,
|
||||
"event_object": event_object,
|
||||
}, {
|
||||
"timestamp": cassandra.parse_date("2012-05-12 21:10:54,851"),
|
||||
"msg_title": "Compacted to [/var/cassandra/a-hc-65-Data.db,]. 102,079,134 to 101,546,397",
|
||||
"alert_type": alert_type,
|
||||
"auto_priority": 0,
|
||||
"event_type": event_type,
|
||||
"aggregation_key": event_object,
|
||||
"event_object": event_object,
|
||||
}, {
|
||||
"timestamp": cassandra.parse_date("2012-05-13 13:15:01,927"),
|
||||
"msg_title": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6527-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6522-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6532-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6517-Data.db')]"[
|
||||
0:common.MAX_TITLE_LEN],
|
||||
"msg_text": "Compacting [SSTableReader(path='/var/cassandra/data/test_data/series-hc-6527-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6522-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6532-Data.db'), SSTableReader(path='/var/cassandra/data/test_data/series-hc-6517-Data.db')]",
|
||||
"alert_type": alert_type,
|
||||
"event_type": event_type,
|
||||
"auto_priority": 0,
|
||||
"aggregation_key": event_object,
|
||||
"event_object": event_object,
|
||||
}, {
|
||||
"timestamp": cassandra.parse_date("2012-05-13 13:27:17,685"),
|
||||
"msg_title": "Compacting large row test_data/series:6c6f677c32 (782001077 bytes) incrementally",
|
||||
"alert_type": alert_type,
|
||||
"event_type": event_type,
|
||||
"auto_priority": 0,
|
||||
"aggregation_key": event_object,
|
||||
"event_object": event_object,
|
||||
}, {
|
||||
"timestamp": cassandra.parse_date(
|
||||
datetime.utcnow().strftime("%Y-%m-%d") + " 13:27:17,685"),
|
||||
"msg_title": "Compacting large row test_data/series:6c6f677c32 (782001077 bytes) incrementally",
|
||||
"alert_type": alert_type,
|
||||
"event_type": event_type,
|
||||
"auto_priority": 0,
|
||||
"aggregation_key": event_object,
|
||||
"event_object": event_object,
|
||||
},
|
||||
]}
|
||||
|
||||
self._write_log(log_data.split("\n"))
|
||||
|
||||
dogstream = Dogstreams.init(self.logger, {'dogstreams': '%s:dogstream.cassandra:parse_cassandra' % self.log_file.name})
|
||||
dogstream = Dogstreams.init(
|
||||
self.logger,
|
||||
{'dogstreams': '%s:dogstream.cassandra:parse_cassandra' % self.log_file.name})
|
||||
actual_output = dogstream.check(self.config, move_end=False)
|
||||
self.assertEquals(expected_output, actual_output)
|
||||
|
||||
|
@ -408,7 +432,7 @@ class TestDogstream(TailTestCase):
|
|||
event_type = supervisord_log.EVENT_TYPE
|
||||
|
||||
expected_output = {
|
||||
"dogstreamEvents":[
|
||||
"dogstreamEvents": [
|
||||
{
|
||||
"alert_type": "info", "event_type": event_type,
|
||||
"aggregation_key": "monitor",
|
||||
|
@ -420,7 +444,7 @@ class TestDogstream(TailTestCase):
|
|||
"aggregation_key": "foo_bar",
|
||||
"event_object": "foo_bar",
|
||||
"msg_title": "success: foo_bar entered RUNNING state, "
|
||||
"process has stayed up for > than 2 seconds (startsecs)",
|
||||
"process has stayed up for > than 2 seconds (startsecs)",
|
||||
"timestamp": int(time.mktime(datetime(2012, 7, 14, 3, 2, 47).timetuple())),
|
||||
}, {
|
||||
"alert_type": "error", "event_type": event_type,
|
||||
|
@ -438,10 +462,13 @@ class TestDogstream(TailTestCase):
|
|||
]}
|
||||
self._write_log(log_data.split("\n"))
|
||||
|
||||
dogstream = Dogstreams.init(self.logger, {'dogstreams': '%s:dogstream.supervisord_log:parse_supervisord' % self.log_file.name})
|
||||
dogstream = Dogstreams.init(
|
||||
self.logger,
|
||||
{'dogstreams': '%s:dogstream.supervisord_log:parse_supervisord' % self.log_file.name})
|
||||
actual_output = dogstream.check(self.config, move_end=False)
|
||||
self.assertEquals(expected_output, actual_output)
|
||||
|
||||
|
||||
class TestNagiosPerfData(TailTestCase):
|
||||
def setUp(self):
|
||||
TailTestCase.setUp(self)
|
||||
|
@ -457,7 +484,7 @@ class TestNagiosPerfData(TailTestCase):
|
|||
for data in config_data:
|
||||
print >> self.nagios_config, data
|
||||
self.nagios_config.flush()
|
||||
|
||||
|
||||
def tearDown(self):
|
||||
TailTestCase.tearDown(self)
|
||||
self.nagios_config.close()
|
||||
|
@ -474,72 +501,73 @@ class TestNagiosPerfData(TailTestCase):
|
|||
self.assertEquals([NagiosServicePerfData], [d.__class__ for d in dogstream.dogstreams])
|
||||
|
||||
log_data = [
|
||||
("DATATYPE::SERVICEPERFDATA",
|
||||
"TIMET::1000000000",
|
||||
"HOSTNAME::myhost0",
|
||||
"SERVICEDESC::Pgsql Backends",
|
||||
"SERVICEPERFDATA::" + " ".join([
|
||||
"time=0.06",
|
||||
"db0=33;180;190;0;200",
|
||||
"db1=1;150;190;0;200",
|
||||
"db2=0;120;290;1;200",
|
||||
"db3=0;110;195;5;100"
|
||||
]),
|
||||
"SERVICECHECKCOMMAND::check_nrpe_1arg!check_postgres_backends",
|
||||
"HOSTSTATE::UP",
|
||||
"HOSTSTATETYPE::HARD",
|
||||
"SERVICESTATE::OK",
|
||||
"SERVICESTATETYPE::HARD",
|
||||
(
|
||||
"DATATYPE::SERVICEPERFDATA",
|
||||
"TIMET::1000000000",
|
||||
"HOSTNAME::myhost0",
|
||||
"SERVICEDESC::Pgsql Backends",
|
||||
"SERVICEPERFDATA::" + " ".join([
|
||||
"time=0.06",
|
||||
"db0=33;180;190;0;200",
|
||||
"db1=1;150;190;0;200",
|
||||
"db2=0;120;290;1;200",
|
||||
"db3=0;110;195;5;100"
|
||||
]),
|
||||
"SERVICECHECKCOMMAND::check_nrpe_1arg!check_postgres_backends",
|
||||
"HOSTSTATE::UP",
|
||||
"HOSTSTATETYPE::HARD",
|
||||
"SERVICESTATE::OK",
|
||||
"SERVICESTATETYPE::HARD",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
expected_output = [
|
||||
('nagios.pgsql_backends.time', 1000000000, 0.06, {
|
||||
'metric_type': 'gauge',
|
||||
'host_name': 'myhost0',
|
||||
}),
|
||||
('nagios.pgsql_backends.db0', 1000000000, 33., {
|
||||
('nagios.pgsql_backends.db0', 1000000000, 33., {
|
||||
'metric_type': 'gauge',
|
||||
'host_name': 'myhost0',
|
||||
'warn': '180',
|
||||
'crit': '190',
|
||||
'min': '0',
|
||||
'max': '200',
|
||||
'min': '0',
|
||||
'max': '200',
|
||||
}),
|
||||
('nagios.pgsql_backends.db1', 1000000000, 1., {
|
||||
('nagios.pgsql_backends.db1', 1000000000, 1., {
|
||||
'metric_type': 'gauge',
|
||||
'host_name': 'myhost0',
|
||||
'warn': '150',
|
||||
'crit': '190',
|
||||
'min': '0',
|
||||
'max': '200',
|
||||
'min': '0',
|
||||
'max': '200',
|
||||
}),
|
||||
('nagios.pgsql_backends.db2', 1000000000, 0., {
|
||||
('nagios.pgsql_backends.db2', 1000000000, 0., {
|
||||
'metric_type': 'gauge',
|
||||
'host_name': 'myhost0',
|
||||
'warn': '120',
|
||||
'crit': '290',
|
||||
'min': '1',
|
||||
'max': '200',
|
||||
'min': '1',
|
||||
'max': '200',
|
||||
}),
|
||||
('nagios.pgsql_backends.db3', 1000000000, 0., {
|
||||
('nagios.pgsql_backends.db3', 1000000000, 0., {
|
||||
'metric_type': 'gauge',
|
||||
'host_name': 'myhost0',
|
||||
'warn': '110',
|
||||
'crit': '195',
|
||||
'min': '5',
|
||||
'max': '100',
|
||||
'min': '5',
|
||||
'max': '100',
|
||||
}),
|
||||
]
|
||||
expected_output.sort(key=point_sorter)
|
||||
|
||||
self._write_log(('\t'.join(data) for data in log_data))
|
||||
self._write_log(('\t'.join(data) for data in log_data))
|
||||
|
||||
actual_output = dogstream.check(self.agent_config, move_end=False)['dogstream']
|
||||
actual_output.sort(key=point_sorter)
|
||||
|
||||
self.assertEquals(expected_output, actual_output)
|
||||
|
||||
|
||||
def test_service_perfdata_special_cases(self):
|
||||
from collector.checks.datadog import NagiosServicePerfData
|
||||
|
||||
|
@ -552,7 +580,8 @@ class TestNagiosPerfData(TailTestCase):
|
|||
self.assertEquals([NagiosServicePerfData], [d.__class__ for d in dogstream.dogstreams])
|
||||
|
||||
log_data = [
|
||||
( "DATATYPE::SERVICEPERFDATA",
|
||||
(
|
||||
"DATATYPE::SERVICEPERFDATA",
|
||||
"TIMET::1000000000",
|
||||
"HOSTNAME::myhost2",
|
||||
"SERVICEDESC::Disk Space",
|
||||
|
@ -573,7 +602,7 @@ class TestNagiosPerfData(TailTestCase):
|
|||
"SERVICESTATETYPE::HARD",
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
expected_output = [
|
||||
('nagios.disk_space', 1000000000, 5477., {
|
||||
'metric_type': 'gauge',
|
||||
|
@ -658,12 +687,13 @@ class TestNagiosPerfData(TailTestCase):
|
|||
]
|
||||
expected_output.sort(key=point_sorter)
|
||||
|
||||
self._write_log(('\t'.join(data) for data in log_data))
|
||||
self._write_log(('\t'.join(data) for data in log_data))
|
||||
|
||||
actual_output = dogstream.check(self.agent_config, move_end=False)['dogstream']
|
||||
actual_output.sort(key=point_sorter)
|
||||
|
||||
self.assertEquals(expected_output, actual_output)
|
||||
|
||||
def test_host_perfdata(self):
|
||||
from collector.checks.datadog import NagiosHostPerfData
|
||||
|
||||
|
@ -676,19 +706,20 @@ class TestNagiosPerfData(TailTestCase):
|
|||
self.assertEquals([NagiosHostPerfData], [d.__class__ for d in dogstream.dogstreams])
|
||||
|
||||
log_data = [
|
||||
("DATATYPE::HOSTPERFDATA",
|
||||
"TIMET::1000000010",
|
||||
"HOSTNAME::myhost1",
|
||||
"HOSTPERFDATA::" + " ".join([
|
||||
"rta=0.978000ms;5000.000000;5000.000000;0.000000",
|
||||
"pl=0%;100;100;0",
|
||||
]),
|
||||
"HOSTCHECKCOMMAND::check-host-alive",
|
||||
"HOSTSTATE::UP",
|
||||
"HOSTSTATETYPE::HARD",
|
||||
(
|
||||
"DATATYPE::HOSTPERFDATA",
|
||||
"TIMET::1000000010",
|
||||
"HOSTNAME::myhost1",
|
||||
"HOSTPERFDATA::" + " ".join([
|
||||
"rta=0.978000ms;5000.000000;5000.000000;0.000000",
|
||||
"pl=0%;100;100;0",
|
||||
]),
|
||||
"HOSTCHECKCOMMAND::check-host-alive",
|
||||
"HOSTSTATE::UP",
|
||||
"HOSTSTATETYPE::HARD",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
expected_output = [
|
||||
('nagios.host.rta', 1000000010, 0.978, {
|
||||
'metric_type': 'gauge',
|
||||
|
@ -698,7 +729,7 @@ class TestNagiosPerfData(TailTestCase):
|
|||
'crit': '5000.000000',
|
||||
'min': '0.000000'
|
||||
}),
|
||||
('nagios.host.pl', 1000000010, 0., {
|
||||
('nagios.host.pl', 1000000010, 0., {
|
||||
'metric_type': 'gauge',
|
||||
'host_name': 'myhost1',
|
||||
'unit': '%',
|
||||
|
@ -709,7 +740,7 @@ class TestNagiosPerfData(TailTestCase):
|
|||
]
|
||||
expected_output.sort(key=point_sorter)
|
||||
|
||||
self._write_log(('\t'.join(data) for data in log_data))
|
||||
self._write_log(('\t'.join(data) for data in log_data))
|
||||
|
||||
actual_output = dogstream.check(self.agent_config, move_end=False)['dogstream']
|
||||
actual_output.sort(key=point_sorter)
|
||||
|
@ -728,7 +759,23 @@ class TestNagiosPerfData(TailTestCase):
|
|||
self.assertEquals([NagiosServicePerfData], [d.__class__ for d in dogstream.dogstreams])
|
||||
actual_output = dogstream.check(self.agent_config, move_end=False)
|
||||
|
||||
expected_output = {'dogstream': [('nagios.current_users.users', 1339511440, 1.0, {'metric_type': 'gauge', 'warn': '20', 'host_name': 'localhost', 'crit': '50', 'min': '0'}), ('nagios.ping.pl', 1339511500, 0.0, {'warn': '20', 'metric_type': 'gauge', 'host_name': 'localhost', 'min': '0', 'crit': '60', 'unit': '%'}), ('nagios.ping.rta', 1339511500, 0.065, {'warn': '100.000000', 'metric_type': 'gauge', 'host_name': 'localhost', 'min': '0.000000', 'crit': '500.000000', 'unit': 'ms'}), ('nagios.root_partition', 1339511560, 2470.0, {'min': '0', 'max': '7315', 'device_name': '/', 'warn': '5852', 'metric_type': 'gauge', 'host_name': 'localhost', 'crit': '6583', 'unit': 'MB'})]}
|
||||
expected_output = {'dogstream': [('nagios.current_users.users', 1339511440, 1.0,
|
||||
{'metric_type': 'gauge', 'warn': '20',
|
||||
'host_name': 'localhost', 'crit': '50', 'min': '0'}),
|
||||
('nagios.ping.pl', 1339511500, 0.0,
|
||||
{'warn': '20', 'metric_type': 'gauge',
|
||||
'host_name': 'localhost', 'min': '0', 'crit': '60',
|
||||
'unit': '%'}),
|
||||
('nagios.ping.rta', 1339511500, 0.065,
|
||||
{'warn': '100.000000', 'metric_type': 'gauge',
|
||||
'host_name': 'localhost',
|
||||
'min': '0.000000', 'crit': '500.000000',
|
||||
'unit': 'ms'}),
|
||||
('nagios.root_partition', 1339511560, 2470.0,
|
||||
{'min': '0', 'max': '7315', 'device_name': '/',
|
||||
'warn': '5852', 'metric_type': 'gauge',
|
||||
'host_name': 'localhost', 'crit': '6583',
|
||||
'unit': 'MB'})]}
|
||||
self.assertEquals(expected_output, actual_output)
|
||||
|
||||
def test_alt_host_perfdata(self):
|
||||
|
@ -743,9 +790,17 @@ class TestNagiosPerfData(TailTestCase):
|
|||
self.assertEquals([NagiosHostPerfData], [d.__class__ for d in dogstream.dogstreams])
|
||||
actual_output = dogstream.check(self.agent_config, move_end=False)
|
||||
|
||||
expected_output = {'dogstream': [('nagios.host.pl', 1339511440, 0.0, {'warn': '80', 'metric_type': 'gauge', 'host_name': 'localhost', 'min': '0', 'crit': '100', 'unit': '%'}), ('nagios.host.rta', 1339511440, 0.048, {'warn': '3000.000000', 'metric_type': 'gauge', 'host_name': 'localhost', 'min': '0.000000', 'crit': '5000.000000', 'unit': 'ms'})]}
|
||||
expected_output = {'dogstream': [('nagios.host.pl', 1339511440, 0.0,
|
||||
{'warn': '80', 'metric_type': 'gauge',
|
||||
'host_name': 'localhost', 'min': '0', 'crit': '100',
|
||||
'unit': '%'}),
|
||||
('nagios.host.rta', 1339511440, 0.048,
|
||||
{'warn': '3000.000000', 'metric_type': 'gauge',
|
||||
'host_name': 'localhost', 'min': '0.000000',
|
||||
'crit': '5000.000000', 'unit': 'ms'})]}
|
||||
self.assertEquals(expected_output, actual_output)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(format="%(asctime)s %(levelname)s %(filename)s:%(lineno)d %(message)s")
|
||||
unittest.main()
|
||||
unittest.main()
|
||||
|
|
|
@ -11,6 +11,7 @@ from tests.common import load_check
|
|||
PORT = 9200
|
||||
MAX_WAIT = 150
|
||||
|
||||
|
||||
class TestElastic(unittest.TestCase):
|
||||
|
||||
def _wait(self, url):
|
||||
|
@ -24,17 +25,16 @@ class TestElastic(unittest.TestCase):
|
|||
time.sleep(0.5)
|
||||
loop += 1
|
||||
if loop >= MAX_WAIT:
|
||||
break
|
||||
break
|
||||
|
||||
|
||||
def setUp(self):
|
||||
self.process = None
|
||||
try:
|
||||
# Start elasticsearch
|
||||
self.process = subprocess.Popen(["elasticsearch","-f","elasticsearch"],
|
||||
executable="elasticsearch",
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
self.process = subprocess.Popen(["elasticsearch", "-f", "elasticsearch"],
|
||||
executable="elasticsearch",
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
|
||||
# Wait for it to really start
|
||||
self._wait("http://localhost:%s" % PORT)
|
||||
|
@ -44,16 +44,17 @@ class TestElastic(unittest.TestCase):
|
|||
def tearDown(self):
|
||||
if self.process is not None:
|
||||
self.process.terminate()
|
||||
|
||||
|
||||
def testElasticChecksD(self):
|
||||
raise SkipTest("See https://github.com/DataDog/dd-agent/issues/825")
|
||||
agent_config = {'elasticsearch': 'http://localhost:%s' % PORT, 'version': '0.1', 'api_key': 'toto'}
|
||||
agent_config = {'elasticsearch': 'http://localhost:%s' %
|
||||
PORT, 'version': '0.1', 'api_key': 'toto'}
|
||||
|
||||
# Initialize the check from checks_d
|
||||
c = load_check('elastic', {'init_config': {}, 'instances': {}}, agent_config)
|
||||
conf = c.parse_agent_config(agent_config)
|
||||
self.check = load_check('elastic', conf, agent_config)
|
||||
|
||||
|
||||
self.check.check(conf['instances'][0])
|
||||
r = self.check.get_metrics()
|
||||
|
||||
|
@ -68,16 +69,17 @@ class TestElastic(unittest.TestCase):
|
|||
self.assertEquals(len([t for t in r if t[0] == "jvm.threads.peak_count"]), 1, r)
|
||||
self.assertEquals(len([t for t in r if t[0] == "elasticsearch.transport.rx_count"]), 1, r)
|
||||
self.assertEquals(len([t for t in r if t[0] == "elasticsearch.transport.tx_size"]), 1, r)
|
||||
self.assertEquals(len([t for t in r if t[0] == "elasticsearch.transport.server_open"]), 1, r)
|
||||
self.assertEquals(len([t for t in r if t[0] == "elasticsearch.thread_pool.snapshot.queue"]), 1, r)
|
||||
self.assertEquals(
|
||||
len([t for t in r if t[0] == "elasticsearch.transport.server_open"]), 1, r)
|
||||
self.assertEquals(
|
||||
len([t for t in r if t[0] == "elasticsearch.thread_pool.snapshot.queue"]), 1, r)
|
||||
self.assertEquals(len([t for t in r if t[0] == "elasticsearch.active_shards"]), 1, r)
|
||||
|
||||
self.check.cluster_status[conf['instances'][0].get('url')] = "red"
|
||||
self.check.check(conf['instances'][0])
|
||||
events = self.check.get_events()
|
||||
self.assertEquals(len(events),1,events)
|
||||
self.assertEquals(len(events), 1, events)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
|
@ -2,6 +2,7 @@ import unittest
|
|||
from tests.common import load_check
|
||||
from nose.plugins.skip import SkipTest
|
||||
|
||||
|
||||
class GearmanTestCase(unittest.TestCase):
|
||||
|
||||
def testMetrics(self):
|
||||
|
|
|
@ -18,6 +18,7 @@ HAPROXY_OPEN_CFG = os.path.realpath(os.path.join(os.path.dirname(__file__), "hap
|
|||
|
||||
|
||||
class HaproxyTestCase(unittest.TestCase):
|
||||
|
||||
def _wait(self, url):
|
||||
loop = 0
|
||||
while True:
|
||||
|
@ -28,13 +29,13 @@ class HaproxyTestCase(unittest.TestCase):
|
|||
authhandler = urllib2.HTTPBasicAuthHandler(passman)
|
||||
opener = urllib2.build_opener(authhandler)
|
||||
urllib2.install_opener(opener)
|
||||
url = "%s%s" % (url,STATS_URL)
|
||||
url = "%s%s" % (url, STATS_URL)
|
||||
req = urllib2.Request(url)
|
||||
request = urllib2.urlopen(req)
|
||||
break
|
||||
except Exception:
|
||||
time.sleep(0.5)
|
||||
loop+=1
|
||||
loop += 1
|
||||
if loop >= MAX_WAIT:
|
||||
break
|
||||
|
||||
|
@ -56,10 +57,10 @@ class HaproxyTestCase(unittest.TestCase):
|
|||
self.cfg.write(open(config_fn).read())
|
||||
self.cfg.flush()
|
||||
# Start haproxy
|
||||
self.process = subprocess.Popen(["haproxy","-d", "-f", self.cfg.name],
|
||||
executable="haproxy",
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
self.process = subprocess.Popen(["haproxy", "-d", "-f", self.cfg.name],
|
||||
executable="haproxy",
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
|
||||
# Wait for it to really start
|
||||
self._wait("http://localhost:3834/stats")
|
||||
|
@ -95,9 +96,9 @@ class HaproxyTestCase(unittest.TestCase):
|
|||
self.assertTrue(len(metrics) > 0)
|
||||
|
||||
self.assertEquals(len([t for t in metrics
|
||||
if t[0] == "haproxy.backend.bytes.in_rate"]), 4, metrics)
|
||||
if t[0] == "haproxy.backend.bytes.in_rate"]), 4, metrics)
|
||||
self.assertEquals(len([t for t in metrics
|
||||
if t[0] == "haproxy.frontend.session.current"]), 1, metrics)
|
||||
if t[0] == "haproxy.frontend.session.current"]), 1, metrics)
|
||||
|
||||
inst = config['instances'][0]
|
||||
data = self.check._fetch_data(inst['url'], inst['username'], inst['password'])
|
||||
|
@ -158,9 +159,9 @@ class HaproxyTestCase(unittest.TestCase):
|
|||
self.assertTrue(len(metrics) > 0)
|
||||
|
||||
self.assertEquals(len([t for t in metrics
|
||||
if t[0] == "haproxy.backend.bytes.in_rate"]), 4, metrics)
|
||||
if t[0] == "haproxy.backend.bytes.in_rate"]), 4, metrics)
|
||||
self.assertEquals(len([t for t in metrics
|
||||
if t[0] == "haproxy.frontend.session.current"]), 1, metrics)
|
||||
if t[0] == "haproxy.frontend.session.current"]), 1, metrics)
|
||||
|
||||
def tearDown(self):
|
||||
if self.process is not None:
|
||||
|
@ -169,4 +170,3 @@ class HaproxyTestCase(unittest.TestCase):
|
|||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@ instances:
|
|||
|
||||
|
||||
class IISTestCase(unittest.TestCase):
|
||||
|
||||
@attr('windows')
|
||||
def testIIS(self):
|
||||
raise SkipTest('Requires IIS and wmi')
|
||||
|
|
|
@ -13,7 +13,10 @@ from monagent.collector.jmxfetch import JMXFetch
|
|||
|
||||
|
||||
STATSD_PORT = 8129
|
||||
|
||||
|
||||
class DummyReporter(threading.Thread):
|
||||
|
||||
def __init__(self, metrics_aggregator):
|
||||
threading.Thread.__init__(self)
|
||||
self.finished = threading.Event()
|
||||
|
@ -23,7 +26,6 @@ class DummyReporter(threading.Thread):
|
|||
self.finished = False
|
||||
self.start()
|
||||
|
||||
|
||||
def run(self):
|
||||
while not self.finished:
|
||||
time.sleep(self.interval)
|
||||
|
@ -34,26 +36,26 @@ class DummyReporter(threading.Thread):
|
|||
if metrics:
|
||||
self.metrics = metrics
|
||||
|
||||
|
||||
class JMXTestCase(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
aggregator = MetricsAggregator("test_host")
|
||||
self.server = Server(aggregator, "localhost", STATSD_PORT)
|
||||
pid_file = PidFile('dogstatsd')
|
||||
self.reporter = DummyReporter(aggregator)
|
||||
|
||||
|
||||
self.t1 = threading.Thread(target=self.server.start)
|
||||
self.t1.start()
|
||||
|
||||
confd_path = os.path.realpath(os.path.join(os.path.abspath(__file__), "..", "jmx_yamls"))
|
||||
JMXFetch.init(confd_path, {'dogstatsd_port':STATSD_PORT}, get_logging_config(), 15)
|
||||
|
||||
JMXFetch.init(confd_path, {'dogstatsd_port': STATSD_PORT}, get_logging_config(), 15)
|
||||
|
||||
def tearDown(self):
|
||||
self.server.stop()
|
||||
self.reporter.finished = True
|
||||
JMXFetch.stop()
|
||||
|
||||
|
||||
def testCustomJMXMetric(self):
|
||||
raise SkipTest('Requires running JMX')
|
||||
count = 0
|
||||
|
@ -67,11 +69,13 @@ class JMXTestCase(unittest.TestCase):
|
|||
|
||||
self.assertTrue(type(metrics) == type([]))
|
||||
self.assertTrue(len(metrics) > 0)
|
||||
self.assertEquals(len([t for t in metrics if t['metric'] == "my.metric.buf" and "instance:jmx_instance1" in t['dimensions']]), 2, metrics)
|
||||
self.assertTrue(len([t for t in metrics if 'type:ThreadPool' in t['dimensions'] and "instance:jmx_instance1" in t['dimensions'] and "jmx.catalina" in t['metric']]) > 8, metrics)
|
||||
self.assertTrue(len([t for t in metrics if "jvm." in t['metric'] and "instance:jmx_instance1" in t['dimensions']]) == 7, metrics)
|
||||
self.assertEquals(len([t for t in metrics if t[
|
||||
'metric'] == "my.metric.buf" and "instance:jmx_instance1" in t['dimensions']]), 2, metrics)
|
||||
self.assertTrue(len([t for t in metrics if 'type:ThreadPool' in t[
|
||||
'dimensions'] and "instance:jmx_instance1" in t['dimensions'] and "jmx.catalina" in t['metric']]) > 8, metrics)
|
||||
self.assertTrue(len([t for t in metrics if "jvm." in t['metric']
|
||||
and "instance:jmx_instance1" in t['dimensions']]) == 7, metrics)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
|
@ -6,8 +6,10 @@ from collector.checks import LaconicFilter
|
|||
|
||||
|
||||
class TestLaconic(unittest.TestCase):
|
||||
|
||||
"""Verify that we only output messages once
|
||||
"""
|
||||
|
||||
def setUp(self):
|
||||
self.l = logging.getLogger("test_laconic")
|
||||
self.sio = StringIO()
|
||||
|
@ -35,11 +37,13 @@ class TestLaconic(unittest.TestCase):
|
|||
def testRepeatingErrors(self):
|
||||
for i in range(10):
|
||||
self.l.error("Cannot find nagios.log")
|
||||
self.assertEquals(self.sio.getvalue().count("Cannot find nagios.log"), 1, self.sio.getvalue())
|
||||
self.assertEquals(
|
||||
self.sio.getvalue().count("Cannot find nagios.log"), 1, self.sio.getvalue())
|
||||
|
||||
for i in range(10):
|
||||
self.l.warn("Cannot find ganglia.log")
|
||||
self.assertEquals(self.sio.getvalue().count("Cannot find ganglia.log"), 1, self.sio.getvalue())
|
||||
self.assertEquals(
|
||||
self.sio.getvalue().count("Cannot find ganglia.log"), 1, self.sio.getvalue())
|
||||
|
||||
for i in range(10):
|
||||
try:
|
||||
|
@ -47,7 +51,8 @@ class TestLaconic(unittest.TestCase):
|
|||
except Exception:
|
||||
self.l.exception("Caught!")
|
||||
|
||||
self.assertEquals(self.sio.getvalue().count("Ka-boom"), 2) # once for the traceback, once for the message
|
||||
# once for the traceback, once for the message
|
||||
self.assertEquals(self.sio.getvalue().count("Ka-boom"), 2)
|
||||
|
||||
def testNonRepeat(self):
|
||||
for i in range(10):
|
||||
|
@ -55,7 +60,6 @@ class TestLaconic(unittest.TestCase):
|
|||
self.assertEquals(self.sio.getvalue().count(" nagios.log"), 10)
|
||||
self.assertEquals(self.sio.getvalue().count(" 7"), 1)
|
||||
|
||||
|
||||
def testBlowUp(self):
|
||||
"""Try to use a lot of memory"""
|
||||
for i in range(2 * self.laconic.LACONIC_MEM_LIMIT + 7):
|
||||
|
|
|
@ -8,6 +8,7 @@ from nose.plugins.skip import SkipTest
|
|||
|
||||
|
||||
class TestMemCache(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.agent_config = {
|
||||
"memcache_server": "localhost",
|
||||
|
@ -20,7 +21,7 @@ class TestMemCache(unittest.TestCase):
|
|||
def _countConnections(self, port):
|
||||
pid = os.getpid()
|
||||
p1 = Popen(['lsof', '-a', '-p%s' %
|
||||
pid, '-i4'], stdout=PIPE)
|
||||
pid, '-i4'], stdout=PIPE)
|
||||
p2 = Popen(["grep", ":%s" % port], stdin=p1.stdout, stdout=PIPE)
|
||||
p3 = Popen(["wc", "-l"], stdin=p2.stdout, stdout=PIPE)
|
||||
output = p3.communicate()[0]
|
||||
|
@ -50,7 +51,8 @@ class TestMemCache(unittest.TestCase):
|
|||
self.assertEquals(len([t for t in r if t[0] == "memcache.total_items"]), 3, r)
|
||||
|
||||
# Check that we got 21 metrics for a specific host
|
||||
self.assertEquals(len([t for t in r if t[3].get('dimensions') == {"instance": mythirdtag}]), 21, r)
|
||||
self.assertEquals(
|
||||
len([t for t in r if t[3].get('dimensions') == {"instance": mythirdtag}]), 21, r)
|
||||
|
||||
def testDimensions(self):
|
||||
raise SkipTest('Requires mcache')
|
||||
|
@ -68,7 +70,8 @@ class TestMemCache(unittest.TestCase):
|
|||
r = self.c.get_metrics()
|
||||
|
||||
# Check the dimensions
|
||||
self.assertEquals(len([t for t in r if t[3].get('dimensions') == {"regular_old": "dimensions"}]), 21, r)
|
||||
self.assertEquals(
|
||||
len([t for t in r if t[3].get('dimensions') == {"regular_old": "dimensions"}]), 21, r)
|
||||
|
||||
conf = {
|
||||
'memcache_server': 'localhost',
|
||||
|
@ -84,7 +87,8 @@ class TestMemCache(unittest.TestCase):
|
|||
r = self.c.get_metrics()
|
||||
|
||||
# Check the dimensions
|
||||
self.assertEquals(len([t for t in r if t[3].get('dimensions') == {"instance": "localhost_11211"}]), 21, r)
|
||||
self.assertEquals(
|
||||
len([t for t in r if t[3].get('dimensions') == {"instance": "localhost_11211"}]), 21, r)
|
||||
|
||||
def testDummyHost(self):
|
||||
new_conf = self.c.parse_agent_config({"memcache_instance_1": "dummy:11211:myothertag"})
|
||||
|
|
|
@ -12,21 +12,27 @@ default_target = 'DEFAULT'
|
|||
specified_target = 'SPECIFIED'
|
||||
has_been_mutated = False
|
||||
|
||||
|
||||
class TestModuleLoad(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
sys.modules[__name__].has_been_mutated = True
|
||||
if 'tests.target_module' in sys.modules:
|
||||
del sys.modules['tests.target_module']
|
||||
|
||||
def tearDown(self):
|
||||
sys.modules[__name__].has_been_mutated = False
|
||||
|
||||
def test_cached_module(self):
|
||||
"""Modules already in the cache should be reused"""
|
||||
self.assertTrue(modules.load('%s:has_been_mutated' % __name__))
|
||||
|
||||
def test_cache_population(self):
|
||||
"""Python module cache should be populated"""
|
||||
self.assertTrue(not 'tests.target_module' in sys.modules)
|
||||
modules.load('tests.target_module')
|
||||
self.assertTrue('tests.target_module' in sys.modules)
|
||||
|
||||
def test_modname_load_default(self):
|
||||
"""When the specifier contains no module name, any provided default
|
||||
should be used"""
|
||||
|
@ -36,6 +42,7 @@ class TestModuleLoad(unittest.TestCase):
|
|||
'default_target'),
|
||||
'DEFAULT'
|
||||
)
|
||||
|
||||
def test_modname_load_specified(self):
|
||||
"""When the specifier contains a module name, any provided default
|
||||
should be overridden"""
|
||||
|
@ -45,6 +52,7 @@ class TestModuleLoad(unittest.TestCase):
|
|||
'default_target'),
|
||||
'SPECIFIED'
|
||||
)
|
||||
|
||||
def test_pathname_load_finds_package(self):
|
||||
""""Loading modules by absolute path should correctly set the name of
|
||||
the loaded module to include any package containing it."""
|
||||
|
|
|
@ -14,7 +14,9 @@ PORT1 = 37017
|
|||
PORT2 = 37018
|
||||
MAX_WAIT = 150
|
||||
|
||||
|
||||
class TestMongo(unittest.TestCase):
|
||||
|
||||
def wait4mongo(self, process, port):
|
||||
# Somehow process.communicate() hangs
|
||||
out = process.stdout
|
||||
|
@ -68,8 +70,10 @@ class TestMongo(unittest.TestCase):
|
|||
|
||||
def tearDown(self):
|
||||
try:
|
||||
if "p1" in dir(self): self.p1.terminate()
|
||||
if "p2" in dir(self): self.p2.terminate()
|
||||
if "p1" in dir(self):
|
||||
self.p1.terminate()
|
||||
if "p2" in dir(self):
|
||||
self.p2.terminate()
|
||||
except Exception:
|
||||
logging.getLogger().exception("Cannot terminate mongod instances")
|
||||
|
||||
|
@ -79,7 +83,7 @@ class TestMongo(unittest.TestCase):
|
|||
'instances': [{
|
||||
'server': "mongodb://localhost:%s/test" % PORT1
|
||||
},
|
||||
{
|
||||
{
|
||||
'server': "mongodb://localhost:%s/test" % PORT2
|
||||
}]
|
||||
}
|
||||
|
@ -111,7 +115,7 @@ class TestMongo(unittest.TestCase):
|
|||
for m in metrics:
|
||||
metric_name = m[0]
|
||||
if metric_name in metric_val_checks:
|
||||
self.assertTrue( metric_val_checks[metric_name]( m[2] ) )
|
||||
self.assertTrue(metric_val_checks[metric_name](m[2]))
|
||||
|
||||
# Run the check against our running server
|
||||
self.check.check(self.config['instances'][1])
|
||||
|
@ -129,7 +133,7 @@ class TestMongo(unittest.TestCase):
|
|||
for m in metrics:
|
||||
metric_name = m[0]
|
||||
if metric_name in metric_val_checks:
|
||||
self.assertTrue( metric_val_checks[metric_name]( m[2] ) )
|
||||
self.assertTrue(metric_val_checks[metric_name](m[2]))
|
||||
|
||||
def testMongoOldConfig(self):
|
||||
raise SkipTest('Requires MongoDB')
|
||||
|
@ -173,7 +177,7 @@ class TestMongo(unittest.TestCase):
|
|||
for m in metrics:
|
||||
metric_name = m[0]
|
||||
if metric_name in metric_val_checks:
|
||||
self.assertTrue( metric_val_checks[metric_name]( m[2] ) )
|
||||
self.assertTrue(metric_val_checks[metric_name](m[2]))
|
||||
|
||||
# Test the second mongodb instance
|
||||
self.check = load_check('mongo', conf2, self.agent_config2)
|
||||
|
@ -194,7 +198,7 @@ class TestMongo(unittest.TestCase):
|
|||
for m in metrics:
|
||||
metric_name = m[0]
|
||||
if metric_name in metric_val_checks:
|
||||
self.assertTrue( metric_val_checks[metric_name]( m[2] ) )
|
||||
self.assertTrue(metric_val_checks[metric_name](m[2]))
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue