commit
52ed313d0c
|
@ -1,578 +1 @@
|
|||
"""Base class for Checks.
|
||||
|
||||
If you are writing your own checks you should subclass the AgentCheck class.
|
||||
The Check class is being deprecated so don't write new checks with it.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
import os
|
||||
import traceback
|
||||
from pprint import pprint
|
||||
|
||||
from monagent.common import check_status
|
||||
from monagent.common.exceptions import NaN, CheckException, Infinity, UnknownValue
|
||||
from monagent.common.util import LaconicFilter, get_os, get_hostname
|
||||
from monagent.common.config import get_confd_path
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Constants
|
||||
|
||||
|
||||
# todo convert all checks to the new interface then remove this. Is the LaconicFilter on logs used elsewhere?
|
||||
#==============================================================================
|
||||
# DEPRECATED
|
||||
# ------------------------------
|
||||
# If you are writing your own check, you should inherit from AgentCheck
|
||||
# and not this class. This class will be removed in a future version
|
||||
# of the agent.
|
||||
#==============================================================================
|
||||
class Check(object):
|
||||
"""
|
||||
(Abstract) class for all checks with the ability to:
|
||||
* store 1 (and only 1) sample for gauges per metric/tag combination
|
||||
* compute rates for counters
|
||||
* only log error messages once (instead of each time they occur)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, logger, agent_config=None):
|
||||
# where to store samples, indexed by metric_name
|
||||
# metric_name: {("sorted", "dimensions"): [(ts, value), (ts, value)],
|
||||
# tuple(dimensions) are stored as a key since lists are not hashable
|
||||
# None: [(ts, value), (ts, value)]}
|
||||
# untagged values are indexed by None
|
||||
self.agent_config = agent_config
|
||||
self._sample_store = {}
|
||||
self._counters = {} # metric_name: bool
|
||||
self.logger = logger
|
||||
try:
|
||||
self.logger.addFilter(LaconicFilter())
|
||||
except Exception:
|
||||
self.logger.exception("Trying to install laconic log filter and failed")
|
||||
|
||||
@staticmethod
|
||||
def normalize(metric, prefix=None):
|
||||
"""Turn a metric into a well-formed metric name
|
||||
prefix.b.c
|
||||
"""
|
||||
name = re.sub(r"[,\+\*\-/()\[\]{}]", "_", metric)
|
||||
# Eliminate multiple _
|
||||
name = re.sub(r"__+", "_", name)
|
||||
# Don't start/end with _
|
||||
name = re.sub(r"^_", "", name)
|
||||
name = re.sub(r"_$", "", name)
|
||||
# Drop ._ and _.
|
||||
name = re.sub(r"\._", ".", name)
|
||||
name = re.sub(r"_\.", ".", name)
|
||||
|
||||
if prefix is not None:
|
||||
return prefix + "." + name
|
||||
else:
|
||||
return name
|
||||
|
||||
@staticmethod
|
||||
def normalize_device_name(device_name):
|
||||
return device_name.strip().lower().replace(' ', '_')
|
||||
|
||||
def counter(self, metric):
|
||||
"""
|
||||
Treats the metric as a counter, i.e. computes its per second derivative
|
||||
ACHTUNG: Resets previous values associated with this metric.
|
||||
"""
|
||||
self._counters[metric] = True
|
||||
self._sample_store[metric] = {}
|
||||
|
||||
def is_counter(self, metric):
|
||||
"Is this metric a counter?"
|
||||
return metric in self._counters
|
||||
|
||||
def gauge(self, metric):
|
||||
"""
|
||||
Treats the metric as a gauge, i.e. keep the data as is
|
||||
ACHTUNG: Resets previous values associated with this metric.
|
||||
"""
|
||||
self._sample_store[metric] = {}
|
||||
|
||||
def is_metric(self, metric):
|
||||
return metric in self._sample_store
|
||||
|
||||
def is_gauge(self, metric):
|
||||
return self.is_metric(metric) and not self.is_counter(metric)
|
||||
|
||||
def get_metric_names(self):
|
||||
"Get all metric names"
|
||||
return self._sample_store.keys()
|
||||
|
||||
def save_gauge(self, metric, value, timestamp=None, dimensions=None, hostname=None, device_name=None):
|
||||
""" Save a gauge value. """
|
||||
if not self.is_gauge(metric):
|
||||
self.gauge(metric)
|
||||
self.save_sample(metric, value, timestamp, dimensions, hostname, device_name)
|
||||
|
||||
def save_sample(self, metric, value, timestamp=None, dimensions=None, hostname=None, device_name=None):
|
||||
"""Save a simple sample, evict old values if needed
|
||||
"""
|
||||
if dimensions is None:
|
||||
dimensions = {}
|
||||
from common.util import cast_metric_val
|
||||
|
||||
if timestamp is None:
|
||||
timestamp = time.time()
|
||||
if metric not in self._sample_store:
|
||||
raise CheckException("Saving a sample for an undefined metric: %s" % metric)
|
||||
try:
|
||||
value = cast_metric_val(value)
|
||||
except ValueError, ve:
|
||||
raise NaN(ve)
|
||||
|
||||
# Sort and validate dimensions
|
||||
if dimensions is not None and not isinstance(dimensions, dict):
|
||||
raise CheckException("Dimensions must be a dictionary")
|
||||
|
||||
# Data eviction rules
|
||||
key = (tuple(sorted(dimensions.items())), device_name)
|
||||
if self.is_gauge(metric):
|
||||
self._sample_store[metric][key] = ((timestamp, value, hostname, device_name), )
|
||||
elif self.is_counter(metric):
|
||||
if self._sample_store[metric].get(key) is None:
|
||||
self._sample_store[metric][key] = [(timestamp, value, hostname, device_name)]
|
||||
else:
|
||||
self._sample_store[metric][key] = self._sample_store[metric][key][-1:] + \
|
||||
[(timestamp, value, hostname, device_name)]
|
||||
else:
|
||||
raise CheckException("%s must be either gauge or counter, skipping sample at %s" %
|
||||
(metric, time.ctime(timestamp)))
|
||||
|
||||
if self.is_gauge(metric):
|
||||
# store[metric][dimensions] = (ts, val) - only 1 value allowed
|
||||
assert len(self._sample_store[metric][key]) == 1, self._sample_store[metric]
|
||||
elif self.is_counter(metric):
|
||||
assert len(self._sample_store[metric][key]) in (1, 2), self._sample_store[metric]
|
||||
|
||||
@classmethod
|
||||
def _rate(cls, sample1, sample2):
|
||||
"Simple rate"
|
||||
try:
|
||||
interval = sample2[0] - sample1[0]
|
||||
if interval == 0:
|
||||
raise Infinity()
|
||||
|
||||
delta = sample2[1] - sample1[1]
|
||||
if delta < 0:
|
||||
raise UnknownValue()
|
||||
|
||||
return (sample2[0], delta / interval, sample2[2], sample2[3])
|
||||
except Infinity:
|
||||
raise
|
||||
except UnknownValue:
|
||||
raise
|
||||
except Exception, e:
|
||||
raise NaN(e)
|
||||
|
||||
def get_sample_with_timestamp(self, metric, dimensions=None, device_name=None, expire=True):
|
||||
"""Get (timestamp-epoch-style, value)
|
||||
"""
|
||||
if dimensions is None:
|
||||
dimensions = {}
|
||||
|
||||
# Get the proper dimensions
|
||||
key = (tuple(sorted(dimensions.items())), device_name)
|
||||
|
||||
# Never seen this metric
|
||||
if metric not in self._sample_store:
|
||||
raise UnknownValue()
|
||||
|
||||
# Not enough value to compute rate
|
||||
elif self.is_counter(metric) and len(self._sample_store[metric][key]) < 2:
|
||||
raise UnknownValue()
|
||||
|
||||
elif self.is_counter(metric) and len(self._sample_store[metric][key]) >= 2:
|
||||
res = self._rate(self._sample_store[metric][key][-2], self._sample_store[metric][key][-1])
|
||||
if expire:
|
||||
del self._sample_store[metric][key][:-1]
|
||||
return res
|
||||
|
||||
elif self.is_gauge(metric) and len(self._sample_store[metric][key]) >= 1:
|
||||
return self._sample_store[metric][key][-1]
|
||||
|
||||
else:
|
||||
raise UnknownValue()
|
||||
|
||||
def get_sample(self, metric, dimensions=None, device_name=None, expire=True):
|
||||
"Return the last value for that metric"
|
||||
x = self.get_sample_with_timestamp(metric, dimensions, device_name, expire)
|
||||
assert isinstance(x, tuple) and len(x) == 4, x
|
||||
return x[1]
|
||||
|
||||
def get_samples_with_timestamps(self, expire=True):
|
||||
"Return all values {metric: (ts, value)} for non-tagged metrics"
|
||||
values = {}
|
||||
for m in self._sample_store:
|
||||
try:
|
||||
values[m] = self.get_sample_with_timestamp(m, expire=expire)
|
||||
except Exception:
|
||||
pass
|
||||
return values
|
||||
|
||||
def get_samples(self, expire=True):
|
||||
"Return all values {metric: value} for non-tagged metrics"
|
||||
values = {}
|
||||
for m in self._sample_store:
|
||||
try:
|
||||
# Discard the timestamp
|
||||
values[m] = self.get_sample_with_timestamp(m, expire=expire)[1]
|
||||
except Exception:
|
||||
pass
|
||||
return values
|
||||
|
||||
def get_metrics(self, expire=True):
|
||||
"""Get all metrics, including the ones that are tagged.
|
||||
This is the preferred method to retrieve metrics
|
||||
|
||||
@return the list of samples
|
||||
@rtype [(metric_name, timestamp, value, {"dimensions": {"name1": "key1", "name2": "key2"}}), ...]
|
||||
"""
|
||||
metrics = []
|
||||
for m in self._sample_store:
|
||||
try:
|
||||
for key in self._sample_store[m]:
|
||||
dimensions_list, device_name = key
|
||||
dimensions = dict(dimensions_list)
|
||||
try:
|
||||
ts, val, hostname, device_name = self.get_sample_with_timestamp(m, dimensions, device_name, expire)
|
||||
except UnknownValue:
|
||||
continue
|
||||
attributes = {}
|
||||
if dimensions_list:
|
||||
attributes['dimensions'] = dimensions
|
||||
if hostname:
|
||||
attributes['host_name'] = hostname
|
||||
if device_name:
|
||||
attributes['device_name'] = device_name
|
||||
metrics.append((m, int(ts), val, attributes))
|
||||
except Exception:
|
||||
pass
|
||||
return metrics
|
||||
|
||||
|
||||
class AgentCheck(object):
|
||||
|
||||
def __init__(self, name, init_config, agent_config, instances=None):
|
||||
"""
|
||||
Initialize a new check.
|
||||
|
||||
:param name: The name of the check
|
||||
:param init_config: The config for initializing the check
|
||||
:param agent_config: The global configuration for the agent
|
||||
:param instances: A list of configuration objects for each instance.
|
||||
"""
|
||||
from monagent.common.aggregator import MetricsAggregator
|
||||
|
||||
self.name = name
|
||||
self.init_config = init_config
|
||||
self.agent_config = agent_config
|
||||
self.hostname = get_hostname(agent_config)
|
||||
self.log = logging.getLogger('%s.%s' % (__name__, name))
|
||||
|
||||
self.aggregator = MetricsAggregator(self.hostname,
|
||||
recent_point_threshold=agent_config.get('recent_point_threshold', None))
|
||||
|
||||
self.events = []
|
||||
self.instances = instances or []
|
||||
self.warnings = []
|
||||
self.library_versions = None
|
||||
|
||||
def instance_count(self):
|
||||
""" Return the number of instances that are configured for this check. """
|
||||
return len(self.instances)
|
||||
|
||||
def gauge(self, metric, value, dimensions=None, hostname=None, device_name=None, timestamp=None):
|
||||
"""
|
||||
Record the value of a gauge, with optional dimensions, hostname and device
|
||||
name.
|
||||
|
||||
:param metric: The name of the metric
|
||||
:param value: The value of the gauge
|
||||
:param dimensions: (optional) A dictionary of dimensions for this metric
|
||||
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
|
||||
:param device_name: (optional) The device name for this metric
|
||||
:param timestamp: (optional) The timestamp for this metric value
|
||||
"""
|
||||
self.aggregator.gauge(metric, value, dimensions, hostname, device_name, timestamp)
|
||||
|
||||
def increment(self, metric, value=1, dimensions=None, hostname=None, device_name=None):
|
||||
"""
|
||||
Increment a counter with optional dimensions, hostname and device name.
|
||||
|
||||
:param metric: The name of the metric
|
||||
:param value: The value to increment by
|
||||
:param dimensions: (optional) A dictionary of dimensions for this metric
|
||||
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
|
||||
:param device_name: (optional) The device name for this metric
|
||||
"""
|
||||
self.aggregator.increment(metric, value, dimensions, hostname, device_name)
|
||||
|
||||
def decrement(self, metric, value=-1, dimensions=None, hostname=None, device_name=None):
|
||||
"""
|
||||
Increment a counter with optional dimensions, hostname and device name.
|
||||
|
||||
:param metric: The name of the metric
|
||||
:param value: The value to decrement by
|
||||
:param dimensions: (optional) A dictionary of dimensions for this metric
|
||||
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
|
||||
:param device_name: (optional) The device name for this metric
|
||||
"""
|
||||
self.aggregator.decrement(metric, value, dimensions, hostname, device_name)
|
||||
|
||||
def rate(self, metric, value, dimensions=None, hostname=None, device_name=None):
|
||||
"""
|
||||
Submit a point for a metric that will be calculated as a rate on flush.
|
||||
Values will persist across each call to `check` if there is not enough
|
||||
point to generate a rate on the flush.
|
||||
|
||||
:param metric: The name of the metric
|
||||
:param value: The value of the rate
|
||||
:param dimensions: (optional) A dictionary of dimensions for this metric
|
||||
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
|
||||
:param device_name: (optional) The device name for this metric
|
||||
"""
|
||||
self.aggregator.rate(metric, value, dimensions, hostname, device_name)
|
||||
|
||||
def histogram(self, metric, value, dimensions=None, hostname=None, device_name=None):
|
||||
"""
|
||||
Sample a histogram value, with optional dimensions, hostname and device name.
|
||||
|
||||
:param metric: The name of the metric
|
||||
:param value: The value to sample for the histogram
|
||||
:param dimensions: (optional) A dictionary of dimensions for this metric
|
||||
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
|
||||
:param device_name: (optional) The device name for this metric
|
||||
"""
|
||||
self.aggregator.histogram(metric, value, dimensions, hostname, device_name)
|
||||
|
||||
def set(self, metric, value, dimensions=None, hostname=None, device_name=None):
|
||||
"""
|
||||
Sample a set value, with optional dimensions, hostname and device name.
|
||||
|
||||
:param metric: The name of the metric
|
||||
:param value: The value for the set
|
||||
:param dimensions: (optional) A dictionary of dimensions for this metric
|
||||
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
|
||||
:param device_name: (optional) The device name for this metric
|
||||
"""
|
||||
self.aggregator.set(metric, value, dimensions, hostname, device_name)
|
||||
|
||||
def event(self, event):
|
||||
"""
|
||||
Save an event.
|
||||
|
||||
:param event: The event payload as a dictionary. Has the following
|
||||
structure:
|
||||
|
||||
{
|
||||
"timestamp": int, the epoch timestamp for the event,
|
||||
"event_type": string, the event time name,
|
||||
"api_key": string, the api key of the account to associate the event with,
|
||||
"msg_title": string, the title of the event,
|
||||
"msg_text": string, the text body of the event,
|
||||
"alert_type": (optional) string, one of ('error', 'warning', 'success', 'info').
|
||||
Defaults to 'info'.
|
||||
"source_type_name": (optional) string, the source type name,
|
||||
"host": (optional) string, the name of the host,
|
||||
"dimensions": (optional) a dictionary of dimensions to associate with this event
|
||||
}
|
||||
"""
|
||||
if event.get('api_key') is None:
|
||||
event['api_key'] = self.agent_config['api_key']
|
||||
self.events.append(event)
|
||||
|
||||
def has_events(self):
|
||||
"""
|
||||
Check whether the check has saved any events
|
||||
|
||||
@return whether or not the check has saved any events
|
||||
@rtype boolean
|
||||
"""
|
||||
return len(self.events) > 0
|
||||
|
||||
def get_metrics(self):
|
||||
"""
|
||||
Get all metrics, including the ones that are tagged.
|
||||
|
||||
@return the list of samples
|
||||
@rtype list of Measurement objects from monagent.common.metrics
|
||||
"""
|
||||
return self.aggregator.flush()
|
||||
|
||||
def get_events(self):
|
||||
"""
|
||||
Return a list of the events saved by the check, if any
|
||||
|
||||
@return the list of events saved by this check
|
||||
@rtype list of event dictionaries
|
||||
"""
|
||||
events = self.events
|
||||
self.events = []
|
||||
return events
|
||||
|
||||
def has_warnings(self):
|
||||
"""
|
||||
Check whether the instance run created any warnings
|
||||
"""
|
||||
return len(self.warnings) > 0
|
||||
|
||||
def warning(self, warning_message):
|
||||
""" Add a warning message that will be printed in the info page
|
||||
:param warning_message: String. Warning message to be displayed
|
||||
"""
|
||||
self.warnings.append(warning_message)
|
||||
|
||||
def get_library_info(self):
|
||||
if self.library_versions is not None:
|
||||
return self.library_versions
|
||||
try:
|
||||
self.library_versions = self.get_library_versions()
|
||||
except NotImplementedError:
|
||||
pass
|
||||
|
||||
def get_library_versions(self):
|
||||
""" Should return a string that shows which version
|
||||
of the needed libraries are used """
|
||||
raise NotImplementedError
|
||||
|
||||
def get_warnings(self):
|
||||
"""
|
||||
Return the list of warnings messages to be displayed in the info page
|
||||
"""
|
||||
warnings = self.warnings
|
||||
self.warnings = []
|
||||
return warnings
|
||||
|
||||
def run(self):
|
||||
""" Run all instances. """
|
||||
instance_statuses = []
|
||||
for i, instance in enumerate(self.instances):
|
||||
try:
|
||||
self.check(instance)
|
||||
if self.has_warnings():
|
||||
instance_status = check_status.InstanceStatus(i,
|
||||
check_status.STATUS_WARNING,
|
||||
warnings=self.get_warnings())
|
||||
else:
|
||||
instance_status = check_status.InstanceStatus(i, check_status.STATUS_OK)
|
||||
except Exception, e:
|
||||
self.log.exception("Check '%s' instance #%s failed" % (self.name, i))
|
||||
instance_status = check_status.InstanceStatus(i,
|
||||
check_status.STATUS_ERROR,
|
||||
error=e,
|
||||
tb=traceback.format_exc())
|
||||
instance_statuses.append(instance_status)
|
||||
return instance_statuses
|
||||
|
||||
def check(self, instance):
|
||||
"""
|
||||
Overriden by the check class. This will be called to run the check.
|
||||
|
||||
:param instance: A dict with the instance information. This will vary
|
||||
depending on your config structure.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@staticmethod
|
||||
def stop():
|
||||
"""
|
||||
To be executed when the agent is being stopped to clean ressources
|
||||
"""
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def from_yaml(cls, path_to_yaml=None, agentConfig=None, yaml_text=None, check_name=None):
|
||||
"""
|
||||
A method used for testing your check without running the agent.
|
||||
"""
|
||||
import yaml
|
||||
try:
|
||||
from yaml import CLoader as Loader
|
||||
except ImportError:
|
||||
from yaml import Loader
|
||||
if path_to_yaml:
|
||||
check_name = os.path.basename(path_to_yaml).split('.')[0]
|
||||
try:
|
||||
f = open(path_to_yaml)
|
||||
except IOError:
|
||||
raise Exception('Unable to open yaml config: %s' % path_to_yaml)
|
||||
yaml_text = f.read()
|
||||
f.close()
|
||||
|
||||
config = yaml.load(yaml_text, Loader=Loader)
|
||||
check = cls(check_name, config.get('init_config') or {}, agentConfig or {})
|
||||
|
||||
return check, config.get('instances', [])
|
||||
|
||||
@staticmethod
|
||||
def normalize(metric, prefix=None):
|
||||
"""
|
||||
Turn a metric into a well-formed metric name
|
||||
prefix.b.c
|
||||
|
||||
:param metric The metric name to normalize
|
||||
:param prefix A prefix to to add to the normalized name, default None
|
||||
"""
|
||||
name = re.sub(r"[,\+\*\-/()\[\]{}]", "_", metric)
|
||||
# Eliminate multiple _
|
||||
name = re.sub(r"__+", "_", name)
|
||||
# Don't start/end with _
|
||||
name = re.sub(r"^_", "", name)
|
||||
name = re.sub(r"_$", "", name)
|
||||
# Drop ._ and _.
|
||||
name = re.sub(r"\._", ".", name)
|
||||
name = re.sub(r"_\.", ".", name)
|
||||
|
||||
if prefix is not None:
|
||||
return prefix + "." + name
|
||||
else:
|
||||
return name
|
||||
|
||||
@staticmethod
|
||||
def read_config(instance, key, message=None, cast=None):
|
||||
val = instance.get(key)
|
||||
if val is None:
|
||||
message = message or 'Must provide `%s` value in instance config' % key
|
||||
raise Exception(message)
|
||||
|
||||
if cast is None:
|
||||
return val
|
||||
else:
|
||||
return cast(val)
|
||||
|
||||
|
||||
def run_check(name, path=None):
|
||||
from tests.common import get_check
|
||||
|
||||
# Read the config file
|
||||
confd_path = path or os.path.join(get_confd_path(get_os()), '%s.yaml' % name)
|
||||
|
||||
try:
|
||||
f = open(confd_path)
|
||||
except IOError:
|
||||
raise Exception('Unable to open configuration at %s' % confd_path)
|
||||
|
||||
config_str = f.read()
|
||||
f.close()
|
||||
|
||||
# Run the check
|
||||
check, instances = get_check(name, config_str)
|
||||
if not instances:
|
||||
raise Exception('YAML configuration returned no instances.')
|
||||
for instance in instances:
|
||||
check.check(instance)
|
||||
if check.has_events():
|
||||
print "Events:\n"
|
||||
pprint(check.get_events(), indent=4)
|
||||
print "Metrics:\n"
|
||||
pprint(check.get_metrics(), indent=4)
|
||||
from check import AgentCheck
|
||||
|
|
|
@ -0,0 +1,573 @@
|
|||
"""Base class for Checks.
|
||||
|
||||
If you are writing your own checks you should subclass the AgentCheck class.
|
||||
The Check class is being deprecated so don't write new checks with it.
|
||||
"""
|
||||
import logging
|
||||
import os
|
||||
from pprint import pprint
|
||||
import re
|
||||
import time
|
||||
import traceback
|
||||
|
||||
from monagent.common import check_status
|
||||
from monagent.common.config import get_confd_path
|
||||
from monagent.common.exceptions import CheckException, NaN, Infinity, UnknownValue
|
||||
from monagent.common.util import LaconicFilter, get_hostname, get_os
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# todo convert all checks to the new interface then remove this. Is the LaconicFilter on logs used elsewhere?
|
||||
#==============================================================================
|
||||
# DEPRECATED
|
||||
# ------------------------------
|
||||
# If you are writing your own check, you should inherit from AgentCheck
|
||||
# and not this class. This class will be removed in a future version
|
||||
# of the agent.
|
||||
#==============================================================================
|
||||
class Check(object):
|
||||
"""
|
||||
(Abstract) class for all checks with the ability to:
|
||||
* store 1 (and only 1) sample for gauges per metric/tag combination
|
||||
* compute rates for counters
|
||||
* only log error messages once (instead of each time they occur)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, logger, agent_config=None):
|
||||
# where to store samples, indexed by metric_name
|
||||
# metric_name: {("sorted", "dimensions"): [(ts, value), (ts, value)],
|
||||
# tuple(dimensions) are stored as a key since lists are not hashable
|
||||
# None: [(ts, value), (ts, value)]}
|
||||
# untagged values are indexed by None
|
||||
self.agent_config = agent_config
|
||||
self._sample_store = {}
|
||||
self._counters = {} # metric_name: bool
|
||||
self.logger = logger
|
||||
try:
|
||||
self.logger.addFilter(LaconicFilter())
|
||||
except Exception:
|
||||
self.logger.exception("Trying to install laconic log filter and failed")
|
||||
|
||||
@staticmethod
|
||||
def normalize(metric, prefix=None):
|
||||
"""Turn a metric into a well-formed metric name
|
||||
prefix.b.c
|
||||
"""
|
||||
name = re.sub(r"[,\+\*\-/()\[\]{}]", "_", metric)
|
||||
# Eliminate multiple _
|
||||
name = re.sub(r"__+", "_", name)
|
||||
# Don't start/end with _
|
||||
name = re.sub(r"^_", "", name)
|
||||
name = re.sub(r"_$", "", name)
|
||||
# Drop ._ and _.
|
||||
name = re.sub(r"\._", ".", name)
|
||||
name = re.sub(r"_\.", ".", name)
|
||||
|
||||
if prefix is not None:
|
||||
return prefix + "." + name
|
||||
else:
|
||||
return name
|
||||
|
||||
@staticmethod
|
||||
def normalize_device_name(device_name):
|
||||
return device_name.strip().lower().replace(' ', '_')
|
||||
|
||||
def counter(self, metric):
|
||||
"""
|
||||
Treats the metric as a counter, i.e. computes its per second derivative
|
||||
ACHTUNG: Resets previous values associated with this metric.
|
||||
"""
|
||||
self._counters[metric] = True
|
||||
self._sample_store[metric] = {}
|
||||
|
||||
def is_counter(self, metric):
|
||||
"Is this metric a counter?"
|
||||
return metric in self._counters
|
||||
|
||||
def gauge(self, metric):
|
||||
"""
|
||||
Treats the metric as a gauge, i.e. keep the data as is
|
||||
ACHTUNG: Resets previous values associated with this metric.
|
||||
"""
|
||||
self._sample_store[metric] = {}
|
||||
|
||||
def is_metric(self, metric):
|
||||
return metric in self._sample_store
|
||||
|
||||
def is_gauge(self, metric):
|
||||
return self.is_metric(metric) and not self.is_counter(metric)
|
||||
|
||||
def get_metric_names(self):
|
||||
"Get all metric names"
|
||||
return self._sample_store.keys()
|
||||
|
||||
def save_gauge(self, metric, value, timestamp=None, dimensions=None, hostname=None, device_name=None):
|
||||
""" Save a gauge value. """
|
||||
if not self.is_gauge(metric):
|
||||
self.gauge(metric)
|
||||
self.save_sample(metric, value, timestamp, dimensions, hostname, device_name)
|
||||
|
||||
def save_sample(self, metric, value, timestamp=None, dimensions=None, hostname=None, device_name=None):
|
||||
"""Save a simple sample, evict old values if needed
|
||||
"""
|
||||
if dimensions is None:
|
||||
dimensions = {}
|
||||
from common.util import cast_metric_val
|
||||
|
||||
if timestamp is None:
|
||||
timestamp = time.time()
|
||||
if metric not in self._sample_store:
|
||||
raise CheckException("Saving a sample for an undefined metric: %s" % metric)
|
||||
try:
|
||||
value = cast_metric_val(value)
|
||||
except ValueError, ve:
|
||||
raise NaN(ve)
|
||||
|
||||
# Sort and validate dimensions
|
||||
if dimensions is not None and not isinstance(dimensions, dict):
|
||||
raise CheckException("Dimensions must be a dictionary")
|
||||
|
||||
# Data eviction rules
|
||||
key = (tuple(sorted(dimensions.items())), device_name)
|
||||
if self.is_gauge(metric):
|
||||
self._sample_store[metric][key] = ((timestamp, value, hostname, device_name), )
|
||||
elif self.is_counter(metric):
|
||||
if self._sample_store[metric].get(key) is None:
|
||||
self._sample_store[metric][key] = [(timestamp, value, hostname, device_name)]
|
||||
else:
|
||||
self._sample_store[metric][key] = self._sample_store[metric][key][-1:] + \
|
||||
[(timestamp, value, hostname, device_name)]
|
||||
else:
|
||||
raise CheckException("%s must be either gauge or counter, skipping sample at %s" %
|
||||
(metric, time.ctime(timestamp)))
|
||||
|
||||
if self.is_gauge(metric):
|
||||
# store[metric][dimensions] = (ts, val) - only 1 value allowed
|
||||
assert len(self._sample_store[metric][key]) == 1, self._sample_store[metric]
|
||||
elif self.is_counter(metric):
|
||||
assert len(self._sample_store[metric][key]) in (1, 2), self._sample_store[metric]
|
||||
|
||||
@classmethod
|
||||
def _rate(cls, sample1, sample2):
|
||||
"Simple rate"
|
||||
try:
|
||||
interval = sample2[0] - sample1[0]
|
||||
if interval == 0:
|
||||
raise Infinity()
|
||||
|
||||
delta = sample2[1] - sample1[1]
|
||||
if delta < 0:
|
||||
raise UnknownValue()
|
||||
|
||||
return (sample2[0], delta / interval, sample2[2], sample2[3])
|
||||
except Infinity:
|
||||
raise
|
||||
except UnknownValue:
|
||||
raise
|
||||
except Exception, e:
|
||||
raise NaN(e)
|
||||
|
||||
def get_sample_with_timestamp(self, metric, dimensions=None, device_name=None, expire=True):
|
||||
"""Get (timestamp-epoch-style, value)
|
||||
"""
|
||||
if dimensions is None:
|
||||
dimensions = {}
|
||||
|
||||
# Get the proper dimensions
|
||||
key = (tuple(sorted(dimensions.items())), device_name)
|
||||
|
||||
# Never seen this metric
|
||||
if metric not in self._sample_store:
|
||||
raise UnknownValue()
|
||||
|
||||
# Not enough value to compute rate
|
||||
elif self.is_counter(metric) and len(self._sample_store[metric][key]) < 2:
|
||||
raise UnknownValue()
|
||||
|
||||
elif self.is_counter(metric) and len(self._sample_store[metric][key]) >= 2:
|
||||
res = self._rate(self._sample_store[metric][key][-2], self._sample_store[metric][key][-1])
|
||||
if expire:
|
||||
del self._sample_store[metric][key][:-1]
|
||||
return res
|
||||
|
||||
elif self.is_gauge(metric) and len(self._sample_store[metric][key]) >= 1:
|
||||
return self._sample_store[metric][key][-1]
|
||||
|
||||
else:
|
||||
raise UnknownValue()
|
||||
|
||||
def get_sample(self, metric, dimensions=None, device_name=None, expire=True):
|
||||
"Return the last value for that metric"
|
||||
x = self.get_sample_with_timestamp(metric, dimensions, device_name, expire)
|
||||
assert isinstance(x, tuple) and len(x) == 4, x
|
||||
return x[1]
|
||||
|
||||
def get_samples_with_timestamps(self, expire=True):
|
||||
"Return all values {metric: (ts, value)} for non-tagged metrics"
|
||||
values = {}
|
||||
for m in self._sample_store:
|
||||
try:
|
||||
values[m] = self.get_sample_with_timestamp(m, expire=expire)
|
||||
except Exception:
|
||||
pass
|
||||
return values
|
||||
|
||||
def get_samples(self, expire=True):
|
||||
"Return all values {metric: value} for non-tagged metrics"
|
||||
values = {}
|
||||
for m in self._sample_store:
|
||||
try:
|
||||
# Discard the timestamp
|
||||
values[m] = self.get_sample_with_timestamp(m, expire=expire)[1]
|
||||
except Exception:
|
||||
pass
|
||||
return values
|
||||
|
||||
def get_metrics(self, expire=True):
|
||||
"""Get all metrics, including the ones that are tagged.
|
||||
This is the preferred method to retrieve metrics
|
||||
|
||||
@return the list of samples
|
||||
@rtype [(metric_name, timestamp, value, {"dimensions": {"name1": "key1", "name2": "key2"}}), ...]
|
||||
"""
|
||||
metrics = []
|
||||
for m in self._sample_store:
|
||||
try:
|
||||
for key in self._sample_store[m]:
|
||||
dimensions_list, device_name = key
|
||||
dimensions = dict(dimensions_list)
|
||||
try:
|
||||
ts, val, hostname, device_name = self.get_sample_with_timestamp(m, dimensions, device_name, expire)
|
||||
except UnknownValue:
|
||||
continue
|
||||
attributes = {}
|
||||
if dimensions_list:
|
||||
attributes['dimensions'] = dimensions
|
||||
if hostname:
|
||||
attributes['host_name'] = hostname
|
||||
if device_name:
|
||||
attributes['device_name'] = device_name
|
||||
metrics.append((m, int(ts), val, attributes))
|
||||
except Exception:
|
||||
pass
|
||||
return metrics
|
||||
|
||||
|
||||
class AgentCheck(object):
|
||||
|
||||
def __init__(self, name, init_config, agent_config, instances=None):
|
||||
"""
|
||||
Initialize a new check.
|
||||
|
||||
:param name: The name of the check
|
||||
:param init_config: The config for initializing the check
|
||||
:param agent_config: The global configuration for the agent
|
||||
:param instances: A list of configuration objects for each instance.
|
||||
"""
|
||||
from monagent.common.aggregator import MetricsAggregator
|
||||
|
||||
self.name = name
|
||||
self.init_config = init_config
|
||||
self.agent_config = agent_config
|
||||
self.hostname = get_hostname(agent_config)
|
||||
self.log = logging.getLogger('%s.%s' % (__name__, name))
|
||||
|
||||
self.aggregator = MetricsAggregator(self.hostname,
|
||||
recent_point_threshold=agent_config.get('recent_point_threshold', None))
|
||||
|
||||
self.events = []
|
||||
self.instances = instances or []
|
||||
self.warnings = []
|
||||
self.library_versions = None
|
||||
|
||||
def instance_count(self):
|
||||
""" Return the number of instances that are configured for this check. """
|
||||
return len(self.instances)
|
||||
|
||||
def gauge(self, metric, value, dimensions=None, hostname=None, device_name=None, timestamp=None):
|
||||
"""
|
||||
Record the value of a gauge, with optional dimensions, hostname and device
|
||||
name.
|
||||
|
||||
:param metric: The name of the metric
|
||||
:param value: The value of the gauge
|
||||
:param dimensions: (optional) A dictionary of dimensions for this metric
|
||||
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
|
||||
:param device_name: (optional) The device name for this metric
|
||||
:param timestamp: (optional) The timestamp for this metric value
|
||||
"""
|
||||
self.aggregator.gauge(metric, value, dimensions, hostname, device_name, timestamp)
|
||||
|
||||
def increment(self, metric, value=1, dimensions=None, hostname=None, device_name=None):
|
||||
"""
|
||||
Increment a counter with optional dimensions, hostname and device name.
|
||||
|
||||
:param metric: The name of the metric
|
||||
:param value: The value to increment by
|
||||
:param dimensions: (optional) A dictionary of dimensions for this metric
|
||||
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
|
||||
:param device_name: (optional) The device name for this metric
|
||||
"""
|
||||
self.aggregator.increment(metric, value, dimensions, hostname, device_name)
|
||||
|
||||
def decrement(self, metric, value=-1, dimensions=None, hostname=None, device_name=None):
|
||||
"""
|
||||
Increment a counter with optional dimensions, hostname and device name.
|
||||
|
||||
:param metric: The name of the metric
|
||||
:param value: The value to decrement by
|
||||
:param dimensions: (optional) A dictionary of dimensions for this metric
|
||||
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
|
||||
:param device_name: (optional) The device name for this metric
|
||||
"""
|
||||
self.aggregator.decrement(metric, value, dimensions, hostname, device_name)
|
||||
|
||||
def rate(self, metric, value, dimensions=None, hostname=None, device_name=None):
|
||||
"""
|
||||
Submit a point for a metric that will be calculated as a rate on flush.
|
||||
Values will persist across each call to `check` if there is not enough
|
||||
point to generate a rate on the flush.
|
||||
|
||||
:param metric: The name of the metric
|
||||
:param value: The value of the rate
|
||||
:param dimensions: (optional) A dictionary of dimensions for this metric
|
||||
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
|
||||
:param device_name: (optional) The device name for this metric
|
||||
"""
|
||||
self.aggregator.rate(metric, value, dimensions, hostname, device_name)
|
||||
|
||||
def histogram(self, metric, value, dimensions=None, hostname=None, device_name=None):
|
||||
"""
|
||||
Sample a histogram value, with optional dimensions, hostname and device name.
|
||||
|
||||
:param metric: The name of the metric
|
||||
:param value: The value to sample for the histogram
|
||||
:param dimensions: (optional) A dictionary of dimensions for this metric
|
||||
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
|
||||
:param device_name: (optional) The device name for this metric
|
||||
"""
|
||||
self.aggregator.histogram(metric, value, dimensions, hostname, device_name)
|
||||
|
||||
def set(self, metric, value, dimensions=None, hostname=None, device_name=None):
|
||||
"""
|
||||
Sample a set value, with optional dimensions, hostname and device name.
|
||||
|
||||
:param metric: The name of the metric
|
||||
:param value: The value for the set
|
||||
:param dimensions: (optional) A dictionary of dimensions for this metric
|
||||
:param hostname: (optional) A hostname for this metric. Defaults to the current hostname.
|
||||
:param device_name: (optional) The device name for this metric
|
||||
"""
|
||||
self.aggregator.set(metric, value, dimensions, hostname, device_name)
|
||||
|
||||
def event(self, event):
|
||||
"""
|
||||
Save an event.
|
||||
|
||||
:param event: The event payload as a dictionary. Has the following
|
||||
structure:
|
||||
|
||||
{
|
||||
"timestamp": int, the epoch timestamp for the event,
|
||||
"event_type": string, the event time name,
|
||||
"api_key": string, the api key of the account to associate the event with,
|
||||
"msg_title": string, the title of the event,
|
||||
"msg_text": string, the text body of the event,
|
||||
"alert_type": (optional) string, one of ('error', 'warning', 'success', 'info').
|
||||
Defaults to 'info'.
|
||||
"source_type_name": (optional) string, the source type name,
|
||||
"host": (optional) string, the name of the host,
|
||||
"dimensions": (optional) a dictionary of dimensions to associate with this event
|
||||
}
|
||||
"""
|
||||
if event.get('api_key') is None:
|
||||
event['api_key'] = self.agent_config['api_key']
|
||||
self.events.append(event)
|
||||
|
||||
def has_events(self):
|
||||
"""
|
||||
Check whether the check has saved any events
|
||||
|
||||
@return whether or not the check has saved any events
|
||||
@rtype boolean
|
||||
"""
|
||||
return len(self.events) > 0
|
||||
|
||||
def get_metrics(self):
|
||||
"""
|
||||
Get all metrics, including the ones that are tagged.
|
||||
|
||||
@return the list of samples
|
||||
@rtype list of Measurement objects from monagent.common.metrics
|
||||
"""
|
||||
return self.aggregator.flush()
|
||||
|
||||
def get_events(self):
|
||||
"""
|
||||
Return a list of the events saved by the check, if any
|
||||
|
||||
@return the list of events saved by this check
|
||||
@rtype list of event dictionaries
|
||||
"""
|
||||
events = self.events
|
||||
self.events = []
|
||||
return events
|
||||
|
||||
def has_warnings(self):
|
||||
"""
|
||||
Check whether the instance run created any warnings
|
||||
"""
|
||||
return len(self.warnings) > 0
|
||||
|
||||
def warning(self, warning_message):
|
||||
""" Add a warning message that will be printed in the info page
|
||||
:param warning_message: String. Warning message to be displayed
|
||||
"""
|
||||
self.warnings.append(warning_message)
|
||||
|
||||
def get_library_info(self):
|
||||
if self.library_versions is not None:
|
||||
return self.library_versions
|
||||
try:
|
||||
self.library_versions = self.get_library_versions()
|
||||
except NotImplementedError:
|
||||
pass
|
||||
|
||||
def get_library_versions(self):
|
||||
""" Should return a string that shows which version
|
||||
of the needed libraries are used """
|
||||
raise NotImplementedError
|
||||
|
||||
def get_warnings(self):
|
||||
"""
|
||||
Return the list of warnings messages to be displayed in the info page
|
||||
"""
|
||||
warnings = self.warnings
|
||||
self.warnings = []
|
||||
return warnings
|
||||
|
||||
def run(self):
|
||||
""" Run all instances. """
|
||||
instance_statuses = []
|
||||
for i, instance in enumerate(self.instances):
|
||||
try:
|
||||
self.check(instance)
|
||||
if self.has_warnings():
|
||||
instance_status = check_status.InstanceStatus(i,
|
||||
check_status.STATUS_WARNING,
|
||||
warnings=self.get_warnings())
|
||||
else:
|
||||
instance_status = check_status.InstanceStatus(i, check_status.STATUS_OK)
|
||||
except Exception, e:
|
||||
self.log.exception("Check '%s' instance #%s failed" % (self.name, i))
|
||||
instance_status = check_status.InstanceStatus(i,
|
||||
check_status.STATUS_ERROR,
|
||||
error=e,
|
||||
tb=traceback.format_exc())
|
||||
instance_statuses.append(instance_status)
|
||||
return instance_statuses
|
||||
|
||||
def check(self, instance):
|
||||
"""
|
||||
Overriden by the check class. This will be called to run the check.
|
||||
|
||||
:param instance: A dict with the instance information. This will vary
|
||||
depending on your config structure.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
@staticmethod
|
||||
def stop():
|
||||
"""
|
||||
To be executed when the agent is being stopped to clean ressources
|
||||
"""
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def from_yaml(cls, path_to_yaml=None, agentConfig=None, yaml_text=None, check_name=None):
|
||||
"""
|
||||
A method used for testing your check without running the agent.
|
||||
"""
|
||||
import yaml
|
||||
try:
|
||||
from yaml import CLoader as Loader
|
||||
except ImportError:
|
||||
from yaml import Loader
|
||||
if path_to_yaml:
|
||||
check_name = os.path.basename(path_to_yaml).split('.')[0]
|
||||
try:
|
||||
f = open(path_to_yaml)
|
||||
except IOError:
|
||||
raise Exception('Unable to open yaml config: %s' % path_to_yaml)
|
||||
yaml_text = f.read()
|
||||
f.close()
|
||||
|
||||
config = yaml.load(yaml_text, Loader=Loader)
|
||||
check = cls(check_name, config.get('init_config') or {}, agentConfig or {})
|
||||
|
||||
return check, config.get('instances', [])
|
||||
|
||||
@staticmethod
|
||||
def normalize(metric, prefix=None):
|
||||
"""
|
||||
Turn a metric into a well-formed metric name
|
||||
prefix.b.c
|
||||
|
||||
:param metric The metric name to normalize
|
||||
:param prefix A prefix to to add to the normalized name, default None
|
||||
"""
|
||||
name = re.sub(r"[,\+\*\-/()\[\]{}]", "_", metric)
|
||||
# Eliminate multiple _
|
||||
name = re.sub(r"__+", "_", name)
|
||||
# Don't start/end with _
|
||||
name = re.sub(r"^_", "", name)
|
||||
name = re.sub(r"_$", "", name)
|
||||
# Drop ._ and _.
|
||||
name = re.sub(r"\._", ".", name)
|
||||
name = re.sub(r"_\.", ".", name)
|
||||
|
||||
if prefix is not None:
|
||||
return prefix + "." + name
|
||||
else:
|
||||
return name
|
||||
|
||||
@staticmethod
|
||||
def read_config(instance, key, message=None, cast=None):
|
||||
val = instance.get(key)
|
||||
if val is None:
|
||||
message = message or 'Must provide `%s` value in instance config' % key
|
||||
raise Exception(message)
|
||||
|
||||
if cast is None:
|
||||
return val
|
||||
else:
|
||||
return cast(val)
|
||||
|
||||
|
||||
def run_check(name, path=None):
|
||||
from tests.common import get_check
|
||||
|
||||
# Read the config file
|
||||
confd_path = path or os.path.join(get_confd_path(get_os()), '%s.yaml' % name)
|
||||
|
||||
try:
|
||||
f = open(confd_path)
|
||||
except IOError:
|
||||
raise Exception('Unable to open configuration at %s' % confd_path)
|
||||
|
||||
config_str = f.read()
|
||||
f.close()
|
||||
|
||||
# Run the check
|
||||
check, instances = get_check(name, config_str)
|
||||
if not instances:
|
||||
raise Exception('YAML configuration returned no instances.')
|
||||
for instance in instances:
|
||||
check.check(instance)
|
||||
if check.has_events():
|
||||
print "Events:\n"
|
||||
pprint(check.get_events(), indent=4)
|
||||
print "Metrics:\n"
|
||||
pprint(check.get_metrics(), indent=4)
|
|
@ -6,7 +6,7 @@ from datetime import datetime
|
|||
from itertools import groupby # >= python 2.4
|
||||
|
||||
from utils import TailFile
|
||||
from . import LaconicFilter
|
||||
from monagent.common.util import LaconicFilter
|
||||
from monagent.collector import modules
|
||||
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ import sys
|
|||
import time
|
||||
|
||||
# project
|
||||
from .. import Check
|
||||
from monagent.collector.checks.check import Check
|
||||
from monagent.common.metrics import Measurement
|
||||
from monagent.common.util import Platform
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from .. import Check
|
||||
from monagent.collector.checks.check import Check
|
||||
|
||||
try:
|
||||
import wmi
|
||||
|
|
|
@ -1,327 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Datadog
|
||||
www.datadoghq.com
|
||||
----
|
||||
Make sense of your IT Data
|
||||
|
||||
Licensed under Simplified BSD License (see LICENSE)
|
||||
(C) Boxed Ice 2010 all rights reserved
|
||||
(C) Datadog, Inc. 2010-2013 all rights reserved
|
||||
"""
|
||||
|
||||
# set up logging before importing any other components
|
||||
from monagent.common.config import initialize_logging
|
||||
initialize_logging('forwarder')
|
||||
from monagent.common.config import get_logging_config
|
||||
|
||||
import os
|
||||
os.umask(022)
|
||||
|
||||
# Standard imports
|
||||
import logging
|
||||
import sys
|
||||
from datetime import timedelta
|
||||
import signal
|
||||
from socket import gaierror, error as socket_error
|
||||
|
||||
# Tornado
|
||||
import tornado.httpclient
|
||||
import tornado.httpserver
|
||||
import tornado.ioloop
|
||||
import tornado.web
|
||||
from tornado.escape import json_decode
|
||||
from tornado.options import define, parse_command_line, options
|
||||
|
||||
# agent import
|
||||
from api import MonAPI
|
||||
from monagent.common.check_status import ForwarderStatus
|
||||
from monagent.common.config import get_config
|
||||
from monagent.common.metrics import Measurement
|
||||
from monagent.common.util import Watchdog, get_tornado_ioloop
|
||||
from transaction import Transaction, TransactionManager
|
||||
|
||||
log = logging.getLogger('forwarder')
|
||||
log.setLevel(get_logging_config()['log_level'] or logging.INFO)
|
||||
|
||||
TRANSACTION_FLUSH_INTERVAL = 5000 # Every 5 seconds
|
||||
WATCHDOG_INTERVAL_MULTIPLIER = 10 # 10x flush interval
|
||||
|
||||
# Maximum delay before replaying a transaction
|
||||
MAX_WAIT_FOR_REPLAY = timedelta(seconds=90)
|
||||
|
||||
# Maximum queue size in bytes (when this is reached, old messages are dropped)
|
||||
MAX_QUEUE_SIZE = 30 * 1024 * 1024 # 30MB
|
||||
|
||||
THROTTLING_DELAY = timedelta(microseconds=1000000/2) # 2 msg/second
|
||||
|
||||
|
||||
class MetricTransaction(Transaction):
|
||||
|
||||
_application = None
|
||||
_trManager = None
|
||||
_endpoints = []
|
||||
|
||||
@classmethod
|
||||
def set_application(cls, app):
|
||||
cls._application = app
|
||||
|
||||
@classmethod
|
||||
def set_tr_manager(cls, manager):
|
||||
cls._trManager = manager
|
||||
|
||||
@classmethod
|
||||
def get_tr_manager(cls):
|
||||
return cls._trManager
|
||||
|
||||
@classmethod
|
||||
def set_endpoints(cls, endpoint):
|
||||
# todo we only have one endpoint option, generalize it better
|
||||
# the immediate use case for two endpoints could be our own monitoring boxes, they could send to
|
||||
# the main api and mini-mon api
|
||||
cls._endpoints.append(endpoint)
|
||||
|
||||
def __init__(self, data, headers):
|
||||
self._data = data
|
||||
self._headers = headers
|
||||
|
||||
# Call after data has been set (size is computed in Transaction's init)
|
||||
Transaction.__init__(self)
|
||||
|
||||
# Insert the transaction in the Manager
|
||||
self._trManager.append(self)
|
||||
log.debug("Created transaction %d" % self.get_id())
|
||||
self._trManager.flush()
|
||||
|
||||
def __sizeof__(self):
|
||||
return sys.getsizeof(self._data)
|
||||
|
||||
def flush(self):
|
||||
try:
|
||||
for endpoint in self._endpoints:
|
||||
endpoint.post_metrics(self._data)
|
||||
except Exception:
|
||||
log.exception('Error flushing metrics to remote endpoints')
|
||||
self._trManager.tr_error(self)
|
||||
else:
|
||||
self._trManager.tr_success(self)
|
||||
self._trManager.flush_next()
|
||||
|
||||
|
||||
class StatusHandler(tornado.web.RequestHandler):
|
||||
|
||||
def get(self):
|
||||
threshold = int(self.get_argument('threshold', -1))
|
||||
|
||||
m = MetricTransaction.get_tr_manager()
|
||||
|
||||
self.write("<table><tr><td>Id</td><td>Size</td><td>Error count</td><td>Next flush</td></tr>")
|
||||
transactions = m.get_transactions()
|
||||
for tr in transactions:
|
||||
self.write("<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>" %
|
||||
(tr.get_id(), tr.get_size(), tr.get_error_count(), tr.get_next_flush()))
|
||||
self.write("</table>")
|
||||
|
||||
if threshold >= 0:
|
||||
if len(transactions) > threshold:
|
||||
self.set_status(503)
|
||||
|
||||
|
||||
class AgentInputHandler(tornado.web.RequestHandler):
|
||||
|
||||
def post(self):
|
||||
"""Read the message and forward it to the intake
|
||||
The message is expected to follow the format:
|
||||
|
||||
"""
|
||||
# read the message it should be a list of monagent.common.metrics.Measurements expressed as a dict
|
||||
msg = tornado.escape.json_decode(self.request.body)
|
||||
try:
|
||||
log.debug(msg)
|
||||
measurements = [Measurement(**m) for m in msg]
|
||||
except Exception:
|
||||
log.exception('Error parsing body of Agent Input')
|
||||
raise tornado.web.HTTPError(500)
|
||||
|
||||
headers = self.request.headers
|
||||
|
||||
if len(measurements) > 0:
|
||||
# Setup a transaction for this message
|
||||
tr = MetricTransaction(measurements, headers)
|
||||
else:
|
||||
raise tornado.web.HTTPError(500)
|
||||
|
||||
self.write("Transaction: %s" % tr.get_id())
|
||||
|
||||
|
||||
class Forwarder(tornado.web.Application):
|
||||
|
||||
def __init__(self, port, agent_config, watchdog=True, skip_ssl_validation=False, use_simple_http_client=False):
|
||||
self._port = int(port)
|
||||
self._agentConfig = agent_config
|
||||
self._metrics = {}
|
||||
MetricTransaction.set_application(self)
|
||||
MetricTransaction.set_endpoints(MonAPI(agent_config['Api']))
|
||||
self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY)
|
||||
MetricTransaction.set_tr_manager(self._tr_manager)
|
||||
|
||||
self._watchdog = None
|
||||
self.skip_ssl_validation = skip_ssl_validation or agent_config.get('skip_ssl_validation', False)
|
||||
self.use_simple_http_client = use_simple_http_client
|
||||
if self.skip_ssl_validation:
|
||||
log.info("Skipping SSL hostname validation, useful when using a transparent proxy")
|
||||
|
||||
if watchdog:
|
||||
watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER
|
||||
self._watchdog = Watchdog(watchdog_timeout, max_mem_mb=agent_config.get('limit_memory_consumption', None))
|
||||
|
||||
def _post_metrics(self):
|
||||
|
||||
if len(self._metrics) > 0:
|
||||
MetricTransaction(self._metrics, headers={'Content-Type': 'application/json'})
|
||||
self._metrics = {}
|
||||
|
||||
# todo why is the tornado logging method overridden? Perhaps ditch this.
|
||||
def log_request(self, handler):
|
||||
""" Override the tornado logging method.
|
||||
If everything goes well, log level is DEBUG.
|
||||
Otherwise it's WARNING or ERROR depending on the response code. """
|
||||
if handler.get_status() < 400:
|
||||
log_method = log.debug
|
||||
elif handler.get_status() < 500:
|
||||
log_method = log.warning
|
||||
else:
|
||||
log_method = log.error
|
||||
request_time = 1000.0 * handler.request.request_time()
|
||||
log_method("%d %s %.2fms", handler.get_status(),
|
||||
handler._request_summary(), request_time)
|
||||
|
||||
def run(self):
|
||||
handlers = [
|
||||
(r"/intake/?", AgentInputHandler),
|
||||
(r"/api/v1/series/?", AgentInputHandler),
|
||||
(r"/status/?", StatusHandler),
|
||||
]
|
||||
|
||||
settings = dict(
|
||||
cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=",
|
||||
xsrf_cookies=False,
|
||||
debug=False,
|
||||
log_function=self.log_request
|
||||
)
|
||||
|
||||
non_local_traffic = self._agentConfig.get("non_local_traffic", False)
|
||||
|
||||
tornado.web.Application.__init__(self, handlers, **settings)
|
||||
http_server = tornado.httpserver.HTTPServer(self)
|
||||
|
||||
try:
|
||||
# non_local_traffic must be == True to match, not just some non-false value
|
||||
if non_local_traffic is True:
|
||||
http_server.listen(self._port)
|
||||
else:
|
||||
# localhost in lieu of 127.0.0.1 to support IPv6
|
||||
try:
|
||||
http_server.listen(self._port, address="localhost")
|
||||
except gaierror:
|
||||
log.warning("localhost seems undefined in your host file, using 127.0.0.1 instead")
|
||||
http_server.listen(self._port, address="127.0.0.1")
|
||||
except socket_error, e:
|
||||
if "Errno 99" in str(e):
|
||||
log.warning("IPv6 doesn't seem to be fully supported. Falling back to IPv4")
|
||||
http_server.listen(self._port, address="127.0.0.1")
|
||||
else:
|
||||
raise
|
||||
except socket_error, e:
|
||||
log.exception("Socket error %s. Is another application listening on the same port ? Exiting", e)
|
||||
sys.exit(1)
|
||||
except Exception:
|
||||
log.exception("Uncaught exception. Forwarder is exiting.")
|
||||
sys.exit(1)
|
||||
|
||||
log.info("Listening on port %d" % self._port)
|
||||
|
||||
# Register callbacks
|
||||
self.mloop = get_tornado_ioloop()
|
||||
|
||||
logging.getLogger().setLevel(get_logging_config()['log_level'] or logging.INFO)
|
||||
|
||||
def flush_trs():
|
||||
if self._watchdog:
|
||||
self._watchdog.reset()
|
||||
self._post_metrics()
|
||||
self._tr_manager.flush()
|
||||
|
||||
tr_sched = tornado.ioloop.PeriodicCallback(flush_trs, TRANSACTION_FLUSH_INTERVAL, io_loop=self.mloop)
|
||||
|
||||
# Start everything
|
||||
if self._watchdog:
|
||||
self._watchdog.reset()
|
||||
tr_sched.start()
|
||||
|
||||
self.mloop.start()
|
||||
log.info("Stopped")
|
||||
|
||||
def stop(self):
|
||||
self.mloop.stop()
|
||||
|
||||
|
||||
def init_forwarder(skip_ssl_validation=False, use_simple_http_client=False):
|
||||
agent_config = get_config(parse_args=False)
|
||||
|
||||
port = agent_config.get('listen_port', 17123)
|
||||
if port is None:
|
||||
port = 17123
|
||||
else:
|
||||
port = int(port)
|
||||
|
||||
app = Forwarder(port, agent_config, skip_ssl_validation=skip_ssl_validation,
|
||||
use_simple_http_client=use_simple_http_client)
|
||||
|
||||
def sigterm_handler(signum, frame):
|
||||
log.info("caught sigterm. stopping")
|
||||
app.stop()
|
||||
|
||||
signal.signal(signal.SIGTERM, sigterm_handler)
|
||||
signal.signal(signal.SIGINT, sigterm_handler)
|
||||
|
||||
return app
|
||||
|
||||
|
||||
def main():
|
||||
define("sslcheck", default=1, help="Verify SSL hostname, on by default")
|
||||
define("use_simple_http_client", default=0, help="Use Tornado SimpleHTTPClient instead of CurlAsyncHTTPClient")
|
||||
args = parse_command_line()
|
||||
skip_ssl_validation = False
|
||||
use_simple_http_client = False
|
||||
|
||||
if unicode(options.sslcheck) == u"0":
|
||||
skip_ssl_validation = True
|
||||
|
||||
if unicode(options.use_simple_http_client) == u"1":
|
||||
use_simple_http_client = True
|
||||
|
||||
# If we don't have any arguments, run the server.
|
||||
if not args:
|
||||
app = init_forwarder(skip_ssl_validation, use_simple_http_client=use_simple_http_client)
|
||||
try:
|
||||
app.run()
|
||||
finally:
|
||||
ForwarderStatus.remove_latest_status()
|
||||
|
||||
else:
|
||||
usage = "%s [help|info]. Run with no commands to start the server" % (sys.argv[0])
|
||||
command = args[0]
|
||||
if command == 'info':
|
||||
logging.getLogger().setLevel(logging.ERROR)
|
||||
return ForwarderStatus.print_latest_status()
|
||||
elif command == 'help':
|
||||
print usage
|
||||
else:
|
||||
print "Unknown command: %s" % command
|
||||
print usage
|
||||
return -1
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
|
@ -1,108 +1 @@
|
|||
import json
|
||||
import logging
|
||||
|
||||
from threading import Timer
|
||||
from monagent.common.keystone import Keystone
|
||||
from monagent.common.util import get_hostname
|
||||
from monclient import client
|
||||
import monclient.exc as exc
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MonAPI(object):
|
||||
"""Sends measurements to MonAPI
|
||||
Any errors should raise an exception so the transaction calling
|
||||
this is not committed
|
||||
"""
|
||||
def __init__(self, config):
|
||||
"""
|
||||
Initialize Mon api client connection.
|
||||
"""
|
||||
self.config = config
|
||||
self.url = config['url']
|
||||
self.api_version = '2_0'
|
||||
self.default_dimensions = config['dimensions']
|
||||
self.token_expiration = 1438
|
||||
# Verify the hostname is set as a dimension
|
||||
if not 'hostname' in self.default_dimensions:
|
||||
self.default_dimensions['hostname'] = get_hostname()
|
||||
|
||||
log.debug("Getting token from Keystone")
|
||||
self.keystone_url = config['keystone_url']
|
||||
self.username = config['username']
|
||||
self.password = config['password']
|
||||
self.project_name = config['project_name']
|
||||
|
||||
self.keystone = Keystone(self.keystone_url,
|
||||
self.username,
|
||||
self.password,
|
||||
self.project_name)
|
||||
self.mon_client = None
|
||||
|
||||
def _post(self, measurements):
|
||||
"""Does the actual http post
|
||||
measurements is a list of Measurement
|
||||
"""
|
||||
data = [m.__dict__ for m in measurements]
|
||||
kwargs = {
|
||||
'jsonbody': data
|
||||
}
|
||||
try:
|
||||
if not self.mon_client:
|
||||
# construct the mon client
|
||||
self.mon_client = self.get_client()
|
||||
|
||||
done = False
|
||||
while not done:
|
||||
response = self.mon_client.metrics.create(**kwargs)
|
||||
if 200 <= response.status_code <= 299:
|
||||
# Good status from web service
|
||||
log.debug("Message sent successfully: {0}"
|
||||
.format(str(data)))
|
||||
elif 400 <= response.status_code <= 499:
|
||||
# Good status from web service but some type of issue
|
||||
# with the data
|
||||
if response.status_code == 401:
|
||||
# Get a new token/client and retry
|
||||
self.mon_client = self.get_client()
|
||||
continue
|
||||
else:
|
||||
error_msg = "Successful web service call but there" + \
|
||||
" were issues (Status: {0}, Status Message: " + \
|
||||
"{1}, Message Content: {1})"
|
||||
log.error(error_msg.format(response.status_code,
|
||||
response.reason, response.text))
|
||||
response.raise_for_status()
|
||||
else: # Not a good status
|
||||
response.raise_for_status()
|
||||
done = True
|
||||
except exc.HTTPException as he:
|
||||
log.error("Error sending message to mon-api: {0}"
|
||||
.format(str(he.message)))
|
||||
|
||||
def post_metrics(self, measurements):
|
||||
"""post_metrics
|
||||
given [Measurement, ...], format the request and post to
|
||||
the monitoring api
|
||||
"""
|
||||
# Add default dimensions
|
||||
for measurement in measurements:
|
||||
for dimension in self.default_dimensions.keys():
|
||||
if not measurement.dimensions.has_key(dimension):
|
||||
measurement.dimensions.update({dimension: self.default_dimensions[dimension]})
|
||||
|
||||
self._post(measurements)
|
||||
|
||||
def get_client(self):
|
||||
"""get_client
|
||||
get a new mon-client object
|
||||
"""
|
||||
token = self.keystone.refresh_token()
|
||||
# Re-create the client. This is temporary until
|
||||
# the client is updated to be able to reset the
|
||||
# token.
|
||||
kwargs = {
|
||||
'token': token
|
||||
}
|
||||
return client.Client(self.api_version, self.url, **kwargs)
|
||||
|
|
|
@ -0,0 +1,105 @@
|
|||
import logging
|
||||
|
||||
from monclient import exc as exc, client
|
||||
from monagent.common.keystone import Keystone
|
||||
from monagent.common.util import get_hostname
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MonAPI(object):
|
||||
"""Sends measurements to MonAPI
|
||||
Any errors should raise an exception so the transaction calling
|
||||
this is not committed
|
||||
"""
|
||||
def __init__(self, config):
|
||||
"""
|
||||
Initialize Mon api client connection.
|
||||
"""
|
||||
self.config = config
|
||||
self.url = config['url']
|
||||
self.api_version = '2_0'
|
||||
self.default_dimensions = config['dimensions']
|
||||
self.token_expiration = 1438
|
||||
# Verify the hostname is set as a dimension
|
||||
if not 'hostname' in self.default_dimensions:
|
||||
self.default_dimensions['hostname'] = get_hostname()
|
||||
|
||||
log.debug("Getting token from Keystone")
|
||||
self.keystone_url = config['keystone_url']
|
||||
self.username = config['username']
|
||||
self.password = config['password']
|
||||
self.project_name = config['project_name']
|
||||
|
||||
self.keystone = Keystone(self.keystone_url,
|
||||
self.username,
|
||||
self.password,
|
||||
self.project_name)
|
||||
self.mon_client = None
|
||||
|
||||
def _post(self, measurements):
|
||||
"""Does the actual http post
|
||||
measurements is a list of Measurement
|
||||
"""
|
||||
data = [m.__dict__ for m in measurements]
|
||||
kwargs = {
|
||||
'jsonbody': data
|
||||
}
|
||||
try:
|
||||
if not self.mon_client:
|
||||
# construct the mon client
|
||||
self.mon_client = self.get_client()
|
||||
|
||||
done = False
|
||||
while not done:
|
||||
response = self.mon_client.metrics.create(**kwargs)
|
||||
if 200 <= response.status_code <= 299:
|
||||
# Good status from web service
|
||||
log.debug("Message sent successfully: {0}"
|
||||
.format(str(data)))
|
||||
elif 400 <= response.status_code <= 499:
|
||||
# Good status from web service but some type of issue
|
||||
# with the data
|
||||
if response.status_code == 401:
|
||||
# Get a new token/client and retry
|
||||
self.mon_client = self.get_client()
|
||||
continue
|
||||
else:
|
||||
error_msg = "Successful web service call but there" + \
|
||||
" were issues (Status: {0}, Status Message: " + \
|
||||
"{1}, Message Content: {1})"
|
||||
log.error(error_msg.format(response.status_code,
|
||||
response.reason, response.text))
|
||||
response.raise_for_status()
|
||||
else: # Not a good status
|
||||
response.raise_for_status()
|
||||
done = True
|
||||
except exc.HTTPException as he:
|
||||
log.error("Error sending message to mon-api: {0}"
|
||||
.format(str(he.message)))
|
||||
|
||||
def post_metrics(self, measurements):
|
||||
"""post_metrics
|
||||
given [Measurement, ...], format the request and post to
|
||||
the monitoring api
|
||||
"""
|
||||
# Add default dimensions
|
||||
for measurement in measurements:
|
||||
for dimension in self.default_dimensions.keys():
|
||||
if not measurement.dimensions.has_key(dimension):
|
||||
measurement.dimensions.update({dimension: self.default_dimensions[dimension]})
|
||||
|
||||
self._post(measurements)
|
||||
|
||||
def get_client(self):
|
||||
"""get_client
|
||||
get a new mon-client object
|
||||
"""
|
||||
token = self.keystone.refresh_token()
|
||||
# Re-create the client. This is temporary until
|
||||
# the client is updated to be able to reset the
|
||||
# token.
|
||||
kwargs = {
|
||||
'token': token
|
||||
}
|
||||
return client.Client(self.api_version, self.url, **kwargs)
|
|
@ -0,0 +1,328 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Datadog
|
||||
www.datadoghq.com
|
||||
----
|
||||
Make sense of your IT Data
|
||||
|
||||
Licensed under Simplified BSD License (see LICENSE)
|
||||
(C) Boxed Ice 2010 all rights reserved
|
||||
(C) Datadog, Inc. 2010-2013 all rights reserved
|
||||
"""
|
||||
|
||||
# set up logging before importing any other components
|
||||
from monagent.common.config import initialize_logging
|
||||
from monagent.forwarder.api.mon import MonAPI
|
||||
|
||||
initialize_logging('forwarder')
|
||||
from monagent.common.config import get_logging_config
|
||||
|
||||
import os
|
||||
os.umask(022)
|
||||
|
||||
# Standard imports
|
||||
import logging
|
||||
import sys
|
||||
from datetime import timedelta
|
||||
import signal
|
||||
from socket import gaierror, error as socket_error
|
||||
|
||||
# Tornado
|
||||
import tornado.httpclient
|
||||
import tornado.httpserver
|
||||
import tornado.ioloop
|
||||
import tornado.web
|
||||
from tornado.escape import json_decode
|
||||
from tornado.options import define, parse_command_line, options
|
||||
|
||||
# agent import
|
||||
from monagent.common.check_status import ForwarderStatus
|
||||
from monagent.common.config import get_config
|
||||
from monagent.common.metrics import Measurement
|
||||
from monagent.common.util import Watchdog, get_tornado_ioloop
|
||||
from transaction import Transaction, TransactionManager
|
||||
|
||||
log = logging.getLogger('forwarder')
|
||||
log.setLevel(get_logging_config()['log_level'] or logging.INFO)
|
||||
|
||||
TRANSACTION_FLUSH_INTERVAL = 5000 # Every 5 seconds
|
||||
WATCHDOG_INTERVAL_MULTIPLIER = 10 # 10x flush interval
|
||||
|
||||
# Maximum delay before replaying a transaction
|
||||
MAX_WAIT_FOR_REPLAY = timedelta(seconds=90)
|
||||
|
||||
# Maximum queue size in bytes (when this is reached, old messages are dropped)
|
||||
MAX_QUEUE_SIZE = 30 * 1024 * 1024 # 30MB
|
||||
|
||||
THROTTLING_DELAY = timedelta(microseconds=1000000/2) # 2 msg/second
|
||||
|
||||
|
||||
class MetricTransaction(Transaction):
|
||||
|
||||
_application = None
|
||||
_trManager = None
|
||||
_endpoints = []
|
||||
|
||||
@classmethod
|
||||
def set_application(cls, app):
|
||||
cls._application = app
|
||||
|
||||
@classmethod
|
||||
def set_tr_manager(cls, manager):
|
||||
cls._trManager = manager
|
||||
|
||||
@classmethod
|
||||
def get_tr_manager(cls):
|
||||
return cls._trManager
|
||||
|
||||
@classmethod
|
||||
def set_endpoints(cls, endpoint):
|
||||
# todo we only have one endpoint option, generalize it better
|
||||
# the immediate use case for two endpoints could be our own monitoring boxes, they could send to
|
||||
# the main api and mini-mon api
|
||||
cls._endpoints.append(endpoint)
|
||||
|
||||
def __init__(self, data, headers):
|
||||
self._data = data
|
||||
self._headers = headers
|
||||
|
||||
# Call after data has been set (size is computed in Transaction's init)
|
||||
Transaction.__init__(self)
|
||||
|
||||
# Insert the transaction in the Manager
|
||||
self._trManager.append(self)
|
||||
log.debug("Created transaction %d" % self.get_id())
|
||||
self._trManager.flush()
|
||||
|
||||
def __sizeof__(self):
|
||||
return sys.getsizeof(self._data)
|
||||
|
||||
def flush(self):
|
||||
try:
|
||||
for endpoint in self._endpoints:
|
||||
endpoint.post_metrics(self._data)
|
||||
except Exception:
|
||||
log.exception('Error flushing metrics to remote endpoints')
|
||||
self._trManager.tr_error(self)
|
||||
else:
|
||||
self._trManager.tr_success(self)
|
||||
self._trManager.flush_next()
|
||||
|
||||
|
||||
class StatusHandler(tornado.web.RequestHandler):
|
||||
|
||||
def get(self):
|
||||
threshold = int(self.get_argument('threshold', -1))
|
||||
|
||||
m = MetricTransaction.get_tr_manager()
|
||||
|
||||
self.write("<table><tr><td>Id</td><td>Size</td><td>Error count</td><td>Next flush</td></tr>")
|
||||
transactions = m.get_transactions()
|
||||
for tr in transactions:
|
||||
self.write("<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>" %
|
||||
(tr.get_id(), tr.get_size(), tr.get_error_count(), tr.get_next_flush()))
|
||||
self.write("</table>")
|
||||
|
||||
if threshold >= 0:
|
||||
if len(transactions) > threshold:
|
||||
self.set_status(503)
|
||||
|
||||
|
||||
class AgentInputHandler(tornado.web.RequestHandler):
|
||||
|
||||
def post(self):
|
||||
"""Read the message and forward it to the intake
|
||||
The message is expected to follow the format:
|
||||
|
||||
"""
|
||||
# read the message it should be a list of monagent.common.metrics.Measurements expressed as a dict
|
||||
msg = tornado.escape.json_decode(self.request.body)
|
||||
try:
|
||||
log.debug(msg)
|
||||
measurements = [Measurement(**m) for m in msg]
|
||||
except Exception:
|
||||
log.exception('Error parsing body of Agent Input')
|
||||
raise tornado.web.HTTPError(500)
|
||||
|
||||
headers = self.request.headers
|
||||
|
||||
if len(measurements) > 0:
|
||||
# Setup a transaction for this message
|
||||
tr = MetricTransaction(measurements, headers)
|
||||
else:
|
||||
raise tornado.web.HTTPError(500)
|
||||
|
||||
self.write("Transaction: %s" % tr.get_id())
|
||||
|
||||
|
||||
class Forwarder(tornado.web.Application):
|
||||
|
||||
def __init__(self, port, agent_config, watchdog=True, skip_ssl_validation=False, use_simple_http_client=False):
|
||||
self._port = int(port)
|
||||
self._agentConfig = agent_config
|
||||
self._metrics = {}
|
||||
MetricTransaction.set_application(self)
|
||||
MetricTransaction.set_endpoints(MonAPI(agent_config['Api']))
|
||||
self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY)
|
||||
MetricTransaction.set_tr_manager(self._tr_manager)
|
||||
|
||||
self._watchdog = None
|
||||
self.skip_ssl_validation = skip_ssl_validation or agent_config.get('skip_ssl_validation', False)
|
||||
self.use_simple_http_client = use_simple_http_client
|
||||
if self.skip_ssl_validation:
|
||||
log.info("Skipping SSL hostname validation, useful when using a transparent proxy")
|
||||
|
||||
if watchdog:
|
||||
watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER
|
||||
self._watchdog = Watchdog(watchdog_timeout, max_mem_mb=agent_config.get('limit_memory_consumption', None))
|
||||
|
||||
def _post_metrics(self):
|
||||
|
||||
if len(self._metrics) > 0:
|
||||
MetricTransaction(self._metrics, headers={'Content-Type': 'application/json'})
|
||||
self._metrics = {}
|
||||
|
||||
# todo why is the tornado logging method overridden? Perhaps ditch this.
|
||||
def log_request(self, handler):
|
||||
""" Override the tornado logging method.
|
||||
If everything goes well, log level is DEBUG.
|
||||
Otherwise it's WARNING or ERROR depending on the response code. """
|
||||
if handler.get_status() < 400:
|
||||
log_method = log.debug
|
||||
elif handler.get_status() < 500:
|
||||
log_method = log.warning
|
||||
else:
|
||||
log_method = log.error
|
||||
request_time = 1000.0 * handler.request.request_time()
|
||||
log_method("%d %s %.2fms", handler.get_status(),
|
||||
handler._request_summary(), request_time)
|
||||
|
||||
def run(self):
|
||||
handlers = [
|
||||
(r"/intake/?", AgentInputHandler),
|
||||
(r"/api/v1/series/?", AgentInputHandler),
|
||||
(r"/status/?", StatusHandler),
|
||||
]
|
||||
|
||||
settings = dict(
|
||||
cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=",
|
||||
xsrf_cookies=False,
|
||||
debug=False,
|
||||
log_function=self.log_request
|
||||
)
|
||||
|
||||
non_local_traffic = self._agentConfig.get("non_local_traffic", False)
|
||||
|
||||
tornado.web.Application.__init__(self, handlers, **settings)
|
||||
http_server = tornado.httpserver.HTTPServer(self)
|
||||
|
||||
try:
|
||||
# non_local_traffic must be == True to match, not just some non-false value
|
||||
if non_local_traffic is True:
|
||||
http_server.listen(self._port)
|
||||
else:
|
||||
# localhost in lieu of 127.0.0.1 to support IPv6
|
||||
try:
|
||||
http_server.listen(self._port, address="localhost")
|
||||
except gaierror:
|
||||
log.warning("localhost seems undefined in your host file, using 127.0.0.1 instead")
|
||||
http_server.listen(self._port, address="127.0.0.1")
|
||||
except socket_error, e:
|
||||
if "Errno 99" in str(e):
|
||||
log.warning("IPv6 doesn't seem to be fully supported. Falling back to IPv4")
|
||||
http_server.listen(self._port, address="127.0.0.1")
|
||||
else:
|
||||
raise
|
||||
except socket_error, e:
|
||||
log.exception("Socket error %s. Is another application listening on the same port ? Exiting", e)
|
||||
sys.exit(1)
|
||||
except Exception:
|
||||
log.exception("Uncaught exception. Forwarder is exiting.")
|
||||
sys.exit(1)
|
||||
|
||||
log.info("Listening on port %d" % self._port)
|
||||
|
||||
# Register callbacks
|
||||
self.mloop = get_tornado_ioloop()
|
||||
|
||||
logging.getLogger().setLevel(get_logging_config()['log_level'] or logging.INFO)
|
||||
|
||||
def flush_trs():
|
||||
if self._watchdog:
|
||||
self._watchdog.reset()
|
||||
self._post_metrics()
|
||||
self._tr_manager.flush()
|
||||
|
||||
tr_sched = tornado.ioloop.PeriodicCallback(flush_trs, TRANSACTION_FLUSH_INTERVAL, io_loop=self.mloop)
|
||||
|
||||
# Start everything
|
||||
if self._watchdog:
|
||||
self._watchdog.reset()
|
||||
tr_sched.start()
|
||||
|
||||
self.mloop.start()
|
||||
log.info("Stopped")
|
||||
|
||||
def stop(self):
|
||||
self.mloop.stop()
|
||||
|
||||
|
||||
def init_forwarder(skip_ssl_validation=False, use_simple_http_client=False):
|
||||
agent_config = get_config(parse_args=False)
|
||||
|
||||
port = agent_config.get('listen_port', 17123)
|
||||
if port is None:
|
||||
port = 17123
|
||||
else:
|
||||
port = int(port)
|
||||
|
||||
app = Forwarder(port, agent_config, skip_ssl_validation=skip_ssl_validation,
|
||||
use_simple_http_client=use_simple_http_client)
|
||||
|
||||
def sigterm_handler(signum, frame):
|
||||
log.info("caught sigterm. stopping")
|
||||
app.stop()
|
||||
|
||||
signal.signal(signal.SIGTERM, sigterm_handler)
|
||||
signal.signal(signal.SIGINT, sigterm_handler)
|
||||
|
||||
return app
|
||||
|
||||
|
||||
def main():
|
||||
define("sslcheck", default=1, help="Verify SSL hostname, on by default")
|
||||
define("use_simple_http_client", default=0, help="Use Tornado SimpleHTTPClient instead of CurlAsyncHTTPClient")
|
||||
args = parse_command_line()
|
||||
skip_ssl_validation = False
|
||||
use_simple_http_client = False
|
||||
|
||||
if unicode(options.sslcheck) == u"0":
|
||||
skip_ssl_validation = True
|
||||
|
||||
if unicode(options.use_simple_http_client) == u"1":
|
||||
use_simple_http_client = True
|
||||
|
||||
# If we don't have any arguments, run the server.
|
||||
if not args:
|
||||
app = init_forwarder(skip_ssl_validation, use_simple_http_client=use_simple_http_client)
|
||||
try:
|
||||
app.run()
|
||||
finally:
|
||||
ForwarderStatus.remove_latest_status()
|
||||
|
||||
else:
|
||||
usage = "%s [help|info]. Run with no commands to start the server" % (sys.argv[0])
|
||||
command = args[0]
|
||||
if command == 'info':
|
||||
logging.getLogger().setLevel(logging.ERROR)
|
||||
return ForwarderStatus.print_latest_status()
|
||||
elif command == 'help':
|
||||
print usage
|
||||
else:
|
||||
print "Unknown command: %s" % command
|
||||
print usage
|
||||
return -1
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
|
@ -1,154 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
A Python Statsd implementation with dimensions added
|
||||
"""
|
||||
|
||||
# set up logging before importing any other components
|
||||
from monagent.common.config import initialize_logging
|
||||
from monagent.monstatsd.reporter import Reporter
|
||||
from monagent.monstatsd.udp import Server
|
||||
|
||||
initialize_logging('monstatsd')
|
||||
|
||||
import os
|
||||
os.umask(022)
|
||||
|
||||
# stdlib
|
||||
import logging
|
||||
import optparse
|
||||
import signal
|
||||
import sys
|
||||
|
||||
# project
|
||||
from monagent.common.aggregator import MetricsBucketAggregator
|
||||
from monagent.common.check_status import MonstatsdStatus
|
||||
from monagent.common.config import get_config
|
||||
from monagent.common.daemon import Daemon, AgentSupervisor
|
||||
from monagent.common.util import PidFile, get_hostname
|
||||
|
||||
log = logging.getLogger('monstatsd')
|
||||
|
||||
|
||||
class Monstatsd(Daemon):
|
||||
""" This class is the monstatsd daemon. """
|
||||
|
||||
def __init__(self, pid_file, server, reporter, autorestart):
|
||||
Daemon.__init__(self, pid_file, autorestart=autorestart)
|
||||
self.server = server
|
||||
self.reporter = reporter
|
||||
|
||||
def _handle_sigterm(self, signum, frame):
|
||||
log.debug("Caught sigterm. Stopping run loop.")
|
||||
self.server.stop()
|
||||
|
||||
def run(self):
|
||||
# Gracefully exit on sigterm.
|
||||
signal.signal(signal.SIGTERM, self._handle_sigterm)
|
||||
|
||||
# Handle Keyboard Interrupt
|
||||
signal.signal(signal.SIGINT, self._handle_sigterm)
|
||||
|
||||
# Start the reporting thread before accepting data
|
||||
self.reporter.start()
|
||||
|
||||
try:
|
||||
try:
|
||||
self.server.start()
|
||||
except Exception, e:
|
||||
log.exception('Error starting server')
|
||||
raise e
|
||||
finally:
|
||||
# The server will block until it's done. Once we're here, shutdown
|
||||
# the reporting thread.
|
||||
self.reporter.stop()
|
||||
self.reporter.join()
|
||||
log.info("Monstatsd is stopped")
|
||||
# Restart if asked to restart
|
||||
if self.autorestart:
|
||||
sys.exit(AgentSupervisor.RESTART_EXIT_STATUS)
|
||||
|
||||
def info(self):
|
||||
logging.getLogger().setLevel(logging.ERROR)
|
||||
return MonstatsdStatus.print_latest_status()
|
||||
|
||||
|
||||
def init_monstatsd(config_path=None, use_watchdog=False):
|
||||
"""Configure the server and the reporting thread.
|
||||
"""
|
||||
c = get_config(parse_args=False, cfg_path=config_path)
|
||||
log.debug("Configuration monstatsd")
|
||||
|
||||
port = c['monstatsd_port']
|
||||
interval = int(c['monstatsd_interval'])
|
||||
aggregator_interval = int(c['monstatsd_agregator_bucket_size'])
|
||||
non_local_traffic = c['non_local_traffic']
|
||||
forward_to_host = c.get('statsd_forward_host')
|
||||
forward_to_port = c.get('statsd_forward_port')
|
||||
event_chunk_size = c.get('event_chunk_size')
|
||||
|
||||
target = c['forwarder_url']
|
||||
|
||||
hostname = get_hostname(c)
|
||||
|
||||
# Create the aggregator (which is the point of communication between the
|
||||
# server and reporting threads.
|
||||
assert 0 < interval
|
||||
|
||||
aggregator = MetricsBucketAggregator(hostname, aggregator_interval,
|
||||
recent_point_threshold=c.get('recent_point_threshold', None))
|
||||
|
||||
# Start the reporting thread.
|
||||
reporter = Reporter(interval, aggregator, target, use_watchdog, event_chunk_size)
|
||||
|
||||
# Start the server on an IPv4 stack
|
||||
# Default to loopback
|
||||
server_host = 'localhost'
|
||||
# If specified, bind to all addressses
|
||||
if non_local_traffic:
|
||||
server_host = ''
|
||||
|
||||
server = Server(aggregator, server_host, port, forward_to_host=forward_to_host, forward_to_port=forward_to_port)
|
||||
|
||||
return reporter, server, c
|
||||
|
||||
|
||||
def main(config_path=None):
|
||||
""" The main entry point for the unix version of monstatsd. """
|
||||
parser = optparse.OptionParser("%prog [start|stop|restart|status]")
|
||||
opts, args = parser.parse_args()
|
||||
|
||||
reporter, server, cnf = init_monstatsd(config_path, use_watchdog=True)
|
||||
pid_file = PidFile('monstatsd')
|
||||
daemon = Monstatsd(pid_file.get_path(), server, reporter,
|
||||
cnf.get('autorestart', False))
|
||||
|
||||
# If no args were passed in, run the server in the foreground.
|
||||
# todo does this need to be a daemon even when it basically always runs in the foreground, if not
|
||||
# restructure and get rid of the silly init_function
|
||||
if not args:
|
||||
daemon.run()
|
||||
return 0
|
||||
|
||||
# Otherwise, we're process the deamon command.
|
||||
else:
|
||||
command = args[0]
|
||||
|
||||
if command == 'start':
|
||||
daemon.start()
|
||||
elif command == 'stop':
|
||||
daemon.stop()
|
||||
elif command == 'restart':
|
||||
daemon.restart()
|
||||
elif command == 'status':
|
||||
daemon.status()
|
||||
elif command == 'info':
|
||||
return daemon.info()
|
||||
else:
|
||||
sys.stderr.write("Unknown command: %s\n\n" % command)
|
||||
parser.print_help()
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
|
@ -0,0 +1,154 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
A Python Statsd implementation with dimensions added
|
||||
"""
|
||||
|
||||
# set up logging before importing any other components
|
||||
from monagent.common.config import initialize_logging
|
||||
from monagent.monstatsd.reporter import Reporter
|
||||
from monagent.monstatsd.udp import Server
|
||||
|
||||
initialize_logging('monstatsd')
|
||||
|
||||
import os
|
||||
os.umask(022)
|
||||
|
||||
# stdlib
|
||||
import logging
|
||||
import optparse
|
||||
import signal
|
||||
import sys
|
||||
|
||||
# project
|
||||
from monagent.common.aggregator import MetricsBucketAggregator
|
||||
from monagent.common.check_status import MonstatsdStatus
|
||||
from monagent.common.config import get_config
|
||||
from monagent.common.daemon import Daemon, AgentSupervisor
|
||||
from monagent.common.util import PidFile, get_hostname
|
||||
|
||||
log = logging.getLogger('monstatsd')
|
||||
|
||||
|
||||
class Monstatsd(Daemon):
|
||||
""" This class is the monstatsd daemon. """
|
||||
|
||||
def __init__(self, pid_file, server, reporter, autorestart):
|
||||
Daemon.__init__(self, pid_file, autorestart=autorestart)
|
||||
self.server = server
|
||||
self.reporter = reporter
|
||||
|
||||
def _handle_sigterm(self, signum, frame):
|
||||
log.debug("Caught sigterm. Stopping run loop.")
|
||||
self.server.stop()
|
||||
|
||||
def run(self):
|
||||
# Gracefully exit on sigterm.
|
||||
signal.signal(signal.SIGTERM, self._handle_sigterm)
|
||||
|
||||
# Handle Keyboard Interrupt
|
||||
signal.signal(signal.SIGINT, self._handle_sigterm)
|
||||
|
||||
# Start the reporting thread before accepting data
|
||||
self.reporter.start()
|
||||
|
||||
try:
|
||||
try:
|
||||
self.server.start()
|
||||
except Exception, e:
|
||||
log.exception('Error starting server')
|
||||
raise e
|
||||
finally:
|
||||
# The server will block until it's done. Once we're here, shutdown
|
||||
# the reporting thread.
|
||||
self.reporter.stop()
|
||||
self.reporter.join()
|
||||
log.info("Monstatsd is stopped")
|
||||
# Restart if asked to restart
|
||||
if self.autorestart:
|
||||
sys.exit(AgentSupervisor.RESTART_EXIT_STATUS)
|
||||
|
||||
def info(self):
|
||||
logging.getLogger().setLevel(logging.ERROR)
|
||||
return MonstatsdStatus.print_latest_status()
|
||||
|
||||
|
||||
def init_monstatsd(config_path=None, use_watchdog=False):
|
||||
"""Configure the server and the reporting thread.
|
||||
"""
|
||||
c = get_config(parse_args=False, cfg_path=config_path)
|
||||
log.debug("Configuration monstatsd")
|
||||
|
||||
port = c['monstatsd_port']
|
||||
interval = int(c['monstatsd_interval'])
|
||||
aggregator_interval = int(c['monstatsd_agregator_bucket_size'])
|
||||
non_local_traffic = c['non_local_traffic']
|
||||
forward_to_host = c.get('statsd_forward_host')
|
||||
forward_to_port = c.get('statsd_forward_port')
|
||||
event_chunk_size = c.get('event_chunk_size')
|
||||
|
||||
target = c['forwarder_url']
|
||||
|
||||
hostname = get_hostname(c)
|
||||
|
||||
# Create the aggregator (which is the point of communication between the
|
||||
# server and reporting threads.
|
||||
assert 0 < interval
|
||||
|
||||
aggregator = MetricsBucketAggregator(hostname, aggregator_interval,
|
||||
recent_point_threshold=c.get('recent_point_threshold', None))
|
||||
|
||||
# Start the reporting thread.
|
||||
reporter = Reporter(interval, aggregator, target, use_watchdog, event_chunk_size)
|
||||
|
||||
# Start the server on an IPv4 stack
|
||||
# Default to loopback
|
||||
server_host = 'localhost'
|
||||
# If specified, bind to all addressses
|
||||
if non_local_traffic:
|
||||
server_host = ''
|
||||
|
||||
server = Server(aggregator, server_host, port, forward_to_host=forward_to_host, forward_to_port=forward_to_port)
|
||||
|
||||
return reporter, server, c
|
||||
|
||||
|
||||
def main(config_path=None):
|
||||
""" The main entry point for the unix version of monstatsd. """
|
||||
parser = optparse.OptionParser("%prog [start|stop|restart|status]")
|
||||
opts, args = parser.parse_args()
|
||||
|
||||
reporter, server, cnf = init_monstatsd(config_path, use_watchdog=True)
|
||||
pid_file = PidFile('monstatsd')
|
||||
daemon = Monstatsd(pid_file.get_path(), server, reporter,
|
||||
cnf.get('autorestart', False))
|
||||
|
||||
# If no args were passed in, run the server in the foreground.
|
||||
# todo does this need to be a daemon even when it basically always runs in the foreground, if not
|
||||
# restructure and get rid of the silly init_function
|
||||
if not args:
|
||||
daemon.run()
|
||||
return 0
|
||||
|
||||
# Otherwise, we're process the deamon command.
|
||||
else:
|
||||
command = args[0]
|
||||
|
||||
if command == 'start':
|
||||
daemon.start()
|
||||
elif command == 'stop':
|
||||
daemon.stop()
|
||||
elif command == 'restart':
|
||||
daemon.restart()
|
||||
elif command == 'status':
|
||||
daemon.status()
|
||||
elif command == 'info':
|
||||
return daemon.info()
|
||||
else:
|
||||
sys.stderr.write("Unknown command: %s\n\n" % command)
|
||||
parser.print_help()
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
|
@ -17,7 +17,7 @@ class Plugins(collections.defaultdict):
|
|||
# todo Possibly enforce the key being a string without .yaml in it.
|
||||
|
||||
def diff(self, other_plugins):
|
||||
raise NotImplementedError
|
||||
raise NotImplementedError
|
||||
|
||||
def merge(self, other):
|
||||
"""Do a deep merge with precedence going to other (as is the case with update)
|
||||
|
|
|
@ -1,99 +1,3 @@
|
|||
"""Classes and utilities for detection of running resources to be monitored.
|
||||
Detection classes should be platform independent
|
||||
"""
|
||||
import psutil
|
||||
from plugin import Plugin
|
||||
from utils import find_process_cmdline, find_process_name, watch_process, service_api_check
|
||||
|
||||
from monsetup import agent_config
|
||||
|
||||
|
||||
class Plugin(object):
|
||||
"""Abstract class implemented by the mon-agent plugin detection classes
|
||||
"""
|
||||
# todo these should include dependency detection
|
||||
|
||||
def __init__(self, template_dir, overwrite=True):
|
||||
self.available = False
|
||||
self.template_dir = template_dir
|
||||
self.dependencies = ()
|
||||
self.overwrite = overwrite
|
||||
self._detect()
|
||||
|
||||
def _detect(self):
|
||||
"""Run detection, set self.available True if the service is detected."""
|
||||
raise NotImplementedError
|
||||
|
||||
def build_config(self):
|
||||
"""Build the config as a Plugins object and return.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def dependencies_installed(self):
|
||||
"""return True if dependencies are installed
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
"""Return _name if set otherwise the class name"""
|
||||
if '_name' in self.__dict__:
|
||||
return self._name
|
||||
else:
|
||||
return self.__class__.__name__
|
||||
|
||||
|
||||
def find_process_cmdline(search_string):
|
||||
"""Simple function to search running process for one with cmdline containing
|
||||
"""
|
||||
for process in psutil.process_iter():
|
||||
for arg in process.cmdline():
|
||||
if arg.find(search_string) != -1:
|
||||
return process
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def find_process_name(pname):
|
||||
"""Simple function to search running process for one with pname.
|
||||
"""
|
||||
for process in psutil.process_iter():
|
||||
if pname == process.name():
|
||||
return process
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def watch_process(search_strings, service = None):
|
||||
"""Takes a list of process search strings and returns a Plugins object with the config set.
|
||||
This was built as a helper as many plugins setup process watching
|
||||
"""
|
||||
config = agent_config.Plugins()
|
||||
parameters = {'name': search_strings[0],
|
||||
'search_string': search_strings}
|
||||
|
||||
# If service parameter is set in the plugin config, add the service dimension which
|
||||
# will override the service in the agent config
|
||||
if service:
|
||||
parameters['dimensions'] = {'service': service}
|
||||
|
||||
config['process'] = {'init_config': None,
|
||||
'instances': [parameters]}
|
||||
return config
|
||||
|
||||
def service_api_check(name, url, pattern, service = None):
|
||||
"""Setup a service api to be watched by the http_check plugin."""
|
||||
config = agent_config.Plugins()
|
||||
parameters = {'name': name,
|
||||
'url': url,
|
||||
'match_pattern': pattern,
|
||||
'timeout': 10,
|
||||
'use_keystone': True}
|
||||
|
||||
# If service parameter is set in the plugin config, add the service dimension which
|
||||
# will override the service in the agent config
|
||||
if service:
|
||||
parameters['dimensions'] = {'service': service}
|
||||
|
||||
config['http_check'] = {'init_config': None,
|
||||
'instances': [parameters]}
|
||||
|
||||
return config
|
||||
|
|
|
@ -4,6 +4,7 @@ from monsetup import agent_config
|
|||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Cinder(Plugin):
|
||||
"""Detect cinder daemons and setup configuration to monitor them."""
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ from monsetup import agent_config
|
|||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CinderAPI(Plugin):
|
||||
"""Detect the Cinder-API daemon and setup configuration to monitor it."""
|
||||
|
||||
|
@ -21,11 +22,12 @@ class CinderAPI(Plugin):
|
|||
config = agent_config.Plugins()
|
||||
# First watch the Nova-API process
|
||||
log.info("\tMonitoring the cinder API process.")
|
||||
config.merge(watch_process([self.process_name], self.service_name))
|
||||
config.merge(watch_process([self.process_name], self.service_name))
|
||||
|
||||
# Next setup an active http_status check on the API
|
||||
log.info("\tConfiguring an http_check for the cinder API.")
|
||||
config.merge(service_api_check(self.process_name, 'http://localhost:8776/v2.0', '.*version=1.*', self.service_name))
|
||||
config.merge(
|
||||
service_api_check(self.process_name, 'http://localhost:8776/v2.0', '.*version=1.*', self.service_name))
|
||||
|
||||
return config
|
||||
|
||||
|
|
|
@ -31,6 +31,7 @@ class Kafka(Plugin):
|
|||
|
||||
import kazoo
|
||||
from kazoo.client import KazooClient
|
||||
|
||||
logging.getLogger('kazoo').setLevel(logging.WARN) # kazoo fills up the console without this
|
||||
|
||||
zk = KazooClient(hosts='127.0.0.1:2181', read_only=True)
|
||||
|
@ -48,7 +49,6 @@ class Kafka(Plugin):
|
|||
except kazoo.exceptions.NoNodeError:
|
||||
continue
|
||||
|
||||
|
||||
log.info("\tInstalling kafka_consumer plugin.")
|
||||
config['kafka_consumer'] = {'init_config': None,
|
||||
'instances': [{'kafka_connect_str': 'localhost:9092',
|
||||
|
|
|
@ -4,6 +4,7 @@ from monsetup import agent_config
|
|||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Nova(Plugin):
|
||||
"""Detect Nova daemons and setup configuration to monitor them."""
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ from monsetup import agent_config
|
|||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class NovaAPI(Plugin):
|
||||
"""Detect the Nova-API daemon and setup configuration to monitor it."""
|
||||
|
||||
|
@ -25,7 +26,8 @@ class NovaAPI(Plugin):
|
|||
|
||||
# Next setup an active http_status check on the API
|
||||
log.info("\tConfiguring an http_check for the nova API.")
|
||||
config.merge(service_api_check(self.process_name, 'http://localhost:8774/v2.0', '.*version=2.*', self.service_name))
|
||||
config.merge(
|
||||
service_api_check(self.process_name, 'http://localhost:8774/v2.0', '.*version=2.*', self.service_name))
|
||||
|
||||
return config
|
||||
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
"""Classes for detection of running resources to be monitored.
|
||||
Detection classes should be platform independent
|
||||
"""
|
||||
|
||||
|
||||
class Plugin(object):
|
||||
"""Abstract class implemented by the mon-agent plugin detection classes
|
||||
"""
|
||||
# todo these should include dependency detection
|
||||
|
||||
def __init__(self, template_dir, overwrite=True):
|
||||
self.available = False
|
||||
self.template_dir = template_dir
|
||||
self.dependencies = ()
|
||||
self.overwrite = overwrite
|
||||
self._detect()
|
||||
|
||||
def _detect(self):
|
||||
"""Run detection, set self.available True if the service is detected."""
|
||||
raise NotImplementedError
|
||||
|
||||
def build_config(self):
|
||||
"""Build the config as a Plugins object and return.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def dependencies_installed(self):
|
||||
"""return True if dependencies are installed
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
"""Return _name if set otherwise the class name"""
|
||||
if '_name' in self.__dict__:
|
||||
return self._name
|
||||
else:
|
||||
return self.__class__.__name__
|
||||
|
||||
|
|
@ -4,18 +4,19 @@ from monsetup import agent_config
|
|||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Swift(Plugin):
|
||||
"""Detect Swift daemons and setup configuration to monitor them."""
|
||||
|
||||
def _detect(self):
|
||||
"""Run detection"""
|
||||
self.swift_processes = ['swift-container-updater', 'swift-account-auditor',
|
||||
'swift-object-replicator', 'swift-container-replicator',
|
||||
'swift-object-auditor', 'swift-container-auditor',
|
||||
'swift-account-reaper', 'swift-container-sync',
|
||||
'swift-account-replicator', 'swift-object-updater',
|
||||
'swift-object-server', 'swift-account-server',
|
||||
'swift-container-server']
|
||||
'swift-object-replicator', 'swift-container-replicator',
|
||||
'swift-object-auditor', 'swift-container-auditor',
|
||||
'swift-account-reaper', 'swift-container-sync',
|
||||
'swift-account-replicator', 'swift-object-updater',
|
||||
'swift-object-server', 'swift-account-server',
|
||||
'swift-container-server']
|
||||
self.found_processes = []
|
||||
|
||||
for process in self.swift_processes:
|
||||
|
|
|
@ -4,6 +4,7 @@ from monsetup import agent_config
|
|||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SwiftAPI(Plugin):
|
||||
"""Detect the Swift-API daemon and setup configuration to monitor it."""
|
||||
|
||||
|
@ -25,7 +26,8 @@ class SwiftAPI(Plugin):
|
|||
|
||||
# Next setup an active http_status check on the API
|
||||
log.info("\tConfiguring an http_check for the swift API.")
|
||||
config.merge(service_api_check(self.process_name, 'http://localhost:8080/healthcheck', '.*OK.*', self.service_name))
|
||||
config.merge(
|
||||
service_api_check(self.process_name, 'http://localhost:8080/healthcheck', '.*OK.*', self.service_name))
|
||||
|
||||
return config
|
||||
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
""" Util functions to assist in detection.
|
||||
"""
|
||||
import psutil
|
||||
|
||||
from monsetup import agent_config
|
||||
|
||||
|
||||
def find_process_cmdline(search_string):
|
||||
"""Simple function to search running process for one with cmdline containing
|
||||
"""
|
||||
for process in psutil.process_iter():
|
||||
for arg in process.cmdline():
|
||||
if arg.find(search_string) != -1:
|
||||
return process
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def find_process_name(pname):
|
||||
"""Simple function to search running process for one with pname.
|
||||
"""
|
||||
for process in psutil.process_iter():
|
||||
if pname == process.name():
|
||||
return process
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def watch_process(search_strings, service=None):
|
||||
"""Takes a list of process search strings and returns a Plugins object with the config set.
|
||||
This was built as a helper as many plugins setup process watching
|
||||
"""
|
||||
config = agent_config.Plugins()
|
||||
parameters = {'name': search_strings[0],
|
||||
'search_string': search_strings}
|
||||
|
||||
# If service parameter is set in the plugin config, add the service dimension which
|
||||
# will override the service in the agent config
|
||||
if service:
|
||||
parameters['dimensions'] = {'service': service}
|
||||
|
||||
config['process'] = {'init_config': None,
|
||||
'instances': [parameters]}
|
||||
return config
|
||||
|
||||
|
||||
def service_api_check(name, url, pattern, service=None):
|
||||
"""Setup a service api to be watched by the http_check plugin."""
|
||||
config = agent_config.Plugins()
|
||||
parameters = {'name': name,
|
||||
'url': url,
|
||||
'match_pattern': pattern,
|
||||
'timeout': 10,
|
||||
'use_keystone': True}
|
||||
|
||||
# If service parameter is set in the plugin config, add the service dimension which
|
||||
# will override the service in the agent config
|
||||
if service:
|
||||
parameters['dimensions'] = {'service': service}
|
||||
|
||||
config['http_check'] = {'init_config': None,
|
||||
'instances': [parameters]}
|
||||
|
||||
return config
|
|
@ -1,44 +1,2 @@
|
|||
"""Classes implementing different methods for running mon-agent on startup as well as starting the process immediately
|
||||
"""
|
||||
import psutil
|
||||
from service import Service
|
||||
|
||||
|
||||
class Service(object):
|
||||
"""Abstract base class implementing the interface for various service types."""
|
||||
def __init__(self, config_dir, log_dir, name='mon-agent'):
|
||||
self.config_dir = config_dir
|
||||
self.log_dir = log_dir
|
||||
self.name = name
|
||||
|
||||
def enable(self):
|
||||
"""Sets mon-agent to start on boot.
|
||||
Generally this requires running as super user
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def start(self, restart=True):
|
||||
"""Starts mon-agent
|
||||
If the agent is running and restart is True, restart
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def stop(self):
|
||||
"""Stops mon-agent
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def is_enabled(self):
|
||||
"""Returns True if mon-agent is setup to start on boot, false otherwise
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def is_running():
|
||||
"""Returns True if mon-agent is running, false otherwise
|
||||
"""
|
||||
# Looking for the supervisor process not the individual components
|
||||
for process in psutil.process_iter():
|
||||
if '/etc/mon-agent/supervisor.conf' in process.cmdline():
|
||||
return True
|
||||
|
||||
return False
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
"""Classes implementing different methods for running mon-agent on startup as well as starting the process immediately
|
||||
"""
|
||||
import psutil
|
||||
|
||||
|
||||
class Service(object):
|
||||
"""Abstract base class implementing the interface for various service types."""
|
||||
def __init__(self, config_dir, log_dir, name='mon-agent'):
|
||||
self.config_dir = config_dir
|
||||
self.log_dir = log_dir
|
||||
self.name = name
|
||||
|
||||
def enable(self):
|
||||
"""Sets mon-agent to start on boot.
|
||||
Generally this requires running as super user
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def start(self, restart=True):
|
||||
"""Starts mon-agent
|
||||
If the agent is running and restart is True, restart
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def stop(self):
|
||||
"""Stops mon-agent
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def is_enabled(self):
|
||||
"""Returns True if mon-agent is setup to start on boot, false otherwise
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def is_running():
|
||||
"""Returns True if mon-agent is running, false otherwise
|
||||
"""
|
||||
# Looking for the supervisor process not the individual components
|
||||
for process in psutil.process_iter():
|
||||
if '/etc/mon-agent/supervisor.conf' in process.cmdline():
|
||||
return True
|
||||
|
||||
return False
|
4
setup.py
4
setup.py
|
@ -131,9 +131,9 @@ setup(
|
|||
packages=find_packages(exclude=['tests', 'build*', 'packaging*']),
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'mon-forwarder = monagent.forwarder:main',
|
||||
'mon-forwarder = monagent.forwarder.daemon:main',
|
||||
'mon-collector = monagent.collector.daemon:main',
|
||||
'monstatsd = monagent.monstatsd:main',
|
||||
'monstatsd = monagent.monstatsd.daemon:main',
|
||||
'mon-setup = monsetup.main:main'
|
||||
],
|
||||
},
|
||||
|
|
Loading…
Reference in New Issue