Remove 'detail' dimension, fix a couple bugs
The 'detail' dimension in http_check is not actually an appropriate dimension since it does not contain data which _identifies_ a metric. This data is valuable, though, and will be captured and made available in a future enhancement. Bug fixes: - Fix unix.py crash on EFI mountpoints which return 0 available inodes - Rename 'service_name' to 'name' in nagios_wrapper for ServicesCheck compatibility - Update nagios_wrapper YAML example for naming convention Change-Id: I44877a856fac84f54ca764a79e75ace9f90d9302
This commit is contained in:
@@ -13,21 +13,10 @@ instances:
|
||||
# The (optional) match_pattern parameter will instruct the check
|
||||
# to match the HTTP response body against a regular-expression-
|
||||
# compatible pattern. If the pattern matches the check will
|
||||
# return 0 for OK. Otherwise, it will return 1 for an error and
|
||||
# include the response body as a 'detail' tag, as long as
|
||||
# 'include_content' below is set to 'true'
|
||||
# return 0 for OK. Otherwise, it will return 1 for an error
|
||||
|
||||
# match_pattern: '.*OK.*OK.*OK.*OK.*OK'
|
||||
|
||||
# The (optional) include_content parameter will instruct the check
|
||||
# to include the first 200 characters of the HTTP response body
|
||||
# in notifications sent by this plugin. This is best used with
|
||||
# "healthcheck"-type URLs, where the body contains a brief, human-
|
||||
# readable summary of failure reasons in the case of errors. This
|
||||
# defaults to false.
|
||||
|
||||
# include_content: true
|
||||
|
||||
# The (optional) collect_response_time parameter will instruct the
|
||||
# check to create a metric 'network.http.response_time', tagged with
|
||||
# the url, reporting the response time in seconds.
|
||||
|
||||
@@ -5,19 +5,19 @@ init_config:
|
||||
# Where to store last-run timestamps for each check (required)
|
||||
# temp_file_path: /dev/shm/
|
||||
|
||||
# List a check name under 'service_name' and the full command-line
|
||||
# List a check name under 'name' and the full command-line
|
||||
# under 'check_command'. If the command exists in 'check_path' above,
|
||||
# it is not necessary to specify the full path.
|
||||
|
||||
instances:
|
||||
# - service_name: load
|
||||
# - name: nagios.load
|
||||
# check_command: check_load -r -w 2,1.5,1 -c 10,5,4
|
||||
|
||||
# - service_name: disk
|
||||
# - name: nagios.disk
|
||||
# check_command: check_disk -w 15\% -c 5\% -A -i /srv/node
|
||||
# check_interval: 300
|
||||
|
||||
# - service_name: swap
|
||||
# - name: nagios.swap
|
||||
# check_command: /usr/lib/nagios/plugins/check_swap -w 50\% -c 10\%
|
||||
# check_interval: 120
|
||||
# dimensions: { 'group': 'memory' }
|
||||
|
||||
@@ -114,6 +114,10 @@ class Disk(monagent.collector.checks.check.Check):
|
||||
except IndexError:
|
||||
self.logger.exception("Cannot parse %s" % (parts,))
|
||||
|
||||
# Some partitions (EFI boot) may appear to have 0 available inodes
|
||||
if parts[1] == 0:
|
||||
continue
|
||||
|
||||
if inodes:
|
||||
usage_data['%s.disk_inode_utilization_perc' % parts[0]] = float(parts[2]) / parts[1] * 100
|
||||
else:
|
||||
|
||||
@@ -36,11 +36,10 @@ class HTTPCheck(ServicesCheck):
|
||||
pattern = instance.get('match_pattern', None)
|
||||
if url is None:
|
||||
raise Exception("Bad configuration. You must specify a url")
|
||||
include_content = instance.get('include_content', False)
|
||||
ssl = instance.get('disable_ssl_validation', True)
|
||||
token = AgentCheck.keystone.get_token()
|
||||
|
||||
return url, username, password, timeout, include_content, headers, response_time, dimensions, ssl, pattern, use_keystone, token
|
||||
return url, username, password, timeout, headers, response_time, dimensions, ssl, pattern, use_keystone, token
|
||||
|
||||
def _create_status_event(self, status, msg, instance):
|
||||
"""Does nothing: status events are not yet supported by Mon API.
|
||||
@@ -49,7 +48,7 @@ class HTTPCheck(ServicesCheck):
|
||||
return
|
||||
|
||||
def _check(self, instance):
|
||||
addr, username, password, timeout, include_content, headers, response_time, dimensions, disable_ssl_validation, pattern, use_keystone, token = self._load_conf(
|
||||
addr, username, password, timeout, headers, response_time, dimensions, disable_ssl_validation, pattern, use_keystone, token = self._load_conf(
|
||||
instance)
|
||||
|
||||
content = ''
|
||||
@@ -125,9 +124,7 @@ class HTTPCheck(ServicesCheck):
|
||||
running_time = time.time() - start
|
||||
self.gauge('http_response_time', running_time, dimensions=new_dimensions)
|
||||
|
||||
# Add a 'detail' tag if requested
|
||||
if include_content:
|
||||
new_dimensions['detail'] = json.dumps(content)
|
||||
# TODO(dschroeder): Save/send content data when supported by API
|
||||
|
||||
if int(resp.status) >= 400:
|
||||
if use_keystone and int(resp.status) == 401:
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#!/bin/env python
|
||||
"""Monitoring Agent wrapper for Nagios checks.
|
||||
"""Monasca Agent wrapper for Nagios checks.
|
||||
|
||||
"""
|
||||
|
||||
@@ -26,12 +26,11 @@ class WrapNagios(ServicesCheck):
|
||||
@staticmethod
|
||||
def _do_skip_check(instance, last_run_data):
|
||||
"""Determine whether or not to skip a check depending on
|
||||
|
||||
the checks's check_interval, if specified, and the last
|
||||
time the check was run
|
||||
"""
|
||||
if instance['service_name'] in last_run_data and 'check_interval' in instance:
|
||||
if time.time() < last_run_data[instance['service_name']] + instance['check_interval']:
|
||||
if instance['name'] in last_run_data and 'check_interval' in instance:
|
||||
if time.time() < last_run_data[instance['name']] + instance['check_interval']:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
@@ -70,7 +69,7 @@ class WrapNagios(ServicesCheck):
|
||||
last_run_path +
|
||||
'nagios_wrapper_' +
|
||||
hashlib.md5(
|
||||
instance['service_name']).hexdigest() +
|
||||
instance['name']).hexdigest() +
|
||||
'.pck')
|
||||
|
||||
# Load last-run data from shared memory file
|
||||
@@ -92,22 +91,20 @@ class WrapNagios(ServicesCheck):
|
||||
output = proc.communicate()
|
||||
# The check detail is all the text before the pipe
|
||||
detail = output[0].split('|')[0]
|
||||
if detail != '':
|
||||
# Serialize the output for JSON-friendliness and add to the dimensions
|
||||
dimensions['detail'] = json.dumps(detail)
|
||||
# TODO(dschroeder): Save/send 'detail' when supported by the API
|
||||
except OSError:
|
||||
# Return an UNKNOWN code (3) if I have landed here
|
||||
self.gauge(instance['service_name'], 3, dimensions=dimensions)
|
||||
self.gauge(instance['name'], 3, dimensions=dimensions)
|
||||
self.log.info(instance['check_command'].split(" ")[0] + " is missing or unreadable")
|
||||
return
|
||||
|
||||
status_code = proc.poll()
|
||||
last_run_data[instance['service_name']] = time.time()
|
||||
self.gauge(instance['service_name'], status_code, dimensions=dimensions)
|
||||
last_run_data[instance['name']] = time.time()
|
||||
self.gauge(instance['name'], status_code, dimensions=dimensions)
|
||||
# Return DOWN on critical, UP otherwise
|
||||
if status_code == "2":
|
||||
return Status.DOWN, "DOWN: " + dimensions['detail']
|
||||
return Status.UP, "UP: " + dimensions['detail']
|
||||
return Status.DOWN, "DOWN: {}".format(detail)
|
||||
return Status.UP, "UP: {}".format(detail)
|
||||
|
||||
# Save last-run data
|
||||
file_w = open(last_run_file, "w")
|
||||
|
||||
Reference in New Issue
Block a user