Remove 'detail' dimension, fix a couple bugs
The 'detail' dimension in http_check is not actually an appropriate dimension since it does not contain data which _identifies_ a metric. This data is valuable, though, and will be captured and made available in a future enhancement. Bug fixes: - Fix unix.py crash on EFI mountpoints which return 0 available inodes - Rename 'service_name' to 'name' in nagios_wrapper for ServicesCheck compatibility - Update nagios_wrapper YAML example for naming convention Change-Id: I44877a856fac84f54ca764a79e75ace9f90d9302
This commit is contained in:
@@ -13,21 +13,10 @@ instances:
|
|||||||
# The (optional) match_pattern parameter will instruct the check
|
# The (optional) match_pattern parameter will instruct the check
|
||||||
# to match the HTTP response body against a regular-expression-
|
# to match the HTTP response body against a regular-expression-
|
||||||
# compatible pattern. If the pattern matches the check will
|
# compatible pattern. If the pattern matches the check will
|
||||||
# return 0 for OK. Otherwise, it will return 1 for an error and
|
# return 0 for OK. Otherwise, it will return 1 for an error
|
||||||
# include the response body as a 'detail' tag, as long as
|
|
||||||
# 'include_content' below is set to 'true'
|
|
||||||
|
|
||||||
# match_pattern: '.*OK.*OK.*OK.*OK.*OK'
|
# match_pattern: '.*OK.*OK.*OK.*OK.*OK'
|
||||||
|
|
||||||
# The (optional) include_content parameter will instruct the check
|
|
||||||
# to include the first 200 characters of the HTTP response body
|
|
||||||
# in notifications sent by this plugin. This is best used with
|
|
||||||
# "healthcheck"-type URLs, where the body contains a brief, human-
|
|
||||||
# readable summary of failure reasons in the case of errors. This
|
|
||||||
# defaults to false.
|
|
||||||
|
|
||||||
# include_content: true
|
|
||||||
|
|
||||||
# The (optional) collect_response_time parameter will instruct the
|
# The (optional) collect_response_time parameter will instruct the
|
||||||
# check to create a metric 'network.http.response_time', tagged with
|
# check to create a metric 'network.http.response_time', tagged with
|
||||||
# the url, reporting the response time in seconds.
|
# the url, reporting the response time in seconds.
|
||||||
|
|||||||
@@ -5,19 +5,19 @@ init_config:
|
|||||||
# Where to store last-run timestamps for each check (required)
|
# Where to store last-run timestamps for each check (required)
|
||||||
# temp_file_path: /dev/shm/
|
# temp_file_path: /dev/shm/
|
||||||
|
|
||||||
# List a check name under 'service_name' and the full command-line
|
# List a check name under 'name' and the full command-line
|
||||||
# under 'check_command'. If the command exists in 'check_path' above,
|
# under 'check_command'. If the command exists in 'check_path' above,
|
||||||
# it is not necessary to specify the full path.
|
# it is not necessary to specify the full path.
|
||||||
|
|
||||||
instances:
|
instances:
|
||||||
# - service_name: load
|
# - name: nagios.load
|
||||||
# check_command: check_load -r -w 2,1.5,1 -c 10,5,4
|
# check_command: check_load -r -w 2,1.5,1 -c 10,5,4
|
||||||
|
|
||||||
# - service_name: disk
|
# - name: nagios.disk
|
||||||
# check_command: check_disk -w 15\% -c 5\% -A -i /srv/node
|
# check_command: check_disk -w 15\% -c 5\% -A -i /srv/node
|
||||||
# check_interval: 300
|
# check_interval: 300
|
||||||
|
|
||||||
# - service_name: swap
|
# - name: nagios.swap
|
||||||
# check_command: /usr/lib/nagios/plugins/check_swap -w 50\% -c 10\%
|
# check_command: /usr/lib/nagios/plugins/check_swap -w 50\% -c 10\%
|
||||||
# check_interval: 120
|
# check_interval: 120
|
||||||
# dimensions: { 'group': 'memory' }
|
# dimensions: { 'group': 'memory' }
|
||||||
|
|||||||
@@ -114,6 +114,10 @@ class Disk(monagent.collector.checks.check.Check):
|
|||||||
except IndexError:
|
except IndexError:
|
||||||
self.logger.exception("Cannot parse %s" % (parts,))
|
self.logger.exception("Cannot parse %s" % (parts,))
|
||||||
|
|
||||||
|
# Some partitions (EFI boot) may appear to have 0 available inodes
|
||||||
|
if parts[1] == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
if inodes:
|
if inodes:
|
||||||
usage_data['%s.disk_inode_utilization_perc' % parts[0]] = float(parts[2]) / parts[1] * 100
|
usage_data['%s.disk_inode_utilization_perc' % parts[0]] = float(parts[2]) / parts[1] * 100
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -36,11 +36,10 @@ class HTTPCheck(ServicesCheck):
|
|||||||
pattern = instance.get('match_pattern', None)
|
pattern = instance.get('match_pattern', None)
|
||||||
if url is None:
|
if url is None:
|
||||||
raise Exception("Bad configuration. You must specify a url")
|
raise Exception("Bad configuration. You must specify a url")
|
||||||
include_content = instance.get('include_content', False)
|
|
||||||
ssl = instance.get('disable_ssl_validation', True)
|
ssl = instance.get('disable_ssl_validation', True)
|
||||||
token = AgentCheck.keystone.get_token()
|
token = AgentCheck.keystone.get_token()
|
||||||
|
|
||||||
return url, username, password, timeout, include_content, headers, response_time, dimensions, ssl, pattern, use_keystone, token
|
return url, username, password, timeout, headers, response_time, dimensions, ssl, pattern, use_keystone, token
|
||||||
|
|
||||||
def _create_status_event(self, status, msg, instance):
|
def _create_status_event(self, status, msg, instance):
|
||||||
"""Does nothing: status events are not yet supported by Mon API.
|
"""Does nothing: status events are not yet supported by Mon API.
|
||||||
@@ -49,7 +48,7 @@ class HTTPCheck(ServicesCheck):
|
|||||||
return
|
return
|
||||||
|
|
||||||
def _check(self, instance):
|
def _check(self, instance):
|
||||||
addr, username, password, timeout, include_content, headers, response_time, dimensions, disable_ssl_validation, pattern, use_keystone, token = self._load_conf(
|
addr, username, password, timeout, headers, response_time, dimensions, disable_ssl_validation, pattern, use_keystone, token = self._load_conf(
|
||||||
instance)
|
instance)
|
||||||
|
|
||||||
content = ''
|
content = ''
|
||||||
@@ -125,9 +124,7 @@ class HTTPCheck(ServicesCheck):
|
|||||||
running_time = time.time() - start
|
running_time = time.time() - start
|
||||||
self.gauge('http_response_time', running_time, dimensions=new_dimensions)
|
self.gauge('http_response_time', running_time, dimensions=new_dimensions)
|
||||||
|
|
||||||
# Add a 'detail' tag if requested
|
# TODO(dschroeder): Save/send content data when supported by API
|
||||||
if include_content:
|
|
||||||
new_dimensions['detail'] = json.dumps(content)
|
|
||||||
|
|
||||||
if int(resp.status) >= 400:
|
if int(resp.status) >= 400:
|
||||||
if use_keystone and int(resp.status) == 401:
|
if use_keystone and int(resp.status) == 401:
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
#!/bin/env python
|
#!/bin/env python
|
||||||
"""Monitoring Agent wrapper for Nagios checks.
|
"""Monasca Agent wrapper for Nagios checks.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -26,12 +26,11 @@ class WrapNagios(ServicesCheck):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def _do_skip_check(instance, last_run_data):
|
def _do_skip_check(instance, last_run_data):
|
||||||
"""Determine whether or not to skip a check depending on
|
"""Determine whether or not to skip a check depending on
|
||||||
|
|
||||||
the checks's check_interval, if specified, and the last
|
the checks's check_interval, if specified, and the last
|
||||||
time the check was run
|
time the check was run
|
||||||
"""
|
"""
|
||||||
if instance['service_name'] in last_run_data and 'check_interval' in instance:
|
if instance['name'] in last_run_data and 'check_interval' in instance:
|
||||||
if time.time() < last_run_data[instance['service_name']] + instance['check_interval']:
|
if time.time() < last_run_data[instance['name']] + instance['check_interval']:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
@@ -70,7 +69,7 @@ class WrapNagios(ServicesCheck):
|
|||||||
last_run_path +
|
last_run_path +
|
||||||
'nagios_wrapper_' +
|
'nagios_wrapper_' +
|
||||||
hashlib.md5(
|
hashlib.md5(
|
||||||
instance['service_name']).hexdigest() +
|
instance['name']).hexdigest() +
|
||||||
'.pck')
|
'.pck')
|
||||||
|
|
||||||
# Load last-run data from shared memory file
|
# Load last-run data from shared memory file
|
||||||
@@ -92,22 +91,20 @@ class WrapNagios(ServicesCheck):
|
|||||||
output = proc.communicate()
|
output = proc.communicate()
|
||||||
# The check detail is all the text before the pipe
|
# The check detail is all the text before the pipe
|
||||||
detail = output[0].split('|')[0]
|
detail = output[0].split('|')[0]
|
||||||
if detail != '':
|
# TODO(dschroeder): Save/send 'detail' when supported by the API
|
||||||
# Serialize the output for JSON-friendliness and add to the dimensions
|
|
||||||
dimensions['detail'] = json.dumps(detail)
|
|
||||||
except OSError:
|
except OSError:
|
||||||
# Return an UNKNOWN code (3) if I have landed here
|
# Return an UNKNOWN code (3) if I have landed here
|
||||||
self.gauge(instance['service_name'], 3, dimensions=dimensions)
|
self.gauge(instance['name'], 3, dimensions=dimensions)
|
||||||
self.log.info(instance['check_command'].split(" ")[0] + " is missing or unreadable")
|
self.log.info(instance['check_command'].split(" ")[0] + " is missing or unreadable")
|
||||||
return
|
return
|
||||||
|
|
||||||
status_code = proc.poll()
|
status_code = proc.poll()
|
||||||
last_run_data[instance['service_name']] = time.time()
|
last_run_data[instance['name']] = time.time()
|
||||||
self.gauge(instance['service_name'], status_code, dimensions=dimensions)
|
self.gauge(instance['name'], status_code, dimensions=dimensions)
|
||||||
# Return DOWN on critical, UP otherwise
|
# Return DOWN on critical, UP otherwise
|
||||||
if status_code == "2":
|
if status_code == "2":
|
||||||
return Status.DOWN, "DOWN: " + dimensions['detail']
|
return Status.DOWN, "DOWN: {}".format(detail)
|
||||||
return Status.UP, "UP: " + dimensions['detail']
|
return Status.UP, "UP: {}".format(detail)
|
||||||
|
|
||||||
# Save last-run data
|
# Save last-run data
|
||||||
file_w = open(last_run_file, "w")
|
file_w = open(last_run_file, "w")
|
||||||
|
|||||||
Reference in New Issue
Block a user