Merge pull request #2 from mon/feature/agent_checks

First commit of agent checks for Nagios & ping
This commit is contained in:
hochmuth
2014-03-06 20:20:15 +00:00
2 changed files with 191 additions and 0 deletions

View File

@@ -0,0 +1,96 @@
#!/bin/env python
"""DataDog remote host aliveness checker"""
import socket
import subprocess
import sys
from checks import AgentCheck
class HostAlive(AgentCheck):
"""Inherit Agentcheck class to test if a host is alive or not"""
def __init__(self, name, init_config, agentConfig, instances=None):
AgentCheck.__init__(self, name, init_config, agentConfig, instances)
def _test_ssh(self, host, port, timeout=None):
""" Connect to the SSH port (typically 22) and look for a banner """
if port is None:
port = 22
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
if timeout is not None:
sock.settimeout(timeout)
except socket.error, msg:
self.log.error("Error creating socket: " + str(msg[0]) + msg[1])
return False
try:
host_ip = socket.gethostbyname(host)
except socket.gaierror:
self.log.error("Unable to resolve host", host)
return False
try:
sock.connect((host_ip, port))
banner = sock.recv(1024)
sock.close()
except socket.error:
return False
if banner.startswith('SSH'):
return True
else:
return False
def _test_ping(self, host, timeout=None):
""" Attempt to ping the host """
ping_prefix = "ping -c 1 -q "
if timeout is not None:
ping_prefix += "-W " + str(timeout) + " "
if sys.platform.startswith('win'):
ping_prefix = "ping -n 1 "
if timeout is not None:
# On Windows, timeout is in milliseconds
timeout *= 1000
ping_prefix += "-w " + str(timeout) + " "
ping_command = ping_prefix + host
try:
ping = subprocess.check_output(ping_command.split(" "),
stderr=subprocess.STDOUT)
except subprocess.CalledProcessError:
return False
# Look at the output for a packet loss percentage
if ping.find('100%') > 0:
return False
else:
return True
def check(self, instance):
"""Run the desired host-alive check againt this host"""
tags = [
'target_host:' + instance['host_name'],
'observer_host:' + socket.getfqdn(),
]
success = False
if instance['alive_test'] == 'ssh':
success = self._test_ssh(instance['host_name'],
self.init_config.get('ssh_port'),
self.init_config.get('ssh_timeout'))
elif instance['alive_test'] == 'ping':
success = self._test_ping(instance['host_name'],
self.init_config.get('ping_timeout'))
else:
self.log.info("Unrecognized alive_test " + instance['alive_test'])
if success is True:
self.gauge('host_alive', 0, tags=tags)
else:
self.gauge('host_alive', 1, tags=tags)

View File

@@ -0,0 +1,95 @@
#!/bin/env python
"""DataDog wrapper for Nagios checks"""
import hashlib
import json
import os
import pickle
import socket
import subprocess
import time
from checks import AgentCheck
class WrapNagios(AgentCheck):
"""Inherit Agentcheck class to process Nagios checks"""
def __init__(self, name, init_config, agentConfig, instances=None):
AgentCheck.__init__(self, name, init_config, agentConfig, instances)
def _do_skip_check(self, instance, last_run_data):
""" Determine whether or not to skip a check depending on
the checks's check_interval, if specified, and the last
time the check was run """
if (instance['service_name'] in last_run_data
and 'check_interval' in instance):
if (time.time() < last_run_data[instance['service_name']]
+ instance['check_interval']):
return True
else:
return False
def check(self, instance):
"""Run the command specified by check_command and capture the result"""
tags = [
'observer_host:' + socket.getfqdn(),
]
if 'host_name' in instance:
tags.extend(['target_host:' + instance['host_name']])
else:
tags.extend(['target_host:' + socket.getfqdn()])
extra_path = self.init_config.get('check_path')
last_run_path = self.init_config.get('temp_file_path')
# Use a default last_run_file if no temp_file is specified in the YAML
if last_run_path is None:
last_run_path = '/dev/shm/'
if last_run_path.endswith('/') is False:
last_run_path += '/'
last_run_file = (last_run_path + 'nagios_wrapper_'
+ hashlib.md5(instance['service_name']).hexdigest() + '.pck')
# Load last-run data from shared memory file
last_run_data = {}
if (os.path.isfile(last_run_file)):
file_r = open(last_run_file, "r")
last_run_data = pickle.load(file_r)
file_r.close()
# Exit here if it is not yet time to re-run this check
if self._do_skip_check(instance, last_run_data) is True:
print "Too soon since last check."
return
try:
proc = subprocess.Popen(instance['check_command'].split(" "),
env={"PATH": extra_path},
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output = proc.communicate()
# The check detail is all the text before the pipe
detail = output[0].split('|')[0]
print "detail is '" + detail + "'"
if detail != '':
# Serialize the output for JSON-friendliness and add to the tags
tags.extend([json.dumps(detail)])
except OSError:
# Return an UNKNOWN code (3) if I have landed here
self.gauge(instance['service_name'], 3, tags=tags)
self.log.info(instance['check_command'].split(" ")[0]
+ " is missing or unreadable")
return
last_run_data[instance['service_name']] = time.time()
self.gauge(instance['service_name'], proc.poll(), tags=tags)
# Save last-run data
file_w = open(last_run_file, "w")
pickle.dump(last_run_data, file_w)
file_w.close()