Merge pull request #2 from mon/feature/agent_checks
First commit of agent checks for Nagios & ping
This commit is contained in:
96
monagent/checks.d/host_alive.py
Normal file
96
monagent/checks.d/host_alive.py
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
#!/bin/env python
|
||||||
|
"""DataDog remote host aliveness checker"""
|
||||||
|
|
||||||
|
import socket
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from checks import AgentCheck
|
||||||
|
|
||||||
|
|
||||||
|
class HostAlive(AgentCheck):
|
||||||
|
"""Inherit Agentcheck class to test if a host is alive or not"""
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, name, init_config, agentConfig, instances=None):
|
||||||
|
AgentCheck.__init__(self, name, init_config, agentConfig, instances)
|
||||||
|
|
||||||
|
def _test_ssh(self, host, port, timeout=None):
|
||||||
|
""" Connect to the SSH port (typically 22) and look for a banner """
|
||||||
|
if port is None:
|
||||||
|
port = 22
|
||||||
|
try:
|
||||||
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
|
if timeout is not None:
|
||||||
|
sock.settimeout(timeout)
|
||||||
|
except socket.error, msg:
|
||||||
|
self.log.error("Error creating socket: " + str(msg[0]) + msg[1])
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
host_ip = socket.gethostbyname(host)
|
||||||
|
except socket.gaierror:
|
||||||
|
self.log.error("Unable to resolve host", host)
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
sock.connect((host_ip, port))
|
||||||
|
banner = sock.recv(1024)
|
||||||
|
sock.close()
|
||||||
|
except socket.error:
|
||||||
|
return False
|
||||||
|
if banner.startswith('SSH'):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _test_ping(self, host, timeout=None):
|
||||||
|
""" Attempt to ping the host """
|
||||||
|
ping_prefix = "ping -c 1 -q "
|
||||||
|
if timeout is not None:
|
||||||
|
ping_prefix += "-W " + str(timeout) + " "
|
||||||
|
if sys.platform.startswith('win'):
|
||||||
|
ping_prefix = "ping -n 1 "
|
||||||
|
if timeout is not None:
|
||||||
|
# On Windows, timeout is in milliseconds
|
||||||
|
timeout *= 1000
|
||||||
|
ping_prefix += "-w " + str(timeout) + " "
|
||||||
|
ping_command = ping_prefix + host
|
||||||
|
|
||||||
|
try:
|
||||||
|
ping = subprocess.check_output(ping_command.split(" "),
|
||||||
|
stderr=subprocess.STDOUT)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Look at the output for a packet loss percentage
|
||||||
|
if ping.find('100%') > 0:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def check(self, instance):
|
||||||
|
"""Run the desired host-alive check againt this host"""
|
||||||
|
|
||||||
|
tags = [
|
||||||
|
'target_host:' + instance['host_name'],
|
||||||
|
'observer_host:' + socket.getfqdn(),
|
||||||
|
]
|
||||||
|
|
||||||
|
success = False
|
||||||
|
|
||||||
|
if instance['alive_test'] == 'ssh':
|
||||||
|
success = self._test_ssh(instance['host_name'],
|
||||||
|
self.init_config.get('ssh_port'),
|
||||||
|
self.init_config.get('ssh_timeout'))
|
||||||
|
elif instance['alive_test'] == 'ping':
|
||||||
|
success = self._test_ping(instance['host_name'],
|
||||||
|
self.init_config.get('ping_timeout'))
|
||||||
|
else:
|
||||||
|
self.log.info("Unrecognized alive_test " + instance['alive_test'])
|
||||||
|
|
||||||
|
if success is True:
|
||||||
|
self.gauge('host_alive', 0, tags=tags)
|
||||||
|
else:
|
||||||
|
self.gauge('host_alive', 1, tags=tags)
|
||||||
|
|
||||||
95
monagent/checks.d/nagios_wrapper.py
Normal file
95
monagent/checks.d/nagios_wrapper.py
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
#!/bin/env python
|
||||||
|
"""DataDog wrapper for Nagios checks"""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
import socket
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
|
||||||
|
from checks import AgentCheck
|
||||||
|
|
||||||
|
|
||||||
|
class WrapNagios(AgentCheck):
|
||||||
|
"""Inherit Agentcheck class to process Nagios checks"""
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, name, init_config, agentConfig, instances=None):
|
||||||
|
AgentCheck.__init__(self, name, init_config, agentConfig, instances)
|
||||||
|
|
||||||
|
def _do_skip_check(self, instance, last_run_data):
|
||||||
|
""" Determine whether or not to skip a check depending on
|
||||||
|
the checks's check_interval, if specified, and the last
|
||||||
|
time the check was run """
|
||||||
|
if (instance['service_name'] in last_run_data
|
||||||
|
and 'check_interval' in instance):
|
||||||
|
if (time.time() < last_run_data[instance['service_name']]
|
||||||
|
+ instance['check_interval']):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def check(self, instance):
|
||||||
|
"""Run the command specified by check_command and capture the result"""
|
||||||
|
|
||||||
|
tags = [
|
||||||
|
'observer_host:' + socket.getfqdn(),
|
||||||
|
]
|
||||||
|
if 'host_name' in instance:
|
||||||
|
tags.extend(['target_host:' + instance['host_name']])
|
||||||
|
else:
|
||||||
|
tags.extend(['target_host:' + socket.getfqdn()])
|
||||||
|
|
||||||
|
extra_path = self.init_config.get('check_path')
|
||||||
|
|
||||||
|
last_run_path = self.init_config.get('temp_file_path')
|
||||||
|
# Use a default last_run_file if no temp_file is specified in the YAML
|
||||||
|
if last_run_path is None:
|
||||||
|
last_run_path = '/dev/shm/'
|
||||||
|
|
||||||
|
if last_run_path.endswith('/') is False:
|
||||||
|
last_run_path += '/'
|
||||||
|
last_run_file = (last_run_path + 'nagios_wrapper_'
|
||||||
|
+ hashlib.md5(instance['service_name']).hexdigest() + '.pck')
|
||||||
|
|
||||||
|
# Load last-run data from shared memory file
|
||||||
|
last_run_data = {}
|
||||||
|
if (os.path.isfile(last_run_file)):
|
||||||
|
file_r = open(last_run_file, "r")
|
||||||
|
last_run_data = pickle.load(file_r)
|
||||||
|
file_r.close()
|
||||||
|
|
||||||
|
# Exit here if it is not yet time to re-run this check
|
||||||
|
if self._do_skip_check(instance, last_run_data) is True:
|
||||||
|
print "Too soon since last check."
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
proc = subprocess.Popen(instance['check_command'].split(" "),
|
||||||
|
env={"PATH": extra_path},
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE)
|
||||||
|
output = proc.communicate()
|
||||||
|
# The check detail is all the text before the pipe
|
||||||
|
detail = output[0].split('|')[0]
|
||||||
|
print "detail is '" + detail + "'"
|
||||||
|
if detail != '':
|
||||||
|
# Serialize the output for JSON-friendliness and add to the tags
|
||||||
|
tags.extend([json.dumps(detail)])
|
||||||
|
except OSError:
|
||||||
|
# Return an UNKNOWN code (3) if I have landed here
|
||||||
|
self.gauge(instance['service_name'], 3, tags=tags)
|
||||||
|
self.log.info(instance['check_command'].split(" ")[0]
|
||||||
|
+ " is missing or unreadable")
|
||||||
|
return
|
||||||
|
|
||||||
|
last_run_data[instance['service_name']] = time.time()
|
||||||
|
self.gauge(instance['service_name'], proc.poll(), tags=tags)
|
||||||
|
|
||||||
|
# Save last-run data
|
||||||
|
file_w = open(last_run_file, "w")
|
||||||
|
pickle.dump(last_run_data, file_w)
|
||||||
|
file_w.close()
|
||||||
|
|
||||||
Reference in New Issue
Block a user