Merge pull request #2 from mon/feature/agent_checks
First commit of agent checks for Nagios & ping
This commit is contained in:
96
monagent/checks.d/host_alive.py
Normal file
96
monagent/checks.d/host_alive.py
Normal file
@@ -0,0 +1,96 @@
|
||||
#!/bin/env python
|
||||
"""DataDog remote host aliveness checker"""
|
||||
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from checks import AgentCheck
|
||||
|
||||
|
||||
class HostAlive(AgentCheck):
|
||||
"""Inherit Agentcheck class to test if a host is alive or not"""
|
||||
|
||||
|
||||
def __init__(self, name, init_config, agentConfig, instances=None):
|
||||
AgentCheck.__init__(self, name, init_config, agentConfig, instances)
|
||||
|
||||
def _test_ssh(self, host, port, timeout=None):
|
||||
""" Connect to the SSH port (typically 22) and look for a banner """
|
||||
if port is None:
|
||||
port = 22
|
||||
try:
|
||||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
if timeout is not None:
|
||||
sock.settimeout(timeout)
|
||||
except socket.error, msg:
|
||||
self.log.error("Error creating socket: " + str(msg[0]) + msg[1])
|
||||
return False
|
||||
|
||||
try:
|
||||
host_ip = socket.gethostbyname(host)
|
||||
except socket.gaierror:
|
||||
self.log.error("Unable to resolve host", host)
|
||||
return False
|
||||
|
||||
try:
|
||||
sock.connect((host_ip, port))
|
||||
banner = sock.recv(1024)
|
||||
sock.close()
|
||||
except socket.error:
|
||||
return False
|
||||
if banner.startswith('SSH'):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def _test_ping(self, host, timeout=None):
|
||||
""" Attempt to ping the host """
|
||||
ping_prefix = "ping -c 1 -q "
|
||||
if timeout is not None:
|
||||
ping_prefix += "-W " + str(timeout) + " "
|
||||
if sys.platform.startswith('win'):
|
||||
ping_prefix = "ping -n 1 "
|
||||
if timeout is not None:
|
||||
# On Windows, timeout is in milliseconds
|
||||
timeout *= 1000
|
||||
ping_prefix += "-w " + str(timeout) + " "
|
||||
ping_command = ping_prefix + host
|
||||
|
||||
try:
|
||||
ping = subprocess.check_output(ping_command.split(" "),
|
||||
stderr=subprocess.STDOUT)
|
||||
except subprocess.CalledProcessError:
|
||||
return False
|
||||
|
||||
# Look at the output for a packet loss percentage
|
||||
if ping.find('100%') > 0:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def check(self, instance):
|
||||
"""Run the desired host-alive check againt this host"""
|
||||
|
||||
tags = [
|
||||
'target_host:' + instance['host_name'],
|
||||
'observer_host:' + socket.getfqdn(),
|
||||
]
|
||||
|
||||
success = False
|
||||
|
||||
if instance['alive_test'] == 'ssh':
|
||||
success = self._test_ssh(instance['host_name'],
|
||||
self.init_config.get('ssh_port'),
|
||||
self.init_config.get('ssh_timeout'))
|
||||
elif instance['alive_test'] == 'ping':
|
||||
success = self._test_ping(instance['host_name'],
|
||||
self.init_config.get('ping_timeout'))
|
||||
else:
|
||||
self.log.info("Unrecognized alive_test " + instance['alive_test'])
|
||||
|
||||
if success is True:
|
||||
self.gauge('host_alive', 0, tags=tags)
|
||||
else:
|
||||
self.gauge('host_alive', 1, tags=tags)
|
||||
|
95
monagent/checks.d/nagios_wrapper.py
Normal file
95
monagent/checks.d/nagios_wrapper.py
Normal file
@@ -0,0 +1,95 @@
|
||||
#!/bin/env python
|
||||
"""DataDog wrapper for Nagios checks"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import pickle
|
||||
import socket
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
from checks import AgentCheck
|
||||
|
||||
|
||||
class WrapNagios(AgentCheck):
|
||||
"""Inherit Agentcheck class to process Nagios checks"""
|
||||
|
||||
|
||||
def __init__(self, name, init_config, agentConfig, instances=None):
|
||||
AgentCheck.__init__(self, name, init_config, agentConfig, instances)
|
||||
|
||||
def _do_skip_check(self, instance, last_run_data):
|
||||
""" Determine whether or not to skip a check depending on
|
||||
the checks's check_interval, if specified, and the last
|
||||
time the check was run """
|
||||
if (instance['service_name'] in last_run_data
|
||||
and 'check_interval' in instance):
|
||||
if (time.time() < last_run_data[instance['service_name']]
|
||||
+ instance['check_interval']):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def check(self, instance):
|
||||
"""Run the command specified by check_command and capture the result"""
|
||||
|
||||
tags = [
|
||||
'observer_host:' + socket.getfqdn(),
|
||||
]
|
||||
if 'host_name' in instance:
|
||||
tags.extend(['target_host:' + instance['host_name']])
|
||||
else:
|
||||
tags.extend(['target_host:' + socket.getfqdn()])
|
||||
|
||||
extra_path = self.init_config.get('check_path')
|
||||
|
||||
last_run_path = self.init_config.get('temp_file_path')
|
||||
# Use a default last_run_file if no temp_file is specified in the YAML
|
||||
if last_run_path is None:
|
||||
last_run_path = '/dev/shm/'
|
||||
|
||||
if last_run_path.endswith('/') is False:
|
||||
last_run_path += '/'
|
||||
last_run_file = (last_run_path + 'nagios_wrapper_'
|
||||
+ hashlib.md5(instance['service_name']).hexdigest() + '.pck')
|
||||
|
||||
# Load last-run data from shared memory file
|
||||
last_run_data = {}
|
||||
if (os.path.isfile(last_run_file)):
|
||||
file_r = open(last_run_file, "r")
|
||||
last_run_data = pickle.load(file_r)
|
||||
file_r.close()
|
||||
|
||||
# Exit here if it is not yet time to re-run this check
|
||||
if self._do_skip_check(instance, last_run_data) is True:
|
||||
print "Too soon since last check."
|
||||
return
|
||||
|
||||
try:
|
||||
proc = subprocess.Popen(instance['check_command'].split(" "),
|
||||
env={"PATH": extra_path},
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
output = proc.communicate()
|
||||
# The check detail is all the text before the pipe
|
||||
detail = output[0].split('|')[0]
|
||||
print "detail is '" + detail + "'"
|
||||
if detail != '':
|
||||
# Serialize the output for JSON-friendliness and add to the tags
|
||||
tags.extend([json.dumps(detail)])
|
||||
except OSError:
|
||||
# Return an UNKNOWN code (3) if I have landed here
|
||||
self.gauge(instance['service_name'], 3, tags=tags)
|
||||
self.log.info(instance['check_command'].split(" ")[0]
|
||||
+ " is missing or unreadable")
|
||||
return
|
||||
|
||||
last_run_data[instance['service_name']] = time.time()
|
||||
self.gauge(instance['service_name'], proc.poll(), tags=tags)
|
||||
|
||||
# Save last-run data
|
||||
file_w = open(last_run_file, "w")
|
||||
pickle.dump(last_run_data, file_w)
|
||||
file_w.close()
|
||||
|
Reference in New Issue
Block a user