diff --git a/monagent/checks.d/host_alive.py b/monagent/checks.d/host_alive.py new file mode 100644 index 00000000..c21549b9 --- /dev/null +++ b/monagent/checks.d/host_alive.py @@ -0,0 +1,96 @@ +#!/bin/env python +"""DataDog remote host aliveness checker""" + +import socket +import subprocess +import sys + +from checks import AgentCheck + + +class HostAlive(AgentCheck): + """Inherit Agentcheck class to test if a host is alive or not""" + + + def __init__(self, name, init_config, agentConfig, instances=None): + AgentCheck.__init__(self, name, init_config, agentConfig, instances) + + def _test_ssh(self, host, port, timeout=None): + """ Connect to the SSH port (typically 22) and look for a banner """ + if port is None: + port = 22 + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + if timeout is not None: + sock.settimeout(timeout) + except socket.error, msg: + self.log.error("Error creating socket: " + str(msg[0]) + msg[1]) + return False + + try: + host_ip = socket.gethostbyname(host) + except socket.gaierror: + self.log.error("Unable to resolve host", host) + return False + + try: + sock.connect((host_ip, port)) + banner = sock.recv(1024) + sock.close() + except socket.error: + return False + if banner.startswith('SSH'): + return True + else: + return False + + def _test_ping(self, host, timeout=None): + """ Attempt to ping the host """ + ping_prefix = "ping -c 1 -q " + if timeout is not None: + ping_prefix += "-W " + str(timeout) + " " + if sys.platform.startswith('win'): + ping_prefix = "ping -n 1 " + if timeout is not None: + # On Windows, timeout is in milliseconds + timeout *= 1000 + ping_prefix += "-w " + str(timeout) + " " + ping_command = ping_prefix + host + + try: + ping = subprocess.check_output(ping_command.split(" "), + stderr=subprocess.STDOUT) + except subprocess.CalledProcessError: + return False + + # Look at the output for a packet loss percentage + if ping.find('100%') > 0: + return False + else: + return True + + def check(self, instance): + """Run the desired host-alive check againt this host""" + + tags = [ + 'target_host:' + instance['host_name'], + 'observer_host:' + socket.getfqdn(), + ] + + success = False + + if instance['alive_test'] == 'ssh': + success = self._test_ssh(instance['host_name'], + self.init_config.get('ssh_port'), + self.init_config.get('ssh_timeout')) + elif instance['alive_test'] == 'ping': + success = self._test_ping(instance['host_name'], + self.init_config.get('ping_timeout')) + else: + self.log.info("Unrecognized alive_test " + instance['alive_test']) + + if success is True: + self.gauge('host_alive', 0, tags=tags) + else: + self.gauge('host_alive', 1, tags=tags) + diff --git a/monagent/checks.d/nagios_wrapper.py b/monagent/checks.d/nagios_wrapper.py new file mode 100644 index 00000000..44f7dae8 --- /dev/null +++ b/monagent/checks.d/nagios_wrapper.py @@ -0,0 +1,95 @@ +#!/bin/env python +"""DataDog wrapper for Nagios checks""" + +import hashlib +import json +import os +import pickle +import socket +import subprocess +import time + +from checks import AgentCheck + + +class WrapNagios(AgentCheck): + """Inherit Agentcheck class to process Nagios checks""" + + + def __init__(self, name, init_config, agentConfig, instances=None): + AgentCheck.__init__(self, name, init_config, agentConfig, instances) + + def _do_skip_check(self, instance, last_run_data): + """ Determine whether or not to skip a check depending on + the checks's check_interval, if specified, and the last + time the check was run """ + if (instance['service_name'] in last_run_data + and 'check_interval' in instance): + if (time.time() < last_run_data[instance['service_name']] + + instance['check_interval']): + return True + else: + return False + + def check(self, instance): + """Run the command specified by check_command and capture the result""" + + tags = [ + 'observer_host:' + socket.getfqdn(), + ] + if 'host_name' in instance: + tags.extend(['target_host:' + instance['host_name']]) + else: + tags.extend(['target_host:' + socket.getfqdn()]) + + extra_path = self.init_config.get('check_path') + + last_run_path = self.init_config.get('temp_file_path') + # Use a default last_run_file if no temp_file is specified in the YAML + if last_run_path is None: + last_run_path = '/dev/shm/' + + if last_run_path.endswith('/') is False: + last_run_path += '/' + last_run_file = (last_run_path + 'nagios_wrapper_' + + hashlib.md5(instance['service_name']).hexdigest() + '.pck') + + # Load last-run data from shared memory file + last_run_data = {} + if (os.path.isfile(last_run_file)): + file_r = open(last_run_file, "r") + last_run_data = pickle.load(file_r) + file_r.close() + + # Exit here if it is not yet time to re-run this check + if self._do_skip_check(instance, last_run_data) is True: + print "Too soon since last check." + return + + try: + proc = subprocess.Popen(instance['check_command'].split(" "), + env={"PATH": extra_path}, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + output = proc.communicate() + # The check detail is all the text before the pipe + detail = output[0].split('|')[0] + print "detail is '" + detail + "'" + if detail != '': + # Serialize the output for JSON-friendliness and add to the tags + tags.extend([json.dumps(detail)]) + except OSError: + # Return an UNKNOWN code (3) if I have landed here + self.gauge(instance['service_name'], 3, tags=tags) + self.log.info(instance['check_command'].split(" ")[0] + + " is missing or unreadable") + return + + last_run_data[instance['service_name']] = time.time() + self.gauge(instance['service_name'], proc.poll(), tags=tags) + + # Save last-run data + file_w = open(last_run_file, "w") + pickle.dump(last_run_data, file_w) + file_w.close() +