c376ee8655
To handle the case where the checked system has multiple IP Addresses and the network name to be used for liveness checking is not the same as the usual name used to identify the server in Monasca, an additional target_hostname parameter can be configured. It is the network hostname or IP Address to check instead of hostname. Add unit tests as well Change-Id: I33721764e64ef5079b26f78df84c94ed7a1009e7
137 lines
5.0 KiB
Python
137 lines
5.0 KiB
Python
#!/bin/env python
|
|
# (C) Copyright 2015,2016 Hewlett Packard Enterprise Development LP
|
|
"""Monitoring Agent remote host aliveness checker.
|
|
|
|
"""
|
|
|
|
import socket
|
|
import subprocess
|
|
import sys
|
|
|
|
import monasca_agent.collector.checks.services_checks as services_checks
|
|
import monasca_agent.common.util as util
|
|
|
|
|
|
class HostAlive(services_checks.ServicesCheck):
|
|
|
|
"""Inherit ServicesCheck class to test if a host is alive or not.
|
|
|
|
"""
|
|
|
|
def __init__(self, name, init_config, agent_config, instances=None):
|
|
super(HostAlive, self).__init__(name, init_config, agent_config, instances)
|
|
|
|
def _test_ssh(self, host, port, timeout=None):
|
|
"""Connect to the SSH port (typically 22) and look for a banner.
|
|
|
|
"""
|
|
if port is None:
|
|
port = 22
|
|
try:
|
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
if timeout is not None:
|
|
sock.settimeout(timeout)
|
|
except socket.error as msg:
|
|
error_message = 'Error creating socket: {0}'.format(str(msg[0]) + msg[1])
|
|
self.log.warn(error_message)
|
|
return False, error_message
|
|
|
|
try:
|
|
host_ip = socket.gethostbyname(host)
|
|
except socket.gaierror:
|
|
error_message = 'Unable to resolve host {0}'.format(host)
|
|
self.log.warn(error_message)
|
|
return False, error_message
|
|
|
|
try:
|
|
sock.connect((host_ip, port))
|
|
banner = sock.recv(1024)
|
|
sock.close()
|
|
except socket.error:
|
|
error_message = 'Unable to open socket to host {0}'.format(host)
|
|
self.log.warn(error_message)
|
|
return False, error_message
|
|
if banner.startswith('SSH'):
|
|
return True, None
|
|
else:
|
|
error_message = 'Unexpected response "{0}" from host {1}'.format(banner, host)
|
|
self.log.warn(error_message)
|
|
return False, error_message
|
|
|
|
def _test_ping(self, host, timeout=None):
|
|
"""Attempt to ping the host.
|
|
|
|
"""
|
|
ping_prefix = "ping -c 1 -q "
|
|
if timeout is not None:
|
|
ping_prefix += "-W " + str(timeout) + " "
|
|
if sys.platform.startswith('win'):
|
|
ping_prefix = "ping -n 1 "
|
|
if timeout is not None:
|
|
# On Windows, timeout is in milliseconds
|
|
timeout *= 1000
|
|
ping_prefix += "-w " + str(timeout) + " "
|
|
ping_command = ping_prefix + host
|
|
|
|
try:
|
|
subprocess.check_output(ping_command.split(" "), stderr=subprocess.STDOUT)
|
|
except subprocess.CalledProcessError:
|
|
error_message = 'Host not accessible, ping test failed ("{0}")'.format(ping_command)
|
|
self.log.info(error_message)
|
|
return False, error_message
|
|
except OSError as err:
|
|
error_message = 'ping command "{0}" failed to run: {1}'.format(ping_command, err)
|
|
self.log.warn(error_message)
|
|
return False, error_message
|
|
return True, None
|
|
|
|
def _check(self, instance):
|
|
"""Run the desired host-alive check against this host.
|
|
|
|
"""
|
|
|
|
host_name = instance.get('host_name', None)
|
|
if not host_name:
|
|
raise ValueError('host_name not specified!')
|
|
|
|
# Allow a different network name to be used for the check
|
|
# to handle multi-homed systems
|
|
if instance.get('target_hostname', None):
|
|
target_hostname = instance.get('target_hostname')
|
|
else:
|
|
target_hostname = host_name
|
|
|
|
host_dimensions = {'hostname': host_name, 'observer_host': util.get_hostname()}
|
|
# If the check is against a different network name than host_name, add it to
|
|
# the dimensions
|
|
if target_hostname != host_name:
|
|
host_dimensions['target_hostname'] = target_hostname
|
|
|
|
dimensions = self._set_dimensions(host_dimensions,
|
|
instance)
|
|
|
|
success = False
|
|
test_type = instance['alive_test']
|
|
if test_type == 'ssh':
|
|
success, error_message = self._test_ssh(target_hostname,
|
|
self.init_config.get('ssh_port'),
|
|
self.init_config.get('ssh_timeout'))
|
|
elif test_type == 'ping':
|
|
success, error_message = self._test_ping(target_hostname,
|
|
self.init_config.get('ping_timeout'))
|
|
else:
|
|
error_message = 'Unrecognized alive_test: {0}'.format(test_type)
|
|
|
|
dimensions.update({'test_type': test_type})
|
|
if success is True:
|
|
self.gauge('host_alive_status',
|
|
0,
|
|
dimensions=dimensions)
|
|
return services_checks.Status.UP, "UP"
|
|
else:
|
|
self.gauge('host_alive_status',
|
|
1,
|
|
dimensions=dimensions,
|
|
value_meta={'error': error_message})
|
|
return services_checks.Status.DOWN, "DOWN"
|