Merge "Repeated check to determine host status"
This commit is contained in:
commit
22059afc26
@ -20,6 +20,14 @@ monitor_host_opts = [
|
||||
cfg.IntOpt('monitoring_interval',
|
||||
default=60,
|
||||
help='Monitoring interval(in seconds) of node status.'),
|
||||
cfg.IntOpt('monitoring_samples',
|
||||
default=1,
|
||||
help='''
|
||||
Monitoring probes to collect before making the decision to send Masakari
|
||||
notification about the node status. If and only if ``monitoring_samples``
|
||||
consecutive reports have the same status, will the Masakari notification
|
||||
be sent.
|
||||
'''),
|
||||
cfg.IntOpt('api_retry_max',
|
||||
default=12,
|
||||
help='Number of retries for send a notification in'
|
||||
|
@ -14,6 +14,7 @@
|
||||
|
||||
import socket
|
||||
|
||||
from collections import deque
|
||||
import eventlet
|
||||
from oslo_log import log as oslo_logging
|
||||
from oslo_utils import timeutils
|
||||
@ -56,6 +57,27 @@ class HandleHost(driver.DriverBase):
|
||||
self.crmmon_xml_parser = parse_crmmon_xml.ParseCrmMonXml()
|
||||
self.status_holder = hold_host_status.HostHoldStatus()
|
||||
self.notifier = masakari.SendNotification()
|
||||
self.monitoring_data = {}
|
||||
|
||||
def _update_monitoring_data(self, hostname, status):
|
||||
health_history = self.monitoring_data.setdefault(
|
||||
hostname, deque([], maxlen=CONF.host.monitoring_samples))
|
||||
health_history.append(status)
|
||||
|
||||
def get_stabilised_host_status(self, hostname):
|
||||
health_history = self.monitoring_data.get(hostname)
|
||||
if len(health_history) < CONF.host.monitoring_samples:
|
||||
LOG.debug("Not enough monitoring data for host %s.", hostname)
|
||||
return '_being_collected'
|
||||
|
||||
stabilised_status = health_history[0]
|
||||
|
||||
# If and only if the sequence of host status is consistently the same,
|
||||
# will it return that status.
|
||||
if len(health_history) == health_history.count(stabilised_status):
|
||||
return stabilised_status
|
||||
else:
|
||||
return '_uncertain'
|
||||
|
||||
def _check_pacemaker_services(self, target_service):
|
||||
try:
|
||||
@ -295,8 +317,8 @@ class HandleHost(driver.DriverBase):
|
||||
if hostname == self.my_hostname:
|
||||
continue
|
||||
|
||||
# Get current status and old status.
|
||||
current_status = node_state_tag.get('crmd')
|
||||
self._update_monitoring_data(hostname, current_status)
|
||||
old_status = self.status_holder.get_host_status(hostname)
|
||||
|
||||
# If old_status is None, This is first get of host status.
|
||||
@ -308,21 +330,28 @@ class HandleHost(driver.DriverBase):
|
||||
self.status_holder.set_host_status(node_state_tag)
|
||||
continue
|
||||
|
||||
stabilised_status = self.get_stabilised_host_status(hostname)
|
||||
|
||||
# Output host status.
|
||||
msg = ("'%s' is '%s'.") % (hostname, current_status)
|
||||
msg = ("'%s' is '%s' (current: '%s').") % (hostname,
|
||||
stabilised_status,
|
||||
current_status)
|
||||
LOG.info("%s", msg)
|
||||
|
||||
# If host status changed, send a notification.
|
||||
if current_status != old_status:
|
||||
if current_status != 'online' and current_status != 'offline':
|
||||
# If current_status is not 'online' or 'offline',
|
||||
if stabilised_status == '_being_collected':
|
||||
continue
|
||||
|
||||
# If host stabilised status changed, send a notification.
|
||||
if stabilised_status != old_status:
|
||||
if stabilised_status not in ['online', 'offline']:
|
||||
# If stabilised_status is not 'online' or 'offline',
|
||||
# hostmonitor doesn't send a notification.
|
||||
msg = ("Since host status is '%s',"
|
||||
" hostmonitor doesn't send a notification.") \
|
||||
% current_status
|
||||
% stabilised_status
|
||||
LOG.info("%s", msg)
|
||||
else:
|
||||
event = self._make_event(hostname, current_status)
|
||||
event = self._make_event(hostname, stabilised_status)
|
||||
|
||||
# Send a notification.
|
||||
self.notifier.send_notification(
|
||||
@ -330,8 +359,9 @@ class HandleHost(driver.DriverBase):
|
||||
CONF.host.api_retry_interval,
|
||||
event)
|
||||
|
||||
# Update host status.
|
||||
self.status_holder.set_host_status(node_state_tag)
|
||||
if stabilised_status != '_uncertain':
|
||||
# Update host status.
|
||||
self.status_holder.set_host_status(node_state_tag)
|
||||
|
||||
def _check_host_status_by_crm_mon(self):
|
||||
crmmon_xml = self._get_crmmon_xml()
|
||||
|
@ -17,6 +17,7 @@ import testtools
|
||||
from unittest import mock
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from collections import deque
|
||||
import eventlet
|
||||
from oslo_utils import timeutils
|
||||
|
||||
@ -629,9 +630,7 @@ class TestHandleHost(testtools.TestCase):
|
||||
mock.call(node3),
|
||||
mock.call(node4),
|
||||
mock.call(node5)]
|
||||
calls_set_host_status = [mock.call(node_state_node2),
|
||||
mock.call(node_state_node3),
|
||||
mock.call(node_state_node4),
|
||||
calls_set_host_status = [mock.call(node_state_node4),
|
||||
mock.call(node_state_node5)]
|
||||
mock_get_host_status.assert_has_calls(calls_get_host_status)
|
||||
mock_set_host_status.assert_has_calls(calls_set_host_status)
|
||||
@ -639,6 +638,64 @@ class TestHandleHost(testtools.TestCase):
|
||||
mock_send_notification.assert_called_once_with(
|
||||
CONF.host.api_retry_max, CONF.host.api_retry_interval, test_event)
|
||||
|
||||
@mock.patch.object(masakari.SendNotification, 'send_notification')
|
||||
@mock.patch.object(handle_host.HandleHost, '_make_event')
|
||||
@mock.patch.object(hold_host_status.HostHoldStatus, 'set_host_status')
|
||||
@mock.patch.object(hold_host_status.HostHoldStatus, 'get_host_status')
|
||||
@mock.patch.object(socket, 'gethostname')
|
||||
def test_check_if_status_changed_with_3_samples(
|
||||
self, mock_gethostname, mock_get_host_status, mock_set_host_status,
|
||||
mock_make_event, mock_send_notification):
|
||||
mock_gethostname.return_value = 'node1'
|
||||
mock_get_host_status.side_effect = \
|
||||
[None, 'online', 'online', 'online']
|
||||
mock_set_host_status.return_value = None
|
||||
test_event = {'notification': 'test'}
|
||||
mock_make_event.return_value = test_event
|
||||
|
||||
status_tag = ElementTree.fromstring(STATUS_TAG_XML)
|
||||
node_state_tag_list = list(status_tag)
|
||||
CONF.host.monitoring_samples = 3
|
||||
|
||||
obj = handle_host.HandleHost()
|
||||
obj.monitoring_data = {
|
||||
"node1": deque(['online', 'online', 'online'], maxlen=3),
|
||||
"node2": deque(['offline', 'online', 'online'], maxlen=3),
|
||||
"node3": deque(['offline'], maxlen=3),
|
||||
"node4": deque(['online', 'offline', 'offline'], maxlen=3),
|
||||
"node5": deque(['online', 'online', 'online'], maxlen=3),
|
||||
}
|
||||
obj._check_if_status_changed(node_state_tag_list)
|
||||
|
||||
self.assertEqual(deque(['online', 'online', 'online'], maxlen=3),
|
||||
obj.monitoring_data.get('node2'))
|
||||
self.assertEqual('online', obj.get_stabilised_host_status('node2'))
|
||||
self.assertIn(mock.call(node_state_tag_list[1]),
|
||||
mock_set_host_status.mock_calls)
|
||||
|
||||
self.assertEqual(deque(['offline', 'online'], maxlen=3),
|
||||
obj.monitoring_data.get('node3'))
|
||||
self.assertEqual('_being_collected',
|
||||
obj.get_stabilised_host_status('node3'))
|
||||
self.assertNotIn(mock.call(node_state_tag_list[2]),
|
||||
mock_set_host_status.mock_calls)
|
||||
|
||||
self.assertEqual(deque(['offline', 'offline', 'offline'], maxlen=3),
|
||||
obj.monitoring_data.get('node4'))
|
||||
self.assertEqual('offline', obj.get_stabilised_host_status('node4'))
|
||||
self.assertIn(mock.call(node_state_tag_list[3]),
|
||||
mock_set_host_status.mock_calls)
|
||||
|
||||
self.assertEqual(deque(['online', 'online', 'other'], maxlen=3),
|
||||
obj.monitoring_data.get('node5'))
|
||||
self.assertEqual('_uncertain', obj.get_stabilised_host_status('node5'))
|
||||
self.assertNotIn(mock.call(node_state_tag_list[4]),
|
||||
mock_set_host_status.mock_calls)
|
||||
|
||||
mock_make_event.assert_called_once_with("node4", 'offline')
|
||||
mock_send_notification.assert_called_once_with(
|
||||
CONF.host.api_retry_max, CONF.host.api_retry_interval, test_event)
|
||||
|
||||
@mock.patch.object(handle_host.HandleHost, '_check_if_status_changed')
|
||||
@mock.patch.object(parse_crmmon_xml.ParseCrmMonXml,
|
||||
'get_node_state_tag_list')
|
||||
|
@ -0,0 +1,17 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Support for repeated check of node status in hostmonitor.
|
||||
|
||||
Repeated check is more reliable than single check to determine host
|
||||
status, especially when there is network instability in play.
|
||||
|
||||
With this feature, the following config option can be set.
|
||||
|
||||
[host]
|
||||
monitoring_samples = 3
|
||||
|
||||
The above means 3 checks will be done before the node status is decided.
|
||||
The default value is 1 which is backwards compatible.
|
||||
|
||||
`Blueprint retry-check-when-host-failure <https://blueprints.launchpad.net/masakari-monitors/+spec/retry-check-when-host-failure>`__
|
Loading…
x
Reference in New Issue
Block a user