Fix hostmonitor hanging forever after certain exceptions
The hostmonitor, like other Masakari monitors, starts as an Oslo service (based on eventlet). The main thread is supposed to run a loop that has an internal wait mechanism (instead of reusing periodic_tasks from oslo_service). However, the loop could be broken, if an unexpected exception appeared, and it never ran again but the process was still alive (due to oslo_service not stopping). The example mentioned in the bug report is about unavailability of the Masakari API (and/or Keystone API) before notification sending. This exception is not caught early because SendNotification._make_client is called outside of the try block (unlike the actual notification sending). The exception bubbles up and stops the main loop, leaving a useless hostmonitor process. The user is unaware unless they notice the logs are no longer growing. While the general design begs for a revamp (we might get away with that by using Consul in the first place), the easy fix is to prevent exceptions breaking the loop completely so that the hostmonitor can continue to work and try to regain health. At the very least it will keep posting ERROR messages in the log which is more likely to be spotted in comparison to lack of logs (which is, unfortunately, less commonly considered an alerting situation). This change also fixes, adapts and robustifies the two relevant unit tests. Closes-Bug: #1930361 Co-Authored-By: Radosław Piliszek <radoslaw.piliszek@gmail.com> Change-Id: I7e3447dcddc7998e3e3c30f4f0019d91a99c79ce
This commit is contained in:
parent
6a496e7871
commit
e7154f3d77
@ -424,10 +424,9 @@ class HandleHost(driver.DriverBase):
|
||||
|
||||
This method monitors hosts.
|
||||
"""
|
||||
try:
|
||||
self.running = True
|
||||
while self.running:
|
||||
|
||||
self.running = True
|
||||
while self.running:
|
||||
try:
|
||||
# Check whether corosync communication between hosts
|
||||
# is normal.
|
||||
ret = self._check_hb_line()
|
||||
@ -466,9 +465,7 @@ class HandleHost(driver.DriverBase):
|
||||
if status_func() != 0:
|
||||
LOG.warning("hostmonitor skips monitoring hosts.")
|
||||
|
||||
eventlet.greenthread.sleep(CONF.host.monitoring_interval)
|
||||
except Exception as e:
|
||||
LOG.exception("Exception caught: %s", e)
|
||||
|
||||
except Exception as e:
|
||||
LOG.exception("Exception caught: %s", e)
|
||||
|
||||
return
|
||||
eventlet.greenthread.sleep(CONF.host.monitoring_interval)
|
||||
|
@ -859,25 +859,25 @@ class TestHandleHost(testtools.TestCase):
|
||||
def test_monitor_hosts(self,
|
||||
mock_check_hb_line,
|
||||
mock_check_pacemaker_services,
|
||||
mock_check_host_status_by_cibadmin,
|
||||
mock_check_host_status_by_crmadmin,
|
||||
mock_check_host_status_by_cibadmin,
|
||||
mock_sleep):
|
||||
|
||||
mock_check_hb_line.side_effect = \
|
||||
[0, 1, 2, 0, Exception("Test exception.")]
|
||||
mock_check_pacemaker_services.side_effect = [True, False, False]
|
||||
mock_check_host_status_by_cibadmin.side_effect = [0, 1]
|
||||
[0, 1, 2, 0, Exception("Test exception."), 0, KeyboardInterrupt()]
|
||||
mock_check_pacemaker_services.side_effect = [True, False, False, True]
|
||||
mock_check_host_status_by_crmadmin.side_effect = [0, 1]
|
||||
mock_check_host_status_by_cibadmin.side_effect = [0, 1, 0]
|
||||
mock_sleep.return_value = None
|
||||
|
||||
obj = handle_host.HandleHost()
|
||||
obj.monitor_hosts()
|
||||
self.assertRaises(KeyboardInterrupt, obj.monitor_hosts)
|
||||
|
||||
self.assertEqual(5, mock_check_hb_line.call_count)
|
||||
self.assertEqual(3, mock_check_pacemaker_services.call_count)
|
||||
self.assertEqual(7, mock_check_hb_line.call_count)
|
||||
self.assertEqual(4, mock_check_pacemaker_services.call_count)
|
||||
mock_check_pacemaker_services.assert_called_with('pacemaker_remote')
|
||||
self.assertEqual(2, mock_check_host_status_by_cibadmin.call_count)
|
||||
self.assertEqual(2, mock_check_host_status_by_crmadmin.call_count)
|
||||
self.assertEqual(3, mock_check_host_status_by_cibadmin.call_count)
|
||||
|
||||
@mock.patch.object(eventlet.greenthread, 'sleep')
|
||||
@mock.patch.object(handle_host.HandleHost,
|
||||
@ -892,16 +892,15 @@ class TestHandleHost(testtools.TestCase):
|
||||
|
||||
CONF.host.restrict_to_remotes = True
|
||||
mock_check_hb_line.side_effect = \
|
||||
[0, Exception("Test exception.")]
|
||||
[0, Exception("Test exception."), 0, KeyboardInterrupt()]
|
||||
mock_check_pacemaker_services.return_value = True
|
||||
mock_check_host_status_by_crm_mon.side_effect = 0
|
||||
mock_check_host_status_by_crm_mon.return_value = 0
|
||||
mock_sleep.return_value = None
|
||||
|
||||
obj = handle_host.HandleHost()
|
||||
obj.monitor_hosts()
|
||||
self.assertRaises(KeyboardInterrupt, obj.monitor_hosts)
|
||||
|
||||
self.assertEqual(1, mock_check_hb_line.call_count)
|
||||
self.assertEqual(1, mock_check_pacemaker_services.call_count)
|
||||
self.assertEqual(4, mock_check_hb_line.call_count)
|
||||
self.assertEqual(2, mock_check_pacemaker_services.call_count)
|
||||
mock_check_pacemaker_services.assert_called_with('pacemaker_remote')
|
||||
self.assertEqual(1, mock_check_host_status_by_crm_mon.call_count)
|
||||
mock_check_host_status_by_crm_mon.assert_called_once_with()
|
||||
self.assertEqual(2, mock_check_host_status_by_crm_mon.call_count)
|
||||
|
5
releasenotes/notes/bug-1930361-fa8ce8e9781ea967.yaml
Normal file
5
releasenotes/notes/bug-1930361-fa8ce8e9781ea967.yaml
Normal file
@ -0,0 +1,5 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixes hostmonitor hanging forever after certain exceptions.
|
||||
`LP#1930361 <https://launchpad.net/bugs/1930361>`__
|
Loading…
x
Reference in New Issue
Block a user