Fix hostmonitor hanging forever after certain exceptions
The hostmonitor, like other Masakari monitors, starts as an
Oslo service (based on eventlet). The main thread is supposed
to run a loop that has an internal wait mechanism (instead of
reusing periodic_tasks from oslo_service). However, the loop
could be broken, if an unexpected exception appeared, and it
never ran again but the process was still alive (due to
oslo_service not stopping). The example mentioned in the bug
report is about unavailability of the Masakari API (and/or
Keystone API) before notification sending. This exception is
not caught early because SendNotification._make_client is
called outside of the try block (unlike the actual notification
sending). The exception bubbles up and stops the main loop,
leaving a useless hostmonitor process. The user is unaware
unless they notice the logs are no longer growing.
While the general design begs for a revamp (we might get away
with that by using Consul in the first place), the easy fix is
to prevent exceptions breaking the loop completely so that the
hostmonitor can continue to work and try to regain health.
At the very least it will keep posting ERROR messages in the log
which is more likely to be spotted in comparison to lack of logs
(which is, unfortunately, less commonly considered an alerting
situation).
This change also fixes, adapts and robustifies the two relevant
unit tests.
Closes-Bug: #1930361
Co-Authored-By: Radosław Piliszek <radoslaw.piliszek@gmail.com>
Change-Id: I7e3447dcddc7998e3e3c30f4f0019d91a99c79ce
(cherry picked from commit e7154f3d77
)
This commit is contained in:
parent
966b3f8b3a
commit
ad90db8811
|
@ -389,10 +389,9 @@ class HandleHost(driver.DriverBase):
|
|||
|
||||
This method monitors hosts.
|
||||
"""
|
||||
try:
|
||||
self.running = True
|
||||
while self.running:
|
||||
|
||||
self.running = True
|
||||
while self.running:
|
||||
try:
|
||||
# Check whether corosync communication between hosts
|
||||
# is normal.
|
||||
ret = self._check_hb_line()
|
||||
|
@ -426,9 +425,7 @@ class HandleHost(driver.DriverBase):
|
|||
if status_func() != 0:
|
||||
LOG.warning("hostmonitor skips monitoring hosts.")
|
||||
|
||||
eventlet.greenthread.sleep(CONF.host.monitoring_interval)
|
||||
except Exception as e:
|
||||
LOG.exception("Exception caught: %s", e)
|
||||
|
||||
except Exception as e:
|
||||
LOG.exception("Exception caught: %s", e)
|
||||
|
||||
return
|
||||
eventlet.greenthread.sleep(CONF.host.monitoring_interval)
|
||||
|
|
|
@ -802,25 +802,25 @@ class TestHandleHost(testtools.TestCase):
|
|||
def test_monitor_hosts(self,
|
||||
mock_check_hb_line,
|
||||
mock_check_pacemaker_services,
|
||||
mock_check_host_status_by_cibadmin,
|
||||
mock_check_host_status_by_crmadmin,
|
||||
mock_check_host_status_by_cibadmin,
|
||||
mock_sleep):
|
||||
|
||||
mock_check_hb_line.side_effect = \
|
||||
[0, 1, 2, 0, Exception("Test exception.")]
|
||||
mock_check_pacemaker_services.side_effect = [True, False, False]
|
||||
mock_check_host_status_by_cibadmin.side_effect = [0, 1]
|
||||
[0, 1, 2, 0, Exception("Test exception."), 0, KeyboardInterrupt()]
|
||||
mock_check_pacemaker_services.side_effect = [True, False, False, True]
|
||||
mock_check_host_status_by_crmadmin.side_effect = [0, 1]
|
||||
mock_check_host_status_by_cibadmin.side_effect = [0, 1, 0]
|
||||
mock_sleep.return_value = None
|
||||
|
||||
obj = handle_host.HandleHost()
|
||||
obj.monitor_hosts()
|
||||
self.assertRaises(KeyboardInterrupt, obj.monitor_hosts)
|
||||
|
||||
self.assertEqual(5, mock_check_hb_line.call_count)
|
||||
self.assertEqual(3, mock_check_pacemaker_services.call_count)
|
||||
self.assertEqual(7, mock_check_hb_line.call_count)
|
||||
self.assertEqual(4, mock_check_pacemaker_services.call_count)
|
||||
mock_check_pacemaker_services.assert_called_with('pacemaker_remote')
|
||||
self.assertEqual(2, mock_check_host_status_by_cibadmin.call_count)
|
||||
self.assertEqual(2, mock_check_host_status_by_crmadmin.call_count)
|
||||
self.assertEqual(3, mock_check_host_status_by_cibadmin.call_count)
|
||||
|
||||
@mock.patch.object(eventlet.greenthread, 'sleep')
|
||||
@mock.patch.object(handle_host.HandleHost,
|
||||
|
@ -835,16 +835,15 @@ class TestHandleHost(testtools.TestCase):
|
|||
|
||||
CONF.host.restrict_to_remotes = True
|
||||
mock_check_hb_line.side_effect = \
|
||||
[0, Exception("Test exception.")]
|
||||
[0, Exception("Test exception."), 0, KeyboardInterrupt()]
|
||||
mock_check_pacemaker_services.return_value = True
|
||||
mock_check_host_status_by_crm_mon.side_effect = 0
|
||||
mock_check_host_status_by_crm_mon.return_value = 0
|
||||
mock_sleep.return_value = None
|
||||
|
||||
obj = handle_host.HandleHost()
|
||||
obj.monitor_hosts()
|
||||
self.assertRaises(KeyboardInterrupt, obj.monitor_hosts)
|
||||
|
||||
self.assertEqual(1, mock_check_hb_line.call_count)
|
||||
self.assertEqual(1, mock_check_pacemaker_services.call_count)
|
||||
self.assertEqual(4, mock_check_hb_line.call_count)
|
||||
self.assertEqual(2, mock_check_pacemaker_services.call_count)
|
||||
mock_check_pacemaker_services.assert_called_with('pacemaker_remote')
|
||||
self.assertEqual(1, mock_check_host_status_by_crm_mon.call_count)
|
||||
mock_check_host_status_by_crm_mon.assert_called_once_with()
|
||||
self.assertEqual(2, mock_check_host_status_by_crm_mon.call_count)
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixes hostmonitor hanging forever after certain exceptions.
|
||||
`LP#1930361 <https://launchpad.net/bugs/1930361>`__
|
Loading…
Reference in New Issue