From 56b8eae918fe525b16979eed5fbba1c525356030 Mon Sep 17 00:00:00 2001 From: D G Lee Date: Fri, 16 Jun 2017 10:55:48 +0800 Subject: [PATCH] Adds more exception handling for ironic-conductor heartbeat When heartbeat thread of ironic-conductor server is reporting heartbeat, it will be interrupted by database exceptions except 'DBConnectionError'. So add 'Exception' in _conductor_service_record_keepalive to catch all possible exceptions raised from database to ensure the heartbeat thread not to exit. And also log the exception information. When the database recovers from an exception, heartbeat thread will continue to report heartbeat. Change-Id: I0dc3ada945275811ef7272d500823e0a57011e8f Closes-Bug: #1696296 --- ironic/conductor/base_manager.py | 3 +++ ironic/tests/unit/conductor/test_base_manager.py | 13 +++++++++++++ .../notes/bug-1696296-a972c8d879b98940.yaml | 9 +++++++++ 3 files changed, 25 insertions(+) create mode 100644 releasenotes/notes/bug-1696296-a972c8d879b98940.yaml diff --git a/ironic/conductor/base_manager.py b/ironic/conductor/base_manager.py index c95a6e8eda..34319babf3 100644 --- a/ironic/conductor/base_manager.py +++ b/ironic/conductor/base_manager.py @@ -393,6 +393,9 @@ class BaseConductorManager(object): except db_exception.DBConnectionError: LOG.warning('Conductor could not connect to database ' 'while heartbeating.') + except Exception as e: + LOG.exception('Error while heartbeating. Error: %(err)s', + {'err': e}) self._keepalive_evt.wait(CONF.conductor.heartbeat_interval) def _mapped_to_this_conductor(self, node_uuid, driver): diff --git a/ironic/tests/unit/conductor/test_base_manager.py b/ironic/tests/unit/conductor/test_base_manager.py index 060da59d1b..ea2ff86aa3 100644 --- a/ironic/tests/unit/conductor/test_base_manager.py +++ b/ironic/tests/unit/conductor/test_base_manager.py @@ -320,6 +320,19 @@ class KeepAliveTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self.service._conductor_service_record_keepalive() self.assertEqual(3, mock_touch.call_count) + def test__conductor_service_record_keepalive_failed_error(self): + self._start_service() + # avoid wasting time at the event.wait() + CONF.set_override('heartbeat_interval', 0, 'conductor') + with mock.patch.object(self.dbapi, 'touch_conductor') as mock_touch: + mock_touch.side_effect = [None, Exception(), + None] + with mock.patch.object(self.service._keepalive_evt, + 'is_set') as mock_is_set: + mock_is_set.side_effect = [False, False, False, True] + self.service._conductor_service_record_keepalive() + self.assertEqual(3, mock_touch.call_count) + class ManagerSpawnWorkerTestCase(tests_base.TestCase): def setUp(self): diff --git a/releasenotes/notes/bug-1696296-a972c8d879b98940.yaml b/releasenotes/notes/bug-1696296-a972c8d879b98940.yaml new file mode 100644 index 0000000000..eb9f19212d --- /dev/null +++ b/releasenotes/notes/bug-1696296-a972c8d879b98940.yaml @@ -0,0 +1,9 @@ +--- +fixes: + - | + Fixes an issue where an ironic-conductor service was deemed dead because + the service could not report its heartbeat due to the database connection + experiencing an unexpected failure. Full tracebacks of these exceptions are + now logged, and if the database connection recovers in a reasonable amount + of time the service will still be available. + See https://bugs.launchpad.net/ironic/+bug/1696296. \ No newline at end of file