Adds more exception handling for ironic-conductor heartbeat

When heartbeat thread of ironic-conductor server is reporting heartbeat,
it will be interrupted by database exceptions except 'DBConnectionError'.
So add 'Exception' in _conductor_service_record_keepalive to catch all
possible exceptions raised from database to ensure the heartbeat thread
not to exit. And also log the exception information. When the database
recovers from an exception, heartbeat thread will continue to report
heartbeat.

Change-Id: I0dc3ada945275811ef7272d500823e0a57011e8f
Closes-Bug: #1696296
This commit is contained in:
D G Lee 2017-06-16 10:55:48 +08:00 committed by Sam Betts
parent 5ea8d9f354
commit 56b8eae918
3 changed files with 25 additions and 0 deletions

View File

@ -393,6 +393,9 @@ class BaseConductorManager(object):
except db_exception.DBConnectionError:
LOG.warning('Conductor could not connect to database '
'while heartbeating.')
except Exception as e:
LOG.exception('Error while heartbeating. Error: %(err)s',
{'err': e})
self._keepalive_evt.wait(CONF.conductor.heartbeat_interval)
def _mapped_to_this_conductor(self, node_uuid, driver):

View File

@ -320,6 +320,19 @@ class KeepAliveTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self.service._conductor_service_record_keepalive()
self.assertEqual(3, mock_touch.call_count)
def test__conductor_service_record_keepalive_failed_error(self):
self._start_service()
# avoid wasting time at the event.wait()
CONF.set_override('heartbeat_interval', 0, 'conductor')
with mock.patch.object(self.dbapi, 'touch_conductor') as mock_touch:
mock_touch.side_effect = [None, Exception(),
None]
with mock.patch.object(self.service._keepalive_evt,
'is_set') as mock_is_set:
mock_is_set.side_effect = [False, False, False, True]
self.service._conductor_service_record_keepalive()
self.assertEqual(3, mock_touch.call_count)
class ManagerSpawnWorkerTestCase(tests_base.TestCase):
def setUp(self):

View File

@ -0,0 +1,9 @@
---
fixes:
- |
Fixes an issue where an ironic-conductor service was deemed dead because
the service could not report its heartbeat due to the database connection
experiencing an unexpected failure. Full tracebacks of these exceptions are
now logged, and if the database connection recovers in a reasonable amount
of time the service will still be available.
See https://bugs.launchpad.net/ironic/+bug/1696296.