diff --git a/octavia/controller/healthmanager/health_manager.py b/octavia/controller/healthmanager/health_manager.py index b1e95b08a2..31182e26b1 100644 --- a/octavia/controller/healthmanager/health_manager.py +++ b/octavia/controller/healthmanager/health_manager.py @@ -15,6 +15,7 @@ from concurrent import futures import functools +import time from oslo_config import cfg from oslo_db import exception as db_exc @@ -83,9 +84,10 @@ class HealthManager(object): } futs = [] while not self.dead.is_set(): - lock_session = db_api.get_session(autocommit=False) - amp = None + amp_health = None try: + lock_session = db_api.get_session(autocommit=False) + amp = None amp_health = self.amp_health_repo.get_stale_amphora( lock_session) if amp_health: @@ -108,6 +110,17 @@ class HealthManager(object): LOG.debug('Database is requesting a retry. Skipping.') lock_session.rollback() amp_health = None + except db_exc.DBConnectionError: + db_api.wait_for_connection(self.dead) + lock_session.rollback() + amp_health = None + if not self.dead.is_set(): + # amphora heartbeat timestamps should also be outdated + # while DB is unavailable and soon after DB comes back + # online. Sleeping off the full "heartbeat_timeout" + # interval to give the amps a chance to check in before + # we start failovers. + time.sleep(CONF.health_manager.heartbeat_timeout) except Exception: with excutils.save_and_reraise_exception(): lock_session.rollback() diff --git a/octavia/db/api.py b/octavia/db/api.py index d2f0b62b4e..52223f0d96 100644 --- a/octavia/db/api.py +++ b/octavia/db/api.py @@ -13,11 +13,16 @@ # under the License. import contextlib +import time + +from sqlalchemy.sql.expression import select from oslo_config import cfg from oslo_db.sqlalchemy import session as db_session +from oslo_log import log as logging from oslo_utils import excutils +LOG = logging.getLogger(__name__) _FACADE = None @@ -50,3 +55,19 @@ def get_lock_session(): except Exception: with excutils.save_and_reraise_exception(): lock_session.rollback() + + +def wait_for_connection(exit_event): + """Helper method to wait for DB connection""" + down = True + while down and not exit_event.is_set(): + try: + LOG.debug('Trying to re-establish connection to database.') + get_engine().scalar(select([1])) + down = False + LOG.debug('Connection to database re-established.') + except Exception: + retry_interval = cfg.CONF.database.retry_interval + LOG.exception('Connection to database failed. Retrying in %s ' + 'seconds.', retry_interval) + time.sleep(retry_interval) diff --git a/octavia/tests/unit/controller/healthmanager/test_health_manager.py b/octavia/tests/unit/controller/healthmanager/test_health_manager.py index bfd3ef1dbf..12f0551c00 100644 --- a/octavia/tests/unit/controller/healthmanager/test_health_manager.py +++ b/octavia/tests/unit/controller/healthmanager/test_health_manager.py @@ -43,13 +43,14 @@ class TestHealthManager(base.TestCase): def setUp(self): super(TestHealthManager, self).setUp() + @mock.patch('octavia.db.api.wait_for_connection') @mock.patch('octavia.controller.worker.controller_worker.' 'ControllerWorker.failover_amphora') @mock.patch('octavia.db.repositories.AmphoraHealthRepository.' 'get_stale_amphora') @mock.patch('octavia.db.api.get_session') def test_health_check_stale_amphora(self, session_mock, get_stale_amp_mock, - failover_mock): + failover_mock, db_wait_mock): amphora_health = mock.MagicMock() amphora_health.amphora_id = AMPHORA_ID @@ -68,14 +69,19 @@ class TestHealthManager(base.TestCase): get_stale_amp_mock.side_effect = [ db_exc.DBDeadlock, db_exc.RetryRequest(Exception('retry_test')), + db_exc.DBConnectionError, TestException('test')] # Test that a DBDeadlock does not raise an exception self.assertIsNone(hm.health_check()) # Test that a RetryRequest does not raise an exception self.assertIsNone(hm.health_check()) + # Test that a DBConnectionError does not raise an exception + self.assertIsNone(hm.health_check()) + # ... and that it waits for DB reconnection + db_wait_mock.assert_called_once() # Other exceptions should raise self.assertRaises(TestException, hm.health_check) - self.assertEqual(3, mock_session.rollback.call_count) + self.assertEqual(4, mock_session.rollback.call_count) @mock.patch('octavia.controller.worker.controller_worker.' 'ControllerWorker.failover_amphora') diff --git a/releasenotes/notes/fix-health-check-db-outage-279b0bc1d0039312.yaml b/releasenotes/notes/fix-health-check-db-outage-279b0bc1d0039312.yaml new file mode 100644 index 0000000000..f816c0129c --- /dev/null +++ b/releasenotes/notes/fix-health-check-db-outage-279b0bc1d0039312.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Fixed an issue when Octavia cannot reach the database (all database + instances are down) bringing down all running loadbalancers. The Health + Manager is more resilient to DB outages now.