From 92473ce21097bd04041d251aafc53baee45906d5 Mon Sep 17 00:00:00 2001 From: Carlos Goncalves Date: Fri, 7 Sep 2018 21:52:51 +0000 Subject: [PATCH] Make health checks resilient to DB outages Octavia is struggling with proper handling of DB connectivity issues bringing down all running loadbalancers. Octavia tries to failover amphorae and can fail in one of the following stages: 1. Octavia can't create new amphora because Nova isn't ready yet after DB outage. Nova-API throws 500, Octavia nukes amphora instance and won't try to recreate it again. 2. Octavia tries to recreate amphora instance but it gets stuck in PENDING_CREATE forever. 3. Octavia fails completely reporting DB connection issues, leaving some amphoras in error, some in pending_delete as bellow: It affects also HA deployments. This patch fixes that by wrapping the DB check for health, waiting for the connection to be re-established and sleeping off the full "heartbeat_timeout" interval. Story: 2003575 Task: 24871 Change-Id: I7b30cd31e1ce0cf9dab61484f4404f1c6ccddd5e --- .../healthmanager/health_manager.py | 17 +++++++++++++-- octavia/db/api.py | 21 +++++++++++++++++++ .../healthmanager/test_health_manager.py | 10 +++++++-- ...alth-check-db-outage-279b0bc1d0039312.yaml | 6 ++++++ 4 files changed, 50 insertions(+), 4 deletions(-) create mode 100644 releasenotes/notes/fix-health-check-db-outage-279b0bc1d0039312.yaml diff --git a/octavia/controller/healthmanager/health_manager.py b/octavia/controller/healthmanager/health_manager.py index b1e95b08a2..31182e26b1 100644 --- a/octavia/controller/healthmanager/health_manager.py +++ b/octavia/controller/healthmanager/health_manager.py @@ -15,6 +15,7 @@ from concurrent import futures import functools +import time from oslo_config import cfg from oslo_db import exception as db_exc @@ -83,9 +84,10 @@ class HealthManager(object): } futs = [] while not self.dead.is_set(): - lock_session = db_api.get_session(autocommit=False) - amp = None + amp_health = None try: + lock_session = db_api.get_session(autocommit=False) + amp = None amp_health = self.amp_health_repo.get_stale_amphora( lock_session) if amp_health: @@ -108,6 +110,17 @@ class HealthManager(object): LOG.debug('Database is requesting a retry. Skipping.') lock_session.rollback() amp_health = None + except db_exc.DBConnectionError: + db_api.wait_for_connection(self.dead) + lock_session.rollback() + amp_health = None + if not self.dead.is_set(): + # amphora heartbeat timestamps should also be outdated + # while DB is unavailable and soon after DB comes back + # online. Sleeping off the full "heartbeat_timeout" + # interval to give the amps a chance to check in before + # we start failovers. + time.sleep(CONF.health_manager.heartbeat_timeout) except Exception: with excutils.save_and_reraise_exception(): lock_session.rollback() diff --git a/octavia/db/api.py b/octavia/db/api.py index d2f0b62b4e..52223f0d96 100644 --- a/octavia/db/api.py +++ b/octavia/db/api.py @@ -13,11 +13,16 @@ # under the License. import contextlib +import time + +from sqlalchemy.sql.expression import select from oslo_config import cfg from oslo_db.sqlalchemy import session as db_session +from oslo_log import log as logging from oslo_utils import excutils +LOG = logging.getLogger(__name__) _FACADE = None @@ -50,3 +55,19 @@ def get_lock_session(): except Exception: with excutils.save_and_reraise_exception(): lock_session.rollback() + + +def wait_for_connection(exit_event): + """Helper method to wait for DB connection""" + down = True + while down and not exit_event.is_set(): + try: + LOG.debug('Trying to re-establish connection to database.') + get_engine().scalar(select([1])) + down = False + LOG.debug('Connection to database re-established.') + except Exception: + retry_interval = cfg.CONF.database.retry_interval + LOG.exception('Connection to database failed. Retrying in %s ' + 'seconds.', retry_interval) + time.sleep(retry_interval) diff --git a/octavia/tests/unit/controller/healthmanager/test_health_manager.py b/octavia/tests/unit/controller/healthmanager/test_health_manager.py index bfd3ef1dbf..12f0551c00 100644 --- a/octavia/tests/unit/controller/healthmanager/test_health_manager.py +++ b/octavia/tests/unit/controller/healthmanager/test_health_manager.py @@ -43,13 +43,14 @@ class TestHealthManager(base.TestCase): def setUp(self): super(TestHealthManager, self).setUp() + @mock.patch('octavia.db.api.wait_for_connection') @mock.patch('octavia.controller.worker.controller_worker.' 'ControllerWorker.failover_amphora') @mock.patch('octavia.db.repositories.AmphoraHealthRepository.' 'get_stale_amphora') @mock.patch('octavia.db.api.get_session') def test_health_check_stale_amphora(self, session_mock, get_stale_amp_mock, - failover_mock): + failover_mock, db_wait_mock): amphora_health = mock.MagicMock() amphora_health.amphora_id = AMPHORA_ID @@ -68,14 +69,19 @@ class TestHealthManager(base.TestCase): get_stale_amp_mock.side_effect = [ db_exc.DBDeadlock, db_exc.RetryRequest(Exception('retry_test')), + db_exc.DBConnectionError, TestException('test')] # Test that a DBDeadlock does not raise an exception self.assertIsNone(hm.health_check()) # Test that a RetryRequest does not raise an exception self.assertIsNone(hm.health_check()) + # Test that a DBConnectionError does not raise an exception + self.assertIsNone(hm.health_check()) + # ... and that it waits for DB reconnection + db_wait_mock.assert_called_once() # Other exceptions should raise self.assertRaises(TestException, hm.health_check) - self.assertEqual(3, mock_session.rollback.call_count) + self.assertEqual(4, mock_session.rollback.call_count) @mock.patch('octavia.controller.worker.controller_worker.' 'ControllerWorker.failover_amphora') diff --git a/releasenotes/notes/fix-health-check-db-outage-279b0bc1d0039312.yaml b/releasenotes/notes/fix-health-check-db-outage-279b0bc1d0039312.yaml new file mode 100644 index 0000000000..f816c0129c --- /dev/null +++ b/releasenotes/notes/fix-health-check-db-outage-279b0bc1d0039312.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Fixed an issue when Octavia cannot reach the database (all database + instances are down) bringing down all running loadbalancers. The Health + Manager is more resilient to DB outages now.