Make health checks resilient to DB outages

Octavia is struggling with proper handling of DB connectivity issues
bringing down all running loadbalancers. Octavia tries to failover
amphorae and can fail in one of the following stages:

1. Octavia can't create new amphora because Nova isn't ready yet after
   DB outage. Nova-API throws 500, Octavia nukes amphora instance and
   won't try to recreate it again.
2. Octavia tries to recreate amphora instance but it gets stuck in
   PENDING_CREATE forever.
3. Octavia fails completely reporting DB connection issues, leaving some
   amphoras in error, some in pending_delete as bellow: It affects also
   HA deployments.

This patch fixes that by wrapping the DB check for health, waiting for
the connection to be re-established and sleeping off the full
"heartbeat_timeout" interval.

Story: 2003575
Task: 24871

Change-Id: I7b30cd31e1ce0cf9dab61484f4404f1c6ccddd5e
This commit is contained in:
Carlos Goncalves 2018-09-07 21:52:51 +00:00
parent c7395e0488
commit 92473ce210
4 changed files with 50 additions and 4 deletions

View File

@ -15,6 +15,7 @@
from concurrent import futures from concurrent import futures
import functools import functools
import time
from oslo_config import cfg from oslo_config import cfg
from oslo_db import exception as db_exc from oslo_db import exception as db_exc
@ -83,9 +84,10 @@ class HealthManager(object):
} }
futs = [] futs = []
while not self.dead.is_set(): while not self.dead.is_set():
lock_session = db_api.get_session(autocommit=False) amp_health = None
amp = None
try: try:
lock_session = db_api.get_session(autocommit=False)
amp = None
amp_health = self.amp_health_repo.get_stale_amphora( amp_health = self.amp_health_repo.get_stale_amphora(
lock_session) lock_session)
if amp_health: if amp_health:
@ -108,6 +110,17 @@ class HealthManager(object):
LOG.debug('Database is requesting a retry. Skipping.') LOG.debug('Database is requesting a retry. Skipping.')
lock_session.rollback() lock_session.rollback()
amp_health = None amp_health = None
except db_exc.DBConnectionError:
db_api.wait_for_connection(self.dead)
lock_session.rollback()
amp_health = None
if not self.dead.is_set():
# amphora heartbeat timestamps should also be outdated
# while DB is unavailable and soon after DB comes back
# online. Sleeping off the full "heartbeat_timeout"
# interval to give the amps a chance to check in before
# we start failovers.
time.sleep(CONF.health_manager.heartbeat_timeout)
except Exception: except Exception:
with excutils.save_and_reraise_exception(): with excutils.save_and_reraise_exception():
lock_session.rollback() lock_session.rollback()

View File

@ -13,11 +13,16 @@
# under the License. # under the License.
import contextlib import contextlib
import time
from sqlalchemy.sql.expression import select
from oslo_config import cfg from oslo_config import cfg
from oslo_db.sqlalchemy import session as db_session from oslo_db.sqlalchemy import session as db_session
from oslo_log import log as logging
from oslo_utils import excutils from oslo_utils import excutils
LOG = logging.getLogger(__name__)
_FACADE = None _FACADE = None
@ -50,3 +55,19 @@ def get_lock_session():
except Exception: except Exception:
with excutils.save_and_reraise_exception(): with excutils.save_and_reraise_exception():
lock_session.rollback() lock_session.rollback()
def wait_for_connection(exit_event):
"""Helper method to wait for DB connection"""
down = True
while down and not exit_event.is_set():
try:
LOG.debug('Trying to re-establish connection to database.')
get_engine().scalar(select([1]))
down = False
LOG.debug('Connection to database re-established.')
except Exception:
retry_interval = cfg.CONF.database.retry_interval
LOG.exception('Connection to database failed. Retrying in %s '
'seconds.', retry_interval)
time.sleep(retry_interval)

View File

@ -43,13 +43,14 @@ class TestHealthManager(base.TestCase):
def setUp(self): def setUp(self):
super(TestHealthManager, self).setUp() super(TestHealthManager, self).setUp()
@mock.patch('octavia.db.api.wait_for_connection')
@mock.patch('octavia.controller.worker.controller_worker.' @mock.patch('octavia.controller.worker.controller_worker.'
'ControllerWorker.failover_amphora') 'ControllerWorker.failover_amphora')
@mock.patch('octavia.db.repositories.AmphoraHealthRepository.' @mock.patch('octavia.db.repositories.AmphoraHealthRepository.'
'get_stale_amphora') 'get_stale_amphora')
@mock.patch('octavia.db.api.get_session') @mock.patch('octavia.db.api.get_session')
def test_health_check_stale_amphora(self, session_mock, get_stale_amp_mock, def test_health_check_stale_amphora(self, session_mock, get_stale_amp_mock,
failover_mock): failover_mock, db_wait_mock):
amphora_health = mock.MagicMock() amphora_health = mock.MagicMock()
amphora_health.amphora_id = AMPHORA_ID amphora_health.amphora_id = AMPHORA_ID
@ -68,14 +69,19 @@ class TestHealthManager(base.TestCase):
get_stale_amp_mock.side_effect = [ get_stale_amp_mock.side_effect = [
db_exc.DBDeadlock, db_exc.DBDeadlock,
db_exc.RetryRequest(Exception('retry_test')), db_exc.RetryRequest(Exception('retry_test')),
db_exc.DBConnectionError,
TestException('test')] TestException('test')]
# Test that a DBDeadlock does not raise an exception # Test that a DBDeadlock does not raise an exception
self.assertIsNone(hm.health_check()) self.assertIsNone(hm.health_check())
# Test that a RetryRequest does not raise an exception # Test that a RetryRequest does not raise an exception
self.assertIsNone(hm.health_check()) self.assertIsNone(hm.health_check())
# Test that a DBConnectionError does not raise an exception
self.assertIsNone(hm.health_check())
# ... and that it waits for DB reconnection
db_wait_mock.assert_called_once()
# Other exceptions should raise # Other exceptions should raise
self.assertRaises(TestException, hm.health_check) self.assertRaises(TestException, hm.health_check)
self.assertEqual(3, mock_session.rollback.call_count) self.assertEqual(4, mock_session.rollback.call_count)
@mock.patch('octavia.controller.worker.controller_worker.' @mock.patch('octavia.controller.worker.controller_worker.'
'ControllerWorker.failover_amphora') 'ControllerWorker.failover_amphora')

View File

@ -0,0 +1,6 @@
---
fixes:
- |
Fixed an issue when Octavia cannot reach the database (all database
instances are down) bringing down all running loadbalancers. The Health
Manager is more resilient to DB outages now.