Make health checks resilient to DB outages
Octavia is struggling with proper handling of DB connectivity issues bringing down all running loadbalancers. Octavia tries to failover amphorae and can fail in one of the following stages: 1. Octavia can't create new amphora because Nova isn't ready yet after DB outage. Nova-API throws 500, Octavia nukes amphora instance and won't try to recreate it again. 2. Octavia tries to recreate amphora instance but it gets stuck in PENDING_CREATE forever. 3. Octavia fails completely reporting DB connection issues, leaving some amphoras in error, some in pending_delete as bellow: It affects also HA deployments. This patch fixes that by wrapping the DB check for health, waiting for the connection to be re-established and sleeping off the full "heartbeat_timeout" interval. Story: 2003575 Task: 24871 Change-Id: I7b30cd31e1ce0cf9dab61484f4404f1c6ccddd5e
This commit is contained in:
parent
c7395e0488
commit
92473ce210
@ -15,6 +15,7 @@
|
||||
|
||||
from concurrent import futures
|
||||
import functools
|
||||
import time
|
||||
|
||||
from oslo_config import cfg
|
||||
from oslo_db import exception as db_exc
|
||||
@ -83,9 +84,10 @@ class HealthManager(object):
|
||||
}
|
||||
futs = []
|
||||
while not self.dead.is_set():
|
||||
amp_health = None
|
||||
try:
|
||||
lock_session = db_api.get_session(autocommit=False)
|
||||
amp = None
|
||||
try:
|
||||
amp_health = self.amp_health_repo.get_stale_amphora(
|
||||
lock_session)
|
||||
if amp_health:
|
||||
@ -108,6 +110,17 @@ class HealthManager(object):
|
||||
LOG.debug('Database is requesting a retry. Skipping.')
|
||||
lock_session.rollback()
|
||||
amp_health = None
|
||||
except db_exc.DBConnectionError:
|
||||
db_api.wait_for_connection(self.dead)
|
||||
lock_session.rollback()
|
||||
amp_health = None
|
||||
if not self.dead.is_set():
|
||||
# amphora heartbeat timestamps should also be outdated
|
||||
# while DB is unavailable and soon after DB comes back
|
||||
# online. Sleeping off the full "heartbeat_timeout"
|
||||
# interval to give the amps a chance to check in before
|
||||
# we start failovers.
|
||||
time.sleep(CONF.health_manager.heartbeat_timeout)
|
||||
except Exception:
|
||||
with excutils.save_and_reraise_exception():
|
||||
lock_session.rollback()
|
||||
|
@ -13,11 +13,16 @@
|
||||
# under the License.
|
||||
|
||||
import contextlib
|
||||
import time
|
||||
|
||||
from sqlalchemy.sql.expression import select
|
||||
|
||||
from oslo_config import cfg
|
||||
from oslo_db.sqlalchemy import session as db_session
|
||||
from oslo_log import log as logging
|
||||
from oslo_utils import excutils
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
_FACADE = None
|
||||
|
||||
|
||||
@ -50,3 +55,19 @@ def get_lock_session():
|
||||
except Exception:
|
||||
with excutils.save_and_reraise_exception():
|
||||
lock_session.rollback()
|
||||
|
||||
|
||||
def wait_for_connection(exit_event):
|
||||
"""Helper method to wait for DB connection"""
|
||||
down = True
|
||||
while down and not exit_event.is_set():
|
||||
try:
|
||||
LOG.debug('Trying to re-establish connection to database.')
|
||||
get_engine().scalar(select([1]))
|
||||
down = False
|
||||
LOG.debug('Connection to database re-established.')
|
||||
except Exception:
|
||||
retry_interval = cfg.CONF.database.retry_interval
|
||||
LOG.exception('Connection to database failed. Retrying in %s '
|
||||
'seconds.', retry_interval)
|
||||
time.sleep(retry_interval)
|
||||
|
@ -43,13 +43,14 @@ class TestHealthManager(base.TestCase):
|
||||
def setUp(self):
|
||||
super(TestHealthManager, self).setUp()
|
||||
|
||||
@mock.patch('octavia.db.api.wait_for_connection')
|
||||
@mock.patch('octavia.controller.worker.controller_worker.'
|
||||
'ControllerWorker.failover_amphora')
|
||||
@mock.patch('octavia.db.repositories.AmphoraHealthRepository.'
|
||||
'get_stale_amphora')
|
||||
@mock.patch('octavia.db.api.get_session')
|
||||
def test_health_check_stale_amphora(self, session_mock, get_stale_amp_mock,
|
||||
failover_mock):
|
||||
failover_mock, db_wait_mock):
|
||||
amphora_health = mock.MagicMock()
|
||||
amphora_health.amphora_id = AMPHORA_ID
|
||||
|
||||
@ -68,14 +69,19 @@ class TestHealthManager(base.TestCase):
|
||||
get_stale_amp_mock.side_effect = [
|
||||
db_exc.DBDeadlock,
|
||||
db_exc.RetryRequest(Exception('retry_test')),
|
||||
db_exc.DBConnectionError,
|
||||
TestException('test')]
|
||||
# Test that a DBDeadlock does not raise an exception
|
||||
self.assertIsNone(hm.health_check())
|
||||
# Test that a RetryRequest does not raise an exception
|
||||
self.assertIsNone(hm.health_check())
|
||||
# Test that a DBConnectionError does not raise an exception
|
||||
self.assertIsNone(hm.health_check())
|
||||
# ... and that it waits for DB reconnection
|
||||
db_wait_mock.assert_called_once()
|
||||
# Other exceptions should raise
|
||||
self.assertRaises(TestException, hm.health_check)
|
||||
self.assertEqual(3, mock_session.rollback.call_count)
|
||||
self.assertEqual(4, mock_session.rollback.call_count)
|
||||
|
||||
@mock.patch('octavia.controller.worker.controller_worker.'
|
||||
'ControllerWorker.failover_amphora')
|
||||
|
@ -0,0 +1,6 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixed an issue when Octavia cannot reach the database (all database
|
||||
instances are down) bringing down all running loadbalancers. The Health
|
||||
Manager is more resilient to DB outages now.
|
Loading…
Reference in New Issue
Block a user