92473ce210
Octavia is struggling with proper handling of DB connectivity issues bringing down all running loadbalancers. Octavia tries to failover amphorae and can fail in one of the following stages: 1. Octavia can't create new amphora because Nova isn't ready yet after DB outage. Nova-API throws 500, Octavia nukes amphora instance and won't try to recreate it again. 2. Octavia tries to recreate amphora instance but it gets stuck in PENDING_CREATE forever. 3. Octavia fails completely reporting DB connection issues, leaving some amphoras in error, some in pending_delete as bellow: It affects also HA deployments. This patch fixes that by wrapping the DB check for health, waiting for the connection to be re-established and sleeping off the full "heartbeat_timeout" interval. Story: 2003575 Task: 24871 Change-Id: I7b30cd31e1ce0cf9dab61484f4404f1c6ccddd5e
74 lines
2.3 KiB
Python
74 lines
2.3 KiB
Python
# Copyright 2014 Rackspace
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import contextlib
|
|
import time
|
|
|
|
from sqlalchemy.sql.expression import select
|
|
|
|
from oslo_config import cfg
|
|
from oslo_db.sqlalchemy import session as db_session
|
|
from oslo_log import log as logging
|
|
from oslo_utils import excutils
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
_FACADE = None
|
|
|
|
|
|
def _create_facade_lazily():
|
|
global _FACADE
|
|
if _FACADE is None:
|
|
_FACADE = db_session.EngineFacade.from_config(cfg.CONF, sqlite_fk=True)
|
|
return _FACADE
|
|
|
|
|
|
def get_engine():
|
|
facade = _create_facade_lazily()
|
|
return facade.get_engine()
|
|
|
|
|
|
def get_session(expire_on_commit=True, autocommit=True):
|
|
"""Helper method to grab session."""
|
|
facade = _create_facade_lazily()
|
|
return facade.get_session(expire_on_commit=expire_on_commit,
|
|
autocommit=autocommit)
|
|
|
|
|
|
@contextlib.contextmanager
|
|
def get_lock_session():
|
|
"""Context manager for using a locking (not auto-commit) session."""
|
|
lock_session = get_session(autocommit=False)
|
|
try:
|
|
yield lock_session
|
|
lock_session.commit()
|
|
except Exception:
|
|
with excutils.save_and_reraise_exception():
|
|
lock_session.rollback()
|
|
|
|
|
|
def wait_for_connection(exit_event):
|
|
"""Helper method to wait for DB connection"""
|
|
down = True
|
|
while down and not exit_event.is_set():
|
|
try:
|
|
LOG.debug('Trying to re-establish connection to database.')
|
|
get_engine().scalar(select([1]))
|
|
down = False
|
|
LOG.debug('Connection to database re-established.')
|
|
except Exception:
|
|
retry_interval = cfg.CONF.database.retry_interval
|
|
LOG.exception('Connection to database failed. Retrying in %s '
|
|
'seconds.', retry_interval)
|
|
time.sleep(retry_interval)
|