Fix health monitor DB locking.

Change-Id: Ida0d9e1d7a808706c69808dc78e16bc8292a39c0
This commit is contained in:
Michael Johnson 2017-08-11 15:35:36 -07:00
parent 690ccfd43f
commit 5744872c94
7 changed files with 161 additions and 59 deletions

View File

@ -30,6 +30,13 @@ CONF = cfg.CONF
LOG = logging.getLogger(__name__)
# Used for while true loops to allow testing
# TODO(johnsom) This will be removed with
# https://review.openstack.org/#/c/456420/
def true_func():
return True
def hm_listener():
# TODO(german): steved'or load those drivers
udp_getter = heartbeat_udp.UDPStatusGetter(
@ -41,8 +48,12 @@ def hm_listener():
def hm_health_check():
hm = health_manager.HealthManager()
while True:
hm.health_check()
while true_func():
try:
hm.health_check()
except Exception as e:
LOG.warning('Health Manager caught the following exception and '
'is restarting: {}'.format(e))
def main():

View File

@ -16,7 +16,9 @@ import time
from concurrent import futures
from oslo_config import cfg
from oslo_db import exception as db_exc
from oslo_log import log as logging
from oslo_utils import excutils
from octavia.controller.worker import controller_worker as cw
from octavia.db import api as db_api
@ -40,11 +42,34 @@ class HealthManager(object):
LOG.debug("Pausing before starting health check")
time.sleep(CONF.health_manager.heartbeat_timeout)
while True:
session = db_api.get_session()
LOG.debug("Starting amphora health check")
failover_count = 0
while True:
amp = amp_health_repo.get_stale_amphora(session)
lock_session = db_api.get_session(autocommit=False)
amp = None
try:
amp = amp_health_repo.get_stale_amphora(lock_session)
lock_session.commit()
except db_exc.DBDeadlock:
LOG.debug('Database reports deadlock. Skipping.')
try:
lock_session.rollback()
except Exception:
pass
except db_exc.RetryRequest:
LOG.debug('Database is requesting a retry. Skipping.')
try:
lock_session.rollback()
except Exception:
pass
except Exception:
with excutils.save_and_reraise_exception():
try:
lock_session.rollback()
except Exception:
pass
if amp is None:
break
failover_count += 1

View File

@ -16,6 +16,7 @@ import datetime
from oslo_config import cfg
from oslo_log import log as logging
from oslo_utils import excutils
import sqlalchemy
from stevedore import driver as stevedore_driver
@ -110,10 +111,16 @@ class UpdateHealthDb(object):
# does not match the expected listener count
if len(listeners) == expected_listener_count:
lock_session = db_api.get_session(autocommit=False)
# if the input amphora is healthy, we update its db info
self.amphora_health_repo.replace(session, health['id'],
last_update=(datetime.
datetime.utcnow()))
try:
self.amphora_health_repo.replace(
lock_session, health['id'],
last_update=(datetime.datetime.utcnow()))
lock_session.commit()
except Exception:
with excutils.save_and_reraise_exception():
lock_session.rollback()
else:
LOG.warning('Amphora %(id)s health message reports %(found)i '
'listeners when %(expected)i expected',

View File

@ -1069,15 +1069,14 @@ class AmphoraHealthRepository(BaseRepository):
expired_time = datetime.datetime.utcnow() - datetime.timedelta(
seconds=timeout)
with session.begin(subtransactions=True):
amp = session.query(self.model_class).with_for_update().filter_by(
busy=False).filter(
self.model_class.last_update < expired_time).first()
amp = session.query(self.model_class).with_for_update().filter_by(
busy=False).filter(
self.model_class.last_update < expired_time).first()
if amp is None:
return None
if amp is None:
return None
amp.busy = True
amp.busy = True
return amp.to_data_model()

View File

@ -40,19 +40,24 @@ class TestHealthManagerCMD(base.TestCase):
mock_getter.assert_called_once_with(mock_health(), mock_stats())
self.assertEqual(2, getter_mock.check.call_count)
@mock.patch('octavia.cmd.health_manager.true_func')
@mock.patch('octavia.controller.healthmanager.'
'health_manager.HealthManager')
def test_hm_health_check(self, mock_health):
def test_hm_health_check(self, mock_health, mock_true_func):
hm_mock = mock.MagicMock()
health_check_mock = mock.MagicMock()
hm_mock.health_check = health_check_mock
hm_mock.health_check.side_effect = [None, Exception('break')]
mock_true_func.side_effect = [True, True, Exception('break')]
mock_health.return_value = hm_mock
self.assertRaisesRegex(Exception, 'break',
health_manager.hm_health_check)
mock_health.assert_called_once_with()
self.assertEqual(2, hm_mock.health_check.call_count)
def test_hm_true_func(self):
self.assertTrue(health_manager.true_func())
@mock.patch('multiprocessing.Process')
@mock.patch('octavia.common.service.prepare_service')
def test_main(self, mock_service, mock_process):

View File

@ -14,6 +14,7 @@
import mock
from oslo_config import cfg
from oslo_db import exception as db_exc
from oslo_utils import uuidutils
from octavia.controller.healthmanager import health_manager as healthmanager
@ -51,14 +52,27 @@ class TestHealthManager(base.TestCase):
amphora_health = mock.MagicMock()
amphora_health.amphora_id = AMPHORA_ID
session_mock.side_effect = [None, TestException('test')]
get_stale_amp_mock.side_effect = [amphora_health, None]
get_stale_amp_mock.side_effect = [amphora_health,
None,
TestException('test')]
hm = healthmanager.HealthManager()
self.assertRaises(TestException, hm.health_check)
failover_mock.assert_called_once_with(AMPHORA_ID)
# Test DBDeadlock and RetryRequest exceptions
session_mock.reset_mock()
get_stale_amp_mock.reset_mock()
mock_session = mock.MagicMock()
session_mock.return_value = mock_session
get_stale_amp_mock.side_effect = [
db_exc.DBDeadlock,
db_exc.RetryRequest(Exception('retry_test')),
TestException('test')]
self.assertRaises(TestException, hm.health_check)
self.assertEqual(3, mock_session.rollback.call_count)
@mock.patch('octavia.controller.worker.controller_worker.'
'ControllerWorker.failover_amphora')
@mock.patch('octavia.db.repositories.AmphoraHealthRepository.'
@ -68,8 +82,7 @@ class TestHealthManager(base.TestCase):
def test_health_check_nonestale_amphora(self, session_mock,
sleep_mock, get_stale_amp_mock,
failover_mock):
session_mock.side_effect = [None, TestException('test')]
get_stale_amp_mock.return_value = None
get_stale_amp_mock.side_effect = [None, TestException('test')]
hm = healthmanager.HealthManager()
self.assertRaises(TestException, hm.health_check)

View File

@ -28,6 +28,15 @@ from octavia.db import models as db_models
from octavia.tests.unit import base
class TestException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
class TestUpdateHealthDb(base.TestCase):
FAKE_UUID_1 = uuidutils.generate_uuid()
@ -104,6 +113,28 @@ class TestUpdateHealthDb(base.TestCase):
self.assertTrue(lb.operating_status.lower.called)
self.assertTrue(self.loadbalancer_repo.update.called)
def test_update_health_replace_error(self):
health = {
"id": self.FAKE_UUID_1,
"listeners": {
"listener-id-1": {"status": constants.OPEN, "pools": {
"pool-id-1": {"status": constants.UP,
"members": {"member-id-1": constants.UP}
}
}
}
}
}
session_mock = mock.MagicMock()
session_mock.commit.side_effect = TestException('boom')
self.mock_session.return_value = session_mock
self.assertRaises(TestException, self.hm.update_health, health)
self.assertTrue(self.amphora_health_repo.replace.called)
session_mock.rollback.assert_called_once()
def test_update_health_online(self):
health = {
@ -118,7 +149,8 @@ class TestUpdateHealthDb(base.TestCase):
}
}
self.mock_session.return_value = 'blah'
session_mock = mock.MagicMock()
self.mock_session.return_value = session_mock
self.hm.update_health(health)
self.assertTrue(self.amphora_health_repo.replace.called)
@ -128,17 +160,17 @@ class TestUpdateHealthDb(base.TestCase):
health.get('listeners', {})):
self.listener_repo.update.assert_any_call(
'blah', listener_id, operating_status=constants.ONLINE)
session_mock, listener_id, operating_status=constants.ONLINE)
for pool_id, pool in six.iteritems(listener.get('pools', {})):
self.hm.pool_repo.update.assert_any_call(
'blah', pool_id, operating_status=constants.ONLINE)
session_mock, pool_id, operating_status=constants.ONLINE)
for member_id, member in six.iteritems(
pool.get('members', {})):
self.member_repo.update.assert_any_call(
'blah', member_id,
session_mock, member_id,
operating_status=constants.ONLINE)
# If the listener count is wrong, make sure we don't update
@ -160,7 +192,8 @@ class TestUpdateHealthDb(base.TestCase):
"status": constants.UP,
"members": {"member-id-1": constants.DRAIN}}}}}}
self.mock_session.return_value = 'blah'
session_mock = mock.MagicMock()
self.mock_session.return_value = session_mock
self.hm.update_health(health)
self.assertTrue(self.amphora_health_repo.replace.called)
@ -170,18 +203,18 @@ class TestUpdateHealthDb(base.TestCase):
health.get('listeners', {})):
self.listener_repo.update.assert_any_call(
'blah', listener_id, operating_status=constants.ONLINE)
session_mock, listener_id, operating_status=constants.ONLINE)
for pool_id, pool in six.iteritems(listener.get('pools', {})):
self.hm.pool_repo.update.assert_any_call(
'blah', pool_id, operating_status=constants.ONLINE)
session_mock, pool_id, operating_status=constants.ONLINE)
for member_id, member in six.iteritems(
pool.get('members', {})):
self.member_repo.update.assert_any_call(
'blah', member_id,
session_mock, member_id,
operating_status=constants.DRAINING)
def test_update_health_member_maint(self):
@ -196,7 +229,8 @@ class TestUpdateHealthDb(base.TestCase):
"status": constants.UP,
"members": {"member-id-1": constants.MAINT}}}}}}
self.mock_session.return_value = 'blah'
session_mock = mock.MagicMock()
self.mock_session.return_value = session_mock
self.hm.update_health(health)
self.assertTrue(self.amphora_health_repo.replace.called)
@ -206,18 +240,18 @@ class TestUpdateHealthDb(base.TestCase):
health.get('listeners', {})):
self.listener_repo.update.assert_any_call(
'blah', listener_id, operating_status=constants.ONLINE)
session_mock, listener_id, operating_status=constants.ONLINE)
for pool_id, pool in six.iteritems(listener.get('pools', {})):
self.hm.pool_repo.update.assert_any_call(
'blah', pool_id, operating_status=constants.ONLINE)
session_mock, pool_id, operating_status=constants.ONLINE)
for member_id, member in six.iteritems(
pool.get('members', {})):
self.member_repo.update.assert_any_call(
'blah', member_id,
session_mock, member_id,
operating_status=constants.OFFLINE)
def test_update_health_member_unknown(self):
@ -232,7 +266,8 @@ class TestUpdateHealthDb(base.TestCase):
"status": constants.UP,
"members": {"member-id-1": "blah"}}}}}}
self.mock_session.return_value = 'blah'
session_mock = mock.MagicMock()
self.mock_session.return_value = session_mock
self.hm.update_health(health)
self.assertTrue(self.amphora_health_repo.replace.called)
@ -242,12 +277,12 @@ class TestUpdateHealthDb(base.TestCase):
health.get('listeners', {})):
self.listener_repo.update.assert_any_call(
'blah', listener_id, operating_status=constants.ONLINE)
session_mock, listener_id, operating_status=constants.ONLINE)
for pool_id, pool in six.iteritems(listener.get('pools', {})):
self.hm.pool_repo.update.assert_any_call(
'blah', pool_id, operating_status=constants.ONLINE)
session_mock, pool_id, operating_status=constants.ONLINE)
self.assertTrue(not self.member_repo.update.called)
def test_update_health_member_down(self):
@ -264,7 +299,8 @@ class TestUpdateHealthDb(base.TestCase):
}
}
self.mock_session.return_value = 'blah'
session_mock = mock.MagicMock()
self.mock_session.return_value = session_mock
self.hm.update_health(health)
self.assertTrue(self.amphora_health_repo.replace.called)
@ -274,18 +310,18 @@ class TestUpdateHealthDb(base.TestCase):
health.get('listeners', {})):
self.listener_repo.update.assert_any_call(
'blah', listener_id, operating_status=constants.ONLINE)
session_mock, listener_id, operating_status=constants.ONLINE)
for pool_id, pool in six.iteritems(listener.get('pools', {})):
self.hm.pool_repo.update.assert_any_call(
'blah', pool_id, operating_status=constants.DEGRADED)
session_mock, pool_id, operating_status=constants.DEGRADED)
for member_id, member in six.iteritems(
pool.get('members', {})):
self.member_repo.update.assert_any_call(
'blah', member_id,
session_mock, member_id,
operating_status=constants.ERROR)
def test_update_health_member_no_check(self):
@ -303,7 +339,8 @@ class TestUpdateHealthDb(base.TestCase):
}
}
self.mock_session.return_value = 'blah'
session_mock = mock.MagicMock()
self.mock_session.return_value = session_mock
self.hm.update_health(health)
self.assertTrue(self.amphora_health_repo.replace.called)
@ -313,18 +350,18 @@ class TestUpdateHealthDb(base.TestCase):
health.get('listeners', {})):
self.listener_repo.update.assert_any_call(
'blah', listener_id, operating_status=constants.ONLINE)
session_mock, listener_id, operating_status=constants.ONLINE)
for pool_id, pool in six.iteritems(listener.get('pools', {})):
self.hm.pool_repo.update.assert_any_call(
'blah', pool_id, operating_status=constants.ONLINE)
session_mock, pool_id, operating_status=constants.ONLINE)
for member_id, member in six.iteritems(
pool.get('members', {})):
self.member_repo.update.assert_any_call(
'blah', member_id,
session_mock, member_id,
operating_status=constants.NO_MONITOR)
def test_update_health_member_admin_down(self):
@ -340,7 +377,8 @@ class TestUpdateHealthDb(base.TestCase):
"members": {
"member-id-1": constants.UP}}}}}}
self.mock_session.return_value = 'blah'
session_mock = mock.MagicMock()
self.mock_session.return_value = session_mock
mock_pool = mock.Mock()
mock_member1 = mock.Mock()
@ -357,21 +395,21 @@ class TestUpdateHealthDb(base.TestCase):
health.get('listeners', {})):
self.listener_repo.update.assert_any_call(
'blah', listener_id, operating_status=constants.ONLINE)
session_mock, listener_id, operating_status=constants.ONLINE)
for pool_id, pool in six.iteritems(listener.get('pools', {})):
self.hm.pool_repo.update.assert_any_call(
'blah', pool_id, operating_status=constants.ONLINE)
session_mock, pool_id, operating_status=constants.ONLINE)
for member_id, member in six.iteritems(
pool.get('members', {})):
self.member_repo.update.assert_any_call(
'blah', member_id,
session_mock, member_id,
operating_status=constants.ONLINE)
self.member_repo.update.assert_any_call(
'blah', mock_member2.id, operating_status=constants.OFFLINE)
session_mock, mock_member2.id, operating_status=constants.OFFLINE)
def test_update_health_list_full_member_down(self):
@ -387,7 +425,8 @@ class TestUpdateHealthDb(base.TestCase):
}
}
self.mock_session.return_value = 'blah'
session_mock = mock.MagicMock()
self.mock_session.return_value = session_mock
self.hm.update_health(health)
self.assertTrue(self.amphora_health_repo.replace.called)
@ -397,18 +436,18 @@ class TestUpdateHealthDb(base.TestCase):
health.get('listeners', {})):
self.listener_repo.update.assert_any_call(
'blah', listener_id, operating_status=constants.DEGRADED)
session_mock, listener_id, operating_status=constants.DEGRADED)
for pool_id, pool in six.iteritems(listener.get('pools', {})):
self.hm.pool_repo.update.assert_any_call(
'blah', pool_id, operating_status=constants.DEGRADED)
session_mock, pool_id, operating_status=constants.DEGRADED)
for member_id, member in six.iteritems(
pool.get('members', {})):
self.member_repo.update.assert_any_call(
'blah', member_id,
session_mock, member_id,
operating_status=constants.ERROR)
self.hm.listener_repo.count.return_value = 2
@ -431,7 +470,8 @@ class TestUpdateHealthDb(base.TestCase):
}
}
self.mock_session.return_value = 'blah'
session_mock = mock.MagicMock()
self.mock_session.return_value = session_mock
self.hm.update_health(health)
self.assertTrue(self.amphora_health_repo.replace.called)
@ -441,18 +481,19 @@ class TestUpdateHealthDb(base.TestCase):
health.get('listeners', {})):
self.listener_repo.update.assert_any_call(
'blah', listener_id, operating_status=constants.ONLINE)
session_mock, listener_id, operating_status=constants.ONLINE)
for pool_id, pool in six.iteritems(listener.get('pools', {})):
self.hm.pool_repo.update.assert_any_call(
'blah', pool_id, operating_status=constants.ERROR)
session_mock, pool_id, operating_status=constants.ERROR)
for member_id, member in six.iteritems(
pool.get('members', {})):
self.member_repo.update.assert_any_call(
'blah', member_id, operating_status=constants.ERROR)
session_mock, member_id,
operating_status=constants.ERROR)
# Test the logic code paths
def test_update_health_full(self):
@ -542,7 +583,8 @@ class TestUpdateHealthDb(base.TestCase):
self.hm.loadbalancer_repo.update.side_effect = (
[sqlalchemy.orm.exc.NoResultFound])
self.mock_session.return_value = 'blah'
session_mock = mock.MagicMock()
self.mock_session.return_value = session_mock
self.hm.update_health(health)
self.assertTrue(self.amphora_health_repo.replace.called)
@ -552,18 +594,18 @@ class TestUpdateHealthDb(base.TestCase):
health.get('listeners', {})):
self.listener_repo.update.assert_any_call(
'blah', listener_id, operating_status=constants.ONLINE)
session_mock, listener_id, operating_status=constants.ONLINE)
for pool_id, pool in six.iteritems(listener.get('pools', {})):
self.hm.pool_repo.update.assert_any_call(
'blah', pool_id, operating_status=constants.ONLINE)
session_mock, pool_id, operating_status=constants.ONLINE)
for member_id, member in six.iteritems(
pool.get('members', {})):
self.member_repo.update.assert_any_call(
'blah', member_id,
session_mock, member_id,
operating_status=constants.ONLINE)
def test_update_health_no_status_change(self):