From be552d99a434057c44f784f08f3ebd8ad2844928 Mon Sep 17 00:00:00 2001 From: Carlos Goncalves Date: Sat, 22 Dec 2018 22:28:26 +0100 Subject: [PATCH] Fix performance of housekeeping DB clean up The Housekeeping service grows in utilization of CPU as more amphorae are created and/or marked as DELETED. The problem lays on the SELECT statement constructed in get_all_deleted_expiring_amphora via the ORM -- it is joined eager loading all relationships. The task does not need such amount of information, only the amphora ID. The statement could be simplified by not loading any relationship or, at most, lazy loading them. This patch also fixes performance of cleaning up deleted and expired load balancers. The code was doing multiple round-trips to the database unecessarily: 1. retrieving all deleted LBs 2. for each LB, retrieving it again from DB to check expired date 3. delete LB Step 1 and 2 are now condensed in get_all_deleted_expiring(), making it a single round-trip. Story: 2004665 Task: 28643 Change-Id: Iffc960c7c3a986328cfded1b4e408931ab0a7877 --- .../controller/housekeeping/house_keeping.py | 31 ++++---- octavia/db/repositories.py | 76 ++++++------------- .../tests/functional/db/test_repositories.py | 32 +------- .../housekeeping/test_house_keeping.py | 23 +++--- ...eping-db-performance-b0d0fcfcce696314.yaml | 6 ++ 5 files changed, 58 insertions(+), 110 deletions(-) create mode 100644 releasenotes/notes/fix-housekeeping-db-performance-b0d0fcfcce696314.yaml diff --git a/octavia/controller/housekeeping/house_keeping.py b/octavia/controller/housekeeping/house_keeping.py index 48bc475b65..991b535953 100644 --- a/octavia/controller/housekeeping/house_keeping.py +++ b/octavia/controller/housekeeping/house_keeping.py @@ -19,7 +19,6 @@ from oslo_config import cfg from oslo_log import log as logging from sqlalchemy.orm import exc as sqlalchemy_exceptions -from octavia.common import constants from octavia.controller.worker import controller_worker as cw from octavia.db import api as db_api from octavia.db import repositories as repo @@ -72,24 +71,24 @@ class DatabaseCleanup(object): seconds=CONF.house_keeping.amphora_expiry_age) session = db_api.get_session() - expiring_amphora = self.amp_repo.get_all_deleted_expiring_amphora( - session, exp_age=exp_age) + amp_ids = self.amp_repo.get_all_deleted_expiring(session, + exp_age=exp_age) - for amp in expiring_amphora: + for amp_id in amp_ids: # If we're here, we already think the amp is expiring according to # the amphora table. Now check it is expired in the health table. # In this way, we ensure that amps aren't deleted unless they are # both expired AND no longer receiving zombie heartbeats. if self.amp_health_repo.check_amphora_health_expired( - session, amp.id, exp_age): + session, amp_id, exp_age): LOG.debug('Attempting to purge db record for Amphora ID: %s', - amp.id) - self.amp_repo.delete(session, id=amp.id) + amp_id) + self.amp_repo.delete(session, id=amp_id) try: - self.amp_health_repo.delete(session, amphora_id=amp.id) + self.amp_health_repo.delete(session, amphora_id=amp_id) except sqlalchemy_exceptions.NoResultFound: pass # Best effort delete, this record might not exist - LOG.info('Purged db record for Amphora ID: %s', amp.id) + LOG.info('Purged db record for Amphora ID: %s', amp_id) def cleanup_load_balancers(self): """Checks the DB for old load balancers and triggers their removal.""" @@ -97,15 +96,13 @@ class DatabaseCleanup(object): seconds=CONF.house_keeping.load_balancer_expiry_age) session = db_api.get_session() - load_balancers, _ = self.lb_repo.get_all( - session, provisioning_status=constants.DELETED) + lb_ids = self.lb_repo.get_all_deleted_expiring(session, + exp_age=exp_age) - for lb in load_balancers: - if self.lb_repo.check_load_balancer_expired(session, lb.id, - exp_age): - LOG.info('Attempting to delete load balancer id : %s', lb.id) - self.lb_repo.delete(session, id=lb.id) - LOG.info('Deleted load balancer id : %s', lb.id) + for lb_id in lb_ids: + LOG.info('Attempting to delete load balancer id : %s', lb_id) + self.lb_repo.delete(session, id=lb_id) + LOG.info('Deleted load balancer id : %s', lb_id) class CertRotation(object): diff --git a/octavia/db/repositories.py b/octavia/db/repositories.py index 46dfcf3d05..dcd8a61af4 100644 --- a/octavia/db/repositories.py +++ b/octavia/db/repositories.py @@ -179,6 +179,31 @@ class BaseRepository(object): """ return bool(session.query(self.model_class).filter_by(id=id).first()) + def get_all_deleted_expiring(self, session, exp_age): + """Get all previously deleted resources that are now expiring. + + :param session: A Sql Alchemy database session. + :param exp_age: A standard datetime delta which is used to see for how + long can a resource live without updates before + it is considered expired + :returns: A list of resource IDs + """ + + expiry_time = datetime.datetime.utcnow() - exp_age + + query = session.query(self.model_class).filter( + self.model_class.updated_at < expiry_time) + if hasattr(self.model_class, 'status'): + query = query.filter_by(status=consts.DELETED) + else: + query = query.filter_by(operating_status=consts.DELETED) + # Do not load any relationship + query = query.options(noload('*')) + model_list = query.all() + + id_list = [model.id for model in model_list] + return id_list + class Repositories(object): def __init__(self): @@ -809,30 +834,6 @@ class LoadBalancerRepository(BaseRepository): session.add(lb) return True - def check_load_balancer_expired(self, session, lb_id, exp_age=None): - """Checks if a given load balancer is expired. - - :param session: A Sql Alchemy database session. - :param lb_id: id of an load balancer object - :param exp_age: A standard datetime delta which is used to see for how - long can a load balancer live without updates before - it is considered expired (default: - CONF.house_keeping.load_balancer_expiry_age) - :returns: boolean - """ - if not exp_age: - exp_age = datetime.timedelta( - seconds=CONF.house_keeping.load_balancer_expiry_age) - - timestamp = datetime.datetime.utcnow() - exp_age - lb = self.get(session, id=lb_id) - if lb: - # If a load balancer was never updated use its creation timestamp - last_update = lb.updated_at or lb.created_at - return last_update < timestamp - # Load balancer was just deleted. - return True - class VipRepository(BaseRepository): model_class = models.Vip @@ -1187,33 +1188,6 @@ class AmphoraRepository(BaseRepository): return db_lb.to_data_model() return None - def get_all_deleted_expiring_amphora(self, session, exp_age=None): - - """Get all previously deleted amphora that are now expiring. - - :param session: A Sql Alchemy database session. - :param exp_age: A standard datetime delta which is used to see for how - long can an amphora live without updates before it is - considered expired (default: - CONF.house_keeping.amphora_expiry_age) - :returns: [octavia.common.data_model] - """ - if not exp_age: - exp_age = datetime.timedelta( - seconds=CONF.house_keeping.amphora_expiry_age) - - expiry_time = datetime.datetime.utcnow() - exp_age - - query = session.query(self.model_class).filter_by( - status=consts.DELETED).filter( - self.model_class.updated_at < expiry_time) - # Only make one trip to the database - query = query.options(joinedload('*')) - model_list = query.all() - - data_model_list = [model.to_data_model() for model in model_list] - return data_model_list - def get_spare_amphora_count(self, session): """Get the count of the spare amphora. diff --git a/octavia/tests/functional/db/test_repositories.py b/octavia/tests/functional/db/test_repositories.py index 2a5ed91221..8ee25833a9 100644 --- a/octavia/tests/functional/db/test_repositories.py +++ b/octavia/tests/functional/db/test_repositories.py @@ -2881,36 +2881,6 @@ class LoadBalancerRepositoryTest(BaseRepositoryTest): lb = self.lb_repo.get(self.session, id=lb_id) self.assertEqual(constants.PENDING_UPDATE, lb.provisioning_status) - def test_check_load_balancer_expired_default_exp_age(self): - """When exp_age defaults to load_balancer_expiry_age.""" - newdate = datetime.datetime.utcnow() - datetime.timedelta(minutes=10) - self.lb_repo.create(self.session, id=self.FAKE_UUID_1, - project_id=self.FAKE_UUID_2, - provisioning_status=constants.ACTIVE, - operating_status=constants.ONLINE, - enabled=True, - updated_at=newdate) - check_res = self.lb_repo.check_load_balancer_expired( - self.session, self.FAKE_UUID_1) - # Default load_balancer_expiry_age value is 1 week so load balancer - # shouldn't be considered expired. - self.assertFalse(check_res) - - def test_check_load_balancer_expired_with_exp_age(self): - """When exp_age is passed as an argument.""" - exp_age = datetime.timedelta( - seconds=self.FAKE_EXP_AGE) - newdate = datetime.datetime.utcnow() - datetime.timedelta(minutes=10) - self.lb_repo.create(self.session, id=self.FAKE_UUID_1, - project_id=self.FAKE_UUID_2, - provisioning_status=constants.ACTIVE, - operating_status=constants.ONLINE, - enabled=True, - updated_at=newdate) - check_res = self.lb_repo.check_load_balancer_expired( - self.session, self.FAKE_UUID_1, exp_age) - self.assertTrue(check_res) - class VipRepositoryTest(BaseRepositoryTest): @@ -3149,7 +3119,7 @@ class AmphoraRepositoryTest(BaseRepositoryTest): amphora2 = self.create_amphora( self.FAKE_UUID_2, status=constants.DELETED) - expiring_list = self.amphora_repo.get_all_deleted_expiring_amphora( + expiring_list = self.amphora_repo.get_all_deleted_expiring( self.session, exp_age=exp_age) expiring_ids = [amp.id for amp in expiring_list] self.assertIn(amphora1.id, expiring_ids) diff --git a/octavia/tests/unit/controller/housekeeping/test_house_keeping.py b/octavia/tests/unit/controller/housekeeping/test_house_keeping.py index 402e5037f1..4022d485a1 100644 --- a/octavia/tests/unit/controller/housekeeping/test_house_keeping.py +++ b/octavia/tests/unit/controller/housekeeping/test_house_keeping.py @@ -117,10 +117,10 @@ class TestDatabaseCleanup(base.TestCase): vrrp_ip=self.FAKE_IP, ha_ip=self.FAKE_IP, updated_at=expired_time) - self.amp_repo.get_all_deleted_expiring_amphora.return_value = [amphora] + self.amp_repo.get_all_deleted_expiring.return_value = [amphora.id] self.amp_health_repo.check_amphora_health_expired.return_value = True self.dbclean.delete_old_amphorae() - self.assertTrue(self.amp_repo.get_all_deleted_expiring_amphora.called) + self.assertTrue(self.amp_repo.get_all_deleted_expiring.called) self.assertTrue( self.amp_health_repo.check_amphora_health_expired.called) self.assertTrue(self.amp_repo.delete.called) @@ -138,9 +138,9 @@ class TestDatabaseCleanup(base.TestCase): vrrp_ip=self.FAKE_IP, ha_ip=self.FAKE_IP, updated_at=datetime.datetime.now()) - self.amp_repo.get_all_deleted_expiring_amphora.return_value = [] + self.amp_repo.get_all_deleted_expiring.return_value = [] self.dbclean.delete_old_amphorae() - self.assertTrue(self.amp_repo.get_all_deleted_expiring_amphora.called) + self.assertTrue(self.amp_repo.get_all_deleted_expiring.called) self.assertFalse( self.amp_health_repo.check_amphora_health_expired.called) self.assertFalse(self.amp_repo.delete.called) @@ -165,10 +165,10 @@ class TestDatabaseCleanup(base.TestCase): vrrp_ip=self.FAKE_IP, ha_ip=self.FAKE_IP, updated_at=expired_time) - self.amp_repo.get_all_deleted_expiring_amphora.return_value = [amphora] + self.amp_repo.get_all_deleted_expiring.return_value = [amphora.id] self.amp_health_repo.check_amphora_health_expired.return_value = False self.dbclean.delete_old_amphorae() - self.assertTrue(self.amp_repo.get_all_deleted_expiring_amphora.called) + self.assertTrue(self.amp_repo.get_all_deleted_expiring.called) self.assertTrue( self.amp_health_repo.check_amphora_health_expired.called) self.assertFalse(self.amp_repo.delete.called) @@ -187,12 +187,13 @@ class TestDatabaseCleanup(base.TestCase): for expired_status in [True, False]: lb_repo = mock.MagicMock() self.dbclean.lb_repo = lb_repo - lb_repo.get_all.return_value = ([load_balancer], None) - lb_repo.check_load_balancer_expired.return_value = ( - expired_status) + if expired_status: + expiring_lbs = [load_balancer.id] + else: + expiring_lbs = [] + lb_repo.get_all_deleted_expiring.return_value = expiring_lbs self.dbclean.cleanup_load_balancers() - self.assertTrue(lb_repo.get_all.called) - self.assertTrue(lb_repo.check_load_balancer_expired.called) + self.assertTrue(lb_repo.get_all_deleted_expiring.called) if expired_status: self.assertTrue(lb_repo.delete.called) else: diff --git a/releasenotes/notes/fix-housekeeping-db-performance-b0d0fcfcce696314.yaml b/releasenotes/notes/fix-housekeeping-db-performance-b0d0fcfcce696314.yaml new file mode 100644 index 0000000000..1e09933afb --- /dev/null +++ b/releasenotes/notes/fix-housekeeping-db-performance-b0d0fcfcce696314.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Fixed a performance issue where the Housekeeping service could + significantly and incrementally utilize CPU as more amphorae and load + balancers are created and/or marked as DELETED.