Fix performance of housekeeping DB clean up

The Housekeeping service grows in utilization of CPU as more amphorae
are created and/or marked as DELETED. The problem lays on the SELECT
statement constructed in get_all_deleted_expiring_amphora via the ORM --
it is joined eager loading all relationships.

The task does not need such amount of information, only the amphora ID.
The statement could be simplified by not loading any relationship or, at
most, lazy loading them.

This patch also fixes performance of cleaning up deleted and expired
load balancers. The code was doing multiple round-trips to the database
unecessarily:

1. retrieving all deleted LBs
2. for each LB, retrieving it again from DB to check expired date
3. delete LB

Step 1 and 2 are now condensed in get_all_deleted_expiring(), making it
a single round-trip.

Story: 2004665
Task: 28643

Change-Id: Iffc960c7c3a986328cfded1b4e408931ab0a7877
This commit is contained in:
Carlos Goncalves 2018-12-22 22:28:26 +01:00
parent 9ce614ad84
commit be552d99a4
5 changed files with 58 additions and 110 deletions

View File

@ -19,7 +19,6 @@ from oslo_config import cfg
from oslo_log import log as logging
from sqlalchemy.orm import exc as sqlalchemy_exceptions
from octavia.common import constants
from octavia.controller.worker import controller_worker as cw
from octavia.db import api as db_api
from octavia.db import repositories as repo
@ -72,24 +71,24 @@ class DatabaseCleanup(object):
seconds=CONF.house_keeping.amphora_expiry_age)
session = db_api.get_session()
expiring_amphora = self.amp_repo.get_all_deleted_expiring_amphora(
session, exp_age=exp_age)
amp_ids = self.amp_repo.get_all_deleted_expiring(session,
exp_age=exp_age)
for amp in expiring_amphora:
for amp_id in amp_ids:
# If we're here, we already think the amp is expiring according to
# the amphora table. Now check it is expired in the health table.
# In this way, we ensure that amps aren't deleted unless they are
# both expired AND no longer receiving zombie heartbeats.
if self.amp_health_repo.check_amphora_health_expired(
session, amp.id, exp_age):
session, amp_id, exp_age):
LOG.debug('Attempting to purge db record for Amphora ID: %s',
amp.id)
self.amp_repo.delete(session, id=amp.id)
amp_id)
self.amp_repo.delete(session, id=amp_id)
try:
self.amp_health_repo.delete(session, amphora_id=amp.id)
self.amp_health_repo.delete(session, amphora_id=amp_id)
except sqlalchemy_exceptions.NoResultFound:
pass # Best effort delete, this record might not exist
LOG.info('Purged db record for Amphora ID: %s', amp.id)
LOG.info('Purged db record for Amphora ID: %s', amp_id)
def cleanup_load_balancers(self):
"""Checks the DB for old load balancers and triggers their removal."""
@ -97,15 +96,13 @@ class DatabaseCleanup(object):
seconds=CONF.house_keeping.load_balancer_expiry_age)
session = db_api.get_session()
load_balancers, _ = self.lb_repo.get_all(
session, provisioning_status=constants.DELETED)
lb_ids = self.lb_repo.get_all_deleted_expiring(session,
exp_age=exp_age)
for lb in load_balancers:
if self.lb_repo.check_load_balancer_expired(session, lb.id,
exp_age):
LOG.info('Attempting to delete load balancer id : %s', lb.id)
self.lb_repo.delete(session, id=lb.id)
LOG.info('Deleted load balancer id : %s', lb.id)
for lb_id in lb_ids:
LOG.info('Attempting to delete load balancer id : %s', lb_id)
self.lb_repo.delete(session, id=lb_id)
LOG.info('Deleted load balancer id : %s', lb_id)
class CertRotation(object):

View File

@ -179,6 +179,31 @@ class BaseRepository(object):
"""
return bool(session.query(self.model_class).filter_by(id=id).first())
def get_all_deleted_expiring(self, session, exp_age):
"""Get all previously deleted resources that are now expiring.
:param session: A Sql Alchemy database session.
:param exp_age: A standard datetime delta which is used to see for how
long can a resource live without updates before
it is considered expired
:returns: A list of resource IDs
"""
expiry_time = datetime.datetime.utcnow() - exp_age
query = session.query(self.model_class).filter(
self.model_class.updated_at < expiry_time)
if hasattr(self.model_class, 'status'):
query = query.filter_by(status=consts.DELETED)
else:
query = query.filter_by(operating_status=consts.DELETED)
# Do not load any relationship
query = query.options(noload('*'))
model_list = query.all()
id_list = [model.id for model in model_list]
return id_list
class Repositories(object):
def __init__(self):
@ -809,30 +834,6 @@ class LoadBalancerRepository(BaseRepository):
session.add(lb)
return True
def check_load_balancer_expired(self, session, lb_id, exp_age=None):
"""Checks if a given load balancer is expired.
:param session: A Sql Alchemy database session.
:param lb_id: id of an load balancer object
:param exp_age: A standard datetime delta which is used to see for how
long can a load balancer live without updates before
it is considered expired (default:
CONF.house_keeping.load_balancer_expiry_age)
:returns: boolean
"""
if not exp_age:
exp_age = datetime.timedelta(
seconds=CONF.house_keeping.load_balancer_expiry_age)
timestamp = datetime.datetime.utcnow() - exp_age
lb = self.get(session, id=lb_id)
if lb:
# If a load balancer was never updated use its creation timestamp
last_update = lb.updated_at or lb.created_at
return last_update < timestamp
# Load balancer was just deleted.
return True
class VipRepository(BaseRepository):
model_class = models.Vip
@ -1187,33 +1188,6 @@ class AmphoraRepository(BaseRepository):
return db_lb.to_data_model()
return None
def get_all_deleted_expiring_amphora(self, session, exp_age=None):
"""Get all previously deleted amphora that are now expiring.
:param session: A Sql Alchemy database session.
:param exp_age: A standard datetime delta which is used to see for how
long can an amphora live without updates before it is
considered expired (default:
CONF.house_keeping.amphora_expiry_age)
:returns: [octavia.common.data_model]
"""
if not exp_age:
exp_age = datetime.timedelta(
seconds=CONF.house_keeping.amphora_expiry_age)
expiry_time = datetime.datetime.utcnow() - exp_age
query = session.query(self.model_class).filter_by(
status=consts.DELETED).filter(
self.model_class.updated_at < expiry_time)
# Only make one trip to the database
query = query.options(joinedload('*'))
model_list = query.all()
data_model_list = [model.to_data_model() for model in model_list]
return data_model_list
def get_spare_amphora_count(self, session):
"""Get the count of the spare amphora.

View File

@ -2881,36 +2881,6 @@ class LoadBalancerRepositoryTest(BaseRepositoryTest):
lb = self.lb_repo.get(self.session, id=lb_id)
self.assertEqual(constants.PENDING_UPDATE, lb.provisioning_status)
def test_check_load_balancer_expired_default_exp_age(self):
"""When exp_age defaults to load_balancer_expiry_age."""
newdate = datetime.datetime.utcnow() - datetime.timedelta(minutes=10)
self.lb_repo.create(self.session, id=self.FAKE_UUID_1,
project_id=self.FAKE_UUID_2,
provisioning_status=constants.ACTIVE,
operating_status=constants.ONLINE,
enabled=True,
updated_at=newdate)
check_res = self.lb_repo.check_load_balancer_expired(
self.session, self.FAKE_UUID_1)
# Default load_balancer_expiry_age value is 1 week so load balancer
# shouldn't be considered expired.
self.assertFalse(check_res)
def test_check_load_balancer_expired_with_exp_age(self):
"""When exp_age is passed as an argument."""
exp_age = datetime.timedelta(
seconds=self.FAKE_EXP_AGE)
newdate = datetime.datetime.utcnow() - datetime.timedelta(minutes=10)
self.lb_repo.create(self.session, id=self.FAKE_UUID_1,
project_id=self.FAKE_UUID_2,
provisioning_status=constants.ACTIVE,
operating_status=constants.ONLINE,
enabled=True,
updated_at=newdate)
check_res = self.lb_repo.check_load_balancer_expired(
self.session, self.FAKE_UUID_1, exp_age)
self.assertTrue(check_res)
class VipRepositoryTest(BaseRepositoryTest):
@ -3149,7 +3119,7 @@ class AmphoraRepositoryTest(BaseRepositoryTest):
amphora2 = self.create_amphora(
self.FAKE_UUID_2, status=constants.DELETED)
expiring_list = self.amphora_repo.get_all_deleted_expiring_amphora(
expiring_list = self.amphora_repo.get_all_deleted_expiring(
self.session, exp_age=exp_age)
expiring_ids = [amp.id for amp in expiring_list]
self.assertIn(amphora1.id, expiring_ids)

View File

@ -117,10 +117,10 @@ class TestDatabaseCleanup(base.TestCase):
vrrp_ip=self.FAKE_IP,
ha_ip=self.FAKE_IP,
updated_at=expired_time)
self.amp_repo.get_all_deleted_expiring_amphora.return_value = [amphora]
self.amp_repo.get_all_deleted_expiring.return_value = [amphora.id]
self.amp_health_repo.check_amphora_health_expired.return_value = True
self.dbclean.delete_old_amphorae()
self.assertTrue(self.amp_repo.get_all_deleted_expiring_amphora.called)
self.assertTrue(self.amp_repo.get_all_deleted_expiring.called)
self.assertTrue(
self.amp_health_repo.check_amphora_health_expired.called)
self.assertTrue(self.amp_repo.delete.called)
@ -138,9 +138,9 @@ class TestDatabaseCleanup(base.TestCase):
vrrp_ip=self.FAKE_IP,
ha_ip=self.FAKE_IP,
updated_at=datetime.datetime.now())
self.amp_repo.get_all_deleted_expiring_amphora.return_value = []
self.amp_repo.get_all_deleted_expiring.return_value = []
self.dbclean.delete_old_amphorae()
self.assertTrue(self.amp_repo.get_all_deleted_expiring_amphora.called)
self.assertTrue(self.amp_repo.get_all_deleted_expiring.called)
self.assertFalse(
self.amp_health_repo.check_amphora_health_expired.called)
self.assertFalse(self.amp_repo.delete.called)
@ -165,10 +165,10 @@ class TestDatabaseCleanup(base.TestCase):
vrrp_ip=self.FAKE_IP,
ha_ip=self.FAKE_IP,
updated_at=expired_time)
self.amp_repo.get_all_deleted_expiring_amphora.return_value = [amphora]
self.amp_repo.get_all_deleted_expiring.return_value = [amphora.id]
self.amp_health_repo.check_amphora_health_expired.return_value = False
self.dbclean.delete_old_amphorae()
self.assertTrue(self.amp_repo.get_all_deleted_expiring_amphora.called)
self.assertTrue(self.amp_repo.get_all_deleted_expiring.called)
self.assertTrue(
self.amp_health_repo.check_amphora_health_expired.called)
self.assertFalse(self.amp_repo.delete.called)
@ -187,12 +187,13 @@ class TestDatabaseCleanup(base.TestCase):
for expired_status in [True, False]:
lb_repo = mock.MagicMock()
self.dbclean.lb_repo = lb_repo
lb_repo.get_all.return_value = ([load_balancer], None)
lb_repo.check_load_balancer_expired.return_value = (
expired_status)
if expired_status:
expiring_lbs = [load_balancer.id]
else:
expiring_lbs = []
lb_repo.get_all_deleted_expiring.return_value = expiring_lbs
self.dbclean.cleanup_load_balancers()
self.assertTrue(lb_repo.get_all.called)
self.assertTrue(lb_repo.check_load_balancer_expired.called)
self.assertTrue(lb_repo.get_all_deleted_expiring.called)
if expired_status:
self.assertTrue(lb_repo.delete.called)
else:

View File

@ -0,0 +1,6 @@
---
fixes:
- |
Fixed a performance issue where the Housekeeping service could
significantly and incrementally utilize CPU as more amphorae and load
balancers are created and/or marked as DELETED.