Move subcloud filtering out of audit worker

The following changes aim to reduce the number of database
requests by dcmanager:
  - Move logic to filter out non-qualified subclouds from audit
    worker to audit manager.
  - Add new db api to mark end audit timestamp in bulk.

Test Plan:
Perform a batch subcloud deployment
  - Verify that subclouds being deployed are excluded from the
    audit_subclouds RPC requests to dcmanager audit workers.
  - Verify that the end audit timestamp of skipped subclouds
    is set in one database transaction.

Story: 2011106
Task: 50218

Change-Id: Ie4f9804a0ef870f81eb726fb9cd451b5284962ab
Signed-off-by: Tee Ngo <tee.ngo@windriver.com>
This commit is contained in:
Tee Ngo
2024-05-29 14:37:04 -04:00
parent a60ce81f26
commit ec3811f321
7 changed files with 108 additions and 73 deletions

View File

@@ -32,6 +32,7 @@ from dcmanager.audit import patch_audit
from dcmanager.audit import rpcapi as dcmanager_audit_rpc_client
from dcmanager.audit import software_audit
from dcmanager.audit import utils as audit_utils
from dcmanager.common import consts
from dcmanager.common import context
from dcmanager.common.i18n import _
from dcmanager.common import manager
@@ -466,14 +467,65 @@ class SubcloudAuditManager(manager.Manager):
LOG.info("Fixup took %s seconds" % (end - start))
subcloud_ids = []
skipped_subcloud_ids = []
pruned_subcloud_audits = []
subcloud_audits = db_api.subcloud_audits_get_all_need_audit(
self.context, last_audit_threshold
)
LOG.debug(
f"Number of subclouds need audit based on audit ts: "
f"{len(subcloud_audits)}"
)
# Remove subclouds that don't qualify for this round of audit
for audit, subcloud_name, deploy_status, availability_status in (
list(subcloud_audits)):
# Include failure deploy status states in the auditable list
# so that the subcloud can be set as offline
if (deploy_status not in
[consts.DEPLOY_STATE_DONE,
consts.DEPLOY_STATE_CONFIGURING,
consts.DEPLOY_STATE_CONFIG_FAILED,
consts.DEPLOY_STATE_CONFIG_ABORTED,
consts.DEPLOY_STATE_PRE_CONFIG_FAILED,
consts.DEPLOY_STATE_INSTALL_FAILED,
consts.DEPLOY_STATE_INSTALL_ABORTED,
consts.DEPLOY_STATE_PRE_INSTALL_FAILED,
consts.DEPLOY_STATE_INSTALLING,
consts.DEPLOY_STATE_DATA_MIGRATION_FAILED,
consts.DEPLOY_STATE_UPGRADE_ACTIVATED,
consts.DEPLOY_STATE_RESTORING,
consts.DEPLOY_STATE_RESTORE_PREP_FAILED,
consts.DEPLOY_STATE_RESTORE_FAILED,
consts.DEPLOY_STATE_REHOME_PENDING]) or (
(deploy_status in [
consts.DEPLOY_STATE_INSTALLING,
consts.DEPLOY_STATE_REHOME_PENDING])
and availability_status ==
dccommon_consts.AVAILABILITY_OFFLINE):
LOG.debug("Skip subcloud %s audit, deploy_status: %s" %
(subcloud_name, deploy_status))
skipped_subcloud_ids.append(audit.subcloud_id)
else:
pruned_subcloud_audits.append(audit)
# Set the audit_finished_at timestamp for non qualified subclouds in bulk
LOG.debug(
f"Set end audit timestamp for non-qualified subclouds "
f"({len(skipped_subcloud_ids)}) in bulk"
)
db_api.subcloud_audits_bulk_end_audit(self.context, skipped_subcloud_ids)
LOG.debug(
f"Number of subclouds qualified for audit: "
f"{len(pruned_subcloud_audits)}"
)
# Now check whether any of these subclouds need patch audit or firmware
# audit data and grab it if needed.
if not audit_patch:
for audit in subcloud_audits:
for audit in pruned_subcloud_audits:
# Currently the load audit is done as part of the patch audit.
# It might make sense to split it out.
if audit.patch_audit_requested or audit.load_audit_requested:
@@ -481,25 +533,25 @@ class SubcloudAuditManager(manager.Manager):
LOG.debug("DB says patch audit needed")
break
if not audit_firmware:
for audit in subcloud_audits:
for audit in pruned_subcloud_audits:
if audit.firmware_audit_requested:
LOG.debug("DB says firmware audit needed")
audit_firmware = True
break
if not audit_kubernetes:
for audit in subcloud_audits:
for audit in pruned_subcloud_audits:
if audit.kubernetes_audit_requested:
LOG.debug("DB says kubernetes audit needed")
audit_kubernetes = True
break
if not audit_kube_rootca_update:
for audit in subcloud_audits:
for audit in pruned_subcloud_audits:
if audit.kube_rootca_update_audit_requested:
LOG.debug("DB says kube-rootca-update audit needed")
audit_kube_rootca_update = True
break
if not audit_software:
for audit in subcloud_audits:
for audit in pruned_subcloud_audits:
if audit.spare_audit_requested:
LOG.debug("DB says software audit needed")
audit_software = True
@@ -544,10 +596,10 @@ class SubcloudAuditManager(manager.Manager):
)
# We want a chunksize of at least 1 so add the number of workers.
chunksize = (len(subcloud_audits) + CONF.audit_worker_workers) // (
chunksize = (len(pruned_subcloud_audits) + CONF.audit_worker_workers) // (
CONF.audit_worker_workers
)
for audit in subcloud_audits:
for audit in pruned_subcloud_audits:
subcloud_ids.append(audit.subcloud_id)
if len(subcloud_ids) == chunksize:
# We've gathered a batch of subclouds, send it for processing.

View File

@@ -113,38 +113,6 @@ class SubcloudAuditWorkerManager(manager.Manager):
LOG.debug("PID: %s, starting audit of subcloud: %s." %
(self.pid, subcloud.name))
# Include failure deploy status states in the auditable list
# so that the subcloud can be set as offline
if (subcloud.deploy_status not in
[consts.DEPLOY_STATE_DONE,
consts.DEPLOY_STATE_CONFIGURING,
consts.DEPLOY_STATE_CONFIG_FAILED,
consts.DEPLOY_STATE_CONFIG_ABORTED,
consts.DEPLOY_STATE_PRE_CONFIG_FAILED,
consts.DEPLOY_STATE_INSTALL_FAILED,
consts.DEPLOY_STATE_INSTALL_ABORTED,
consts.DEPLOY_STATE_PRE_INSTALL_FAILED,
consts.DEPLOY_STATE_INSTALLING,
consts.DEPLOY_STATE_DATA_MIGRATION_FAILED,
consts.DEPLOY_STATE_UPGRADE_ACTIVATED,
consts.DEPLOY_STATE_RESTORING,
consts.DEPLOY_STATE_RESTORE_PREP_FAILED,
consts.DEPLOY_STATE_RESTORE_FAILED,
consts.DEPLOY_STATE_REHOME_PENDING]) or (
(subcloud.deploy_status in [
consts.DEPLOY_STATE_INSTALLING,
consts.DEPLOY_STATE_REHOME_PENDING])
and subcloud.availability_status ==
dccommon_consts.AVAILABILITY_OFFLINE):
LOG.debug("Skip subcloud %s audit, deploy_status: %s" %
(subcloud.name, subcloud.deploy_status))
# This DB API call will set the "audit_finished_at" timestamp
# so it won't get audited again for a while.
audits_done = []
db_api.subcloud_audits_end_audit(self.context,
subcloud_id, audits_done)
continue
# Check the per-subcloud audit flags
do_load_audit = subcloud_audits.load_audit_requested
# Currently we do the load audit as part of the patch audit,

View File

@@ -89,6 +89,11 @@ def subcloud_audits_end_audit(context, subcloud_id, audits_done):
return IMPL.subcloud_audits_end_audit(context, subcloud_id, audits_done)
def subcloud_audits_bulk_end_audit(context, subcloud_ids):
"""Set the 'audit finished' timestamp for the main audit in bulk."""
return IMPL.subcloud_audits_bulk_end_audit(context, subcloud_ids)
def subcloud_audits_fix_expired_audits(context, last_audit_threshold,
trigger_audits=False):
return IMPL.subcloud_audits_fix_expired_audits(context,

View File

@@ -193,7 +193,12 @@ def subcloud_audits_update(context, subcloud_id, values):
@require_context
def subcloud_audits_get_all_need_audit(context, last_audit_threshold):
with read_session() as session:
result = session.query(models.SubcloudAudits).\
result = session.query(models.SubcloudAudits,
models.Subcloud.name,
models.Subcloud.deploy_status,
models.Subcloud.availability_status).\
join(models.Subcloud,
models.Subcloud.id == models.SubcloudAudits.subcloud_id).\
filter_by(deleted=0).\
filter(models.SubcloudAudits.audit_started_at <=
models.SubcloudAudits.audit_finished_at).\
@@ -245,6 +250,18 @@ def subcloud_audits_end_audit(context, subcloud_id, audits_done):
return subcloud_audits_ref
@require_context
def subcloud_audits_bulk_end_audit(context, subcloud_ids):
values = {
"audit_finished_at": datetime.datetime.utcnow()
}
with write_session():
model_query(context, models.SubcloudAudits). \
filter_by(deleted=0). \
filter(models.SubcloudAudits.subcloud_id.in_(subcloud_ids)). \
update(values, synchronize_session='fetch')
# Find and fix up subcloud audits where the audit has taken too long.
# We want to find subclouds that started an audit but never finished
# it and update the "finished at" timestamp to be the same as

View File

@@ -20,6 +20,7 @@ import mock
from dccommon import consts as dccommon_consts
from dcmanager.audit import subcloud_audit_manager
from dcmanager.common import consts
from dcmanager.db.sqlalchemy import api as db_api
from dcmanager.tests import base
@@ -295,6 +296,20 @@ class TestAuditManager(base.DCManagerTestCase):
am = subcloud_audit_manager.SubcloudAuditManager()
am._periodic_subcloud_audit_loop()
@mock.patch.object(subcloud_audit_manager.db_api,
'subcloud_audits_bulk_end_audit')
def test_skip_subcloud_audit(self, mock_subcloud_audits_bulk_end_audit):
subcloud = self.create_subcloud_static(self.ctx)
am = subcloud_audit_manager.SubcloudAuditManager()
subcloud = db_api.subcloud_update(
self.ctx, subcloud.id,
management_state='unmanaged',
availability_status=dccommon_consts.AVAILABILITY_OFFLINE,
deploy_status=consts.DEPLOY_STATE_CREATED)
am._periodic_subcloud_audit_loop()
# Verify that the audit is skipped
mock_subcloud_audits_bulk_end_audit.assert_called_once()
def test_audit_one_subcloud(self):
subcloud = self.create_subcloud_static(self.ctx)
am = subcloud_audit_manager.SubcloudAuditManager()

View File

@@ -1045,38 +1045,6 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
mock.ANY, dccommon_consts.AVAILABILITY_OFFLINE,
False, audit_fail_count)
@mock.patch.object(scheduler.ThreadGroupManager, 'start')
@mock.patch.object(subcloud_audit_worker_manager.db_api,
'subcloud_audits_end_audit')
def test_offline_subcloud_audit_skip_while_installing(
self, mock_subcloud_audits_end_audit, mock_thread_start):
subcloud = self.create_subcloud_static(self.ctx, name='subcloud1')
self.assertIsNotNone(subcloud)
wm = subcloud_audit_worker_manager.SubcloudAuditWorkerManager()
# Set the subcloud to unmanaged/offline/installing
subcloud = db_api.subcloud_update(
self.ctx, subcloud.id,
management_state='unmanaged',
first_identity_sync_complete=True,
availability_status=dccommon_consts.AVAILABILITY_OFFLINE,
deploy_status=consts.DEPLOY_STATE_INSTALLING)
wm.audit_subclouds(context=self.ctx,
subcloud_ids=[subcloud.id],
patch_audit_data=True,
firmware_audit_data=True,
kubernetes_audit_data=True,
do_openstack_audit=False,
kube_rootca_update_audit_data=True,
software_audit_data=False)
# Verify if audit was skipped
mock_subcloud_audits_end_audit.assert_called_once()
mock_thread_start.assert_not_called()
def test_audit_subcloud_offline_update_audit_fail_count_only(self):
subcloud = self.create_subcloud_static(self.ctx, name='subcloud1')
self.assertIsNotNone(subcloud)

View File

@@ -144,7 +144,7 @@ class DBAPISubcloudAuditsTest(base.DCManagerTestCase):
audits = db_api.subcloud_audits_get_all_need_audit(
self.ctx, last_audit_threshold
)
subcloud_ids = [audit.subcloud_id for audit in audits]
subcloud_ids = [audit[0].subcloud_id for audit in audits]
self.assertEqual(len(subcloud_ids), 2)
self.assertNotIn(1, subcloud_ids)
# Set one of the special audits to make sure it overrides.
@@ -215,3 +215,13 @@ class DBAPISubcloudAuditsTest(base.DCManagerTestCase):
self.assertEqual(result["load_audit_requested"], False)
self.assertEqual(result["kubernetes_audit_requested"], False)
self.assertEqual(result["kube_rootca_update_audit_requested"], False)
def test_subcloud_audits_bulk_end_audit(self):
db_api.subcloud_audits_bulk_end_audit(self.ctx, [1, 2])
subcloud_audits1 = db_api.subcloud_audits_get(self.ctx, 1)
subcloud_audits2 = db_api.subcloud_audits_get(self.ctx, 2)
subcloud_audits3 = db_api.subcloud_audits_get(self.ctx, 3)
self.assertEqual(subcloud_audits1["audit_finished_at"],
subcloud_audits2["audit_finished_at"])
self.assertTrue(subcloud_audits1["audit_finished_at"] >
subcloud_audits3["audit_finished_at"])