diff --git a/distributedcloud/dcmanager/audit/subcloud_audit_manager.py b/distributedcloud/dcmanager/audit/subcloud_audit_manager.py index 6d0c25638..83822ed1d 100644 --- a/distributedcloud/dcmanager/audit/subcloud_audit_manager.py +++ b/distributedcloud/dcmanager/audit/subcloud_audit_manager.py @@ -32,6 +32,7 @@ from dcmanager.audit import patch_audit from dcmanager.audit import rpcapi as dcmanager_audit_rpc_client from dcmanager.audit import software_audit from dcmanager.audit import utils as audit_utils +from dcmanager.common import consts from dcmanager.common import context from dcmanager.common.i18n import _ from dcmanager.common import manager @@ -466,14 +467,65 @@ class SubcloudAuditManager(manager.Manager): LOG.info("Fixup took %s seconds" % (end - start)) subcloud_ids = [] + skipped_subcloud_ids = [] + pruned_subcloud_audits = [] + subcloud_audits = db_api.subcloud_audits_get_all_need_audit( self.context, last_audit_threshold ) + LOG.debug( + f"Number of subclouds need audit based on audit ts: " + f"{len(subcloud_audits)}" + ) + + # Remove subclouds that don't qualify for this round of audit + for audit, subcloud_name, deploy_status, availability_status in ( + list(subcloud_audits)): + # Include failure deploy status states in the auditable list + # so that the subcloud can be set as offline + if (deploy_status not in + [consts.DEPLOY_STATE_DONE, + consts.DEPLOY_STATE_CONFIGURING, + consts.DEPLOY_STATE_CONFIG_FAILED, + consts.DEPLOY_STATE_CONFIG_ABORTED, + consts.DEPLOY_STATE_PRE_CONFIG_FAILED, + consts.DEPLOY_STATE_INSTALL_FAILED, + consts.DEPLOY_STATE_INSTALL_ABORTED, + consts.DEPLOY_STATE_PRE_INSTALL_FAILED, + consts.DEPLOY_STATE_INSTALLING, + consts.DEPLOY_STATE_DATA_MIGRATION_FAILED, + consts.DEPLOY_STATE_UPGRADE_ACTIVATED, + consts.DEPLOY_STATE_RESTORING, + consts.DEPLOY_STATE_RESTORE_PREP_FAILED, + consts.DEPLOY_STATE_RESTORE_FAILED, + consts.DEPLOY_STATE_REHOME_PENDING]) or ( + (deploy_status in [ + consts.DEPLOY_STATE_INSTALLING, + consts.DEPLOY_STATE_REHOME_PENDING]) + and availability_status == + dccommon_consts.AVAILABILITY_OFFLINE): + LOG.debug("Skip subcloud %s audit, deploy_status: %s" % + (subcloud_name, deploy_status)) + skipped_subcloud_ids.append(audit.subcloud_id) + else: + pruned_subcloud_audits.append(audit) + + # Set the audit_finished_at timestamp for non qualified subclouds in bulk + LOG.debug( + f"Set end audit timestamp for non-qualified subclouds " + f"({len(skipped_subcloud_ids)}) in bulk" + ) + db_api.subcloud_audits_bulk_end_audit(self.context, skipped_subcloud_ids) + + LOG.debug( + f"Number of subclouds qualified for audit: " + f"{len(pruned_subcloud_audits)}" + ) # Now check whether any of these subclouds need patch audit or firmware # audit data and grab it if needed. if not audit_patch: - for audit in subcloud_audits: + for audit in pruned_subcloud_audits: # Currently the load audit is done as part of the patch audit. # It might make sense to split it out. if audit.patch_audit_requested or audit.load_audit_requested: @@ -481,25 +533,25 @@ class SubcloudAuditManager(manager.Manager): LOG.debug("DB says patch audit needed") break if not audit_firmware: - for audit in subcloud_audits: + for audit in pruned_subcloud_audits: if audit.firmware_audit_requested: LOG.debug("DB says firmware audit needed") audit_firmware = True break if not audit_kubernetes: - for audit in subcloud_audits: + for audit in pruned_subcloud_audits: if audit.kubernetes_audit_requested: LOG.debug("DB says kubernetes audit needed") audit_kubernetes = True break if not audit_kube_rootca_update: - for audit in subcloud_audits: + for audit in pruned_subcloud_audits: if audit.kube_rootca_update_audit_requested: LOG.debug("DB says kube-rootca-update audit needed") audit_kube_rootca_update = True break if not audit_software: - for audit in subcloud_audits: + for audit in pruned_subcloud_audits: if audit.spare_audit_requested: LOG.debug("DB says software audit needed") audit_software = True @@ -544,10 +596,10 @@ class SubcloudAuditManager(manager.Manager): ) # We want a chunksize of at least 1 so add the number of workers. - chunksize = (len(subcloud_audits) + CONF.audit_worker_workers) // ( + chunksize = (len(pruned_subcloud_audits) + CONF.audit_worker_workers) // ( CONF.audit_worker_workers ) - for audit in subcloud_audits: + for audit in pruned_subcloud_audits: subcloud_ids.append(audit.subcloud_id) if len(subcloud_ids) == chunksize: # We've gathered a batch of subclouds, send it for processing. diff --git a/distributedcloud/dcmanager/audit/subcloud_audit_worker_manager.py b/distributedcloud/dcmanager/audit/subcloud_audit_worker_manager.py index 6c766c54e..0ee928b3c 100644 --- a/distributedcloud/dcmanager/audit/subcloud_audit_worker_manager.py +++ b/distributedcloud/dcmanager/audit/subcloud_audit_worker_manager.py @@ -113,38 +113,6 @@ class SubcloudAuditWorkerManager(manager.Manager): LOG.debug("PID: %s, starting audit of subcloud: %s." % (self.pid, subcloud.name)) - # Include failure deploy status states in the auditable list - # so that the subcloud can be set as offline - if (subcloud.deploy_status not in - [consts.DEPLOY_STATE_DONE, - consts.DEPLOY_STATE_CONFIGURING, - consts.DEPLOY_STATE_CONFIG_FAILED, - consts.DEPLOY_STATE_CONFIG_ABORTED, - consts.DEPLOY_STATE_PRE_CONFIG_FAILED, - consts.DEPLOY_STATE_INSTALL_FAILED, - consts.DEPLOY_STATE_INSTALL_ABORTED, - consts.DEPLOY_STATE_PRE_INSTALL_FAILED, - consts.DEPLOY_STATE_INSTALLING, - consts.DEPLOY_STATE_DATA_MIGRATION_FAILED, - consts.DEPLOY_STATE_UPGRADE_ACTIVATED, - consts.DEPLOY_STATE_RESTORING, - consts.DEPLOY_STATE_RESTORE_PREP_FAILED, - consts.DEPLOY_STATE_RESTORE_FAILED, - consts.DEPLOY_STATE_REHOME_PENDING]) or ( - (subcloud.deploy_status in [ - consts.DEPLOY_STATE_INSTALLING, - consts.DEPLOY_STATE_REHOME_PENDING]) - and subcloud.availability_status == - dccommon_consts.AVAILABILITY_OFFLINE): - LOG.debug("Skip subcloud %s audit, deploy_status: %s" % - (subcloud.name, subcloud.deploy_status)) - # This DB API call will set the "audit_finished_at" timestamp - # so it won't get audited again for a while. - audits_done = [] - db_api.subcloud_audits_end_audit(self.context, - subcloud_id, audits_done) - continue - # Check the per-subcloud audit flags do_load_audit = subcloud_audits.load_audit_requested # Currently we do the load audit as part of the patch audit, diff --git a/distributedcloud/dcmanager/db/api.py b/distributedcloud/dcmanager/db/api.py index f0bfeca4c..328e8f30e 100644 --- a/distributedcloud/dcmanager/db/api.py +++ b/distributedcloud/dcmanager/db/api.py @@ -89,6 +89,11 @@ def subcloud_audits_end_audit(context, subcloud_id, audits_done): return IMPL.subcloud_audits_end_audit(context, subcloud_id, audits_done) +def subcloud_audits_bulk_end_audit(context, subcloud_ids): + """Set the 'audit finished' timestamp for the main audit in bulk.""" + return IMPL.subcloud_audits_bulk_end_audit(context, subcloud_ids) + + def subcloud_audits_fix_expired_audits(context, last_audit_threshold, trigger_audits=False): return IMPL.subcloud_audits_fix_expired_audits(context, diff --git a/distributedcloud/dcmanager/db/sqlalchemy/api.py b/distributedcloud/dcmanager/db/sqlalchemy/api.py index 481736e62..a36ff71b6 100644 --- a/distributedcloud/dcmanager/db/sqlalchemy/api.py +++ b/distributedcloud/dcmanager/db/sqlalchemy/api.py @@ -193,7 +193,12 @@ def subcloud_audits_update(context, subcloud_id, values): @require_context def subcloud_audits_get_all_need_audit(context, last_audit_threshold): with read_session() as session: - result = session.query(models.SubcloudAudits).\ + result = session.query(models.SubcloudAudits, + models.Subcloud.name, + models.Subcloud.deploy_status, + models.Subcloud.availability_status).\ + join(models.Subcloud, + models.Subcloud.id == models.SubcloudAudits.subcloud_id).\ filter_by(deleted=0).\ filter(models.SubcloudAudits.audit_started_at <= models.SubcloudAudits.audit_finished_at).\ @@ -245,6 +250,18 @@ def subcloud_audits_end_audit(context, subcloud_id, audits_done): return subcloud_audits_ref +@require_context +def subcloud_audits_bulk_end_audit(context, subcloud_ids): + values = { + "audit_finished_at": datetime.datetime.utcnow() + } + with write_session(): + model_query(context, models.SubcloudAudits). \ + filter_by(deleted=0). \ + filter(models.SubcloudAudits.subcloud_id.in_(subcloud_ids)). \ + update(values, synchronize_session='fetch') + + # Find and fix up subcloud audits where the audit has taken too long. # We want to find subclouds that started an audit but never finished # it and update the "finished at" timestamp to be the same as diff --git a/distributedcloud/dcmanager/tests/unit/audit/test_subcloud_audit_manager.py b/distributedcloud/dcmanager/tests/unit/audit/test_subcloud_audit_manager.py index d9884dfe3..a1c6cd9cf 100644 --- a/distributedcloud/dcmanager/tests/unit/audit/test_subcloud_audit_manager.py +++ b/distributedcloud/dcmanager/tests/unit/audit/test_subcloud_audit_manager.py @@ -20,6 +20,7 @@ import mock from dccommon import consts as dccommon_consts from dcmanager.audit import subcloud_audit_manager +from dcmanager.common import consts from dcmanager.db.sqlalchemy import api as db_api from dcmanager.tests import base @@ -295,6 +296,20 @@ class TestAuditManager(base.DCManagerTestCase): am = subcloud_audit_manager.SubcloudAuditManager() am._periodic_subcloud_audit_loop() + @mock.patch.object(subcloud_audit_manager.db_api, + 'subcloud_audits_bulk_end_audit') + def test_skip_subcloud_audit(self, mock_subcloud_audits_bulk_end_audit): + subcloud = self.create_subcloud_static(self.ctx) + am = subcloud_audit_manager.SubcloudAuditManager() + subcloud = db_api.subcloud_update( + self.ctx, subcloud.id, + management_state='unmanaged', + availability_status=dccommon_consts.AVAILABILITY_OFFLINE, + deploy_status=consts.DEPLOY_STATE_CREATED) + am._periodic_subcloud_audit_loop() + # Verify that the audit is skipped + mock_subcloud_audits_bulk_end_audit.assert_called_once() + def test_audit_one_subcloud(self): subcloud = self.create_subcloud_static(self.ctx) am = subcloud_audit_manager.SubcloudAuditManager() diff --git a/distributedcloud/dcmanager/tests/unit/audit/test_subcloud_audit_worker_manager.py b/distributedcloud/dcmanager/tests/unit/audit/test_subcloud_audit_worker_manager.py index cc658d0de..8aa7dc083 100644 --- a/distributedcloud/dcmanager/tests/unit/audit/test_subcloud_audit_worker_manager.py +++ b/distributedcloud/dcmanager/tests/unit/audit/test_subcloud_audit_worker_manager.py @@ -1045,38 +1045,6 @@ class TestAuditWorkerManager(base.DCManagerTestCase): mock.ANY, dccommon_consts.AVAILABILITY_OFFLINE, False, audit_fail_count) - @mock.patch.object(scheduler.ThreadGroupManager, 'start') - @mock.patch.object(subcloud_audit_worker_manager.db_api, - 'subcloud_audits_end_audit') - def test_offline_subcloud_audit_skip_while_installing( - self, mock_subcloud_audits_end_audit, mock_thread_start): - - subcloud = self.create_subcloud_static(self.ctx, name='subcloud1') - self.assertIsNotNone(subcloud) - - wm = subcloud_audit_worker_manager.SubcloudAuditWorkerManager() - - # Set the subcloud to unmanaged/offline/installing - subcloud = db_api.subcloud_update( - self.ctx, subcloud.id, - management_state='unmanaged', - first_identity_sync_complete=True, - availability_status=dccommon_consts.AVAILABILITY_OFFLINE, - deploy_status=consts.DEPLOY_STATE_INSTALLING) - - wm.audit_subclouds(context=self.ctx, - subcloud_ids=[subcloud.id], - patch_audit_data=True, - firmware_audit_data=True, - kubernetes_audit_data=True, - do_openstack_audit=False, - kube_rootca_update_audit_data=True, - software_audit_data=False) - - # Verify if audit was skipped - mock_subcloud_audits_end_audit.assert_called_once() - mock_thread_start.assert_not_called() - def test_audit_subcloud_offline_update_audit_fail_count_only(self): subcloud = self.create_subcloud_static(self.ctx, name='subcloud1') self.assertIsNotNone(subcloud) diff --git a/distributedcloud/dcmanager/tests/unit/db/test_subcloud_audits.py b/distributedcloud/dcmanager/tests/unit/db/test_subcloud_audits.py index 65cb10cf5..b7ce6e49c 100644 --- a/distributedcloud/dcmanager/tests/unit/db/test_subcloud_audits.py +++ b/distributedcloud/dcmanager/tests/unit/db/test_subcloud_audits.py @@ -144,7 +144,7 @@ class DBAPISubcloudAuditsTest(base.DCManagerTestCase): audits = db_api.subcloud_audits_get_all_need_audit( self.ctx, last_audit_threshold ) - subcloud_ids = [audit.subcloud_id for audit in audits] + subcloud_ids = [audit[0].subcloud_id for audit in audits] self.assertEqual(len(subcloud_ids), 2) self.assertNotIn(1, subcloud_ids) # Set one of the special audits to make sure it overrides. @@ -215,3 +215,13 @@ class DBAPISubcloudAuditsTest(base.DCManagerTestCase): self.assertEqual(result["load_audit_requested"], False) self.assertEqual(result["kubernetes_audit_requested"], False) self.assertEqual(result["kube_rootca_update_audit_requested"], False) + + def test_subcloud_audits_bulk_end_audit(self): + db_api.subcloud_audits_bulk_end_audit(self.ctx, [1, 2]) + subcloud_audits1 = db_api.subcloud_audits_get(self.ctx, 1) + subcloud_audits2 = db_api.subcloud_audits_get(self.ctx, 2) + subcloud_audits3 = db_api.subcloud_audits_get(self.ctx, 3) + self.assertEqual(subcloud_audits1["audit_finished_at"], + subcloud_audits2["audit_finished_at"]) + self.assertTrue(subcloud_audits1["audit_finished_at"] > + subcloud_audits3["audit_finished_at"])