From a7ca6368cdf6e5aa98f24f6b458906f3733a739e Mon Sep 17 00:00:00 2001 From: Christopher Souza Date: Tue, 16 May 2023 09:32:07 -0300 Subject: [PATCH] Trigger subcloud audits when identity status is updated In this commit, the trigger of the audits were removed from the manage action and moved to trigger only when the identity endpoint is updated from unknown to other statuses. This was done to avoid a race condition between dcdbsync and the subcloud audit where the subcloud audit would fail to role not found. The first_identity_sync_complete field was add to not allow the audit of any endpoint to be done before the first identity sync is complete, in order to avoid keystone errors. Test Plan: PASS: Add 250 subclouds and manage them all at the same time and verify that all subclouds went in-sync without errors from any endpoint audit. PASS: Unmanage a subcloud and manage it back and verify that the subcloud is in-sync PASS: Create a role to update the identity status from in-sync -> out-of-sync -> in-sync and verify that no extra subcloud audits were triggered. PASS: Re-install a subcloud that was managed and then manage that subcloud and verify that the others endpoints audit were only triggered after the identity sync was complete Closes-Bug: 2019857 Signed-off-by: Christopher Souza Change-Id: Ied7a3ed99112daeb7af9dd39ad7780c60f451e0e --- .../drivers/openstack/sdk_platform.py | 6 +- .../dcmanager/api/controllers/v1/subclouds.py | 1 + .../audit/subcloud_audit_worker_manager.py | 20 ++++- distributedcloud/dcmanager/db/api.py | 6 +- .../dcmanager/db/sqlalchemy/api.py | 7 +- .../013_first_identity_sync_complete.py | 30 +++++++ .../dcmanager/db/sqlalchemy/models.py | 3 +- .../dcmanager/manager/subcloud_manager.py | 9 +- .../dcmanager/state/subcloud_state_manager.py | 12 +-- .../test_subcloud_audit_worker_manager.py | 82 ++++++++++++++++++- .../unit/manager/test_subcloud_manager.py | 24 ++---- 11 files changed, 159 insertions(+), 41 deletions(-) create mode 100644 distributedcloud/dcmanager/db/sqlalchemy/migrate_repo/versions/013_first_identity_sync_complete.py diff --git a/distributedcloud/dccommon/drivers/openstack/sdk_platform.py b/distributedcloud/dccommon/drivers/openstack/sdk_platform.py index c035a3bf3..17d95fb42 100644 --- a/distributedcloud/dccommon/drivers/openstack/sdk_platform.py +++ b/distributedcloud/dccommon/drivers/openstack/sdk_platform.py @@ -1,4 +1,4 @@ -# Copyright 2017-2021 Wind River Inc +# Copyright 2017-2023 Wind River Inc # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -94,6 +94,10 @@ class OpenStackDriver(object): LOG.debug('keystone_client region %s error: %s' % (region_name, str(exception))) raise exception + except keystone_exceptions.NotFound as exception: + LOG.debug('keystone_client region %s error: %s' % + (region_name, str(exception))) + raise exception except Exception as exception: LOG.error('keystone_client region %s error: %s' % diff --git a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py index dff840229..536020285 100644 --- a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py +++ b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py @@ -1483,6 +1483,7 @@ class SubcloudsController(object): software_version=payload['software_version'], management_state=dccommon_consts.MANAGEMENT_UNMANAGED, deploy_status=consts.DEPLOY_STATE_PRE_INSTALL, + first_identity_sync_complete=False, data_install=data_install) self.dcmanager_rpc_client.reinstall_subcloud( diff --git a/distributedcloud/dcmanager/audit/subcloud_audit_worker_manager.py b/distributedcloud/dcmanager/audit/subcloud_audit_worker_manager.py index ddd9e1c0a..68111de43 100644 --- a/distributedcloud/dcmanager/audit/subcloud_audit_worker_manager.py +++ b/distributedcloud/dcmanager/audit/subcloud_audit_worker_manager.py @@ -364,7 +364,6 @@ class SubcloudAuditWorkerManager(manager.Manager): sysinv_client = None fm_client = None avail_to_set = dccommon_consts.AVAILABILITY_OFFLINE - try: os_client = OpenStackDriver(region_name=subcloud_name, thread_name='subcloud-audit', @@ -382,6 +381,19 @@ class SubcloudAuditWorkerManager(manager.Manager): LOG.error("Identity or Platform endpoint for online " "subcloud: %s not found." % subcloud_name) + except keystone_exceptions.NotFound: + if subcloud.first_identity_sync_complete \ + and avail_status_current == dccommon_consts.AVAILABILITY_ONLINE: + # The first identity sync is already complete + # Therefore this is an error + LOG.error("Identity or Platform endpoint for online " + "subcloud: %s not found." % subcloud_name) + else: + LOG.debug("Identity or Platform endpoint for %s not " + "found, ignoring for offline " + "subcloud or identity sync not done." % subcloud_name) + return audits_done, failures + except (keystone_exceptions.EndpointNotFound, keystone_exceptions.ConnectFailure, IndexError): @@ -455,9 +467,11 @@ class SubcloudAuditWorkerManager(manager.Manager): availability_status=avail_status_current, update_state_only=True) - # If subcloud is managed and online, audit additional resources + # If subcloud is managed and online and the identity was synced once, + # audit additional resources if (subcloud.management_state == dccommon_consts.MANAGEMENT_MANAGED and - avail_to_set == dccommon_consts.AVAILABILITY_ONLINE): + avail_to_set == dccommon_consts.AVAILABILITY_ONLINE and + subcloud.first_identity_sync_complete): # Get alarm summary and store in db, if fm_client: self.alarm_aggr.update_alarm_summary(subcloud_name, fm_client) diff --git a/distributedcloud/dcmanager/db/api.py b/distributedcloud/dcmanager/db/api.py index 086e44113..50d8c0644 100644 --- a/distributedcloud/dcmanager/db/api.py +++ b/distributedcloud/dcmanager/db/api.py @@ -1,5 +1,5 @@ # Copyright (c) 2015 Ericsson AB. -# Copyright (c) 2017-2022 Wind River Systems, Inc. +# Copyright (c) 2017-2023 Wind River Systems, Inc. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -174,7 +174,7 @@ def subcloud_update(context, subcloud_id, management_state=None, deploy_status=None, backup_status=None, backup_datetime=None, error_description=None, openstack_installed=None, group_id=None, - data_install=None, data_upgrade=None): + data_install=None, data_upgrade=None, first_identity_sync_complete=None): """Update a subcloud or raise if it does not exist.""" return IMPL.subcloud_update(context, subcloud_id, management_state, availability_status, software_version, @@ -182,7 +182,7 @@ def subcloud_update(context, subcloud_id, management_state=None, management_start_ip, management_end_ip, location, audit_fail_count, deploy_status, backup_status, backup_datetime, error_description, openstack_installed, - group_id, data_install, data_upgrade) + group_id, data_install, data_upgrade, first_identity_sync_complete) def subcloud_bulk_update_by_ids(context, subcloud_ids, update_form): diff --git a/distributedcloud/dcmanager/db/sqlalchemy/api.py b/distributedcloud/dcmanager/db/sqlalchemy/api.py index 4b47b355d..1b753a07e 100644 --- a/distributedcloud/dcmanager/db/sqlalchemy/api.py +++ b/distributedcloud/dcmanager/db/sqlalchemy/api.py @@ -1,5 +1,5 @@ # Copyright (c) 2015 Ericsson AB. -# Copyright (c) 2017-2022 Wind River Systems, Inc. +# Copyright (c) 2017-2023 Wind River Systems, Inc. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -382,7 +382,8 @@ def subcloud_update(context, subcloud_id, management_state=None, openstack_installed=None, group_id=None, data_install=None, - data_upgrade=None): + data_upgrade=None, + first_identity_sync_complete=None): with write_session() as session: subcloud_ref = subcloud_get(context, subcloud_id) if management_state is not None: @@ -421,6 +422,8 @@ def subcloud_update(context, subcloud_id, management_state=None, subcloud_ref.openstack_installed = openstack_installed if group_id is not None: subcloud_ref.group_id = group_id + if first_identity_sync_complete is not None: + subcloud_ref.first_identity_sync_complete = first_identity_sync_complete subcloud_ref.save(session) return subcloud_ref diff --git a/distributedcloud/dcmanager/db/sqlalchemy/migrate_repo/versions/013_first_identity_sync_complete.py b/distributedcloud/dcmanager/db/sqlalchemy/migrate_repo/versions/013_first_identity_sync_complete.py new file mode 100644 index 000000000..abf4f876b --- /dev/null +++ b/distributedcloud/dcmanager/db/sqlalchemy/migrate_repo/versions/013_first_identity_sync_complete.py @@ -0,0 +1,30 @@ +# Copyright (c) 2023 Wind River Systems, Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# + +import sqlalchemy + + +def upgrade(migrate_engine): + meta = sqlalchemy.MetaData() + meta.bind = migrate_engine + subcloud = sqlalchemy.Table('subclouds', meta, autoload=True) + # Add the first_identity_sync_complete column + subcloud.create_column(sqlalchemy.Column('first_identity_sync_complete', + sqlalchemy.Boolean, + default=False)) + + +def downgrade(migrate_engine): + raise NotImplementedError('Database downgrade not supported - ' + 'would drop all tables') diff --git a/distributedcloud/dcmanager/db/sqlalchemy/models.py b/distributedcloud/dcmanager/db/sqlalchemy/models.py index a08bf08ab..e2a7445a3 100644 --- a/distributedcloud/dcmanager/db/sqlalchemy/models.py +++ b/distributedcloud/dcmanager/db/sqlalchemy/models.py @@ -1,5 +1,5 @@ # Copyright (c) 2015 Ericsson AB -# Copyright (c) 2017-2022 Wind River Systems, Inc. +# Copyright (c) 2017-2023 Wind River Systems, Inc. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -137,6 +137,7 @@ class Subcloud(BASE, DCManagerBase): openstack_installed = Column(Boolean, nullable=False, default=False) systemcontroller_gateway_ip = Column(String(255)) audit_fail_count = Column(Integer) + first_identity_sync_complete = Column(Boolean, default=False) # multiple subclouds can be in a particular group group_id = Column(Integer, diff --git a/distributedcloud/dcmanager/manager/subcloud_manager.py b/distributedcloud/dcmanager/manager/subcloud_manager.py index 7a3758a69..62aaf2097 100644 --- a/distributedcloud/dcmanager/manager/subcloud_manager.py +++ b/distributedcloud/dcmanager/manager/subcloud_manager.py @@ -1764,16 +1764,9 @@ class SubcloudManager(manager.Manager): elif management_state == dccommon_consts.MANAGEMENT_MANAGED: # Subcloud is managed # Tell cert-mon to audit endpoint certificate - LOG.info('Request for managed audit for %s' % subcloud.name) + LOG.info('Request certmon audit for %s' % subcloud.name) dc_notification = dcmanager_rpc_client.DCManagerNotifications() dc_notification.subcloud_managed(context, subcloud.name) - # Since sysinv user is sync'ed during bootstrap, trigger the - # related audits. Patch and load audits are delayed until the - # identity resource synchronized by dcdbsync is complete. - exclude_endpoints = [dccommon_consts.ENDPOINT_TYPE_PATCHING, - dccommon_consts.ENDPOINT_TYPE_LOAD] - self.audit_rpc_client.trigger_subcloud_audits( - context, subcloud_id, exclude_endpoints) return db_api.subcloud_db_model_to_dict(subcloud) diff --git a/distributedcloud/dcmanager/state/subcloud_state_manager.py b/distributedcloud/dcmanager/state/subcloud_state_manager.py index 125dbb2c0..ad0be2c15 100644 --- a/distributedcloud/dcmanager/state/subcloud_state_manager.py +++ b/distributedcloud/dcmanager/state/subcloud_state_manager.py @@ -10,7 +10,7 @@ # License for the specific language governing permissions and limitations # under the License. # -# Copyright (c) 2017-2022 Wind River Systems, Inc. +# Copyright (c) 2017-2023 Wind River Systems, Inc. # # The right to copy, distribute, modify, or otherwise make use # of this software may be licensed only pursuant to the terms @@ -131,15 +131,17 @@ class SubcloudStateManager(manager.Manager): endpoint_type, sync_status) - # Trigger subcloud patch and load audits for the subcloud after + # Trigger subcloud audits for the subcloud after # its identity endpoint turns to other status from unknown if endpoint_type == dccommon_consts.ENDPOINT_TYPE_IDENTITY \ and sync_status != dccommon_consts.SYNC_STATUS_UNKNOWN \ and original_identity_status == dccommon_consts.SYNC_STATUS_UNKNOWN: - LOG.debug('Request for patch and load audit for %s after updating ' + if not subcloud.first_identity_sync_complete: + db_api.subcloud_update(context, subcloud_id, + first_identity_sync_complete=True) + LOG.debug('Request for audits for %s after updating ' 'identity out of unknown' % subcloud.name) - self.audit_rpc_client.trigger_subcloud_patch_load_audits( - context, subcloud_id) + self.audit_rpc_client.trigger_subcloud_audits(context, subcloud_id) entity_instance_id = "subcloud=%s.resource=%s" % \ (subcloud.name, endpoint_type) diff --git a/distributedcloud/dcmanager/tests/unit/audit/test_subcloud_audit_worker_manager.py b/distributedcloud/dcmanager/tests/unit/audit/test_subcloud_audit_worker_manager.py index 58d31dded..b537572b6 100644 --- a/distributedcloud/dcmanager/tests/unit/audit/test_subcloud_audit_worker_manager.py +++ b/distributedcloud/dcmanager/tests/unit/audit/test_subcloud_audit_worker_manager.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2022 Wind River Systems, Inc. +# Copyright (c) 2017-2023 Wind River Systems, Inc. # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain # a copy of the License at @@ -391,7 +391,8 @@ class TestAuditWorkerManager(base.DCManagerTestCase): # Set the subcloud to managed subcloud = db_api.subcloud_update( self.ctx, subcloud.id, - management_state='managed') + management_state='managed', + first_identity_sync_complete=True) am = subcloud_audit_manager.SubcloudAuditManager() wm = subcloud_audit_worker_manager.SubcloudAuditWorkerManager() @@ -460,6 +461,78 @@ class TestAuditWorkerManager(base.DCManagerTestCase): self.fake_kube_rootca_update_audit.subcloud_audit.assert_called_with( subcloud.name, kube_rootca_update_audit_data) + def test_audit_subcloud_online_first_identity_sync_not_complete(self): + + subcloud = self.create_subcloud_static(self.ctx, name='subcloud1') + self.assertIsNotNone(subcloud) + + # Set the subcloud to managed + subcloud = db_api.subcloud_update( + self.ctx, subcloud.id, + management_state='managed') + + am = subcloud_audit_manager.SubcloudAuditManager() + wm = subcloud_audit_worker_manager.SubcloudAuditWorkerManager() + + # Audit the subcloud + update_subcloud_state = False + do_audit_openstack = False + do_patch_audit = True + do_load_audit = True + do_firmware_audit = True + do_kubernetes_audit = True + do_kube_rootca_update_audit = True + (patch_audit_data, firmware_audit_data, + kubernetes_audit_data, kube_rootca_update_audit_data) = \ + am._get_audit_data(do_patch_audit, + do_firmware_audit, + do_kubernetes_audit, + do_kube_rootca_update_audit) + # Convert to dict like what would happen calling via RPC + # Note: the other data should also be converted... + patch_audit_data = patch_audit_data.to_dict() + wm._audit_subcloud(subcloud, + update_subcloud_state, + do_audit_openstack, + patch_audit_data, + firmware_audit_data, + kubernetes_audit_data, + kube_rootca_update_audit_data, + do_patch_audit, + do_load_audit, + do_firmware_audit, + do_kubernetes_audit, + do_kube_rootca_update_audit) + + # Verify the subcloud was set to online + self.fake_dcmanager_state_api.update_subcloud_availability.assert_called_with( + mock.ANY, subcloud.name, dccommon_consts.AVAILABILITY_ONLINE, + False, 0) + + # Verify the _update_subcloud_audit_fail_count is not called + with mock.patch.object(wm, '_update_subcloud_audit_fail_count') as \ + mock_update_subcloud_audit_fail_count: + mock_update_subcloud_audit_fail_count.assert_not_called() + + # Verify the openstack endpoints were not added + self.fake_dcmanager_api.update_subcloud_sync_endpoint_type.\ + assert_not_called() + + # Verify alarm update is not called + self.fake_alarm_aggr.update_alarm_summary.assert_not_called() + + # Verify patch audit is not called + self.fake_patch_audit.subcloud_patch_audit.assert_not_called() + + # Verify firmware audit is not called + self.fake_firmware_audit.subcloud_firmware_audit.assert_not_called() + + # Verify kubernetes audit is not called + self.fake_kubernetes_audit.subcloud_kubernetes_audit.assert_not_called() + + # Verify kube rootca update audit is not called + self.fake_kube_rootca_update_audit.subcloud_audit.assert_not_called() + def test_audit_subcloud_online_unmanaged(self): subcloud = self.create_subcloud_static(self.ctx, name='subcloud1') @@ -635,6 +708,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase): subcloud = db_api.subcloud_update( self.ctx, subcloud.id, management_state='managed', + first_identity_sync_complete=True, availability_status=dccommon_consts.AVAILABILITY_ONLINE) # Mark a service group as inactive @@ -895,6 +969,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase): subcloud = db_api.subcloud_update( self.ctx, subcloud.id, management_state='managed', + first_identity_sync_complete=True, availability_status=dccommon_consts.AVAILABILITY_ONLINE) # Audit the subcloud @@ -954,6 +1029,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase): self.ctx, subcloud.id, management_state='managed', availability_status=dccommon_consts.AVAILABILITY_ONLINE, + first_identity_sync_complete=True, openstack_installed=True) # Remove stx-openstack application @@ -1014,6 +1090,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase): subcloud = db_api.subcloud_update( self.ctx, subcloud.id, management_state='managed', + first_identity_sync_complete=True, availability_status=dccommon_consts.AVAILABILITY_ONLINE, openstack_installed=True) @@ -1071,6 +1148,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase): # Set the subcloud to managed subcloud = db_api.subcloud_update( self.ctx, subcloud.id, + first_identity_sync_complete=True, management_state='managed') am = subcloud_audit_manager.SubcloudAuditManager() diff --git a/distributedcloud/dcmanager/tests/unit/manager/test_subcloud_manager.py b/distributedcloud/dcmanager/tests/unit/manager/test_subcloud_manager.py index 38725f5ab..da47ae8ee 100644 --- a/distributedcloud/dcmanager/tests/unit/manager/test_subcloud_manager.py +++ b/distributedcloud/dcmanager/tests/unit/manager/test_subcloud_manager.py @@ -658,11 +658,6 @@ class TestSubcloudManager(base.DCManagerTestCase): fake_dcmanager_notification.subcloud_managed.assert_called_once_with( self.ctx, subcloud.name) - exclude_endpoints = [dccommon_consts.ENDPOINT_TYPE_PATCHING, - dccommon_consts.ENDPOINT_TYPE_LOAD] - self.fake_dcmanager_audit_api.trigger_subcloud_audits.\ - assert_called_once_with(self.ctx, subcloud.id, exclude_endpoints) - # Verify subcloud was updated with correct values updated_subcloud = db_api.subcloud_get_by_name(self.ctx, subcloud.name) self.assertEqual(dccommon_consts.MANAGEMENT_MANAGED, @@ -758,10 +753,6 @@ class TestSubcloudManager(base.DCManagerTestCase): fake_dcmanager_cermon_api.subcloud_managed.assert_called_once_with( self.ctx, subcloud.name) - exclude_endpoints = [dccommon_consts.ENDPOINT_TYPE_PATCHING, - dccommon_consts.ENDPOINT_TYPE_LOAD] - self.fake_dcmanager_audit_api.trigger_subcloud_audits.\ - assert_called_once_with(self.ctx, subcloud.id, exclude_endpoints) # Verify subcloud was updated with correct values updated_subcloud = db_api.subcloud_get_by_name(self.ctx, subcloud.name) @@ -1224,7 +1215,7 @@ class TestSubcloudManager(base.DCManagerTestCase): # We trigger a subcloud audits after updating the identity from unknown # to in-sync - self.fake_dcmanager_audit_api.trigger_subcloud_patch_load_audits.\ + self.fake_dcmanager_audit_api.trigger_subcloud_audits.\ assert_called_once_with(self.ctx, subcloud.id) # Audit fails once @@ -1280,6 +1271,7 @@ class TestSubcloudManager(base.DCManagerTestCase): # Set the subcloud to online/managed db_api.subcloud_update(self.ctx, subcloud.id, management_state=dccommon_consts.MANAGEMENT_MANAGED, + first_identity_sync_complete=True, availability_status=dccommon_consts.AVAILABILITY_ONLINE) # Update identity endpoints statuses @@ -1299,19 +1291,19 @@ class TestSubcloudManager(base.DCManagerTestCase): sync_status=original_sync_status) # Get the count of the trigger already called - original_trigger_subcloud_patch_load_audits_count = \ - self.fake_dcmanager_audit_api.trigger_subcloud_patch_load_audits.call_count + trigger_subcloud_audits = \ + self.fake_dcmanager_audit_api.trigger_subcloud_audits.call_count # Update identity to new status and get the count of the trigger again ssm.update_subcloud_endpoint_status( self.ctx, subcloud_name=subcloud.name, endpoint_type=endpoint, sync_status=new_sync_status) - new_trigger_subcloud_patch_load_audits_count = \ - self.fake_dcmanager_audit_api.trigger_subcloud_patch_load_audits.call_count + new_trigger_subcloud_audits = \ + self.fake_dcmanager_audit_api.trigger_subcloud_audits.call_count - trigger_count = new_trigger_subcloud_patch_load_audits_count - \ - original_trigger_subcloud_patch_load_audits_count + trigger_count = new_trigger_subcloud_audits - \ + trigger_subcloud_audits if original_sync_status == dccommon_consts.SYNC_STATUS_UNKNOWN and \ new_sync_status != dccommon_consts.SYNC_STATUS_UNKNOWN: