Fix Sub clouds going offline due to auth failure

This update contains the following changes that prevent subclouds going offline due to authentication failure: 1. The os region client cache is cleared when a new keystone client is created. The os region client will be re-created using the new keystone session. 2. When the user's access info (such as role id) is changed create new keystone client and os region clients. This could happen after system controller keystone role ids were synced to subclouds 3. Remove get_admin_backup_session that was only required when upgrading to stx 4.0. 4. Increase AVAIL_FAIL_COUNT_TO_ALARM to 2 as we don't want to alarm first failure since there are cases where we expect a transient failure in the subcloud (e.g. haproxy process restart to update certificates) Tested on DC-6: 1. Adding 50 subclouds twice 2. Soaking the fix over the weekend Closes-Bug: 1927007 Signed-off-by: Tao Liu <tao.liu@windriver.com> Change-Id: I86fdc9a2f062409e704bdfac2119dc488123f7de
2021-05-03 12:32:53 -04:00 · 2021-05-03 12:32:53 -04:00 · 17b5505d9e
parent 8acd9699ac
commit 17b5505d9e
5 changed files with 29 additions and 51 deletions
--- a/distributedcloud/dccommon/drivers/openstack/sdk_platform.py
+++ b/distributedcloud/dccommon/drivers/openstack/sdk_platform.py
@ -83,6 +83,9 @@ class OpenStackDriver(object):
                OpenStackDriver.update_region_clients(region_name,
                                                      KEYSTONE_CLIENT_NAME,
                                                      self.keystone_client)
+                # Clear client object cache
+                OpenStackDriver.os_clients_dict[region_name] = \
+                    collections.defaultdict(dict)
            except Exception as exception:
                LOG.error('keystone_client region %s error: %s' %
                          (region_name, str(exception)))
@ -185,14 +188,18 @@ class OpenStackDriver(object):
                    OpenStackDriver._identity_tokens[region_name],
                    include_catalog=False)
                if token != OpenStackDriver._identity_tokens[region_name]:
-                    LOG.debug("%s: updating token %s to %s" %
+                    LOG.debug("%s: AccessInfo changed %s to %s" %
                              (region_name,
                               OpenStackDriver._identity_tokens[region_name],
                               token))
-                    OpenStackDriver._identity_tokens[region_name] = token
+                    OpenStackDriver._identity_tokens[region_name] = None
+                    OpenStackDriver.os_clients_dict[region_name] = \
+                        collections.defaultdict(dict)
+                    return False

        except Exception as exception:
-            LOG.info('_is_token_valid handle: %s', str(exception))
+            LOG.info('_is_token_valid handle: region: %s error: %s',
+                     (region_name, str(exception)))
            # Reset the cached dictionary
            OpenStackDriver.os_clients_dict[region_name] = \
                collections.defaultdict(dict)
--- a/distributedcloud/dccommon/endpoint_cache.py
+++ b/distributedcloud/dccommon/endpoint_cache.py
@ -23,7 +23,6 @@
 import collections
 import threading

-from keystoneauth1 import exceptions as keystone_exceptions
 from keystoneauth1 import loading
 from keystoneauth1 import session

@ -107,10 +106,6 @@ class EndpointCache(object):
                CONF.endpoint_cache.password,
                CONF.endpoint_cache.project_name,
                CONF.endpoint_cache.project_domain_name)
-            # check if the current session is valid and get an admin session
-            # if necessary
-            self.admin_session = EndpointCache.get_admin_backup_session(
-                self.admin_session, CONF.endpoint_cache.username, sc_auth_url)

            self.keystone_client = ks_client.Client(
                session=self.admin_session,
@ -140,33 +135,6 @@ class EndpointCache(object):
            auth=user_auth, additional_headers=consts.USER_HEADER,
            timeout=timeout)

-    @classmethod
-    def get_admin_backup_session(cls, admin_session, user_name, auth_url):
-        """Validate a session and open an admin session if it fails.
-
-        This method is require to handle an upgrade to stx 4.0 and it
-        can be removed in stx 5.0.
-
-        """
-
-        try:
-            admin_session.get_auth_headers()
-        except keystone_exceptions.Unauthorized:
-            # this will only happen briefly during an upgrade to stx 4.0
-            # just until the dcorch has synced the dcmanager user to each
-            # subcloud
-            LOG.info("Failed to authenticate user:%s, use %s user instead"
-                     % (user_name,
-                        CONF.cache.admin_username))
-            admin_session = EndpointCache.get_admin_session(
-                auth_url,
-                CONF.cache.admin_username,
-                CONF.cache.admin_user_domain_name,
-                CONF.cache.admin_password,
-                CONF.cache.admin_tenant,
-                CONF.cache.admin_project_domain_name)
-        return admin_session
-
    @staticmethod
    def _is_central_cloud(region_id):
        central_cloud_regions = [consts.CLOUD_0, consts.VIRTUAL_MASTER_CLOUD]
--- a/distributedcloud/dcmanager/common/consts.py
+++ b/distributedcloud/dcmanager/common/consts.py
@ -75,7 +75,11 @@ ENDPOINT_TYPE = "endpoint_type"
 SERVICE_GROUP_STATUS_ACTIVE = "active"

 # Availability fail count
-AVAIL_FAIL_COUNT_TO_ALARM = 1
+# we don't want to alarm first failure since there are
+# cases where we expect a transient failure in the
+# subcloud (e.g. haproxy process restart to update
+# certificates)
+AVAIL_FAIL_COUNT_TO_ALARM = 2
 AVAIL_FAIL_COUNT_MAX = 9999

 # Software update strategy types
--- a/distributedcloud/dcmanager/tests/unit/audit/test_subcloud_audit_worker_manager.py
+++ b/distributedcloud/dcmanager/tests/unit/audit/test_subcloud_audit_worker_manager.py
@ -583,8 +583,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
        audit_fail_count = 1
        self.fake_dcmanager_api.update_subcloud_availability.\
            assert_called_with(mock.ANY, subcloud.name,
-                               consts.AVAILABILITY_OFFLINE,
-                               False, audit_fail_count)
+                               None, False, audit_fail_count)

        # Update the DB like dcmanager would do.
        subcloud = db_api.subcloud_update(
@ -605,23 +604,27 @@ class TestAuditWorkerManager(base.DCManagerTestCase):

        audit_fail_count = audit_fail_count + 1

-        # Verify the subcloud availability didn't change, just the fail count
+        # Verify the subcloud goes offline
        self.fake_dcmanager_api.update_subcloud_availability.\
            assert_called_with(mock.ANY, subcloud.name,
                               None, False,
                               audit_fail_count)

-        # Verify alarm update is not called
-        self.fake_alarm_aggr.update_alarm_summary.assert_not_called()
+        # Verify alarm update is called only once
+        self.fake_alarm_aggr.update_alarm_summary.assert_called_once_with(
+            subcloud.name, self.fake_openstack_client.fm_client)

-        # Verify patch audit is not called
-        self.fake_patch_audit.subcloud_patch_audit.assert_not_called()
+        # Verify patch audit is called only once
+        self.fake_patch_audit.subcloud_patch_audit.assert_called_once_with(
+            subcloud.name, mock.ANY, True)

-        # Verify firmware audit is not called
-        self.fake_firmware_audit.subcloud_firmware_audit.assert_not_called()
+        # Verify firmware audit is called
+        self.fake_firmware_audit.subcloud_firmware_audit.assert_called_once_with(
+            subcloud.name, mock.ANY)

-        # Verify firmware audit is not called
-        self.fake_kubernetes_audit.subcloud_kubernetes_audit.assert_not_called()
+        # Verify firmware audit is called
+        self.fake_kubernetes_audit.subcloud_kubernetes_audit.assert_called_once_with(
+            subcloud.name, mock.ANY)

    def test_audit_subcloud_offline_no_change(self):
        subcloud = self.create_subcloud_static(self.ctx, name='subcloud1')
--- a/distributedcloud/dcorch/engine/sync_thread.py
+++ b/distributedcloud/dcorch/engine/sync_thread.py
@ -193,10 +193,6 @@ class SyncThread(object):
                    config.admin_project_domain_name,
                    timeout=60)

-            if config is cfg.CONF.endpoint_cache:
-                self.sc_admin_session = EndpointCache.get_admin_backup_session(
-                    self.sc_admin_session, config.username, sc_auth_url)
-
    def initial_sync(self):
        # Return True to indicate initial sync success
        return True