Fix Sub clouds going offline due to auth failure
This update contains the following changes that prevent subclouds
going offline due to authentication failure:
1. The os region client cache is cleared when a new keystone client
is created. The os region client will be re-created using the new
keystone session.
2. When the user's access info (such as role id) is changed create
new keystone client and os region clients. This could happen after
system controller keystone role ids were synced to subclouds
3. Remove get_admin_backup_session that was only required when
upgrading to stx 4.0.
4. Increase AVAIL_FAIL_COUNT_TO_ALARM to 2 as we don't want to alarm
first failure since there are cases where we expect a transient
failure in the subcloud (e.g. haproxy process restart to update
certificates)
Tested on DC-6:
1. Adding 50 subclouds twice
2. Soaking the fix over the weekend
Closes-Bug: 1927007
Signed-off-by: Tao Liu <tao.liu@windriver.com>
Change-Id: I86fdc9a2f062409e704bdfac2119dc488123f7de
(cherry picked from commit 17b5505d9e
)
This commit is contained in:
parent
de0fef663a
commit
d8ce118e50
@ -83,6 +83,9 @@ class OpenStackDriver(object):
|
||||
OpenStackDriver.update_region_clients(region_name,
|
||||
KEYSTONE_CLIENT_NAME,
|
||||
self.keystone_client)
|
||||
# Clear client object cache
|
||||
OpenStackDriver.os_clients_dict[region_name] = \
|
||||
collections.defaultdict(dict)
|
||||
except Exception as exception:
|
||||
LOG.error('keystone_client region %s error: %s' %
|
||||
(region_name, str(exception)))
|
||||
@ -185,14 +188,18 @@ class OpenStackDriver(object):
|
||||
OpenStackDriver._identity_tokens[region_name],
|
||||
include_catalog=False)
|
||||
if token != OpenStackDriver._identity_tokens[region_name]:
|
||||
LOG.debug("%s: updating token %s to %s" %
|
||||
LOG.debug("%s: AccessInfo changed %s to %s" %
|
||||
(region_name,
|
||||
OpenStackDriver._identity_tokens[region_name],
|
||||
token))
|
||||
OpenStackDriver._identity_tokens[region_name] = token
|
||||
OpenStackDriver._identity_tokens[region_name] = None
|
||||
OpenStackDriver.os_clients_dict[region_name] = \
|
||||
collections.defaultdict(dict)
|
||||
return False
|
||||
|
||||
except Exception as exception:
|
||||
LOG.info('_is_token_valid handle: %s', str(exception))
|
||||
LOG.info('_is_token_valid handle: region: %s error: %s',
|
||||
(region_name, str(exception)))
|
||||
# Reset the cached dictionary
|
||||
OpenStackDriver.os_clients_dict[region_name] = \
|
||||
collections.defaultdict(dict)
|
||||
|
@ -23,7 +23,6 @@
|
||||
import collections
|
||||
import threading
|
||||
|
||||
from keystoneauth1 import exceptions as keystone_exceptions
|
||||
from keystoneauth1 import loading
|
||||
from keystoneauth1 import session
|
||||
|
||||
@ -107,10 +106,6 @@ class EndpointCache(object):
|
||||
CONF.endpoint_cache.password,
|
||||
CONF.endpoint_cache.project_name,
|
||||
CONF.endpoint_cache.project_domain_name)
|
||||
# check if the current session is valid and get an admin session
|
||||
# if necessary
|
||||
self.admin_session = EndpointCache.get_admin_backup_session(
|
||||
self.admin_session, CONF.endpoint_cache.username, sc_auth_url)
|
||||
|
||||
self.keystone_client = ks_client.Client(
|
||||
session=self.admin_session,
|
||||
@ -140,33 +135,6 @@ class EndpointCache(object):
|
||||
auth=user_auth, additional_headers=consts.USER_HEADER,
|
||||
timeout=timeout)
|
||||
|
||||
@classmethod
|
||||
def get_admin_backup_session(cls, admin_session, user_name, auth_url):
|
||||
"""Validate a session and open an admin session if it fails.
|
||||
|
||||
This method is require to handle an upgrade to stx 4.0 and it
|
||||
can be removed in stx 5.0.
|
||||
|
||||
"""
|
||||
|
||||
try:
|
||||
admin_session.get_auth_headers()
|
||||
except keystone_exceptions.Unauthorized:
|
||||
# this will only happen briefly during an upgrade to stx 4.0
|
||||
# just until the dcorch has synced the dcmanager user to each
|
||||
# subcloud
|
||||
LOG.info("Failed to authenticate user:%s, use %s user instead"
|
||||
% (user_name,
|
||||
CONF.cache.admin_username))
|
||||
admin_session = EndpointCache.get_admin_session(
|
||||
auth_url,
|
||||
CONF.cache.admin_username,
|
||||
CONF.cache.admin_user_domain_name,
|
||||
CONF.cache.admin_password,
|
||||
CONF.cache.admin_tenant,
|
||||
CONF.cache.admin_project_domain_name)
|
||||
return admin_session
|
||||
|
||||
@staticmethod
|
||||
def _is_central_cloud(region_id):
|
||||
central_cloud_regions = [consts.CLOUD_0, consts.VIRTUAL_MASTER_CLOUD]
|
||||
|
@ -75,7 +75,11 @@ ENDPOINT_TYPE = "endpoint_type"
|
||||
SERVICE_GROUP_STATUS_ACTIVE = "active"
|
||||
|
||||
# Availability fail count
|
||||
AVAIL_FAIL_COUNT_TO_ALARM = 1
|
||||
# we don't want to alarm first failure since there are
|
||||
# cases where we expect a transient failure in the
|
||||
# subcloud (e.g. haproxy process restart to update
|
||||
# certificates)
|
||||
AVAIL_FAIL_COUNT_TO_ALARM = 2
|
||||
AVAIL_FAIL_COUNT_MAX = 9999
|
||||
|
||||
# Software update strategy types
|
||||
|
@ -583,8 +583,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
|
||||
audit_fail_count = 1
|
||||
self.fake_dcmanager_api.update_subcloud_availability.\
|
||||
assert_called_with(mock.ANY, subcloud.name,
|
||||
consts.AVAILABILITY_OFFLINE,
|
||||
False, audit_fail_count)
|
||||
None, False, audit_fail_count)
|
||||
|
||||
# Update the DB like dcmanager would do.
|
||||
subcloud = db_api.subcloud_update(
|
||||
@ -605,23 +604,27 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
|
||||
|
||||
audit_fail_count = audit_fail_count + 1
|
||||
|
||||
# Verify the subcloud availability didn't change, just the fail count
|
||||
# Verify the subcloud goes offline
|
||||
self.fake_dcmanager_api.update_subcloud_availability.\
|
||||
assert_called_with(mock.ANY, subcloud.name,
|
||||
None, False,
|
||||
audit_fail_count)
|
||||
|
||||
# Verify alarm update is not called
|
||||
self.fake_alarm_aggr.update_alarm_summary.assert_not_called()
|
||||
# Verify alarm update is called only once
|
||||
self.fake_alarm_aggr.update_alarm_summary.assert_called_once_with(
|
||||
subcloud.name, self.fake_openstack_client.fm_client)
|
||||
|
||||
# Verify patch audit is not called
|
||||
self.fake_patch_audit.subcloud_patch_audit.assert_not_called()
|
||||
# Verify patch audit is called only once
|
||||
self.fake_patch_audit.subcloud_patch_audit.assert_called_once_with(
|
||||
subcloud.name, mock.ANY, True)
|
||||
|
||||
# Verify firmware audit is not called
|
||||
self.fake_firmware_audit.subcloud_firmware_audit.assert_not_called()
|
||||
# Verify firmware audit is called
|
||||
self.fake_firmware_audit.subcloud_firmware_audit.assert_called_once_with(
|
||||
subcloud.name, mock.ANY)
|
||||
|
||||
# Verify firmware audit is not called
|
||||
self.fake_kubernetes_audit.subcloud_kubernetes_audit.assert_not_called()
|
||||
# Verify firmware audit is called
|
||||
self.fake_kubernetes_audit.subcloud_kubernetes_audit.assert_called_once_with(
|
||||
subcloud.name, mock.ANY)
|
||||
|
||||
def test_audit_subcloud_offline_no_change(self):
|
||||
subcloud = self.create_subcloud_static(self.ctx, name='subcloud1')
|
||||
|
@ -193,10 +193,6 @@ class SyncThread(object):
|
||||
config.admin_project_domain_name,
|
||||
timeout=60)
|
||||
|
||||
if config is cfg.CONF.endpoint_cache:
|
||||
self.sc_admin_session = EndpointCache.get_admin_backup_session(
|
||||
self.sc_admin_session, config.username, sc_auth_url)
|
||||
|
||||
def initial_sync(self):
|
||||
# Return True to indicate initial sync success
|
||||
return True
|
||||
|
Loading…
Reference in New Issue
Block a user