Files
distcloud/distributedcloud/dcmanager/manager/service.py
Gustavo Herzmann 2ac4be0d5a Add in-memory token caching for DC services
This commit introduces an in-memory, dictionary-based token caching
mechanism to reduce the number of token requests made to subclouds'
identity APIs.

The caching is implemented by subclassing the v3.Password
authentication class, which normally handles HTTP requests to the
identity API. The cache first checks if a valid, non-expired token
exists and returns it if found. If not, it proceeds with the actual
request and caches the new token for future use.

Tokens can be invalidated early when all fernet keys are rotated
(e.g., during the initial sync between subcloud and system controller).
The cache leverages Keystone's session reauthentication mechanism to
automatically invalidate cached tokens when necessary.

This commit also raises the open file descriptor limit for the DC
orchestrator service. With the use of sessions, TCP connections are
reused and are not closed immediately after each request.

Test Plan:
01. PASS - Deploy a subcloud and verify token caching behavior.
02. PASS - Deploy a subcloud with remote install, ensuring the token
    cache works.
03. PASS - Prestage a subcloud for install and software deployment,
    validating token caching during the process.
04. PASS - Run prestage orchestration and verify proper use of the
    token cache.
05. PASS - Manage a subcloud for the first time and verify that the
    initial sync functions as expected. Ensure fernet key rotation
    causes cached tokens to invalidate, and confirm reauthentication
    requests are made.
06. PASS - Unmanage a subcloud, rotate all fernet keys manually, then
    manage the subcloud again. Verify token invalidation and
    reauthentication function as expected.
07. PASS - Create a subcloud backup and ensure no token cache issues
    arise.
08. PASS - Restore a subcloud from backup and verify proper
    functionality of the token cache.
09. PASS - Deploy an N-1 subcloud and validate token caching for this
    subcloud.
10. PASS - Verify that audits correctly identify an N-1 subcloud
    without the USM patch as missing the USM service.
11. PASS - Apply the USM patch to the N-1 subcloud and verify that
    the audit detects the USM service and prestage orchestration for
    software deployment functions correctly.
12. PASS - Test DC orchestration audit and sync by creating a new
    OpenStack user, and verify the user is replicated to the subcloud.
13. PASS - Apply a patch to subclouds using software deployment
    orchestration, verifying token cache performance.
14. PASS - Test dcmanager API commands that send requests to
    subclouds (e.g., 'dcmanager subcloud show <subcloud> --details'),
    ensuring token cache is used.
15. PASS - Conduct a soak test of all DC services to verify token
    expiration, renewal, and cache behavior over extended use.
16. PASS - Monitor TCP connections to ensure they are properly
    closed after each use, preventing lingering open connections during
    token caching or HTTP request handling.
17. PASS - Run end-to-end geo-redundancy operation and verify that it
    completes successfully.
18. PASS - Run kube rootca update orchestration and verify that it
    completes successfully.
19. PASS - Verify that the number of POST token requests made by the DC
    audit to the subcloud per hour is equal to the number of DC audit
    workers on the system controller.
20. PASS - Monitor the number of open file descriptors to ensure it
    does not reach the new limit while executing a DC kube rootca
    update strategy with the maximum number of supported subclouds.
    Additionally, verify that all sessions are closed after the
    strategy is complete.

Closes-Bug: 2084490

Change-Id: Ie3c17f58c09ae08df8cd9f0c92f50ab0c556c263
Signed-off-by: Gustavo Herzmann <gustavo.herzmann@windriver.com>
2024-10-22 16:37:06 -03:00

452 lines
16 KiB
Python

# Copyright (c) 2017-2024 Wind River Systems, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
import functools
import os
import threading
from oslo_config import cfg
from oslo_log import log as logging
import oslo_messaging
from oslo_service import service
from oslo_utils import uuidutils
from dccommon import consts as dccommon_consts
from dccommon.subprocess_cleanup import SubprocessCleanup
from dcmanager.audit import rpcapi as dcmanager_audit_rpc_client
from dcmanager.common import consts
from dcmanager.common import context
from dcmanager.common import exceptions
from dcmanager.common.i18n import _
from dcmanager.common import messaging as rpc_messaging
from dcmanager.common import utils
from dcmanager.manager.peer_monitor_manager import PeerMonitorManager
from dcmanager.manager.subcloud_manager import SubcloudManager
from dcmanager.manager.system_peer_manager import SystemPeerManager
CONF = cfg.CONF
LOG = logging.getLogger(__name__)
# The RPC server has a thread limit (defaults to 64), by manually
# threading the functions the RPC cast returns earlier, allowing to
# run multiple operations in parallel past the RPC limit.
def run_in_thread(fn):
"""Decorator to run a function in a separate thread."""
def wrapper(*args, **kwargs):
thread = threading.Thread(target=fn, args=args, kwargs=kwargs)
thread.start()
return wrapper
def request_context(func):
@functools.wraps(func)
def wrapped(self, ctx, *args, **kwargs):
if ctx is not None and not isinstance(ctx, context.RequestContext):
ctx = context.RequestContext.from_dict(ctx.to_dict())
try:
return func(self, ctx, *args, **kwargs)
except exceptions.DCManagerException:
raise oslo_messaging.rpc.dispatcher.ExpectedException()
return wrapped
class DCManagerService(service.Service):
"""Lifecycle manager for a running service.
- All the methods in here are called from the RPC client.
- If a RPC call does not have a corresponding method here, an exception
will be thrown.
- Arguments to these calls are added dynamically and will be treated as
keyword arguments by the RPC client.
"""
def __init__(self, host, topic, manager=None):
super(DCManagerService, self).__init__()
self.host = cfg.CONF.host
self.rpc_api_version = consts.RPC_API_VERSION
self.topic = consts.TOPIC_DC_MANAGER
# The following are initialized here, but assigned in start() which
# happens after the fork when spawning multiple worker processes
self.engine_id = None
self.target = None
self._rpc_server = None
self.subcloud_manager = None
self.peer_monitor_manager = None
self.system_peer_manager = None
self.audit_rpc_client = None
self.context = context.get_admin_context()
def init_managers(self):
self.subcloud_manager = SubcloudManager()
self.peer_monitor_manager = PeerMonitorManager(self.subcloud_manager)
self.system_peer_manager = SystemPeerManager(self.peer_monitor_manager)
def start(self):
utils.set_open_file_limit(cfg.CONF.dcmanager_worker_rlimit_nofile)
self.dcmanager_id = uuidutils.generate_uuid()
self.init_managers()
target = oslo_messaging.Target(
version=self.rpc_api_version, server=self.host, topic=self.topic
)
self.target = target
self._rpc_server = rpc_messaging.get_rpc_server(self.target, self)
self._rpc_server.start()
# Used to notify dcmanager-audit
self.audit_rpc_client = dcmanager_audit_rpc_client.ManagerAuditClient()
if not os.path.isdir(consts.DC_ANSIBLE_LOG_DIR):
os.mkdir(consts.DC_ANSIBLE_LOG_DIR, 0o755)
os.makedirs(dccommon_consts.ANSIBLE_OVERRIDES_PATH, 0o600, exist_ok=True)
self.subcloud_manager.handle_subcloud_operations_in_progress()
self.system_peer_manager.handle_association_operations_in_progress()
# Send notify to peer monitor.
self.peer_monitor_manager.peer_monitor_notify(self.context)
super(DCManagerService, self).start()
@run_in_thread
@request_context
def add_subcloud(self, context, subcloud_id, payload):
# Adds a subcloud
LOG.info("Handling add_subcloud request for: %s" % payload.get("name"))
return self.subcloud_manager.add_subcloud(context, subcloud_id, payload)
@request_context
def add_secondary_subcloud(self, context, subcloud_id, payload):
# Adds a secondary subcloud
LOG.info(
"Handling add_secondary_subcloud request for: %s" % payload.get("name")
)
return self.subcloud_manager.add_subcloud(context, subcloud_id, payload)
@request_context
def delete_subcloud(self, context, subcloud_id):
# Deletes a subcloud
LOG.info("Handling delete_subcloud request for: %s" % subcloud_id)
return self.subcloud_manager.delete_subcloud(context, subcloud_id)
@request_context
def rename_subcloud(
self, context, subcloud_id, curr_subcloud_name, new_subcloud_name=None
):
# Rename a subcloud
LOG.info("Handling rename_subcloud request for: %s" % curr_subcloud_name)
subcloud = self.subcloud_manager.rename_subcloud(
context, subcloud_id, curr_subcloud_name, new_subcloud_name
)
return subcloud
@request_context
def get_subcloud_name_by_region_name(self, context, subcloud_region):
# get subcloud by region name
LOG.debug(
"Handling get_subcloud_name_by_region_name request for region: %s"
% subcloud_region
)
subcloud = self.subcloud_manager.get_subcloud_name_by_region_name(
context, subcloud_region
)
return subcloud
@request_context
def update_subcloud(
self,
context,
subcloud_id,
management_state=None,
description=None,
location=None,
group_id=None,
data_install=None,
force=None,
deploy_status=None,
peer_group_id=None,
bootstrap_values=None,
bootstrap_address=None,
):
# Updates a subcloud
LOG.info("Handling update_subcloud request for: %s" % subcloud_id)
subcloud = self.subcloud_manager.update_subcloud(
context,
subcloud_id,
management_state,
description,
location,
group_id,
data_install,
force,
deploy_status,
peer_group_id,
bootstrap_values,
bootstrap_address,
)
return subcloud
@request_context
def update_subcloud_with_network_reconfig(self, context, subcloud_id, payload):
LOG.info(
"Handling update_subcloud_with_network_reconfig request for: %s",
subcloud_id,
)
return self.subcloud_manager.update_subcloud_with_network_reconfig(
context, subcloud_id, payload
)
@run_in_thread
@request_context
def redeploy_subcloud(self, context, subcloud_id, payload, previous_version):
# Redeploy a subcloud
LOG.info("Handling redeploy_subcloud request for: %s" % subcloud_id)
return self.subcloud_manager.redeploy_subcloud(
context, subcloud_id, payload, previous_version
)
@request_context
def backup_subclouds(self, context, payload):
# Backup a subcloud or group of subclouds
entity = "subcloud" if payload.get("subcloud") else "group"
LOG.info(
"Handling backup_subclouds request for %s ID: %s"
% (entity, (payload.get("subcloud") or payload.get("group")))
)
return self.subcloud_manager.create_subcloud_backups(context, payload)
@request_context
def delete_subcloud_backups(self, context, release_version, payload):
# Delete backup on subcloud or group of subclouds
entity = "subcloud" if payload.get("subcloud") else "group"
LOG.info(
"Handling delete_subcloud_backups request for %s ID: %s"
% (entity, (payload.get("subcloud") or payload.get("group")))
)
return self.subcloud_manager.delete_subcloud_backups(
context, release_version, payload
)
@request_context
def restore_subcloud_backups(self, context, payload):
# Restore a subcloud backup or a group of subclouds backups
entity = "subcloud" if payload.get("subcloud") else "group"
LOG.info(
"Handling restore_subcloud_backups request for %s ID: %s"
% (entity, (payload.get("subcloud") or payload.get("group")))
)
return self.subcloud_manager.restore_subcloud_backups(context, payload)
@request_context
def update_subcloud_sync_endpoint_type(
self, context, subcloud_name, endpoint_type_list, openstack_installed
):
# Updates subcloud sync endpoint type
LOG.info(
"Handling update_subcloud_sync_endpoint_type request for: %s"
% subcloud_name
)
self.subcloud_manager.update_subcloud_sync_endpoint_type(
context, subcloud_name, endpoint_type_list, openstack_installed
)
@request_context
def prestage_subcloud(self, context, payload):
LOG.info("Handling prestage_subcloud request for: %s", payload["subcloud_name"])
return self.subcloud_manager.prestage_subcloud(context, payload)
@request_context
def subcloud_deploy_create(self, context, subcloud_id, payload):
# Adds a subcloud
LOG.info(
"Handling subcloud_deploy_create request for: %s" % payload.get("name")
)
return self.subcloud_manager.subcloud_deploy_create(
context, subcloud_id, payload
)
@run_in_thread
@request_context
def subcloud_deploy_bootstrap(
self, context, subcloud_id, payload, initial_deployment
):
# Bootstraps a subcloud
LOG.info(
"Handling subcloud_deploy_bootstrap request for: %s" % payload.get("name")
)
return self.subcloud_manager.subcloud_deploy_bootstrap(
context, subcloud_id, payload, initial_deployment
)
@run_in_thread
@request_context
def subcloud_deploy_config(self, context, subcloud_id, payload, initial_deployment):
# Configures a subcloud
LOG.info("Handling subcloud_deploy_config request for: %s" % subcloud_id)
return self.subcloud_manager.subcloud_deploy_config(
context, subcloud_id, payload, initial_deployment
)
@run_in_thread
@request_context
def subcloud_deploy_install(
self, context, subcloud_id, payload, initial_deployment, previous_version
):
# Install a subcloud
LOG.info("Handling subcloud_deploy_install request for: %s" % subcloud_id)
return self.subcloud_manager.subcloud_deploy_install(
context, subcloud_id, payload, initial_deployment, previous_version
)
@run_in_thread
@request_context
def subcloud_deploy_enroll(self, context, subcloud_id, payload):
# Enroll a subcloud
LOG.info(f"Handling subcloud_deploy_enroll request for: {subcloud_id}")
return self.subcloud_manager.subcloud_deploy_enroll(
context, subcloud_id, payload
)
@request_context
def subcloud_deploy_complete(self, context, subcloud_id):
# Complete the subcloud deployment
LOG.info("Handling subcloud_deploy_complete request for: %s" % subcloud_id)
return self.subcloud_manager.subcloud_deploy_complete(context, subcloud_id)
@run_in_thread
@request_context
def subcloud_deploy_abort(self, context, subcloud_id, deploy_status):
# Abort the subcloud deployment
LOG.info("Handling subcloud_deploy_abort request for: %s" % subcloud_id)
return self.subcloud_manager.subcloud_deploy_abort(
context, subcloud_id, deploy_status
)
@run_in_thread
@request_context
def subcloud_deploy_resume(
self,
context,
subcloud_id,
subcloud_name,
payload,
deploy_states_to_run,
previous_version,
):
# Adds a subcloud
LOG.info("Handling subcloud_deploy_resume request for: %s" % subcloud_name)
return self.subcloud_manager.subcloud_deploy_resume(
context,
subcloud_id,
subcloud_name,
payload,
deploy_states_to_run,
previous_version,
)
@request_context
def batch_migrate_subcloud(self, context, payload):
LOG.info(
"Handling batch_migrate_subcloud request for peer_group: %s",
payload["peer_group"],
)
return self.subcloud_manager.batch_migrate_subcloud(context, payload)
@request_context
def peer_monitor_notify(self, context):
LOG.info("Handling peer monitor notify")
return self.peer_monitor_manager.peer_monitor_notify(context)
@request_context
def peer_group_audit_notify(self, context, peer_group_name, payload):
LOG.info("Handling peer group audit notify of peer group {peer_group_name}")
return self.peer_monitor_manager.peer_group_audit_notify(
context, peer_group_name, payload
)
@request_context
def sync_subcloud_peer_group(self, context, association_id, sync_subclouds=True):
LOG.info("Handling sync_subcloud_peer_group request for: %s", association_id)
return self.system_peer_manager.sync_subcloud_peer_group(
context, association_id, sync_subclouds
)
@request_context
def update_subcloud_peer_group(
self,
context,
peer_group_id,
group_state,
max_subcloud_rehoming,
group_name,
new_group_name=None,
):
LOG.info(
"Handling update_subcloud_peer_group request for peer group %s"
% peer_group_id
)
return self.system_peer_manager.update_subcloud_peer_group(
context,
peer_group_id,
group_state,
max_subcloud_rehoming,
group_name,
new_group_name,
)
@request_context
def delete_peer_group_association(self, context, association_id):
LOG.info(
"Handling delete_peer_group_association request for: %s", association_id
)
return self.system_peer_manager.delete_peer_group_association(
context, association_id
)
@request_context
def update_association_sync_status(
self, context, peer_group_id, sync_status, sync_message=None
):
# Updates peer group association sync_status
LOG.info(
"Handling update_peer_association_sync_status request for: %s"
% peer_group_id
)
return self.system_peer_manager.update_association_sync_status(
context, peer_group_id, sync_status, sync_message
)
def _stop_rpc_server(self):
# Stop RPC connection to prevent new requests
LOG.debug(_("Attempting to stop RPC service..."))
try:
self._rpc_server.stop()
self._rpc_server.wait()
LOG.info("RPC service stopped successfully")
except Exception as ex:
LOG.error("Failed to stop RPC service: %s", str(ex))
def stop(self):
SubprocessCleanup.shutdown_cleanup(origin="service")
self._stop_rpc_server()
# Terminate the engine process
LOG.info("All threads were gone, terminating engine")
super(DCManagerService, self).stop()