
1. Refactor dcorch's generic_sync_manager.py and initial_sync_manager into a main process manager and a worker manager. The main manager will handle the allocation of eligible subclouds to each worker. 2. Rename the current EngineService to EngineWorkerService and introduce a new EngineService for the main process, similar to DCManagerAuditService and DCManagerAuditWorkerService. 3. Rename the current RPC EngineClient to EngineWorkerClient and introduce a new EngineClient. Adapt the RPC methods to accommodate the modifications in these main process managers and worker managers. 4. Move master resources data retrieval from each sync_thread to engine workers. 5. Implement 2 new db APIs for subcloud batch sync and state updates. 6. Remove code related to sync_lock and its associated db table schema. 7. Add ocf script for managing the start and stop of the dcorch engine-worker service, and make changes in packaging accordingly. 8. Bug fixes for the issues related to the usage of base64.urlsafe_b64encode and base64.urlsafe_b64decode in python3. 9. Update unit tests for the main process and worker managers. Test Plan: PASS: Verify that the dcorch audit runs properly every 5 minutes. PASS: Verify that the initial sync runs properly every 10 seconds. PASS: Verify that the sync subclouds operation runs properly every 5 seconds. PASS: Successfully start and stop the dcorch-engine and dcorch-engine-worker services using the sm commands. PASS: Change the admin password on the system controller using the command "openstack --os-region-name SystemController user password set". Verify that the admin password is synchronized to the subcloud and the dcorch receives the corresponding sync request, followed by successful execution of sync resources for the subcloud. PASS: Unmanage and then manage a subcloud, and verify that the initial sync is executed successfully for that subcloud. PASS: Verify the removal of the sync_lock table from the dcorch db. Story: 2011106 Task: 50013 Change-Id: I329847bd1107ec43e67ec59bdd1e3111b7b37cd3 Signed-off-by: lzhu1 <li.zhu@windriver.com>
353 lines
13 KiB
Python
353 lines
13 KiB
Python
# Copyright (c) 2020-2024 Wind River Systems, Inc.
|
|
# All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import functools
|
|
import resource
|
|
import time
|
|
|
|
from oslo_config import cfg
|
|
from oslo_log import log as logging
|
|
import oslo_messaging
|
|
from oslo_service import service
|
|
from oslo_utils import uuidutils
|
|
import six
|
|
|
|
from dccommon import consts as dccommon_consts
|
|
from dcorch.common import consts
|
|
from dcorch.common import context
|
|
from dcorch.common import exceptions
|
|
from dcorch.common.i18n import _
|
|
from dcorch.common import messaging as rpc_messaging
|
|
from dcorch.engine.fernet_key_manager import FernetKeyManager
|
|
from dcorch.engine.generic_sync_manager import GenericSyncManager
|
|
from dcorch.engine.generic_sync_worker_manager import GenericSyncWorkerManager
|
|
from dcorch.engine.initial_sync_manager import InitialSyncManager
|
|
from dcorch.engine.initial_sync_worker_manager import InitialSyncWorkerManager
|
|
from dcorch.engine.quota_manager import QuotaManager
|
|
from dcorch.engine import scheduler
|
|
|
|
CONF = cfg.CONF
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
|
|
def request_context(func):
|
|
@functools.wraps(func)
|
|
def wrapped(self, ctx, *args, **kwargs):
|
|
if ctx is not None and not isinstance(ctx, context.RequestContext):
|
|
ctx = context.RequestContext.from_dict(ctx.to_dict())
|
|
try:
|
|
return func(self, ctx, *args, **kwargs)
|
|
except exceptions.OrchestratorException:
|
|
raise oslo_messaging.rpc.dispatcher.ExpectedException()
|
|
|
|
return wrapped
|
|
|
|
|
|
class EngineService(service.Service):
|
|
"""Lifecycle manager for a running audit service."""
|
|
|
|
def __init__(self):
|
|
|
|
super(EngineService, self).__init__()
|
|
self.host = cfg.CONF.host
|
|
self.rpc_api_version = consts.RPC_API_VERSION
|
|
self.topic = consts.TOPIC_ORCH_ENGINE
|
|
# The following are initialized here, but assigned in start() which
|
|
# happens after the fork when spawning multiple worker processes
|
|
self.TG = None
|
|
self.periodic_enable = cfg.CONF.scheduler.periodic_enable
|
|
self.target = None
|
|
self._rpc_server = None
|
|
self.qm = None
|
|
self.gsm = None
|
|
self.fkm = None
|
|
self.ism = None
|
|
|
|
def start(self):
|
|
target = oslo_messaging.Target(
|
|
version=self.rpc_api_version, server=self.host, topic=self.topic
|
|
)
|
|
self.target = target
|
|
self._rpc_server = rpc_messaging.get_rpc_server(self.target, self)
|
|
self._rpc_server.start()
|
|
|
|
self.init_tgm()
|
|
self.init_qm()
|
|
self.init_gsm()
|
|
self.init_fkm()
|
|
self.init_ism()
|
|
|
|
super(EngineService, self).start()
|
|
|
|
if self.periodic_enable:
|
|
LOG.info("Adding periodic tasks for the engine to perform")
|
|
self.TG.add_timer(CONF.fernet.key_rotation_interval *
|
|
dccommon_consts.SECONDS_IN_HOUR,
|
|
self.periodic_key_rotation,
|
|
initial_delay=(CONF.fernet.key_rotation_interval
|
|
* dccommon_consts.SECONDS_IN_HOUR))
|
|
|
|
def init_tgm(self):
|
|
self.TG = scheduler.ThreadGroupManager()
|
|
|
|
def init_qm(self):
|
|
self.qm = QuotaManager()
|
|
|
|
def init_gsm(self):
|
|
self.gsm = GenericSyncManager()
|
|
self.TG.start(self.gsm.sync_job_thread)
|
|
self.TG.start(self.gsm.sync_audit_thread)
|
|
|
|
def init_fkm(self):
|
|
self.fkm = FernetKeyManager(self.gsm)
|
|
|
|
def init_ism(self):
|
|
self.ism = InitialSyncManager()
|
|
self.ism.init_actions()
|
|
self.TG.start(self.ism.initial_sync_thread)
|
|
|
|
@request_context
|
|
# The sync job info has been written to the DB, alert the sync engine
|
|
# that there is work to do.
|
|
# TODO(lzhu1): add authentication since ctxt not actually needed later
|
|
def sync_request(self, ctxt, endpoint_type):
|
|
self.gsm.sync_request(ctxt, endpoint_type)
|
|
|
|
def periodic_balance_all(self):
|
|
# Automated Quota Sync for all the keystone projects
|
|
LOG.info("Periodic quota sync job started at: %s",
|
|
time.strftime("%c"))
|
|
self.qm.periodic_balance_all()
|
|
|
|
@request_context
|
|
def get_usage_for_project_and_user(self, context, endpoint_type,
|
|
project_id, user_id=None):
|
|
# Returns cached usage as of last quota sync audit so will be
|
|
# slightly stale.
|
|
return self.qm.get_usage_for_project_and_user(endpoint_type,
|
|
project_id, user_id)
|
|
|
|
@request_context
|
|
def quota_sync_for_project(self, context, project_id, user_id):
|
|
# On Demand Quota Sync for a project, will be triggered by KB-API
|
|
LOG.info("On Demand Quota Sync Called for: %s %s",
|
|
project_id, user_id)
|
|
self.qm.quota_sync_for_project(project_id, user_id)
|
|
|
|
def _stop_rpc_server(self):
|
|
# Stop RPC connection to prevent new requests
|
|
LOG.debug(_("Attempting to stop engine service..."))
|
|
try:
|
|
self._rpc_server.stop()
|
|
self._rpc_server.wait()
|
|
LOG.info("Engine service stopped successfully")
|
|
except Exception as ex:
|
|
LOG.error(f"Failed to stop engine service: {six.text_type(ex)}")
|
|
|
|
def stop(self):
|
|
self._stop_rpc_server()
|
|
|
|
if self.TG:
|
|
self.TG.stop()
|
|
|
|
# Terminate the engine process
|
|
LOG.info("All threads were gone, terminating engine")
|
|
super(EngineService, self).stop()
|
|
|
|
def periodic_key_rotation(self):
|
|
"""Periodic key rotation."""
|
|
LOG.info("Periodic key rotation started at: %s", time.strftime("%c"))
|
|
return self.fkm.rotate_fernet_keys()
|
|
|
|
|
|
class EngineWorkerService(service.Service):
|
|
"""Lifecycle manager for a running service engine.
|
|
|
|
- All the methods in here are called from the RPC client.
|
|
- If a RPC call does not have a corresponding method here, an exceptions
|
|
will be thrown.
|
|
- Arguments to these calls are added dynamically and will be treated as
|
|
keyword arguments by the RPC client.
|
|
"""
|
|
|
|
def __init__(self):
|
|
|
|
super(EngineWorkerService, self).__init__()
|
|
self.host = cfg.CONF.host
|
|
self.rpc_api_version = consts.RPC_API_VERSION
|
|
self.topic = consts.TOPIC_ORCH_ENGINE_WORKER
|
|
# The following are initialized here, but assigned in start() which
|
|
# happens after the fork when spawning multiple worker processes
|
|
self.engine_id = None
|
|
self.target = None
|
|
self._rpc_server = None
|
|
self.gswm = None
|
|
self.iswm = None
|
|
|
|
def init_gswm(self):
|
|
self.gswm = GenericSyncWorkerManager(self.engine_id)
|
|
|
|
def init_iswm(self):
|
|
self.iswm = InitialSyncWorkerManager(self.gswm, self.engine_id)
|
|
|
|
def start(self):
|
|
LOG.info("Starting %s", self.__class__.__name__)
|
|
self.engine_id = uuidutils.generate_uuid()
|
|
target = oslo_messaging.Target(version=self.rpc_api_version,
|
|
server=self.host,
|
|
topic=self.topic)
|
|
self.target = target
|
|
self._rpc_server = rpc_messaging.get_rpc_server(self.target, self)
|
|
self._rpc_server.start()
|
|
|
|
self.init_gswm()
|
|
self.init_iswm()
|
|
|
|
self.set_resource_limit()
|
|
|
|
super(EngineWorkerService, self).start()
|
|
|
|
def set_resource_limit(self):
|
|
try:
|
|
resource.setrlimit(resource.RLIMIT_NOFILE, (cfg.CONF.rlimit_nofile,
|
|
cfg.CONF.rlimit_nofile))
|
|
except Exception as ex:
|
|
LOG.error('Engine id %s: failed to set the NOFILE resource limit: '
|
|
'%s' % (self.engine_id, ex))
|
|
|
|
@request_context
|
|
def add_subcloud(self, ctxt, subcloud_name, sw_version):
|
|
self.gswm.add_subcloud(ctxt, subcloud_name, sw_version)
|
|
|
|
@request_context
|
|
# todo: add authentication since ctxt not actually needed later
|
|
def del_subcloud(self, ctxt, subcloud_name):
|
|
self.gswm.del_subcloud(ctxt, subcloud_name)
|
|
|
|
@request_context
|
|
# todo: add authentication since ctxt not actually needed later
|
|
def update_subcloud_states(self, ctxt, subcloud_name,
|
|
management_state,
|
|
availability_status):
|
|
"""Handle subcloud state updates from dcmanager
|
|
|
|
These state updates must be processed quickly. Any work triggered by
|
|
these state updates must be done asynchronously, without delaying the
|
|
reply to the dcmanager. For example, it is not acceptable to
|
|
communicate with a subcloud while handling the state update.
|
|
"""
|
|
|
|
# Check if state has changed before doing anything
|
|
if self.gswm.subcloud_state_matches(
|
|
subcloud_name,
|
|
management_state=management_state,
|
|
availability_status=availability_status):
|
|
# No change in state - nothing to do.
|
|
LOG.debug('Ignoring unchanged state update for %s' % subcloud_name)
|
|
return
|
|
|
|
# Check if the subcloud is ready to sync.
|
|
if (management_state == dccommon_consts.MANAGEMENT_MANAGED) and \
|
|
(availability_status == dccommon_consts.AVAILABILITY_ONLINE):
|
|
# Update the subcloud state and schedule an initial sync
|
|
self.gswm.update_subcloud_state(
|
|
ctxt,
|
|
subcloud_name,
|
|
management_state=management_state,
|
|
availability_status=availability_status,
|
|
initial_sync_state=consts.INITIAL_SYNC_STATE_REQUESTED)
|
|
else:
|
|
# Update the subcloud state and cancel the initial sync
|
|
self.gswm.update_subcloud_state(
|
|
ctxt,
|
|
subcloud_name,
|
|
management_state=management_state,
|
|
availability_status=availability_status,
|
|
initial_sync_state=consts.INITIAL_SYNC_STATE_NONE)
|
|
|
|
@request_context
|
|
def update_subcloud_state(self, ctxt, subcloud_name,
|
|
management_state=None,
|
|
availability_status=None,
|
|
initial_sync_state=None):
|
|
LOG.info("Trigger update state for subcloud %s", subcloud_name)
|
|
self.gswm.update_subcloud_state(ctxt, subcloud_name,
|
|
management_state,
|
|
availability_status,
|
|
initial_sync_state)
|
|
|
|
@request_context
|
|
def add_subcloud_sync_endpoint_type(self, ctxt, subcloud_name,
|
|
endpoint_type_list=None):
|
|
try:
|
|
self.gswm.add_subcloud_sync_endpoint_type(
|
|
ctxt, subcloud_name,
|
|
endpoint_type_list=endpoint_type_list)
|
|
except Exception as ex:
|
|
LOG.warning('Add subcloud endpoint type failed for %s: %s',
|
|
subcloud_name, six.text_type(ex))
|
|
raise
|
|
|
|
@request_context
|
|
def remove_subcloud_sync_endpoint_type(self, ctxt, subcloud_name,
|
|
endpoint_type_list=None):
|
|
try:
|
|
self.gswm.remove_subcloud_sync_endpoint_type(
|
|
ctxt, subcloud_name,
|
|
endpoint_type_list=endpoint_type_list)
|
|
except Exception as ex:
|
|
LOG.warning('Remove subcloud endpoint type failed for %s: %s',
|
|
subcloud_name, six.text_type(ex))
|
|
raise
|
|
|
|
@request_context
|
|
def sync_subclouds(self, ctxt, subcloud_sync_list):
|
|
self.gswm.sync_subclouds(ctxt, subcloud_sync_list)
|
|
|
|
@request_context
|
|
def run_sync_audit(self, ctxt, subcloud_sync_list):
|
|
self.gswm.run_sync_audit(ctxt, subcloud_sync_list)
|
|
|
|
@request_context
|
|
def initial_sync_subclouds(self, ctxt, subcloud_capabilities):
|
|
self.iswm.initial_sync_subclouds(ctxt, subcloud_capabilities)
|
|
|
|
@request_context
|
|
# todo: add authentication since ctxt not actually needed later
|
|
def update_subcloud_version(self, ctxt, subcloud_name, sw_version):
|
|
self.gswm.update_subcloud_version(ctxt, subcloud_name, sw_version)
|
|
|
|
@request_context
|
|
def update_subcloud_endpoints(self, ctxt, subcloud_name, endpoints):
|
|
self.gswm.update_subcloud_endpoints(ctxt, subcloud_name, endpoints)
|
|
|
|
def _stop_rpc_server(self):
|
|
# Stop RPC connection to prevent new requests
|
|
LOG.debug(_("Attempting to stop engine-worker service..."))
|
|
try:
|
|
if self._rpc_server:
|
|
self._rpc_server.stop()
|
|
self._rpc_server.wait()
|
|
LOG.info('Engine-worker service stopped successfully')
|
|
except Exception as ex:
|
|
LOG.error(f"Failed to stop engine-worker service: {six.text_type(ex)}")
|
|
|
|
def stop(self):
|
|
self._stop_rpc_server()
|
|
|
|
# Terminate the engine process
|
|
LOG.info("All threads were gone, terminating engine-worker")
|
|
super(EngineWorkerService, self).stop()
|