Files
distcloud/distributedcloud/dcorch/engine/service.py
Li Zhu 287246bf4f DCorch Engine Update for Scalability
1. Refactor dcorch's generic_sync_manager.py and initial_sync_manager
   into a main process manager and a worker manager. The main manager
   will handle the allocation of eligible subclouds to each worker.
2. Rename the current EngineService to EngineWorkerService and introduce
   a new EngineService for the main process, similar to
   DCManagerAuditService and DCManagerAuditWorkerService.
3. Rename the current RPC EngineClient to EngineWorkerClient and
   introduce a new EngineClient. Adapt the RPC methods to accommodate
   the modifications in these main process managers and worker managers.
4. Move master resources data retrieval from each sync_thread to engine
   workers.
5. Implement 2 new db APIs for subcloud batch sync and state updates.
6. Remove code related to sync_lock and its associated db table schema.
7. Add ocf script for managing the start and stop of the dcorch
   engine-worker service, and make changes in packaging accordingly.
8. Bug fixes for the issues related to the usage of
   base64.urlsafe_b64encode and base64.urlsafe_b64decode in python3.
9. Update unit tests for the main process and worker managers.

Test Plan:
PASS: Verify that the dcorch audit runs properly every 5 minutes.
PASS: Verify that the initial sync runs properly every 10 seconds.
PASS: Verify that the sync subclouds operation runs properly every 5
      seconds.
PASS: Successfully start and stop the dcorch-engine and
      dcorch-engine-worker services using the sm commands.
PASS: Change the admin password on the system controller using
      the command "openstack --os-region-name SystemController user
      password set". Verify that the admin password is synchronized
      to the subcloud and the dcorch receives the corresponding sync
      request, followed by successful execution of sync resources for
      the subcloud.
PASS: Unmanage and then manage a subcloud, and verify that the initial
      sync is executed successfully for that subcloud.
PASS: Verify the removal of the sync_lock table from the dcorch db.

Story: 2011106
Task: 50013

Change-Id: I329847bd1107ec43e67ec59bdd1e3111b7b37cd3
Signed-off-by: lzhu1 <li.zhu@windriver.com>
2024-05-15 10:49:13 -04:00

353 lines
13 KiB
Python

# Copyright (c) 2020-2024 Wind River Systems, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import functools
import resource
import time
from oslo_config import cfg
from oslo_log import log as logging
import oslo_messaging
from oslo_service import service
from oslo_utils import uuidutils
import six
from dccommon import consts as dccommon_consts
from dcorch.common import consts
from dcorch.common import context
from dcorch.common import exceptions
from dcorch.common.i18n import _
from dcorch.common import messaging as rpc_messaging
from dcorch.engine.fernet_key_manager import FernetKeyManager
from dcorch.engine.generic_sync_manager import GenericSyncManager
from dcorch.engine.generic_sync_worker_manager import GenericSyncWorkerManager
from dcorch.engine.initial_sync_manager import InitialSyncManager
from dcorch.engine.initial_sync_worker_manager import InitialSyncWorkerManager
from dcorch.engine.quota_manager import QuotaManager
from dcorch.engine import scheduler
CONF = cfg.CONF
LOG = logging.getLogger(__name__)
def request_context(func):
@functools.wraps(func)
def wrapped(self, ctx, *args, **kwargs):
if ctx is not None and not isinstance(ctx, context.RequestContext):
ctx = context.RequestContext.from_dict(ctx.to_dict())
try:
return func(self, ctx, *args, **kwargs)
except exceptions.OrchestratorException:
raise oslo_messaging.rpc.dispatcher.ExpectedException()
return wrapped
class EngineService(service.Service):
"""Lifecycle manager for a running audit service."""
def __init__(self):
super(EngineService, self).__init__()
self.host = cfg.CONF.host
self.rpc_api_version = consts.RPC_API_VERSION
self.topic = consts.TOPIC_ORCH_ENGINE
# The following are initialized here, but assigned in start() which
# happens after the fork when spawning multiple worker processes
self.TG = None
self.periodic_enable = cfg.CONF.scheduler.periodic_enable
self.target = None
self._rpc_server = None
self.qm = None
self.gsm = None
self.fkm = None
self.ism = None
def start(self):
target = oslo_messaging.Target(
version=self.rpc_api_version, server=self.host, topic=self.topic
)
self.target = target
self._rpc_server = rpc_messaging.get_rpc_server(self.target, self)
self._rpc_server.start()
self.init_tgm()
self.init_qm()
self.init_gsm()
self.init_fkm()
self.init_ism()
super(EngineService, self).start()
if self.periodic_enable:
LOG.info("Adding periodic tasks for the engine to perform")
self.TG.add_timer(CONF.fernet.key_rotation_interval *
dccommon_consts.SECONDS_IN_HOUR,
self.periodic_key_rotation,
initial_delay=(CONF.fernet.key_rotation_interval
* dccommon_consts.SECONDS_IN_HOUR))
def init_tgm(self):
self.TG = scheduler.ThreadGroupManager()
def init_qm(self):
self.qm = QuotaManager()
def init_gsm(self):
self.gsm = GenericSyncManager()
self.TG.start(self.gsm.sync_job_thread)
self.TG.start(self.gsm.sync_audit_thread)
def init_fkm(self):
self.fkm = FernetKeyManager(self.gsm)
def init_ism(self):
self.ism = InitialSyncManager()
self.ism.init_actions()
self.TG.start(self.ism.initial_sync_thread)
@request_context
# The sync job info has been written to the DB, alert the sync engine
# that there is work to do.
# TODO(lzhu1): add authentication since ctxt not actually needed later
def sync_request(self, ctxt, endpoint_type):
self.gsm.sync_request(ctxt, endpoint_type)
def periodic_balance_all(self):
# Automated Quota Sync for all the keystone projects
LOG.info("Periodic quota sync job started at: %s",
time.strftime("%c"))
self.qm.periodic_balance_all()
@request_context
def get_usage_for_project_and_user(self, context, endpoint_type,
project_id, user_id=None):
# Returns cached usage as of last quota sync audit so will be
# slightly stale.
return self.qm.get_usage_for_project_and_user(endpoint_type,
project_id, user_id)
@request_context
def quota_sync_for_project(self, context, project_id, user_id):
# On Demand Quota Sync for a project, will be triggered by KB-API
LOG.info("On Demand Quota Sync Called for: %s %s",
project_id, user_id)
self.qm.quota_sync_for_project(project_id, user_id)
def _stop_rpc_server(self):
# Stop RPC connection to prevent new requests
LOG.debug(_("Attempting to stop engine service..."))
try:
self._rpc_server.stop()
self._rpc_server.wait()
LOG.info("Engine service stopped successfully")
except Exception as ex:
LOG.error(f"Failed to stop engine service: {six.text_type(ex)}")
def stop(self):
self._stop_rpc_server()
if self.TG:
self.TG.stop()
# Terminate the engine process
LOG.info("All threads were gone, terminating engine")
super(EngineService, self).stop()
def periodic_key_rotation(self):
"""Periodic key rotation."""
LOG.info("Periodic key rotation started at: %s", time.strftime("%c"))
return self.fkm.rotate_fernet_keys()
class EngineWorkerService(service.Service):
"""Lifecycle manager for a running service engine.
- All the methods in here are called from the RPC client.
- If a RPC call does not have a corresponding method here, an exceptions
will be thrown.
- Arguments to these calls are added dynamically and will be treated as
keyword arguments by the RPC client.
"""
def __init__(self):
super(EngineWorkerService, self).__init__()
self.host = cfg.CONF.host
self.rpc_api_version = consts.RPC_API_VERSION
self.topic = consts.TOPIC_ORCH_ENGINE_WORKER
# The following are initialized here, but assigned in start() which
# happens after the fork when spawning multiple worker processes
self.engine_id = None
self.target = None
self._rpc_server = None
self.gswm = None
self.iswm = None
def init_gswm(self):
self.gswm = GenericSyncWorkerManager(self.engine_id)
def init_iswm(self):
self.iswm = InitialSyncWorkerManager(self.gswm, self.engine_id)
def start(self):
LOG.info("Starting %s", self.__class__.__name__)
self.engine_id = uuidutils.generate_uuid()
target = oslo_messaging.Target(version=self.rpc_api_version,
server=self.host,
topic=self.topic)
self.target = target
self._rpc_server = rpc_messaging.get_rpc_server(self.target, self)
self._rpc_server.start()
self.init_gswm()
self.init_iswm()
self.set_resource_limit()
super(EngineWorkerService, self).start()
def set_resource_limit(self):
try:
resource.setrlimit(resource.RLIMIT_NOFILE, (cfg.CONF.rlimit_nofile,
cfg.CONF.rlimit_nofile))
except Exception as ex:
LOG.error('Engine id %s: failed to set the NOFILE resource limit: '
'%s' % (self.engine_id, ex))
@request_context
def add_subcloud(self, ctxt, subcloud_name, sw_version):
self.gswm.add_subcloud(ctxt, subcloud_name, sw_version)
@request_context
# todo: add authentication since ctxt not actually needed later
def del_subcloud(self, ctxt, subcloud_name):
self.gswm.del_subcloud(ctxt, subcloud_name)
@request_context
# todo: add authentication since ctxt not actually needed later
def update_subcloud_states(self, ctxt, subcloud_name,
management_state,
availability_status):
"""Handle subcloud state updates from dcmanager
These state updates must be processed quickly. Any work triggered by
these state updates must be done asynchronously, without delaying the
reply to the dcmanager. For example, it is not acceptable to
communicate with a subcloud while handling the state update.
"""
# Check if state has changed before doing anything
if self.gswm.subcloud_state_matches(
subcloud_name,
management_state=management_state,
availability_status=availability_status):
# No change in state - nothing to do.
LOG.debug('Ignoring unchanged state update for %s' % subcloud_name)
return
# Check if the subcloud is ready to sync.
if (management_state == dccommon_consts.MANAGEMENT_MANAGED) and \
(availability_status == dccommon_consts.AVAILABILITY_ONLINE):
# Update the subcloud state and schedule an initial sync
self.gswm.update_subcloud_state(
ctxt,
subcloud_name,
management_state=management_state,
availability_status=availability_status,
initial_sync_state=consts.INITIAL_SYNC_STATE_REQUESTED)
else:
# Update the subcloud state and cancel the initial sync
self.gswm.update_subcloud_state(
ctxt,
subcloud_name,
management_state=management_state,
availability_status=availability_status,
initial_sync_state=consts.INITIAL_SYNC_STATE_NONE)
@request_context
def update_subcloud_state(self, ctxt, subcloud_name,
management_state=None,
availability_status=None,
initial_sync_state=None):
LOG.info("Trigger update state for subcloud %s", subcloud_name)
self.gswm.update_subcloud_state(ctxt, subcloud_name,
management_state,
availability_status,
initial_sync_state)
@request_context
def add_subcloud_sync_endpoint_type(self, ctxt, subcloud_name,
endpoint_type_list=None):
try:
self.gswm.add_subcloud_sync_endpoint_type(
ctxt, subcloud_name,
endpoint_type_list=endpoint_type_list)
except Exception as ex:
LOG.warning('Add subcloud endpoint type failed for %s: %s',
subcloud_name, six.text_type(ex))
raise
@request_context
def remove_subcloud_sync_endpoint_type(self, ctxt, subcloud_name,
endpoint_type_list=None):
try:
self.gswm.remove_subcloud_sync_endpoint_type(
ctxt, subcloud_name,
endpoint_type_list=endpoint_type_list)
except Exception as ex:
LOG.warning('Remove subcloud endpoint type failed for %s: %s',
subcloud_name, six.text_type(ex))
raise
@request_context
def sync_subclouds(self, ctxt, subcloud_sync_list):
self.gswm.sync_subclouds(ctxt, subcloud_sync_list)
@request_context
def run_sync_audit(self, ctxt, subcloud_sync_list):
self.gswm.run_sync_audit(ctxt, subcloud_sync_list)
@request_context
def initial_sync_subclouds(self, ctxt, subcloud_capabilities):
self.iswm.initial_sync_subclouds(ctxt, subcloud_capabilities)
@request_context
# todo: add authentication since ctxt not actually needed later
def update_subcloud_version(self, ctxt, subcloud_name, sw_version):
self.gswm.update_subcloud_version(ctxt, subcloud_name, sw_version)
@request_context
def update_subcloud_endpoints(self, ctxt, subcloud_name, endpoints):
self.gswm.update_subcloud_endpoints(ctxt, subcloud_name, endpoints)
def _stop_rpc_server(self):
# Stop RPC connection to prevent new requests
LOG.debug(_("Attempting to stop engine-worker service..."))
try:
if self._rpc_server:
self._rpc_server.stop()
self._rpc_server.wait()
LOG.info('Engine-worker service stopped successfully')
except Exception as ex:
LOG.error(f"Failed to stop engine-worker service: {six.text_type(ex)}")
def stop(self):
self._stop_rpc_server()
# Terminate the engine process
LOG.info("All threads were gone, terminating engine-worker")
super(EngineWorkerService, self).stop()