
1. Refactor dcorch's generic_sync_manager.py and initial_sync_manager into a main process manager and a worker manager. The main manager will handle the allocation of eligible subclouds to each worker. 2. Rename the current EngineService to EngineWorkerService and introduce a new EngineService for the main process, similar to DCManagerAuditService and DCManagerAuditWorkerService. 3. Rename the current RPC EngineClient to EngineWorkerClient and introduce a new EngineClient. Adapt the RPC methods to accommodate the modifications in these main process managers and worker managers. 4. Move master resources data retrieval from each sync_thread to engine workers. 5. Implement 2 new db APIs for subcloud batch sync and state updates. 6. Remove code related to sync_lock and its associated db table schema. 7. Add ocf script for managing the start and stop of the dcorch engine-worker service, and make changes in packaging accordingly. 8. Bug fixes for the issues related to the usage of base64.urlsafe_b64encode and base64.urlsafe_b64decode in python3. 9. Update unit tests for the main process and worker managers. Test Plan: PASS: Verify that the dcorch audit runs properly every 5 minutes. PASS: Verify that the initial sync runs properly every 10 seconds. PASS: Verify that the sync subclouds operation runs properly every 5 seconds. PASS: Successfully start and stop the dcorch-engine and dcorch-engine-worker services using the sm commands. PASS: Change the admin password on the system controller using the command "openstack --os-region-name SystemController user password set". Verify that the admin password is synchronized to the subcloud and the dcorch receives the corresponding sync request, followed by successful execution of sync resources for the subcloud. PASS: Unmanage and then manage a subcloud, and verify that the initial sync is executed successfully for that subcloud. PASS: Verify the removal of the sync_lock table from the dcorch db. Story: 2011106 Task: 50013 Change-Id: I329847bd1107ec43e67ec59bdd1e3111b7b37cd3 Signed-off-by: lzhu1 <li.zhu@windriver.com>
120 lines
4.8 KiB
Python
120 lines
4.8 KiB
Python
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
#
|
|
# Copyright (c) 2020, 2024 Wind River Systems, Inc.
|
|
#
|
|
|
|
import eventlet
|
|
from oslo_config import cfg
|
|
from oslo_log import log as logging
|
|
|
|
from dcorch.common import consts
|
|
from dcorch.common import context
|
|
from dcorch.db import api as db_api
|
|
from dcorch.rpc import client
|
|
|
|
CONF = cfg.CONF
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
# How often the initial sync thread will wake up
|
|
SYNC_INTERVAL = 10
|
|
# How long to wait after a failed sync before retrying
|
|
SYNC_FAIL_HOLD_OFF = 60
|
|
|
|
|
|
class InitialSyncManager(object):
|
|
"""Manages the initial sync for each subcloud."""
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
self.context = context.get_admin_context()
|
|
self.engine_worker_rpc_client = client.EngineWorkerClient()
|
|
|
|
def init_actions(self):
|
|
"""Perform actions on initialization"""
|
|
|
|
# Since we are starting up, any initial syncs that were in progress
|
|
# should be considered failed and must be redone.
|
|
subclouds = db_api.subcloud_update_all_initial_state(
|
|
self.context,
|
|
pre_initial_sync_state=consts.INITIAL_SYNC_STATE_IN_PROGRESS,
|
|
initial_sync_state=consts.INITIAL_SYNC_STATE_REQUESTED)
|
|
if subclouds > 0:
|
|
LOG.info("Initial sync for subclouds were in progress and "
|
|
"will be re-attempted.")
|
|
|
|
# Since we are starting up, any failed syncs won't be re-attempted
|
|
# because the timer will not be running. Reattempt them.
|
|
subclouds = db_api.subcloud_update_all_initial_state(
|
|
self.context,
|
|
pre_initial_sync_state=consts.INITIAL_SYNC_STATE_FAILED,
|
|
initial_sync_state=consts.INITIAL_SYNC_STATE_REQUESTED)
|
|
if subclouds > 0:
|
|
LOG.info(
|
|
"Initial sync for subclouds were failed and will be re-attempted.")
|
|
|
|
def initial_sync_thread(self):
|
|
"""Perform initial sync for subclouds as required."""
|
|
|
|
while True:
|
|
# Catch exceptions so the thread does not die.
|
|
try:
|
|
eventlet.greenthread.sleep(SYNC_INTERVAL)
|
|
self._initial_sync_subclouds()
|
|
except eventlet.greenlet.GreenletExit:
|
|
# We have been told to exit
|
|
return
|
|
except Exception as e:
|
|
LOG.exception(e)
|
|
|
|
def _initial_sync_subclouds(self):
|
|
"""Perform initial sync for subclouds that require it."""
|
|
subclouds = db_api.subcloud_capabilities_get_all(
|
|
self.context,
|
|
initial_sync_state=consts.INITIAL_SYNC_STATE_REQUESTED)
|
|
if not subclouds:
|
|
LOG.debug("No eligible subclouds for initial sync.")
|
|
return
|
|
|
|
LOG.info("Starting initial sync loop.")
|
|
|
|
# We want a chunksize of least 1 so add the number of workers.
|
|
chunksize = (len(subclouds) + CONF.workers) // (CONF.workers)
|
|
|
|
subcloud_capabilities = {}
|
|
for region_name, capabilities in subclouds.items():
|
|
subcloud_capabilities[region_name] = capabilities
|
|
if len(subcloud_capabilities) == chunksize:
|
|
# We've gathered a batch of subclouds, send it to engine worker
|
|
# to process.
|
|
try:
|
|
self.engine_worker_rpc_client.initial_sync_subclouds(
|
|
self.context,
|
|
subcloud_capabilities)
|
|
LOG.debug(f"Sent initial sync request message for "
|
|
f"{len(subcloud_capabilities)} subclouds")
|
|
except Exception as e:
|
|
LOG.error(f"Exception occurred in initial_sync for subclouds "
|
|
f"{list(subcloud_capabilities.keys())}: {e}")
|
|
subcloud_capabilities = {}
|
|
if subcloud_capabilities:
|
|
# We've got a partial batch...send it off for processing.
|
|
try:
|
|
self.engine_worker_rpc_client.initial_sync_subclouds(
|
|
self.context,
|
|
subcloud_capabilities)
|
|
LOG.debug(f"Sent initial sync request message for "
|
|
f"{len(subcloud_capabilities)} subclouds")
|
|
except Exception as e:
|
|
LOG.error(f"Exception occurred in initial_sync for subclouds "
|
|
f"{list(subcloud_capabilities.keys())}: {e}")
|
|
LOG.debug("Done sending initial sync request messages.")
|