1. Refactor dcorch's generic_sync_manager.py and initial_sync_manager
into a main process manager and a worker manager. The main manager
will handle the allocation of eligible subclouds to each worker.
2. Rename the current EngineService to EngineWorkerService and introduce
a new EngineService for the main process, similar to
DCManagerAuditService and DCManagerAuditWorkerService.
3. Rename the current RPC EngineClient to EngineWorkerClient and
introduce a new EngineClient. Adapt the RPC methods to accommodate
the modifications in these main process managers and worker managers.
4. Move master resources data retrieval from each sync_thread to engine
workers.
5. Implement 2 new db APIs for subcloud batch sync and state updates.
6. Remove code related to sync_lock and its associated db table schema.
7. Add ocf script for managing the start and stop of the dcorch
engine-worker service, and make changes in packaging accordingly.
8. Bug fixes for the issues related to the usage of
base64.urlsafe_b64encode and base64.urlsafe_b64decode in python3.
9. Update unit tests for the main process and worker managers.
Test Plan:
PASS: Verify that the dcorch audit runs properly every 5 minutes.
PASS: Verify that the initial sync runs properly every 10 seconds.
PASS: Verify that the sync subclouds operation runs properly every 5
seconds.
PASS: Successfully start and stop the dcorch-engine and
dcorch-engine-worker services using the sm commands.
PASS: Change the admin password on the system controller using
the command "openstack --os-region-name SystemController user
password set". Verify that the admin password is synchronized
to the subcloud and the dcorch receives the corresponding sync
request, followed by successful execution of sync resources for
the subcloud.
PASS: Unmanage and then manage a subcloud, and verify that the initial
sync is executed successfully for that subcloud.
PASS: Verify the removal of the sync_lock table from the dcorch db.
Story: 2011106
Task: 50013
Change-Id: I329847bd1107ec43e67ec59bdd1e3111b7b37cd3
Signed-off-by: lzhu1 <li.zhu@windriver.com>
324 lines
9.9 KiB
Bash
324 lines
9.9 KiB
Bash
#!/bin/sh
|
|
# OpenStack DC Orchestrator Engine-worker Service (dcorch-engine-worker)
|
|
#
|
|
# Description: Manages an OpenStack DC Orchestrator Engine-worker Service (dcorch-engine-worker) process as an HA resource
|
|
#
|
|
# Copyright (c) 2024 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
#
|
|
# See usage() function below for more details ...
|
|
#
|
|
# OCF instance parameters:
|
|
# OCF_RESKEY_binary
|
|
# OCF_RESKEY_config
|
|
# OCF_RESKEY_user
|
|
# OCF_RESKEY_pid
|
|
# OCF_RESKEY_additional_parameters
|
|
#######################################################################
|
|
# Initialization:
|
|
|
|
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
|
|
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
|
|
|
|
#######################################################################
|
|
|
|
# Fill in some defaults if no values are specified
|
|
|
|
OCF_RESKEY_binary_default="/usr/bin/dcorch-engine-worker"
|
|
OCF_RESKEY_config_default="/etc/dcorch/dcorch.conf"
|
|
OCF_RESKEY_user_default="root"
|
|
OCF_RESKEY_pid_default="$HA_RSCTMP/$OCF_RESOURCE_INSTANCE.pid"
|
|
|
|
: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}
|
|
: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}}
|
|
: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}}
|
|
: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}}
|
|
|
|
#######################################################################
|
|
|
|
usage() {
|
|
cat <<UEND
|
|
usage: $0 (start|stop|validate-all|meta-data|status|monitor)
|
|
|
|
$0 manages an OpenStack DC Orchestrator Engine-worker (dcorch-engine-worker) process as an HA resource
|
|
|
|
The 'start' operation starts the dcorch-engine-worker service.
|
|
The 'stop' operation stops the dcorch-engine-worker service.
|
|
The 'validate-all' operation reports whether the parameters are valid
|
|
The 'meta-data' operation reports this RA's meta-data information
|
|
The 'status' operation reports whether the dcorch-engine-worker service is running
|
|
The 'monitor' operation reports whether the dcorch-engine-worker service seems to be working
|
|
|
|
UEND
|
|
}
|
|
|
|
meta_data() {
|
|
cat <<END
|
|
<?xml version="1.0"?>
|
|
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
|
|
<resource-agent name="dcorch-engine-worker">
|
|
<version>1.0</version>
|
|
|
|
<longdesc lang="en">
|
|
Resource agent for the DC Orchestrator Engine-worker Service (dcorch-engine-worker)
|
|
</longdesc>
|
|
<shortdesc lang="en">Manages the OpenStack DC Orchestrator Engine-worker Service(dcorch-engine-worker)</shortdesc>
|
|
<parameters>
|
|
|
|
<parameter name="binary" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
Location of the DC Orchestrator Engine-worker binary (dcorch-engine-worker)
|
|
</longdesc>
|
|
<shortdesc lang="en">DC Orchestrator Engine-worker binary (dcorch-engine-worker)</shortdesc>
|
|
<content type="string" default="${OCF_RESKEY_binary_default}" />
|
|
</parameter>
|
|
|
|
<parameter name="config" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
Location of the DC Orchestrator Engine-worker (dcorch-engine-worker) configuration file
|
|
</longdesc>
|
|
<shortdesc lang="en">DC Orchestrator Engine-worker (dcorch-engine-worker registry) config file</shortdesc>
|
|
<content type="string" default="${OCF_RESKEY_config_default}" />
|
|
</parameter>
|
|
|
|
<parameter name="user" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
User running DC Orchestrator Engine-worker (dcorch-engine-worker)
|
|
</longdesc>
|
|
<shortdesc lang="en">DC Orchestrator Engine-worker (dcorch-engine-worker) user</shortdesc>
|
|
<content type="string" default="${OCF_RESKEY_user_default}" />
|
|
</parameter>
|
|
|
|
<parameter name="pid" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
The pid file to use for this DC Orchestrator Engine-worker (dcorch-engine-worker) instance
|
|
</longdesc>
|
|
<shortdesc lang="en">DC Orchestrator Engine-worker (dcorch-engine-worker) pid file</shortdesc>
|
|
<content type="string" default="${OCF_RESKEY_pid_default}" />
|
|
</parameter>
|
|
|
|
<parameter name="additional_parameters" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
Additional parameters to pass on to the OpenStack DC Orchestrator Engine-worker (dcorch-engine-worker)
|
|
</longdesc>
|
|
<shortdesc lang="en">Additional parameters for dcorch-engine-worker</shortdesc>
|
|
<content type="string" />
|
|
</parameter>
|
|
|
|
</parameters>
|
|
|
|
<actions>
|
|
<action name="start" timeout="20" />
|
|
<action name="stop" timeout="20" />
|
|
<action name="status" timeout="20" />
|
|
<action name="monitor" timeout="10" interval="5" />
|
|
<action name="validate-all" timeout="5" />
|
|
<action name="meta-data" timeout="5" />
|
|
</actions>
|
|
</resource-agent>
|
|
END
|
|
}
|
|
|
|
#######################################################################
|
|
# Functions invoked by resource manager actions
|
|
|
|
dcorch_engine_worker_validate() {
|
|
local rc
|
|
|
|
check_binary $OCF_RESKEY_binary
|
|
check_binary curl
|
|
check_binary tr
|
|
check_binary grep
|
|
check_binary cut
|
|
check_binary head
|
|
|
|
# A config file on shared storage that is not available
|
|
# during probes is OK.
|
|
if [ ! -f $OCF_RESKEY_config ]; then
|
|
if ! ocf_is_probe; then
|
|
ocf_log err "Config $OCF_RESKEY_config doesn't exist"
|
|
return $OCF_ERR_INSTALLED
|
|
fi
|
|
ocf_log_warn "Config $OCF_RESKEY_config not available during a probe"
|
|
fi
|
|
|
|
getent passwd $OCF_RESKEY_user >/dev/null 2>&1
|
|
rc=$?
|
|
if [ $rc -ne 0 ]; then
|
|
ocf_log err "User $OCF_RESKEY_user doesn't exist"
|
|
return $OCF_ERR_INSTALLED
|
|
fi
|
|
|
|
true
|
|
}
|
|
|
|
dcorch_engine_worker_status() {
|
|
local pid
|
|
local rc
|
|
|
|
if [ ! -f $OCF_RESKEY_pid ]; then
|
|
ocf_log info "DC Orchestrator Engine-worker (dcorch-engine-worker) is not running"
|
|
return $OCF_NOT_RUNNING
|
|
else
|
|
pid=`cat $OCF_RESKEY_pid`
|
|
fi
|
|
|
|
ocf_run -warn kill -s 0 $pid
|
|
rc=$?
|
|
if [ $rc -eq 0 ]; then
|
|
return $OCF_SUCCESS
|
|
else
|
|
ocf_log info "Old PID file found, but DC Orchestrator Engine-worker (dcorch-engine-worker) is not running"
|
|
rm -f $OCF_RESKEY_pid
|
|
return $OCF_NOT_RUNNING
|
|
fi
|
|
}
|
|
|
|
dcorch_engine_worker_monitor() {
|
|
local rc
|
|
|
|
dcorch_engine_worker_status
|
|
rc=$?
|
|
|
|
# If status returned anything but success, return that immediately
|
|
if [ $rc -ne $OCF_SUCCESS ]; then
|
|
return $rc
|
|
fi
|
|
|
|
# Further verify the service availibility.
|
|
|
|
ocf_log debug "DC Orchestrator Engine-worker (dcorch-engine-worker) monitor succeeded"
|
|
return $OCF_SUCCESS
|
|
}
|
|
|
|
dcorch_engine_worker_start() {
|
|
local rc
|
|
|
|
dcorch_engine_worker_status
|
|
rc=$?
|
|
if [ $rc -eq $OCF_SUCCESS ]; then
|
|
ocf_log info "DC Orchestrator Engine-worker (dcorch-engine-worker) already running"
|
|
return $OCF_SUCCESS
|
|
fi
|
|
|
|
# Change the working dir to /, to be sure it's accesible
|
|
cd /
|
|
|
|
# run the actual dcorch-engine-worker daemon. Don't use ocf_run as we're sending the tool's output
|
|
# straight to /dev/null anyway and using ocf_run would break stdout-redirection here.
|
|
su ${OCF_RESKEY_user} -s /bin/sh -c "${OCF_RESKEY_binary} --config-file=$OCF_RESKEY_config \
|
|
$OCF_RESKEY_additional_parameters"' >> /dev/null 2>&1 & echo $!' > $OCF_RESKEY_pid
|
|
|
|
# Spin waiting for the server to come up.
|
|
# Let the CRM/LRM time us out if required
|
|
while true; do
|
|
dcorch_engine_worker_monitor
|
|
rc=$?
|
|
[ $rc -eq $OCF_SUCCESS ] && break
|
|
if [ $rc -ne $OCF_NOT_RUNNING ]; then
|
|
ocf_log err "DC Orchestrator Engine-worker (dcorch-engine-worker) start failed"
|
|
exit $OCF_ERR_GENERIC
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
ocf_log info "DC Orchestrator Engine-worker (dcorch-engine-worker) started"
|
|
return $OCF_SUCCESS
|
|
}
|
|
|
|
dcorch_engine_worker_confirm_stop() {
|
|
local my_bin
|
|
local my_processes
|
|
|
|
my_binary=`which ${OCF_RESKEY_binary}`
|
|
my_processes=`pgrep -l -f "^(python|/usr/bin/python|/usr/bin/python3) ${my_binary}([^\w-]|$)"`
|
|
|
|
if [ -n "${my_processes}" ]
|
|
then
|
|
ocf_log info "About to SIGKILL the following: ${my_processes}"
|
|
pkill -KILL -f "^(python|/usr/bin/python|/usr/bin/python3) ${my_binary}([^\w-]|$)"
|
|
fi
|
|
}
|
|
|
|
dcorch_engine_worker_stop() {
|
|
local rc
|
|
local pid
|
|
|
|
dcorch_engine_worker_status
|
|
rc=$?
|
|
if [ $rc -eq $OCF_NOT_RUNNING ]; then
|
|
ocf_log info "DC Orchestrator Engine-worker (dcorch-engine-worker) already stopped"
|
|
dcorch_engine_worker_confirm_stop
|
|
return $OCF_SUCCESS
|
|
fi
|
|
|
|
# Try SIGTERM
|
|
pid=`cat $OCF_RESKEY_pid`
|
|
ocf_run kill -s TERM $pid
|
|
rc=$?
|
|
if [ $rc -ne 0 ]; then
|
|
ocf_log err "DC Orchestrator Engine-worker (dcorch-engine-worker) couldn't be stopped"
|
|
dcorch_engine_worker_confirm_stop
|
|
exit $OCF_ERR_GENERIC
|
|
fi
|
|
|
|
# stop waiting
|
|
shutdown_timeout=15
|
|
if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
|
|
shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-5))
|
|
fi
|
|
count=0
|
|
while [ $count -lt $shutdown_timeout ]; do
|
|
dcorch_engine_worker_status
|
|
rc=$?
|
|
if [ $rc -eq $OCF_NOT_RUNNING ]; then
|
|
break
|
|
fi
|
|
count=`expr $count + 1`
|
|
sleep 1
|
|
ocf_log debug "DC Orchestrator Engine-worker (dcorch-engine-worker) still hasn't stopped yet. Waiting ..."
|
|
done
|
|
|
|
dcorch_engine_worker_status
|
|
rc=$?
|
|
if [ $rc -ne $OCF_NOT_RUNNING ]; then
|
|
# SIGTERM didn't help either, try SIGKILL
|
|
ocf_log info "DC Orchestrator Engine-worker (dcorch-engine-worker) failed to stop after ${shutdown_timeout}s \
|
|
using SIGTERM. Trying SIGKILL ..."
|
|
ocf_run kill -s KILL $pid
|
|
fi
|
|
dcorch_engine_worker_confirm_stop
|
|
|
|
ocf_log info "DC Orchestrator Engine-worker (dcorch-engine-worker) stopped"
|
|
|
|
rm -f $OCF_RESKEY_pid
|
|
|
|
return $OCF_SUCCESS
|
|
}
|
|
|
|
#######################################################################
|
|
|
|
case "$1" in
|
|
meta-data) meta_data
|
|
exit $OCF_SUCCESS;;
|
|
usage|help) usage
|
|
exit $OCF_SUCCESS;;
|
|
esac
|
|
|
|
# Anything except meta-data and help must pass validation
|
|
dcorch_engine_worker_validate || exit $?
|
|
|
|
# What kind of method was invoked?
|
|
case "$1" in
|
|
start) dcorch_engine_worker_start;;
|
|
stop) dcorch_engine_worker_stop;;
|
|
status) dcorch_engine_worker_status;;
|
|
monitor) dcorch_engine_worker_monitor;;
|
|
validate-all) ;;
|
|
*) usage
|
|
exit $OCF_ERR_UNIMPLEMENTED;;
|
|
esac
|
|
|