The DC OCF scripts were not updated over the switch to Debian
in StarlingX 8.0. As a result, it could lead to orphan processes
over the service restart or controller swact. The orphan processes
consume resources and perform duplicate/obsolete tasks (e.g.
auditing the same subclouds as the corresponding worker processes)
until their work queues are empty.
This commit fixes up the pgrep option to restore the functionality
of the confirm_stop function of the OCF script. Processes that
fail to be terminated will get killed.
Test Plan:
- Deploy a small DC system. Verify that all DC services can
be started, stopped and restarted by SM.
- Deploy a large DC system with many subclouds. Reduce the
thread_pool_size of dcmanager-audit-worker. Let the system
soak for a couple of hours. Restart the service in the
middle of the audit cycle. Verify that dcmanager-audit-worker
sevice was successfully restarted and there are no orphan
processes.
Closes-Bug: 2064368
Change-Id: Ie5cbc89cde374e32d4e0a3799a9f8833c071d206
Signed-off-by: Tee Ngo <tee.ngo@windriver.com>
328 lines
9.5 KiB
Bash
328 lines
9.5 KiB
Bash
#!/bin/sh
|
|
# OpenStack DC Manager State Service (dcmanager-state)
|
|
#
|
|
# Description:
|
|
# Manages an OpenStack DC Manager State Service (dcmanager-state)
|
|
# process as an HA resource
|
|
#
|
|
# Copyright (c) 2022,2024 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
#
|
|
# See usage() function below for more details ...
|
|
#
|
|
# OCF instance parameters:
|
|
# OCF_RESKEY_binary
|
|
# OCF_RESKEY_config
|
|
# OCF_RESKEY_user
|
|
# OCF_RESKEY_pid
|
|
# OCF_RESKEY_additional_parameters
|
|
#######################################################################
|
|
# Initialization:
|
|
|
|
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
|
|
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
|
|
|
|
#######################################################################
|
|
|
|
# Fill in some defaults if no values are specified
|
|
|
|
OCF_RESKEY_binary_default="/usr/bin/dcmanager-state"
|
|
OCF_RESKEY_config_default="/etc/dcmanager/dcmanager.conf"
|
|
OCF_RESKEY_user_default="root"
|
|
OCF_RESKEY_pid_default="$HA_RSCTMP/$OCF_RESOURCE_INSTANCE.pid"
|
|
|
|
: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}
|
|
: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}}
|
|
: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}}
|
|
: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}}
|
|
|
|
export TMPDIR=/var/run/dcmanager
|
|
|
|
#######################################################################
|
|
|
|
usage() {
|
|
cat <<UEND
|
|
usage: $0 (start|stop|validate-all|meta-data|status|monitor)
|
|
|
|
$0 manages an OpenStack DC Manager State service (dcmanager-state) process as an HA resource
|
|
|
|
The 'start' operation starts the dcmanager-state service.
|
|
The 'stop' operation stops the dcmanager-state service.
|
|
The 'validate-all' operation reports whether the parameters are valid
|
|
The 'meta-data' operation reports this RA's meta-data information
|
|
The 'status' operation reports whether the dcmanager-state service is running
|
|
The 'monitor' operation reports whether the dcmanager-state service seems to be working
|
|
|
|
UEND
|
|
}
|
|
|
|
meta_data() {
|
|
cat <<END
|
|
<?xml version="1.0"?>
|
|
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
|
|
<resource-agent name="dcmanager-state">
|
|
<version>1.0</version>
|
|
|
|
<longdesc lang="en">
|
|
Resource agent for the DC Manager service (dcmanager-state)
|
|
</longdesc>
|
|
<shortdesc lang="en">Manages the OpenStack DC Manager State Service (dcmanager-state)</shortdesc>
|
|
<parameters>
|
|
|
|
<parameter name="binary" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
Location of the DC Manager State Service binary (dcmanager-state)
|
|
</longdesc>
|
|
<shortdesc lang="en">DC Manager State Service binary (dcmanager-state)</shortdesc>
|
|
<content type="string" default="${OCF_RESKEY_binary_default}" />
|
|
</parameter>
|
|
|
|
<parameter name="config" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
Location of the DC Manager State Service (dcmanager-state) configuration file
|
|
</longdesc>
|
|
<shortdesc lang="en">DC Manager State Service (dcmanager-state registry) config file</shortdesc>
|
|
<content type="string" default="${OCF_RESKEY_config_default}" />
|
|
</parameter>
|
|
|
|
<parameter name="user" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
User running DC Manager State Service (dcmanager-state)
|
|
</longdesc>
|
|
<shortdesc lang="en">DC Manager State Service (dcmanager-state) user</shortdesc>
|
|
<content type="string" default="${OCF_RESKEY_user_default}" />
|
|
</parameter>
|
|
|
|
<parameter name="pid" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
The pid file to use for this DC Manager State Service (dcmanager-state) instance
|
|
</longdesc>
|
|
<shortdesc lang="en">DC Manager State Service (dcmanager-state) pid file</shortdesc>
|
|
<content type="string" default="${OCF_RESKEY_pid_default}" />
|
|
</parameter>
|
|
|
|
<parameter name="additional_parameters" unique="0" required="0">
|
|
<longdesc lang="en">
|
|
Additional parameters to pass on to the OpenStack NovaAPI (dcmanager-state)
|
|
</longdesc>
|
|
<shortdesc lang="en">Additional parameters for dcmanager-state</shortdesc>
|
|
<content type="string" />
|
|
</parameter>
|
|
|
|
</parameters>
|
|
|
|
<actions>
|
|
<action name="start" timeout="20" />
|
|
<action name="stop" timeout="20" />
|
|
<action name="status" timeout="20" />
|
|
<action name="monitor" timeout="10" interval="5" />
|
|
<action name="validate-all" timeout="5" />
|
|
<action name="meta-data" timeout="5" />
|
|
</actions>
|
|
</resource-agent>
|
|
END
|
|
}
|
|
|
|
#######################################################################
|
|
# Functions invoked by resource manager actions
|
|
|
|
dcmanager_state_validate() {
|
|
local rc
|
|
|
|
check_binary $OCF_RESKEY_binary
|
|
check_binary curl
|
|
check_binary tr
|
|
check_binary grep
|
|
check_binary cut
|
|
check_binary head
|
|
|
|
# A config file on shared storage that is not available
|
|
# during probes is OK.
|
|
if [ ! -f $OCF_RESKEY_config ]; then
|
|
if ! ocf_is_probe; then
|
|
ocf_log err "Config $OCF_RESKEY_config doesn't exist"
|
|
return $OCF_ERR_INSTALLED
|
|
fi
|
|
ocf_log_warn "Config $OCF_RESKEY_config not available during a probe"
|
|
fi
|
|
|
|
getent passwd $OCF_RESKEY_user >/dev/null 2>&1
|
|
rc=$?
|
|
if [ $rc -ne 0 ]; then
|
|
ocf_log err "User $OCF_RESKEY_user doesn't exist"
|
|
return $OCF_ERR_INSTALLED
|
|
fi
|
|
|
|
true
|
|
}
|
|
|
|
dcmanager_state_status() {
|
|
local pid
|
|
local rc
|
|
|
|
if [ ! -f $OCF_RESKEY_pid ]; then
|
|
ocf_log info "DC Manager State Service (dcmanager-state) is not running"
|
|
return $OCF_NOT_RUNNING
|
|
else
|
|
pid=`cat $OCF_RESKEY_pid`
|
|
fi
|
|
|
|
ocf_run -warn kill -s 0 $pid
|
|
rc=$?
|
|
if [ $rc -eq 0 ]; then
|
|
return $OCF_SUCCESS
|
|
else
|
|
ocf_log info "Old PID file found, but DC Manager State Service (dcmanager-state) is not running"
|
|
rm -f $OCF_RESKEY_pid
|
|
return $OCF_NOT_RUNNING
|
|
fi
|
|
}
|
|
|
|
dcmanager_state_monitor() {
|
|
local rc
|
|
|
|
dcmanager_state_status
|
|
rc=$?
|
|
|
|
# If status returned anything but success, return that immediately
|
|
if [ $rc -ne $OCF_SUCCESS ]; then
|
|
return $rc
|
|
fi
|
|
|
|
# Further verify the service availibility.
|
|
|
|
ocf_log debug "DC Manager State Service (dcmanager-state) monitor succeeded"
|
|
return $OCF_SUCCESS
|
|
}
|
|
|
|
dcmanager_state_start() {
|
|
local rc
|
|
|
|
dcmanager_state_status
|
|
rc=$?
|
|
if [ $rc -eq $OCF_SUCCESS ]; then
|
|
ocf_log info "DC Manager State Service (dcmanager-state) already running"
|
|
return $OCF_SUCCESS
|
|
fi
|
|
|
|
# Change the working dir to /, to be sure it's accesible
|
|
cd /
|
|
|
|
# run the actual dcmanager-state daemon. Don't use ocf_run as we're sending the tool's output
|
|
# straight to /dev/null anyway and using ocf_run would break stdout-redirection here.
|
|
su ${OCF_RESKEY_user} -s /bin/sh -c "${OCF_RESKEY_binary} --config-file=$OCF_RESKEY_config \
|
|
$OCF_RESKEY_additional_parameters"' >> /dev/null 2>&1 & echo $!' > $OCF_RESKEY_pid
|
|
|
|
# Spin waiting for the server to come up.
|
|
# Let the CRM/LRM time us out if required
|
|
while true; do
|
|
dcmanager_state_monitor
|
|
rc=$?
|
|
[ $rc -eq $OCF_SUCCESS ] && break
|
|
if [ $rc -ne $OCF_NOT_RUNNING ]; then
|
|
ocf_log err "DC Manager State Service (dcmanager-state) start failed"
|
|
exit $OCF_ERR_GENERIC
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
ocf_log info "DC Manager State Service (dcmanager-state) started"
|
|
return $OCF_SUCCESS
|
|
}
|
|
|
|
dcmanager_state_confirm_stop() {
|
|
local my_bin
|
|
local my_processes
|
|
|
|
my_binary=`which ${OCF_RESKEY_binary}`
|
|
my_processes=`pgrep -l -f "^(python|/usr/bin/python|/usr/bin/python3) ${my_binary}([^\w-]|$)"`
|
|
|
|
if [ -n "${my_processes}" ]
|
|
then
|
|
ocf_log info "About to SIGKILL the following: ${my_processes}"
|
|
pkill -KILL -f "^(python|/usr/bin/python|/usr/bin/python3) ${my_binary}([^\w-]|$)"
|
|
fi
|
|
}
|
|
|
|
dcmanager_state_stop() {
|
|
local rc
|
|
local pid
|
|
|
|
dcmanager_state_status
|
|
rc=$?
|
|
if [ $rc -eq $OCF_NOT_RUNNING ]; then
|
|
ocf_log info "DC Manager State Service (dcmanager-state) already stopped"
|
|
dcmanager_state_confirm_stop
|
|
return $OCF_SUCCESS
|
|
fi
|
|
|
|
# Try SIGTERM
|
|
pid=`cat $OCF_RESKEY_pid`
|
|
ocf_run kill -s TERM $pid
|
|
rc=$?
|
|
if [ $rc -ne 0 ]; then
|
|
ocf_log err "DC Manager State Service (dcmanager-state) couldn't be stopped"
|
|
dcmanager_state_confirm_stop
|
|
exit $OCF_ERR_GENERIC
|
|
fi
|
|
|
|
# stop waiting
|
|
shutdown_timeout=15
|
|
if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
|
|
shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-5))
|
|
fi
|
|
count=0
|
|
while [ $count -lt $shutdown_timeout ]; do
|
|
dcmanager_state_status
|
|
rc=$?
|
|
if [ $rc -eq $OCF_NOT_RUNNING ]; then
|
|
break
|
|
fi
|
|
count=`expr $count + 1`
|
|
sleep 1
|
|
ocf_log debug "DC Manager State Service (dcmanager-state) still hasn't stopped yet. Waiting ..."
|
|
done
|
|
|
|
dcmanager_state_status
|
|
rc=$?
|
|
if [ $rc -ne $OCF_NOT_RUNNING ]; then
|
|
# SIGTERM didn't help either, try SIGKILL
|
|
ocf_log info "DC Manager State Service (dcmanager-state) failed to stop after ${shutdown_timeout}s \
|
|
using SIGTERM. Trying SIGKILL ..."
|
|
ocf_run kill -s KILL $pid
|
|
fi
|
|
dcmanager_state_confirm_stop
|
|
|
|
ocf_log info "DC Manager State Service (dcmanager-state) stopped"
|
|
|
|
rm -f $OCF_RESKEY_pid
|
|
|
|
return $OCF_SUCCESS
|
|
}
|
|
|
|
#######################################################################
|
|
|
|
case "$1" in
|
|
meta-data) meta_data
|
|
exit $OCF_SUCCESS;;
|
|
usage|help) usage
|
|
exit $OCF_SUCCESS;;
|
|
esac
|
|
|
|
# Anything except meta-data and help must pass validation
|
|
dcmanager_state_validate || exit $?
|
|
|
|
# What kind of method was invoked?
|
|
case "$1" in
|
|
start) dcmanager_state_start;;
|
|
stop) dcmanager_state_stop;;
|
|
status) dcmanager_state_status;;
|
|
monitor) dcmanager_state_monitor;;
|
|
validate-all) ;;
|
|
*) usage
|
|
exit $OCF_ERR_UNIMPLEMENTED;;
|
|
esac
|
|
|