distcloud/distributedcloud/ocf/dcmanager-manager
Andy Ning e8d916756d Set up /var/run/dcmanager as dcmanager's default temp files location
dcmanager call the k8s python client to perform a number of
operations. The k8s python client creates temp files under /tmp and
continues use these tmp files for the life-cycle of the processes.

However systemd-tmpfiles-clean.service will run every day to clean up
files in /tmp dir that are older than 10 days. If the k8s client code
is not triggered for more than 10 days (thus its temp files are not
accessed for more than 10 days), these temp files will be removed as
part of the cleanup. Certain dcmanager operations then starts to fail
with an error that the tmp file is no longer there.

This is a known issue of kubernetes python client:
https://github.com/kubernetes-client/python/issues/765

The commit fixes this issue by setting TMPDIR to /var/run/dcmanager
when sm starts dcmanager-manager.

Change-Id: Ib147c2ab26e303032e18da51a506e3768bc471e0
Closes-Bug: 1883599
Signed-off-by: Andy Ning <andy.ning@windriver.com>
2020-06-17 10:40:23 -04:00

328 lines
9.5 KiB
Bash

#!/bin/sh
# OpenStack DC Manager Service (dcmanager-manager)
#
# Description:
# Manages an OpenStack DC Manager Service (dcmanager-manager)
# process as an HA resource
#
# Copyright (c) 2017 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
#
# See usage() function below for more details ...
#
# OCF instance parameters:
# OCF_RESKEY_binary
# OCF_RESKEY_config
# OCF_RESKEY_user
# OCF_RESKEY_pid
# OCF_RESKEY_additional_parameters
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
#######################################################################
# Fill in some defaults if no values are specified
OCF_RESKEY_binary_default="/usr/bin/dcmanager-manager"
OCF_RESKEY_config_default="/etc/dcmanager/dcmanager.conf"
OCF_RESKEY_user_default="root"
OCF_RESKEY_pid_default="$HA_RSCTMP/$OCF_RESOURCE_INSTANCE.pid"
: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}
: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}}
: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}}
: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}}
export TMPDIR=/var/run/dcmanager
#######################################################################
usage() {
cat <<UEND
usage: $0 (start|stop|validate-all|meta-data|status|monitor)
$0 manages an OpenStack DC Orchestrator Api Proxy service (dcmanager-manager) process as an HA resource
The 'start' operation starts the dcmanager-manager service.
The 'stop' operation stops the dcmanager-manager service.
The 'validate-all' operation reports whether the parameters are valid
The 'meta-data' operation reports this RA's meta-data information
The 'status' operation reports whether the dcmanager-manager service is running
The 'monitor' operation reports whether the dcmanager-manager service seems to be working
UEND
}
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="dcmanager-manager">
<version>1.0</version>
<longdesc lang="en">
Resource agent for the DC Manager service (dcmanager-manager)
</longdesc>
<shortdesc lang="en">Manages the OpenStack DC Manager Service (dcmanager-manager)</shortdesc>
<parameters>
<parameter name="binary" unique="0" required="0">
<longdesc lang="en">
Location of the DC Manager Service binary (dcmanager-manager)
</longdesc>
<shortdesc lang="en">DC Manager Service binary (dcmanager-manager)</shortdesc>
<content type="string" default="${OCF_RESKEY_binary_default}" />
</parameter>
<parameter name="config" unique="0" required="0">
<longdesc lang="en">
Location of the DC Manager Service (dcmanager-manager) configuration file
</longdesc>
<shortdesc lang="en">DC Manager Service (dcmanager-manager registry) config file</shortdesc>
<content type="string" default="${OCF_RESKEY_config_default}" />
</parameter>
<parameter name="user" unique="0" required="0">
<longdesc lang="en">
User running DC Manager Service (dcmanager-manager)
</longdesc>
<shortdesc lang="en">DC Manager Service (dcmanager-manager) user</shortdesc>
<content type="string" default="${OCF_RESKEY_user_default}" />
</parameter>
<parameter name="pid" unique="0" required="0">
<longdesc lang="en">
The pid file to use for this DC Manager Service (dcmanager-manager) instance
</longdesc>
<shortdesc lang="en">DC Manager Service (dcmanager-manager) pid file</shortdesc>
<content type="string" default="${OCF_RESKEY_pid_default}" />
</parameter>
<parameter name="additional_parameters" unique="0" required="0">
<longdesc lang="en">
Additional parameters to pass on to the OpenStack NovaAPI (dcmanager-manager)
</longdesc>
<shortdesc lang="en">Additional parameters for dcmanager-manager</shortdesc>
<content type="string" />
</parameter>
</parameters>
<actions>
<action name="start" timeout="20" />
<action name="stop" timeout="20" />
<action name="status" timeout="20" />
<action name="monitor" timeout="10" interval="5" />
<action name="validate-all" timeout="5" />
<action name="meta-data" timeout="5" />
</actions>
</resource-agent>
END
}
#######################################################################
# Functions invoked by resource manager actions
dcmanager_manager_validate() {
local rc
check_binary $OCF_RESKEY_binary
check_binary curl
check_binary tr
check_binary grep
check_binary cut
check_binary head
# A config file on shared storage that is not available
# during probes is OK.
if [ ! -f $OCF_RESKEY_config ]; then
if ! ocf_is_probe; then
ocf_log err "Config $OCF_RESKEY_config doesn't exist"
return $OCF_ERR_INSTALLED
fi
ocf_log_warn "Config $OCF_RESKEY_config not available during a probe"
fi
getent passwd $OCF_RESKEY_user >/dev/null 2>&1
rc=$?
if [ $rc -ne 0 ]; then
ocf_log err "User $OCF_RESKEY_user doesn't exist"
return $OCF_ERR_INSTALLED
fi
true
}
dcmanager_manager_status() {
local pid
local rc
if [ ! -f $OCF_RESKEY_pid ]; then
ocf_log info "DC Manager Service (dcmanager-manager) is not running"
return $OCF_NOT_RUNNING
else
pid=`cat $OCF_RESKEY_pid`
fi
ocf_run -warn kill -s 0 $pid
rc=$?
if [ $rc -eq 0 ]; then
return $OCF_SUCCESS
else
ocf_log info "Old PID file found, but DC Manager Service (dcmanager-manager) is not running"
rm -f $OCF_RESKEY_pid
return $OCF_NOT_RUNNING
fi
}
dcmanager_manager_monitor() {
local rc
dcmanager_manager_status
rc=$?
# If status returned anything but success, return that immediately
if [ $rc -ne $OCF_SUCCESS ]; then
return $rc
fi
# Further verify the service availibility.
ocf_log debug "DC Manager Service (dcmanager-manager) monitor succeeded"
return $OCF_SUCCESS
}
dcmanager_manager_start() {
local rc
dcmanager_manager_status
rc=$?
if [ $rc -eq $OCF_SUCCESS ]; then
ocf_log info "DC Manager Service (dcmanager-manager) already running"
return $OCF_SUCCESS
fi
# Change the working dir to /, to be sure it's accesible
cd /
# run the actual dcmanager-manager daemon. Don't use ocf_run as we're sending the tool's output
# straight to /dev/null anyway and using ocf_run would break stdout-redirection here.
su ${OCF_RESKEY_user} -s /bin/sh -c "${OCF_RESKEY_binary} --config-file=$OCF_RESKEY_config \
$OCF_RESKEY_additional_parameters"' >> /dev/null 2>&1 & echo $!' > $OCF_RESKEY_pid
# Spin waiting for the server to come up.
# Let the CRM/LRM time us out if required
while true; do
dcmanager_manager_monitor
rc=$?
[ $rc -eq $OCF_SUCCESS ] && break
if [ $rc -ne $OCF_NOT_RUNNING ]; then
ocf_log err "DC Manager Service (dcmanager-manager) start failed"
exit $OCF_ERR_GENERIC
fi
sleep 1
done
ocf_log info "DC Manager Service (dcmanager-manager) started"
return $OCF_SUCCESS
}
dcmanager_manager_confirm_stop() {
local my_bin
local my_processes
my_binary=`which ${OCF_RESKEY_binary}`
my_processes=`pgrep -l -f "^(python|/usr/bin/python|/usr/bin/python2) ${my_binary}([^\w-]|$)"`
if [ -n "${my_processes}" ]
then
ocf_log info "About to SIGKILL the following: ${my_processes}"
pkill -KILL -f "^(python|/usr/bin/python|/usr/bin/python2) ${my_binary}([^\w-]|$)"
fi
}
dcmanager_manager_stop() {
local rc
local pid
dcmanager_manager_status
rc=$?
if [ $rc -eq $OCF_NOT_RUNNING ]; then
ocf_log info "DC Manager Service (dcmanager-manager) already stopped"
dcmanager_manager_confirm_stop
return $OCF_SUCCESS
fi
# Try SIGTERM
pid=`cat $OCF_RESKEY_pid`
ocf_run kill -s TERM $pid
rc=$?
if [ $rc -ne 0 ]; then
ocf_log err "DC Manager Service (dcmanager-manager) couldn't be stopped"
dcmanager_manager_confirm_stop
exit $OCF_ERR_GENERIC
fi
# stop waiting
shutdown_timeout=15
if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-5))
fi
count=0
while [ $count -lt $shutdown_timeout ]; do
dcmanager_manager_status
rc=$?
if [ $rc -eq $OCF_NOT_RUNNING ]; then
break
fi
count=`expr $count + 1`
sleep 1
ocf_log debug "DC Manager Service (dcmanager-manager) still hasn't stopped yet. Waiting ..."
done
dcmanager_manager_status
rc=$?
if [ $rc -ne $OCF_NOT_RUNNING ]; then
# SIGTERM didn't help either, try SIGKILL
ocf_log info "DC Manager Service (dcmanager-manager) failed to stop after ${shutdown_timeout}s \
using SIGTERM. Trying SIGKILL ..."
ocf_run kill -s KILL $pid
fi
dcmanager_manager_confirm_stop
ocf_log info "DC Manager Service (dcmanager-manager) stopped"
rm -f $OCF_RESKEY_pid
return $OCF_SUCCESS
}
#######################################################################
case "$1" in
meta-data) meta_data
exit $OCF_SUCCESS;;
usage|help) usage
exit $OCF_SUCCESS;;
esac
# Anything except meta-data and help must pass validation
dcmanager_manager_validate || exit $?
# What kind of method was invoked?
case "$1" in
start) dcmanager_manager_start;;
stop) dcmanager_manager_stop;;
status) dcmanager_manager_status;;
monitor) dcmanager_manager_monitor;;
validate-all) ;;
*) usage
exit $OCF_ERR_UNIMPLEMENTED;;
esac