metal/mtce/src/scripts/mtcAgent

526 lines
14 KiB
Bash
Executable File

#!/bin/sh
#
# Copyright (c) 2013-2017 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
#
# Support: www.windriver.com
#
# Purpose: This resource agent manages
#
# .... the Titanium Cloud Controller Maintenance Daemon
#
# RA Spec:
#
# http://www.opencf.org/cgi-bin/viewcvs.cgi/specs/ra/resource-agent-api.txt?rev=HEAD
#
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
#######################################################################
# Fill in some defaults if no values are specified
OCF_RESKEY_binary_default="mtcAgent"
OCF_RESKEY_config_default="/etc/mtc.ini"
OCF_RESKEY_dbg_default="false"
OCF_RESKEY_logging_default="true"
OCF_RESKEY_mode_default="normal"
OCF_RESKEY_user_default="admin"
OCF_RESKEY_pid_default="/var/run/mtcAgent.pid"
OCF_RESKEY_state_default="standby"
: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}
: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}}
: ${OCF_RESKEY_logging=${OCF_RESKEY_logging_default}}
: ${OCF_RESKEY_dbg=${OCF_RESKEY_dbg_default}}
: ${OCF_RESKEY_mode=${OCF_RESKEY_mode_default}}
: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}}
: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}}
: ${OCF_RESKEY_state=${OCF_RESKEY_state_default}}
mydaemon="/usr/local/bin/${OCF_RESKEY_binary}"
statusfile="/var/run/${OCF_RESKEY_binary}.info"
#######################################################################
usage() {
cat <<UEND
usage: $0 (start|stop|reload|status|monitor|validate-all|meta-data)
$0 manages the Platform's Controller Maintenance (mtcAgent) process as an HA resource
The 'start' ..... operation starts the maintenance service in the active state.
The 'stop' ...... operation stops the maintenance service.
The 'reload' .... operation stops and then starts the maintenance service.
The 'status' .... operation checks the status of the maintenance service.
The 'monitor' ... operation indicates the in-service status of the maintenance service.
The 'validate-all' operation reports whether the parameters are valid.
The 'meta-data' . operation reports the mtcAgent's meta-data information.
UEND
}
#######################################################################
meta_data() {
if [ ${OCF_RESKEY_dbg} = "true" ] ; then
ocf_log info "mtcAgent:meta_data"
fi
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="mtcAgent">
<version>1.0</version>
<longdesc lang="en">
This 'mtcAgent' is an OCF Compliant Resource Agent that manages start, stop
and in-service monitoring of the Host Maintenance Process on Wind River's
Titanium Cloud in the active mode.
</longdesc>
<shortdesc lang="en">
Manages the Titanium Cloud's Maintenance (mtcAgent) Daemon.
</shortdesc>
<parameters>
<parameter name="state" unique="0" required="0">
<longdesc lang="en">
state = standby ... run maintenance daemon in 'standby' mode (default)
state = active ... run maintenance daemon in 'active' mode
</longdesc>
<shortdesc lang="en">Maintenance Activity State Option</shortdesc>
<content type="string" default="${OCF_RESKEY_state_default}"/>
</parameter>
<parameter name="mode" unique="0" required="0">
<longdesc lang="en">
mode = normal ... run maintenance daemon in 'normal' mode (default)
mode = passive ... run maintenance daemon in 'passive' mode
</longdesc>
<shortdesc lang="en">Maintenance Mode Option</shortdesc>
<content type="string" default="${OCF_RESKEY_mode_default}"/>
</parameter>
<parameter name="logging" unique="0" required="0">
<longdesc lang="en">
This option is used to direct the mtcAgent dameon log stream.
logging = true ... /var/log/mtcAgent.log (default)
logging = false ... /dev/null
See also debug option which sets the verbosity of logging.
</longdesc>
<shortdesc lang="en">Service Logging Control Option</shortdesc>
<content type="boolean" default="${OCF_RESKEY_logging_default}"/>
</parameter>
<parameter name="dbg" unique="0" required="0">
<longdesc lang="en">
dbg = false ... info, warn and err logs sent to output stream (default)
dbg = true ... Additional dbg logs are also sent to the output stream
</longdesc>
<shortdesc lang="en">Service Debug Control Option</shortdesc>
<content type="boolean" default="${OCF_RESKEY_dbg_default}"/>
</parameter>
</parameters>
<actions>
<action name="start" timeout="10s" />
<action name="stop" timeout="10s" />
<action name="monitor" timeout="10s" interval="300s" />
<action name="meta-data" timeout="10s" />
<action name="validate-all" timeout="10s" />
</actions>
</resource-agent>
END
return ${OCF_SUCCESS}
}
mtcAgent_validate() {
if [ ${OCF_RESKEY_dbg} = "true" ] ; then
ocf_log info "mtcAgent:validate"
fi
check_binary "/usr/local/bin/${OCF_RESKEY_binary}"
check_binary "/usr/local/bin/hbsAgent"
check_binary "/usr/local/bin/mtcClient"
check_binary "/usr/local/bin/hbsClient"
check_binary sysinv-api
check_binary pidof
if [ ! -f ${OCF_RESKEY_config} ] ; then
msg="${OCF_RESKEY_binary} ini file missing ${OCF_RESKEY_config}"
ocf_log err "${msg}"
return ${OCF_ERR_CONFIGURED}
fi
return ${OCF_SUCCESS}
}
function log_procfs()
{
pid=`cat ${OCF_RESKEY_pid}`
PROCSCHEDFILE="/proc/$pid/sched"
if [ -r $PROCSCHEDFILE ] ; then
PROCSCHED="$(cat $PROCSCHEDFILE 2>&1)"
echo "$PROCSCHED" | while read line; do
ocf_log info "sched: ${line}"
done
fi
PROCSTACKFILE="/proc/$pid/stack"
echo "stack file: $PROCSTACKFILE"
if [ -r $PROCSTACKFILE ] ; then
PROCSTACK="$(cat $PROCSTACKFILE 2>&1)"
echo "$PROCSTACK" | while read line; do
ocf_log info "stack: ${line}"
done
fi
}
# total worst case timeout of this status check is 13 seconds.
# This is 2 seconds under SM's default 15 second timeout.
mtcAgent_status () {
proc="mtcAgent:status"
if [ ${OCF_RESKEY_dbg} = "true" ] ; then
ocf_log info "mtcAgent:status"
fi
# remove the status file before we request a new
rm -f ${statusfile}
# Verify the pid file exists as part of status
for ((loop=0;loop<3;loop++)) {
if [ -f ${OCF_RESKEY_pid} ] ; then
break
else
sleep 1
fi
}
# See if the daemon is running
pid=`cat ${OCF_RESKEY_pid}`
kill -0 $pid 2> /dev/null
if [ $? -eq 0 ] ; then
log_sig="${OCF_RESKEY_binary} In-Service Active Monitor Test"
# Ask the daemon to produce status
ocf_run kill -s USR1 $pid
# Wait for the response
for ((loop=0;loop<10;loop++)) {
sleep 1
if [ -f ${statusfile} ] ; then
ocf_log info "${log_sig} Passed ($loop)"
return ${OCF_SUCCESS}
elif [ $loop -eq 5 ] ; then
# send the signal again
ocf_run kill -s USR1 $pid
pid_stat=`cat /proc/${pid}/stat`
ocf_log notice "${log_sig} is slow to respond"
ocf_log notice "$pid_stat"
elif [ $loop -eq 8 ] ; then
pid_stat=`cat /proc/${pid}/stat`
ocf_log warn "${log_sig} is very slow to respond"
ocf_log warn "$pid_stat"
fi
}
log_procfs
ocf_log err "${log_sig} Failed"
return ${OCF_ERR_GENERIC}
fi
return ${OCF_NOT_RUNNING}
}
mtcAgent_monitor () {
proc="mtcAgent:monitor"
if [ ${OCF_RESKEY_dbg} = "true" ] ; then
ocf_log info "${proc}"
fi
# Uncomment if you want the monitor function to force-pass
# return ${OCF_SUCCESS}
pid=`cat ${OCF_RESKEY_pid}`
kill -0 $pid 2> /dev/null
if [ $? -ne 0 ] ; then
if [ ${OCF_RESKEY_dbg} = "true" ] ; then
ocf_log info "${proc} called while ${OCF_RESKEY_binary} not running."
fi
return ${OCF_NOT_RUNNING}
fi
mtcAgent_status
return $?
}
mtcAgent_start () {
local rc
start_proc="mtcAgent:start"
if [ ${OCF_RESKEY_dbg} = "true" ] ; then
ocf_log info "${start_proc}"
fi
# Uncomment if you want the start function to force-pass without starting
# return ${OCF_SUCCESS}
# If running then issue a ping test
pid=`cat ${OCF_RESKEY_pid}`
kill -0 $pid 2> /dev/null
if [ $? -eq 0 ] ; then
mtcAgent_status
rc=$?
if [ $rc -ne ${OCF_SUCCESS} ] ; then
msg="${start_proc} ping test failed rc=${rc}"
ocf_log err "${msg}"
mtcAgent_stop
else
# Spec says to return success if process is already running for start
pid=`cat ${OCF_RESKEY_pid}`
kill -0 $pid 2> /dev/null
if [ $? -eq 0 ] ; then
ocf_log info "${start_proc} called while ${OCF_RESKEY_binary} is already running"
return ${OCF_SUCCESS}
fi
fi
fi
# should not be running now or error
pid=`cat ${OCF_RESKEY_pid}`
kill -0 $pid 2> /dev/null
if [ $? -eq 0 ] ; then
msg="${start_proc} cannot kill off existing instance of ${OCF_RESKEY_binary}"
ocf_log err "${msg}"
return ${OCF_RUNNING_MASTER}
fi
rm -f ${statusfile}
if [ ${OCF_RESKEY_state} = "active" ] ; then
RUN_OPT_STATE="-a"
else
RUN_OPT_STATE=""
fi
if [ ${OCF_RESKEY_dbg} = "true" ] ; then
RUN_OPT_DEBUG="-d debug"
else
RUN_OPT_DEBUG=""
fi
if [ ${OCF_RESKEY_mode} = "passive" ] ; then
RUN_OPT_MODE="-p"
else
RUN_OPT_MODE=""
fi
if [ ${OCF_RESKEY_logging} = "true" ] ; then
RUN_OPT_LOG="-l"
else
RUN_OPT_LOG=""
fi
# default PID to null
pid=""
# Try to Start the daemon
${mydaemon} ${RUN_OPT_STATE} ${RUN_OPT_LOG} ${RUN_OPT_MODE} ${RUN_OPT_DEBUG}
rc=$?
# verify it was started and set return code appropriately
if [ $rc -eq ${OCF_SUCCESS} ] ; then
# Verify the pid file exists as part of status
for ((loop=0;loop<3;loop++)) {
if [ -f ${OCF_RESKEY_pid} ] ; then
break
else
ocf_log info "${start_proc} waiting ... loop=${loop}"
sleep 1
fi
}
pid=`cat ${OCF_RESKEY_pid}`
# ocf_log info "PID:$pid"
kill -0 $pid 2> /dev/null
if [ $? -ne 0 ] ; then
rc=${OCF_FAILED_MASTER}
else
if [ ! -f ${statusfile} ] ; then
ocf_log info "mtcAgent: Startup Health Test Failed - missing info"
rc = ${OCF_ERR_GENERIC}
fi
fi
else
ocf_log info "${start_proc} failed ${mydaemon} daemon rc=${rc}"
rc = ${OCF_ERR_GENERIC}
fi
# Record success or failure and return status
if [ ${rc} -eq $OCF_SUCCESS ] ; then
msg="${start_proc}ed pid=${pid}"
ocf_log info "${msg}"
else
msg="${start_proc} failed rc=${rc}"
ocf_log err "${msg}"
rc=${OCF_NOT_RUNNING}
fi
return $rc
}
mtcAgent_confirm_stop () {
proc="mtcAgent:confirm_stop"
ocf_log info "${proc}"
pid=`pidof ${OCF_RESKEY_binary}`
kill -0 ${pid} 2> /dev/null
if [ $? -eq 0 ] ; then
ocf_log info "${proc} 'kill -9 ${pid}'"
kill -9 ${pid}
ocf_log info "${proc}ed (by emergency kill -9 ${pid})"
sleep 1
fi
rm -f ${OCF_RESKEY_pid}
}
mtcAgent_stop () {
proc="mtcAgent:stop"
# See if the process is running by pidfile
pid=`pidof ${OCF_RESKEY_binary}`
ocf_log info "${proc} PID:${pid}"
kill -0 ${pid} 2> /dev/null
if [ $? -ne 0 ] ; then
ocf_log info "${proc} called while already stopped (no process)"
mtcAgent_confirm_stop
return ${OCF_SUCCESS}
fi
MAX=3
for ((loop=0;loop<$MAX;loop++)) {
# verify stop with pidfile
if [ -f ${OCF_RESKEY_pid} ] ; then
pid=`cat ${OCF_RESKEY_pid}`
# if pid file is gone we are done
if [ ${pid} = "" ] ; then
ocf_log info "${proc}ped (by -int)"
break
# if pidfile is empty then kill by -int
else
kill -0 ${pid} 2> /dev/null
if [ $? -ne 0 ] ; then
ocf_log info "${proc}ped (by pid)"
break
else
ocf_log info "${proc}ping (by -int - loop:${loop})"
kill -int ${pid}
sleep 1
fi
fi
fi
}
mtcAgent_confirm_stop
return ${OCF_SUCCESS}
}
mtcAgent_reload () {
local rc
proc="mtcAgent:reload"
if [ ${OCF_RESKEY_dbg} = "true" ] ; then
ocf_log info "${proc}"
fi
mtcAgent_stop
rc=$?
if [ $rc -eq ${OCF_SUCCESS} ] ; then
#sleep 1
mtcAgent_start
rc=$?
if [ $rc -eq ${OCF_SUCCESS} ] ; then
msg="${proc}ed"
ocf_log info "${mgs}"
fi
fi
if [ ${rc} -ne ${OCF_SUCCESS} ] ; then
msg="${OCF_RESKEY_binary}: failed to restart rc=${rc}"
ocf_log info "${mgs}"
fi
return ${rc}
}
case ${__OCF_ACTION} in
meta-data) meta_data
exit ${OCF_SUCCESS}
;;
usage|help) usage
exit ${OCF_SUCCESS}
;;
esac
if [ ${__OCF_ACTION} = "monitor" ] ; then
ocf_log debug "mtcAgent:${__OCF_ACTION} action"
else
ocf_log info "mtcAgent:${__OCF_ACTION} action"
fi
# Anything except meta-data and help must pass validation
mtcAgent_validate || exit $?
case ${__OCF_ACTION} in
start) mtcAgent_start
;;
stop) mtcAgent_stop
;;
status) mtcAgent_status
;;
reload) mtcAgent_reload
;;
monitor) mtcAgent_monitor
;;
validate-all) mtcAgent_validate
;;
*) usage
exit ${OCF_ERR_UNIMPLEMENTED}
;;
esac