Remove deprecated shell scripts
As per deprecation notes [1], removed all unused shell scripts from hostmonitor and processmonitor. [1]: https://docs.openstack.org/releasenotes/masakari-monitors/ocata.html Change-Id: I93761ce4685c258058cb2d6b2ccb2323636f33ff
This commit is contained in:
parent
d0e53dfd56
commit
ae3ab24f9a
26
README.rst
26
README.rst
@ -52,32 +52,6 @@ Configure masakari-monitors
|
||||
$ masakari-hostmonitor
|
||||
$ masakari-instancemonitor
|
||||
|
||||
If you are intend to use bash scripts of masakari-processmonitor and
|
||||
masakari-hostmonitor, use following steps to install them.
|
||||
However, those bash shell scripts are deprecated as of the Ocata release and
|
||||
will be removed in the Queens release.
|
||||
Use above masakari-hostmonitors implemented in python instead.
|
||||
|
||||
#. Clone masakari using::
|
||||
|
||||
$ git clone https://github.com/openstack/masakari-monitors.git
|
||||
|
||||
#. Create masakarimonitors directory in /etc/.
|
||||
|
||||
#. Remove '.sample' from files hostmonitor.conf.sample,
|
||||
processmonitor.conf.sample and proc.list.sample which exist at
|
||||
masakari-monitors/etc/.
|
||||
|
||||
#. Copy hostmonitor.conf, processmonitor.conf and proc.list files from
|
||||
masakari-monitors/etc/ to /etc/masakarimonitors folder and make necessary
|
||||
changes to the hostmonitor.conf, processmonitor.conf and proc.list files.
|
||||
|
||||
#. To run bash scripts of masakari-processmonitor and masakari-hostmonitor
|
||||
simply use following binary::
|
||||
|
||||
$ masakari-processmonitor.sh /etc/masakarimonitors/processmonitor.conf /etc/masakarimonitors/proc.list
|
||||
$ masakari-hostmonitor.sh /etc/masakarimonitors/hostmonitor.conf
|
||||
|
||||
|
||||
Features
|
||||
--------
|
||||
|
@ -1,27 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright(c) 2016 Nippon Telegraph and Telephone Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
SCRIPT_DIR=/usr/local/lib/python2.7/dist-packages/masakarimonitors/hostmonitor
|
||||
SCRIPT_FILE=${SCRIPT_DIR}/hostmonitor.sh
|
||||
|
||||
# Argument check
|
||||
if [ $# -ne 1 ]; then
|
||||
echo "Usage: $0 <configuration file path>"
|
||||
exit 1
|
||||
else
|
||||
SCRIPT_CONF_FILE=$1
|
||||
fi
|
||||
|
||||
sudo bash ${SCRIPT_FILE} ${SCRIPT_CONF_FILE}
|
@ -1,28 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Copyright(c) 2016 Nippon Telegraph and Telephone Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
SCRIPT_DIR=/usr/local/lib/python2.7/dist-packages/masakarimonitors/processmonitor
|
||||
SCRIPT_FILE=${SCRIPT_DIR}/processmonitor.sh
|
||||
|
||||
# Argument check
|
||||
if [ $# -ne 2 ]; then
|
||||
echo "Usage: $0 <configuration file path> <proc.list file path>"
|
||||
exit 1
|
||||
else
|
||||
SCRIPT_CONF_FILE=$1
|
||||
PROC_LIST=$2
|
||||
fi
|
||||
|
||||
sudo bash ${SCRIPT_FILE} ${SCRIPT_CONF_FILE} ${PROC_LIST}
|
@ -1,964 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright(c) 2016 Nippon Telegraph and Telephone Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Define variables.
|
||||
BASE_NAME=`basename $0`
|
||||
HOST_NAME=`hostname`
|
||||
MY_NODE_NAME=${HOST_NAME,,}
|
||||
LOGTAG=`basename $0`
|
||||
TMP_DIR="/var/tmp"
|
||||
TMP_CRM_MON_FILE="$TMP_DIR/crm_mon.tmp"
|
||||
STATUS_FILE="$TMP_DIR/node_status.tmp"
|
||||
TMP_CRMADM_FILE="$TMP_DIR/crmadmin.tmp"
|
||||
TMP_IFCONFIG_FILE="$TMP_DIR/ifconfig.tmp"
|
||||
NOTICE_OUTPUT="$TMP_DIR/${BASE_NAME}_resp.out"
|
||||
NOTICE_PROGRAM="curl"
|
||||
RA_COUNT=0
|
||||
LOGDIR="/var/log/masakari"
|
||||
LOGFILE="${LOGDIR}/masakari-hostmonitor.log"
|
||||
CLUSTER_STATUS="ONLINE"
|
||||
HOST_STATUS="NORMAL"
|
||||
|
||||
# Define the node state.
|
||||
NODE_STATUS_STARTED="Started"
|
||||
NODE_STATUS_STOPPED="Stopped"
|
||||
NODE_STATUS_STARTING="Starting"
|
||||
NODE_STATUS_STOPPING="Stopping"
|
||||
NODE_STATUS_UNKNOWN="Unknown"
|
||||
|
||||
# This function outputs the debug log
|
||||
# Argument
|
||||
# $1 : Message
|
||||
log_debug () {
|
||||
if [ ! -e ${LOGDIR} ]; then
|
||||
mkdir -p ${LOGDIR}
|
||||
fi
|
||||
|
||||
if [ "${LOG_LEVEL}" == "debug" ]; then
|
||||
log_output "$1"
|
||||
fi
|
||||
}
|
||||
|
||||
# This function outputs the info log
|
||||
# Argument
|
||||
# $1 : Message
|
||||
log_info () {
|
||||
if [ ! -e ${LOGDIR} ]; then
|
||||
mkdir -p ${LOGDIR}
|
||||
fi
|
||||
|
||||
log_output "$1"
|
||||
}
|
||||
|
||||
# This function outputs the log
|
||||
# Argument
|
||||
# $1 : Message
|
||||
log_output () {
|
||||
echo "`date +'%Y-%m-%d %H:%M:%S'` ${HOST_NAME} ${LOGTAG}: $1" >> $LOGFILE
|
||||
}
|
||||
|
||||
# This function locks a file
|
||||
# Argument
|
||||
# $1 : Message
|
||||
file_lock () {
|
||||
exec 9>>$1
|
||||
flock -x 9
|
||||
}
|
||||
|
||||
# This function unlocks a file
|
||||
file_unlock () {
|
||||
exec 9>&-
|
||||
}
|
||||
|
||||
# Initialization function
|
||||
script_initialize () {
|
||||
ID=`uuidgen`
|
||||
log_debug "begin loop ID:$ID"
|
||||
if [ -f $TMP_CRM_MON_FILE ]; then
|
||||
sudo rm -f $TMP_CRM_MON_FILE
|
||||
fi
|
||||
if [ -f $NOTICE_OUTPUT ]; then
|
||||
sudo rm -f $NOTICE_OUTPUT
|
||||
fi
|
||||
if [ -e $TMP_CRMADM_FILE ]; then
|
||||
sudo rm -rf $TMP_CRMADM_FILE
|
||||
fi
|
||||
if [ -e $TMP_IFCONFIG_FILE ]; then
|
||||
sudo rm -rf $TMP_IFCONFIG_FILE
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
# Finalization function
|
||||
# Argument
|
||||
# $1 : The flag indicating whether delete the node state file.
|
||||
# 0 -> The node state file is deleted.
|
||||
# 1 -> The node state file is not deleted.
|
||||
script_finalize () {
|
||||
if [ $1 -eq 0 ]; then
|
||||
if [ -f $STATUS_FILE ]; then
|
||||
sudo rm -f $STATUS_FILE
|
||||
fi
|
||||
fi
|
||||
if [ -f $TMP_CRM_MON_FILE ]; then
|
||||
sudo rm -f $TMP_CRM_MON_FILE
|
||||
fi
|
||||
if [ -f $NOTICE_OUTPUT ]; then
|
||||
sudo rm -f $NOTICE_OUTPUT
|
||||
fi
|
||||
if [ -e $TMP_CRMADM_FILE ]; then
|
||||
sudo rm -rf $TMP_CRMADM_FILE
|
||||
fi
|
||||
if [ -e $TMP_IFCONFIG_FILE ]; then
|
||||
sudo rm -rf $TMP_IFCONFIG_FILE
|
||||
fi
|
||||
log_debug "end loop ID:$ID"
|
||||
return 0
|
||||
}
|
||||
|
||||
# Check the value is correct type
|
||||
# Argument
|
||||
# $1: Type
|
||||
# $2: Parameter Name
|
||||
# $3: Value
|
||||
# Return
|
||||
# 0: The value is correct type
|
||||
# 1: The value is not correct type
|
||||
check_config_type() {
|
||||
expected_type=$1
|
||||
parameter_name=$2
|
||||
value=$3
|
||||
|
||||
ret=0
|
||||
case $expected_type in
|
||||
int)
|
||||
expr $value + 1 > /dev/null 2>&1
|
||||
if [ $? -ge 2 ]; then ret=1; fi
|
||||
;;
|
||||
string)
|
||||
if [ -z $value ] ; then ret=1; fi
|
||||
;;
|
||||
*)
|
||||
ret=1
|
||||
;;
|
||||
esac
|
||||
|
||||
if [ $ret -eq 1 ] ; then
|
||||
log_info "config file parameter error. [${SCRIPT_CONF_FILE}:${parameter_name}]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_info "config file parameter : ${parameter_name}=${value}"
|
||||
return 0
|
||||
}
|
||||
|
||||
# This function reads the configuration file and set the value.
|
||||
# If the value is omitted, set the default value.
|
||||
# If invalid value is set, return 1.
|
||||
# Note) The default value for each item are as follows.
|
||||
# MONITOR_INTERVAL (defualt : 60)
|
||||
# NOTICE_TIMEOUT (defualt : 10)
|
||||
# NOTICE_RETRY_COUNT (default : 12)
|
||||
# NOTICE_RETRY_INTERVAL (default : 10)
|
||||
# STONITH_WAIT (default : 30)
|
||||
# MAX_CHILD_PROCESS (default : 3)
|
||||
# TCPDUMP_TIMEOUT (default : 10)
|
||||
# IPMI_TIMEOUT (default : 5)
|
||||
# IPMI_RETRY_MAX (default : 3)
|
||||
# IPMI_RETRY_INTERVAL (default : 10)
|
||||
# HA_CONF (default : "/etc/corosync/corosync.conf")
|
||||
# LOG_LEVEL (default : "info")
|
||||
# DOMAIN (default : "")
|
||||
# ADMIN_USER (default : "")
|
||||
# ADMIN_PASS (default : "")
|
||||
# PROJECT (default : "")
|
||||
# AUTH_URL (default : "")
|
||||
# REGION (default : "")
|
||||
# IGNORE_RESOURCE_GROUP_NAME_PATTERN (default : "stonith")
|
||||
#
|
||||
# Return value
|
||||
# 0 : Setting completion
|
||||
# 1 : Reading failure of the configuration or invalid setting value
|
||||
set_conf_value () {
|
||||
# Read the configuration file
|
||||
source $SCRIPT_CONF_FILE > /dev/null 2>&1
|
||||
if [ $? -ne 0 ]; then
|
||||
log_info "config file read error. [$SCRIPT_CONF_FILE]"
|
||||
return 1
|
||||
fi
|
||||
|
||||
MONITOR_INTERVAL=${MONITOR_INTERVAL:-60}
|
||||
check_config_type 'int' MONITOR_INTERVAL $MONITOR_INTERVAL
|
||||
|
||||
NOTICE_TIMEOUT=${NOTICE_TIMEOUT:-10}
|
||||
check_config_type 'int' NOTICE_TIMEOUT $NOTICE_TIMEOUT
|
||||
|
||||
NOTICE_RETRY_COUNT=${NOTICE_RETRY_COUNT:-12}
|
||||
check_config_type 'int' NOTICE_RETRY_COUNT $NOTICE_RETRY_COUNT
|
||||
|
||||
NOTICE_RETRY_INTERVAL=${NOTICE_RETRY_INTERVAL:-10}
|
||||
check_config_type 'int' NOTICE_RETRY_INTERVAL $NOTICE_RETRY_INTERVAL
|
||||
|
||||
STONITH_WAIT=${STONITH_WAIT:-30}
|
||||
check_config_type 'int' STONITH_WAIT $STONITH_WAIT
|
||||
|
||||
MAX_CHILD_PROCESS=${MAX_CHILD_PROCESS:-3}
|
||||
check_config_type 'int' MAX_CHILD_PROCESS $MAX_CHILD_PROCESS
|
||||
|
||||
TCPDUMP_TIMEOUT=${TCPDUMP_TIMEOUT:-10}
|
||||
check_config_type 'int' TCPDUMP_TIMEOUT $TCPDUMP_TIMEOUT
|
||||
|
||||
IPMI_TIMEOUT=${IPMI_TIMEOUT:-5}
|
||||
check_config_type 'int' IPMI_TIMEOUT $IPMI_TIMEOUT
|
||||
|
||||
IPMI_RETRY_MAX=${IPMI_RETRY_MAX:-3}
|
||||
check_config_type 'int' IPMI_RETRY_MAX $IPMI_RETRY_MAX
|
||||
|
||||
IPMI_RETRY_INTERVAL=${IPMI_RETRY_INTERVAL:-10}
|
||||
check_config_type 'int' IPMI_RETRY_INTERVAL $IPMI_RETRY_INTERVAL
|
||||
|
||||
HA_CONF=${HA_CONF:-"/etc/corosync/corosync.conf"}
|
||||
check_config_type 'string' HA_CONF $HA_CONF
|
||||
|
||||
LOG_LEVEL=${LOG_LEVEL:-"info"}
|
||||
check_config_type 'string' LOG_LEVEL $LOG_LEVEL
|
||||
|
||||
DOMAIN=${DOMAIN:-""}
|
||||
check_config_type 'string' DOMAIN $DOMAIN
|
||||
|
||||
ADMIN_USER=${ADMIN_USER:-""}
|
||||
check_config_type 'string' ADMIN_USER $ADMIN_USER
|
||||
|
||||
ADMIN_PASS=${ADMIN_PASS:-""}
|
||||
check_config_type 'string' ADMIN_PASS $ADMIN_PASS
|
||||
|
||||
PROJECT=${PROJECT:-""}
|
||||
check_config_type 'string' PROJECT $PROJECT
|
||||
|
||||
AUTH_URL=${AUTH_URL:-""}
|
||||
check_config_type 'string' AUTH_URL $AUTH_URL
|
||||
|
||||
REGION=${REGION:-""}
|
||||
check_config_type 'string' REGION $REGION
|
||||
|
||||
IGNORE_RESOURCE_GROUP_NAME_PATTERN=${IGNORE_RESOURCE_GROUP_NAME_PATTERN:-""}
|
||||
check_config_type 'string' IGNORE_RESOURCE_GROUP_NAME_PATTERN $IGNORE_RESOURCE_GROUP_NAME_PATTERN
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# This function gets the NIC that is used for intercommunication of corosync based on
|
||||
# the contents of /etc/corosync/corosync.conf.
|
||||
#
|
||||
# Argument
|
||||
# $1 : Value of bindnetabbr is set in /etc/corosync/corosync.conf
|
||||
# Return value
|
||||
# 0 : Success to get
|
||||
# 1 : Fail to get(Detect /etc/corosync/corosync.conf of invalid setting value)
|
||||
get_mcast_nic () {
|
||||
BIND_NET_ADDR=$1
|
||||
BIND_NET_ADDR=`echo ${BIND_NET_ADDR} | sed -e 's/\.0$//g'`
|
||||
sudo ifconfig > ${TMP_IFCONFIG_FILE}
|
||||
|
||||
if [ `grep "${BIND_NET_ADDR}" ${TMP_IFCONFIG_FILE} | wc -l` -eq 0 ]; then
|
||||
return 1
|
||||
fi
|
||||
|
||||
S_LINES=`cat ${TMP_IFCONFIG_FILE} | grep -n -e "^[a-z]" -e "^[0-9]" | cut -d":" -f1`
|
||||
E_LINE_DEFAULT=`cat -n ${TMP_IFCONFIG_FILE} | tail -n 1 | awk '{print $1}'`
|
||||
for S_LINE in ${S_LINES}
|
||||
do
|
||||
S_LINE=`expr ${S_LINE} + 1`
|
||||
E_LINE=`cat ${TMP_IFCONFIG_FILE} | tail -n +${S_LINE} | egrep -n -m 1 -e "^[a-z]" -e "^[0-9]" | cut -d":" -f1`
|
||||
|
||||
if [ -z "${E_LINE}" ]; then
|
||||
E_LINE=${E_LINE_DEFAULT}
|
||||
else
|
||||
E_LINE=`expr ${S_LINE} + ${E_LINE} - 1 - 1`
|
||||
fi
|
||||
|
||||
if [ `cat ${TMP_IFCONFIG_FILE} | sed -n "${S_LINE},${E_LINE}p" | grep "${BIND_NET_ADDR}" | wc -l` -ne 0 ]; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
S_LINE=`expr ${S_LINE} - 1`
|
||||
MCAST_NIC=`cat -n ${TMP_IFCONFIG_FILE} | grep " ${S_LINE}" | awk '{print $2}'`
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# Check whether masakari-hostmoitor works on pacemaker-remote
|
||||
# Return value
|
||||
# 0 : works on pacemaker-remote
|
||||
# 1 : doesn't work on pacemaker-remote
|
||||
is_pacemaker_remote() {
|
||||
sudo service pacemaker_remote status > /dev/null 2>&1
|
||||
return $?
|
||||
}
|
||||
|
||||
# This function checks whether the HB line is alive
|
||||
# Return value
|
||||
# 0 : The HB line is alive.
|
||||
# 1 : The HB line is not alive.
|
||||
# 2 : Detect /etc/corosync/corosync.conf of invalic setting value
|
||||
check_hb_line () {
|
||||
# If the heartbeat is not starting, it is not required to execute tcpdump command.
|
||||
sudo service corosync status > /dev/null 2>&1
|
||||
RET_CORO=$?
|
||||
sudo service pacemaker status > /dev/null 2>&1
|
||||
RET_PACE=$?
|
||||
is_pacemaker_remote
|
||||
RET_REMOTE=$?
|
||||
if [ ${RET_CORO} -ne 0 -o ${RET_PACE} -ne 0 ]; then
|
||||
if [ ${RET_REMOTE} -ne 0 ]; then
|
||||
log_debug "neither pacemaker nor pacemaker-remote is running."
|
||||
return 1
|
||||
else
|
||||
log_debug "works on pacemaker-remote."
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
# Get all the setting of mcastport and bindnetaddr.
|
||||
MCAST_PORTS=`grep "mcastport:" ${HA_CONF} | awk '{print $2}'`
|
||||
BIND_NET_ADDRS=`grep "bindnetaddr:" ${HA_CONF} | awk '{print $2}'`
|
||||
|
||||
array_mcast_ports=(`echo ${MCAST_PORTS}`)
|
||||
array_bind_net_addrs=(`echo ${BIND_NET_ADDRS}`)
|
||||
|
||||
if [ -z "${MCAST_PORTS}" ] ||
|
||||
[ -z "${BIND_NET_ADDRS}" ] ||
|
||||
[ ${#array_bind_net_addrs[*]} -ne ${#array_mcast_ports[*]} ]; then
|
||||
log_debug "${HA_CONF} has incorrect parameters."
|
||||
return 2
|
||||
fi
|
||||
|
||||
NIC_SUCCESS_FLG=0
|
||||
results=""
|
||||
loop_count=0
|
||||
while [ ${loop_count} -lt ${#array_bind_net_addrs[*]} ]
|
||||
do
|
||||
MCAST_PORT=${array_mcast_ports[${loop_count}]}
|
||||
MCAST_NIC=""
|
||||
# Get the NIC that is used for multicast from the values set in bindnetaddr.
|
||||
get_mcast_nic ${array_bind_net_addrs[$loop_count]}
|
||||
if [ $? -ne 0 ]; then
|
||||
log_debug "${HA_CONF} has incorrect parameters."
|
||||
return 2
|
||||
fi
|
||||
|
||||
log_debug "read mcast port from ${HA_CONF} -> ${MCAST_PORT}"
|
||||
log_debug "read mcast nic from ${HA_CONF} -> ${MCAST_NIC}"
|
||||
|
||||
timeout $TCPDUMP_TIMEOUT sudo tcpdump -c 1 -p -i ${MCAST_NIC} port ${MCAST_PORT} > /dev/null 2>&1
|
||||
result=$?
|
||||
if [ $result -eq 0 ]; then
|
||||
NIC_SUCCESS_FLG=1
|
||||
log_debug "tcpdump hb line (${MCAST_NIC}) ok."
|
||||
break
|
||||
else
|
||||
log_debug "tcpdump hb line (${MCAST_NIC}) fail. [exit-code: $result]"
|
||||
results+="$result "
|
||||
fi
|
||||
loop_count=`expr $loop_count + 1`
|
||||
done
|
||||
|
||||
if [ ${NIC_SUCCESS_FLG} -eq 0 ]; then
|
||||
log_info "tcpdump hb line fail. [exit-code: $results]"
|
||||
return 1
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
# This function checks the heartbeat state of the own node
|
||||
# Return value
|
||||
# 0 : Stable state
|
||||
# 1 : The heartbeat is stopped state
|
||||
# 2 : Unstable state (during state transitions)
|
||||
check_hb_status()
|
||||
{
|
||||
OWN_NODE=`uname -n`
|
||||
|
||||
sudo crmadmin -S ${OWN_NODE,,} 1> $TMP_CRMADM_FILE 2>/dev/null
|
||||
if [ $? -ne 0 ]; then
|
||||
# The heartbeat is not running (or during get state).
|
||||
log_debug "Heartbeat in the own node doesn't run."
|
||||
rm -f $TMP_CRMADM_FILE
|
||||
return 1
|
||||
fi
|
||||
|
||||
grep -v -e S_IDLE -e S_NOT_DC $TMP_CRMADM_FILE 1>/dev/null 2>&1
|
||||
if [ $? -eq 0 ]; then
|
||||
# The heartbeat is unstable state (or during state transitions).
|
||||
log_debug "Heartbeat is in an unstable state."
|
||||
rm -f $TMP_CRMADM_FILE
|
||||
return 2
|
||||
fi
|
||||
|
||||
rm -f $TMP_CRMADM_FILE
|
||||
log_debug "Heartbeat is in a stable state."
|
||||
return 0
|
||||
}
|
||||
|
||||
# This function executes the crm_mon command and hold result
|
||||
# Return value
|
||||
# 0 : Normal termination
|
||||
# 1 : Fail to execute the crm_command
|
||||
run_crm_mon () {
|
||||
sudo crm_mon -A -1 >$TMP_CRM_MON_FILE
|
||||
result=$?
|
||||
if [ $result -ne 0 ]; then
|
||||
log_debug "crm_mon fail. [exit-code: $result]"
|
||||
return 1
|
||||
else
|
||||
# Count the number of RA.
|
||||
if [ $RA_COUNT -eq 0 ]; then
|
||||
group_define=`sudo crm configure show | grep "^group " | grep -vi "$IGNORE_RESOURCE_GROUP_NAME_PATTERN" | sed -n '$p' | cut -d" " -f3-`
|
||||
result=$?
|
||||
if [ ! -n "$group_define" ] || ! [ "$result" -eq 0 ] ; then
|
||||
log_debug "cib is not configured."
|
||||
return 1
|
||||
fi
|
||||
tmp_array=(`echo $group_define`)
|
||||
ln=`echo $((${#group_define}))`
|
||||
last_word=`echo ${group_define} | cut -c ${ln}`
|
||||
if [[ $last_word != "\\" ]]; then
|
||||
RA_COUNT=${#tmp_array[*]}
|
||||
else
|
||||
RA_COUNT=`expr ${#tmp_array[*]} - 1`
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
log_debug "`cat $TMP_CRM_MON_FILE`"
|
||||
|
||||
# Check whether there is the quorum.
|
||||
grep "partition WITHOUT quorum" $TMP_CRM_MON_FILE > /dev/null 2>&1
|
||||
result=$?
|
||||
if [ $result -eq 0 ]; then
|
||||
log_info "$MY_NODE_NAME is no-quorum."
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# This function creates the node state file
|
||||
make_status_file () {
|
||||
touch $STATUS_FILE
|
||||
count_cluster_nodes
|
||||
work_count=$?
|
||||
n=0
|
||||
while [ $n -lt $work_count ]
|
||||
do
|
||||
check_node_status ${nodes_array[$n]}
|
||||
result=$?
|
||||
append_status_file ${nodes_array[$n]} $result
|
||||
n=`expr $n + 1`
|
||||
done
|
||||
}
|
||||
|
||||
# This function analyzes the output of crm_mon and count the number of cluster node.
|
||||
# And it stores node name in array in this function.
|
||||
# Return value
|
||||
# The number of cluster node
|
||||
count_cluster_nodes () {
|
||||
# Initialize the array
|
||||
nodes_array=()
|
||||
|
||||
# Count the number of Online node.
|
||||
online_nodes=`cat $TMP_CRM_MON_FILE | grep '^Online\|^RemoteOnline' | sed -e 's/\s\{1,\}/ /g' | sed -e 's/ \]$//g' | cut -d" " -f3- | tr '\n' ' '`
|
||||
log_debug "online nodes : $online_nodes"
|
||||
if [ -n "$online_nodes" ]; then
|
||||
nodes_array+=(`echo $online_nodes`)
|
||||
fi
|
||||
|
||||
# Count the number of OFFLINE node.
|
||||
offline_nodes=`cat $TMP_CRM_MON_FILE | grep '^OFFLINE\|^RemoteOFFLINE' | sed -e 's/\s\{1,\}/ /g' | sed -e 's/ \]$//g' | cut -d" " -f3- | tr '\n' ' '`
|
||||
log_debug "offline nodes : $offline_nodes"
|
||||
if [ -n "$offline_nodes" ]; then
|
||||
nodes_array+=(`echo $offline_nodes`)
|
||||
fi
|
||||
|
||||
# Count the number of except for Online, OFFLINE node.
|
||||
other_nodes=`cat $TMP_CRM_MON_FILE | grep ^Node | grep -v Attributes | sed -e 's/\s\{1,\}/ /g' | cut -d" " -f2`
|
||||
log_debug "other nodes : $other_nodes"
|
||||
if [ -n "$other_nodes" ]; then
|
||||
nodes_array+=(`echo $other_nodes`)
|
||||
fi
|
||||
|
||||
return ${#nodes_array[*]}
|
||||
}
|
||||
|
||||
# This function checks startup state of node's RA.
|
||||
# Argument
|
||||
# $1 : Node name
|
||||
# Return value
|
||||
# 0 : Started state
|
||||
# Node is online, and state of all RA is "Started"
|
||||
# 1 : Stopped state
|
||||
# UNCLEAN, OFFLINE, pending, standby
|
||||
# 2 : Starting or Stopping state
|
||||
# Node is online, and mixed "RA of Started" and "RA of Stopped"
|
||||
check_node_status () {
|
||||
online_nodes=`cat $TMP_CRM_MON_FILE | grep '^Online\|^RemoteOnline' | sed -e 's/\s\{1,\}/ /g' | sed -e 's/ \]$//g' | cut -d" " -f3-`
|
||||
# Check whether the node of argument is "Online".
|
||||
if [ "`echo $online_nodes | grep -e "$1 " -e "$1$"`" ]; then
|
||||
# Check whether the node of state of all RA is "Started".
|
||||
# In some cases "unmanaged" may not exist.
|
||||
START_RA_COUNT=`egrep -e "Started\s+$1\s*(\(unmanaged\))*\s*$" $TMP_CRM_MON_FILE | grep -v stonith | wc -l`
|
||||
if [ $START_RA_COUNT -eq $RA_COUNT ] || [ $RA_COUNT -eq -1 ] ; then
|
||||
# Node is online and state of all RA is "Started"(startup state)
|
||||
return 0
|
||||
else
|
||||
# There is "Stopped" even one(Starting or Stopping).
|
||||
return 2
|
||||
fi
|
||||
else
|
||||
# In spite of "UNCLEAN" or "OFFLINE" or "pending" or "standby",
|
||||
# if RA of "Started" exists, consider state as starting state or stopping state.
|
||||
other_node_ra=`grep "Started $1 " $TMP_CRM_MON_FILE | grep -v stonith | wc -l`
|
||||
if [ $other_node_ra -ne 0 ] ; then
|
||||
return 2
|
||||
# "UNCLEAN" or "OFFLINE" or "pending" or "standby"(stopped)
|
||||
else
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# This function writes in the node state file
|
||||
# Argument
|
||||
# $1 : node name
|
||||
# $2 : node state(0:Started, 1:Stopped, 2:Starting or Stopping)
|
||||
append_status_file () {
|
||||
if [ $2 -eq 0 ]; then
|
||||
node_status="$NODE_STATUS_STARTED"
|
||||
elif [ $2 -eq 1 ]; then
|
||||
node_status="$NODE_STATUS_STOPPED"
|
||||
else
|
||||
node_status="$NODE_STATUS_UNKNOWN"
|
||||
fi
|
||||
|
||||
file_lock $STATUS_FILE
|
||||
echo "$1 $node_status" >> $STATUS_FILE
|
||||
file_unlock
|
||||
}
|
||||
|
||||
# This function analyzes the state of the node specified by the argument from the result of crm_mon,
|
||||
# and if the nodes state are different from the last state, notify to the resource management.
|
||||
# Argument
|
||||
# $1 : Node name(1)
|
||||
# $2 : Node name(2)
|
||||
# ...
|
||||
# $n : node name(n)
|
||||
# Node name that are passed by arguments is multiple.
|
||||
# If nothing is passed to the argument, immediate return.
|
||||
parse_node_status () {
|
||||
if [ $# -eq 0 ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
work_count=$#
|
||||
n=0
|
||||
while [ $n -lt $work_count ]
|
||||
do
|
||||
check_node_status $1
|
||||
result1=$?
|
||||
if [ $result1 -eq 0 ]; then
|
||||
EVENT="STARTED"
|
||||
elif [ $result1 -eq 1 ]; then
|
||||
EVENT="STOPPED"
|
||||
fi
|
||||
TIME=`date -u +'%Y-%m-%d %H:%M:%S'`
|
||||
compare_status_file $1 $result1
|
||||
result2=$?
|
||||
if [ $result2 -eq 1 ]; then
|
||||
make_notice_data $1
|
||||
send_notification $1
|
||||
fi
|
||||
shift
|
||||
n=`expr $n + 1`
|
||||
done
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# This function compares state of last node with state of this time node,
|
||||
# and if they are different, rewrite the state file.
|
||||
# It is called from child process.
|
||||
#
|
||||
# Arguments
|
||||
# $1 : Node name
|
||||
# $2 : Node state(0:Started, 1:Stopped, 2:Starting or Stopping)
|
||||
# return value
|
||||
# 0 : There is not change from the last state and notification to the resource is not required.
|
||||
# 1 : There is change from the last state and notification to the resource is required.
|
||||
# 2 : There is change from the last state and notification to the resource is not required.
|
||||
compare_status_file () {
|
||||
# Check whether state of this time node changed from state of last time node.
|
||||
last_node_status=`grep "$1 " $STATUS_FILE | cut -d" " -f2`
|
||||
|
||||
# If node name that does not exist in the node state file, add it's node name to the file.
|
||||
if [ ! -n "$last_node_status" ]; then
|
||||
append_status_file $1 $2
|
||||
return 2
|
||||
fi
|
||||
|
||||
if [ $2 -eq 0 ]; then
|
||||
# If state of this time node is "Started" and state of last time node is "Started",
|
||||
if [[ $last_node_status = $NODE_STATUS_STARTED ]]; then
|
||||
return 0
|
||||
# If state of this time node is "Started" and
|
||||
# state of last time node is "Started" or "Stopping" or "Starting" or "Unknown",
|
||||
else
|
||||
change_status_file $1 $2 $last_node_status
|
||||
return $?
|
||||
fi
|
||||
elif [ $2 -eq 1 ]; then
|
||||
# If state of this time node is "Stopped" and state of last time node is "Stopped",
|
||||
if [[ $last_node_status = $NODE_STATUS_STOPPED ]]; then
|
||||
return 0
|
||||
# If state of this time node is "Stopped" and
|
||||
# state of last time node is "Started" or "Stopping" or "Starting" or "Unknown",
|
||||
else
|
||||
change_status_file $1 $2 $last_node_status
|
||||
return $?
|
||||
fi
|
||||
# If state of this time node is "Stopping" or "Starting" or "Unknown",
|
||||
else
|
||||
change_status_file $1 $2 $last_node_status
|
||||
return $?
|
||||
fi
|
||||
}
|
||||
|
||||
# This function rewrites the state file.
|
||||
# Return the necessity of notification return code
|
||||
#
|
||||
# Argument
|
||||
# $1 : Node name
|
||||
# $2 : Node state(0:Started, 1:Stopped, 2:Starting or Stopping)
|
||||
# $3 : State of the last node is specified in the node state file
|
||||
# Return value
|
||||
# 1 : Notification to the resource management is required
|
||||
# 2 : Notification to the resource management is not required
|
||||
change_status_file () {
|
||||
# If state of this time node is "Started",
|
||||
if [ $2 -eq 0 ]; then
|
||||
node_status="$NODE_STATUS_STARTED"
|
||||
# If state of this time node is "Stopping" or "Unknown", notification is not sent.
|
||||
if [[ $3 = $NODE_STATUS_STOPPING ]] ||
|
||||
[[ $3 = $NODE_STATUS_UNKNOWN ]]; then
|
||||
retval=2
|
||||
else
|
||||
retval=1
|
||||
fi
|
||||
# If state of this time node is "Stopped",
|
||||
elif [ $2 -eq 1 ]; then
|
||||
node_status="$NODE_STATUS_STOPPED"
|
||||
# If state of this time node is "Starting" or "Unknown", notification is not sent.
|
||||
if [[ $3 = $NODE_STATUS_STARTING ]] ||
|
||||
[[ $3 = $NODE_STATUS_UNKNOWN ]]; then
|
||||
retval=2
|
||||
else
|
||||
retval=1
|
||||
fi
|
||||
# If state of this time node is "Starting" or "Stopping" or "Unknown",
|
||||
else
|
||||
if [[ $3 = $NODE_STATUS_STARTED ]]; then
|
||||
node_status="$NODE_STATUS_STOPPING"
|
||||
elif [[ $3 = $NODE_STATUS_STOPPED ]]; then
|
||||
node_status="$NODE_STATUS_STARTING"
|
||||
else
|
||||
node_status="$3"
|
||||
fi
|
||||
# Notification is not sent.
|
||||
retval=2
|
||||
fi
|
||||
|
||||
file_lock $STATUS_FILE
|
||||
sed -i "s/$1 $last_node_status/$1 $node_status/g" $STATUS_FILE
|
||||
file_unlock
|
||||
|
||||
return $retval
|
||||
}
|
||||
|
||||
|
||||
# This function creates data to be notified to the resource management.
|
||||
# It is called from the child process.
|
||||
#
|
||||
# Argument
|
||||
# $1 : Node name
|
||||
make_notice_data () {
|
||||
TMP_RULE=`sudo crm configure show | grep "rule" | grep -i -e "100: #uname eq $1 " -e "100: #uname eq $1$" | grep -vi "stonith"`
|
||||
P_HOST=`echo ${TMP_RULE} | awk '{print $6}'`
|
||||
if [[ ${STONITH_TYPE} = "ssh" ]] ; then
|
||||
P_HOST=$1
|
||||
fi
|
||||
|
||||
# Usually, the route which shuldn't pass
|
||||
# (Abnormal states such as resource group name is "_grp", or physical host name is ""(empty string).
|
||||
if [ ! -n "${P_HOST}" ]; then P_HOST="UnknownPhysicalHost"; fi
|
||||
|
||||
CLUSTER_STATUS="ONLINE"
|
||||
HOST_STATUS="NORMAL"
|
||||
|
||||
# In the case of stop notification, check whether the opposing node has stopped securety.
|
||||
if [[ ${EVENT} = "STOPPED" ]] ; then
|
||||
CLUSTER_STATUS="OFFLINE"
|
||||
HOST_STATUS="NORMAL"
|
||||
|
||||
# adhoc setting for test
|
||||
if [[ ${STONITH_TYPE} = "ipmi" ]] ; then
|
||||
|
||||
# Get the value which is required for ipmitool command execution.
|
||||
IPMI_RAS=`sudo crm configure show | grep "^primitive.*stonith:external/ipmi" | awk '{print $2}'`
|
||||
for IPMI_RA in ${IPMI_RAS}
|
||||
do
|
||||
IPMI_HOST=`sudo crm resource param ${IPMI_RA} show hostname`
|
||||
if [[ ${IPMI_HOST} = ${P_HOST} ]]; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
userid=`sudo crm resource param ${IPMI_RA} show userid`
|
||||
passwd=`sudo crm resource param ${IPMI_RA} show passwd`
|
||||
interface=`sudo crm resource param ${IPMI_RA} show interface`
|
||||
ipaddr=`sudo crm resource param ${IPMI_RA} show ipaddr`
|
||||
|
||||
LOOP_COUNT=0
|
||||
while [ ${LOOP_COUNT} -lt `expr ${IPMI_RETRY_MAX} + 1` ]
|
||||
do
|
||||
POWER_STATUS=`timeout ${IPMI_TIMEOUT} sudo ipmitool -U ${userid} -P ${passwd} -I ${interface} -H ${ipaddr} power status 2>&1`
|
||||
RET1=$?
|
||||
echo ${POWER_STATUS} | grep "Power is off" > /dev/null 2>&1
|
||||
RET2=$?
|
||||
# If the opposing node has stopped securely, pass route of the notification.
|
||||
if [ ${RET1} -eq 0 ] && [ ${RET2} -eq 0 ]; then
|
||||
log_debug "Node $1 power is off."
|
||||
break
|
||||
fi
|
||||
# If the opposing node has stopped securely, recheck after sleep.
|
||||
log_debug "Sleep to get power status of node $1"
|
||||
sleep ${IPMI_RETRY_INTERVAL}
|
||||
LOOP_COUNT=`expr ${LOOP_COUNT} + 1`
|
||||
done
|
||||
|
||||
if [ ${LOOP_COUNT} -eq `expr ${IPMI_RETRY_MAX} + 1` ]; then
|
||||
HOST_STATUS="UNKNOWN"
|
||||
# If get the state of "Power is on" at the final, the HOST_STATUS is "UNKNOWN".
|
||||
if [ ${RET1} -eq 0 ]; then
|
||||
log_info "$1 info : Node $1 power is still on."
|
||||
# If get the state of "Unknown", HOST_STATUS is "UNKNOWN".
|
||||
else
|
||||
log_info "$1 info : Couldn't get power status of node $1."
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# Consider the port number
|
||||
# that is used for intercommunication of Pacemaker+corosync as the cluster identifier.
|
||||
|
||||
PAYLOAD="{\"event\": \"${EVENT}\",\"host_status\": \"${HOST_STATUS}\",\"cluster_status\": \"${CLUSTER_STATUS}\"}"
|
||||
|
||||
}
|
||||
|
||||
|
||||
# This function notifies to the resource management.
|
||||
# It is called masakari_cli post_event method.
|
||||
#
|
||||
# Argument
|
||||
# $1 : Node name
|
||||
send_notification () {
|
||||
TYPE="COMPUTE_HOST"
|
||||
TARGET="post_event"
|
||||
AUTH_INFO="--os-domain-name ${DOMAIN} --os-project-name ${PROJECT} --os-region-name ${REGION} --os-auth-url ${AUTH_URL} --os-username ${ADMIN_USER} --os-password ${ADMIN_PASS}"
|
||||
|
||||
log_info "$1 info : Send a notification."
|
||||
log_info "$1 info : openstack ${AUTH_INFO} notification create ${TYPE} ${P_HOST} \"${TIME}\" \"${PAYLOAD}\""
|
||||
|
||||
RESP=`openstack ${AUTH_INFO} notification create ${TYPE} ${P_HOST} "${TIME}" "${PAYLOAD}"`
|
||||
result=$?
|
||||
|
||||
if [ $result -eq 0 ]; then
|
||||
log_info "$1 info : Succeeded in sending a notification."
|
||||
log_info "$1 info : $RESP"
|
||||
else
|
||||
log_info "$1 info : Failed to send a notification. [exit-code: $result]"
|
||||
log_info "$1 info : $RESP"
|
||||
fi
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
# Argument check
|
||||
if [ $# -ne 1 ]; then
|
||||
echo "Usage: $0 <configuration file path>"
|
||||
exit 1
|
||||
else
|
||||
SCRIPT_CONF_FILE=$1
|
||||
fi
|
||||
|
||||
# Output warning message.
|
||||
log_info "WARNING : $0 is deprecated as of the Ocata release and will be removed in the Queens release. Use masakari-hostmonitor implemented in python instead of $0."
|
||||
|
||||
# main route
|
||||
log_info "begin"
|
||||
|
||||
# If node state file exists at the initial startup, delete the file.
|
||||
if [ -f $STATUS_FILE ]; then
|
||||
sudo rm -f $STATUS_FILE
|
||||
fi
|
||||
|
||||
while true
|
||||
do
|
||||
# If invalid value is set in the configuration file, set the default value.
|
||||
set_conf_value
|
||||
if [ $? -ne 0 ]; then
|
||||
break
|
||||
fi
|
||||
|
||||
# Initialize
|
||||
script_initialize
|
||||
|
||||
# Check whether HB line is normal.
|
||||
check_hb_line
|
||||
ret=$?
|
||||
if [ $ret -ne 0 ]; then
|
||||
case $ret in
|
||||
1)
|
||||
sleep $STONITH_WAIT
|
||||
;;
|
||||
2)
|
||||
script_finalize 1
|
||||
sleep $MONITOR_INTERVAL
|
||||
continue
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# Check the heartbeat state of the own node.
|
||||
# It only checks hb status when this process runs on the full
|
||||
# cluster stack of corosync.
|
||||
if ! is_pacemaker_remote ; then
|
||||
check_hb_status
|
||||
ret=$?
|
||||
if [ $ret -ne 0 ]; then
|
||||
case $ret in
|
||||
1)
|
||||
script_finalize 0
|
||||
;;
|
||||
2)
|
||||
script_finalize 1
|
||||
;;
|
||||
esac
|
||||
sleep $MONITOR_INTERVAL
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
# Get output result of crm_mon.
|
||||
run_crm_mon
|
||||
ret=$?
|
||||
if [ $ret -ne 0 ]; then
|
||||
script_finalize 0
|
||||
sleep $MONITOR_INTERVAL
|
||||
continue
|
||||
fi
|
||||
|
||||
# If state file of last node is not exsits, create state file,
|
||||
# and write current state to state file.
|
||||
if [ ! -e $STATUS_FILE ]; then
|
||||
make_status_file
|
||||
log_debug "`cat $STATUS_FILE`"
|
||||
sleep $MONITOR_INTERVAL
|
||||
continue
|
||||
fi
|
||||
|
||||
# Count the number of cluster node.
|
||||
count_cluster_nodes
|
||||
result=$?
|
||||
if [ $result -eq 0 ]; then
|
||||
script_finalize 0
|
||||
sleep $MONITOR_INTERVAL
|
||||
continue
|
||||
fi
|
||||
|
||||
# If the number of nodes is fewer than the maximum number of child process,
|
||||
# Child process should start only the number of the node.
|
||||
if [ $result -le $MAX_CHILD_PROCESS ]; then
|
||||
MAX_CHILD_PROCESS=$result
|
||||
fi
|
||||
|
||||
# Get the minimum number of nodes that are taken care of by the child process.
|
||||
child_min_work=`expr $result / $MAX_CHILD_PROCESS`
|
||||
# Get the maximum number of nodes that are taken care of by the child process.
|
||||
child_max_work=`expr $child_min_work + 1`
|
||||
# Get the number of the child process
|
||||
# that takes care of the number of child_max_work nodes.
|
||||
max_work_count=`expr $result % $MAX_CHILD_PROCESS`
|
||||
|
||||
# Get the node name(multiple) that is processed by the child process,
|
||||
# pass its node name to child process
|
||||
jobsrunning=0
|
||||
n=0
|
||||
m=0
|
||||
# Loop processing is executed only by the MAX_CHILD_PROCESS.
|
||||
while [ $jobsrunning -lt $MAX_CHILD_PROCESS ]
|
||||
do
|
||||
work=0
|
||||
param=""
|
||||
# If the child process take care of only the "max_work_count" nodes,
|
||||
if [ $m -lt $max_work_count ]; then
|
||||
# Loop processing is executed only by the maximun number of nodes
|
||||
# that are taken care of by the child process.
|
||||
while [ $work -lt $child_max_work ]
|
||||
do
|
||||
# Only if node name is not empty string
|
||||
# and it is not own node name, pass it to child process.
|
||||
if [ -n "${nodes_array[$n]}" ] && [[ ${nodes_array[$n]} != $MY_NODE_NAME ]]; then
|
||||
param+="${nodes_array[$n]} "
|
||||
fi
|
||||
work=`expr $work + 1`
|
||||
n=`expr $n + 1`
|
||||
done
|
||||
# If the child process take care of only the "min_work_count" nodes,
|
||||
else
|
||||
# Loop processing is executed only by the maximun number of nodes
|
||||
# that are taken care of by the child process.
|
||||
while [ $work -lt `expr $child_min_work` ]
|
||||
do
|
||||
# Only if node name is not empty string
|
||||
# and it is not own node name, pass it child process.
|
||||
if [ -n "${nodes_array[$n]}" ] && [[ ${nodes_array[$n]} != $MY_NODE_NAME ]]; then
|
||||
param+="${nodes_array[$n]} "
|
||||
fi
|
||||
work=`expr $work + 1`
|
||||
n=`expr $n + 1`
|
||||
done
|
||||
fi
|
||||
parse_node_status $param &
|
||||
jobsrunning=`expr $jobsrunning + 1`
|
||||
done
|
||||
wait
|
||||
|
||||
log_debug "`cat $STATUS_FILE`"
|
||||
|
||||
script_finalize 1
|
||||
sleep $MONITOR_INTERVAL
|
||||
done
|
||||
|
||||
log_info "end"
|
||||
|
@ -1,196 +0,0 @@
|
||||
# Copyright(c) 2016 Nippon Telegraph and Telephone Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
LOGTAG=`basename $0`
|
||||
HOST_NAME=`hostname`
|
||||
LOGDIR="/var/log/masakari"
|
||||
LOGFILE="${LOGDIR}/masakari-processmonitor.log"
|
||||
|
||||
# Debug log output function
|
||||
# Argument
|
||||
# $1 : Message
|
||||
log_debug () {
|
||||
if [ ! -e ${LOGDIR} ]; then
|
||||
mkdir -p ${LOGDIR}
|
||||
fi
|
||||
|
||||
if [ "${LOG_LEVEL}" == "debug" ]; then
|
||||
log_output "$1"
|
||||
fi
|
||||
}
|
||||
|
||||
# Info log output function
|
||||
# Argument
|
||||
# $1 : Message
|
||||
log_info () {
|
||||
if [ ! -e ${LOGDIR} ]; then
|
||||
mkdir -p ${LOGDIR}
|
||||
fi
|
||||
|
||||
log_output "$1"
|
||||
}
|
||||
|
||||
# This function outputs the log
|
||||
# Argument
|
||||
# $1 : Message
|
||||
log_output () {
|
||||
echo "`date +'%Y-%m-%d %H:%M:%S'` ${HOST_NAME} ${LOGTAG}: $1" >> $LOGFILE
|
||||
}
|
||||
|
||||
# Some sanity checks on the check target processing list.
|
||||
# Format of the proc.list(Each columns must be separated by a comma.)
|
||||
# The first column : Process ID (two digits of leading zeros) : cannot be omitted.
|
||||
# The second column : The keyword when check exists in processing list(empty is NG.). : cannot be omitted
|
||||
# The third column : The initial startup command (it's required to include word of "start". )
|
||||
# The fourth column : Rebooting command (it's required to include word of "start".)
|
||||
# The fifth column : Shell file name for special processing at the initial startup(before the startup)
|
||||
# The sixth column : Shell file name for special processing at the initial startup(after the startup)
|
||||
# The seventh column : Shell file name for special processing at the initial restart(before the startup)
|
||||
# The eighth column : Shell file name for special processing at the initial restart(after the startup)
|
||||
#
|
||||
# When abonormal condition is detected about proc.list, exits by "exit 2".
|
||||
column_num=8
|
||||
check_proc_file_common (){
|
||||
|
||||
# Check the existence and validity of the proc.list.
|
||||
if [ ! -e $PROC_LIST ]; then
|
||||
log_info "$PROC_LIST(proc_list) is not exists."
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [ ! -s $PROC_LIST ]; then
|
||||
log_info "$PROC_LIST(proc_list) is empty file."
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [ ! -r "$PROC_LIST" ]; then
|
||||
log_info "$PROC_LIST(proc_list) is not readable."
|
||||
exit 2
|
||||
fi
|
||||
|
||||
OLD_IFS=$IFS
|
||||
IFS=$'\n'
|
||||
proc_list=(`cat $PROC_LIST`)
|
||||
IFS=$OLD_IFS
|
||||
|
||||
LINE_NO=1
|
||||
|
||||
for line in "${proc_list[@]}"
|
||||
do
|
||||
num=`echo "$line" | tr -dc ',' | wc -c`
|
||||
# The number of required column are incomplete.
|
||||
check_num=`expr $column_num - 1`
|
||||
if [ $num -ne $check_num ]; then
|
||||
log_info "$PROC_LIST format error (column_num) line $LINE_NO"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
PROC_ID=`echo $line | cut -d"," -f 1`
|
||||
if [ ! -z "$PROC_ID" ]; then
|
||||
expr "$PROC_ID" + 1 >/dev/null 2>&1
|
||||
# If PROC ID is not a numeric,
|
||||
if [ 1 -lt $? ]; then
|
||||
log_info "$PROC_LIST format error (PROC_ID) not number. line $LINE_NO"
|
||||
exit 2
|
||||
fi
|
||||
else
|
||||
log_info "$PROC_LIST format error (PROC_ID) empty. line $LINE_NO"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
KEY_WORD=`echo $line | cut -d"," -f 2`
|
||||
if [ -z "$KEY_WORD" ]; then
|
||||
log_info "$PROC_LIST format error (KEY_WORD) empty. line $LINE_NO"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
|
||||
START_CMD=`echo $line | cut -d"," -f 3`
|
||||
if [ ! -z "$START_CMD" ]; then
|
||||
check=`echo $START_CMD | grep -c start`
|
||||
# If words of "start" are not included in initial startup processing.,
|
||||
if [ $check -ne 1 ]; then
|
||||
log_info "$PROC_LIST format error (START_CMD) line $LINE_NO"
|
||||
exit 2
|
||||
fi
|
||||
fi
|
||||
|
||||
RESTART_CMD=`echo $line | cut -d"," -f 4`
|
||||
if [ ! -z "$RESTART_CMD" ]; then
|
||||
check=`echo $RESTART_CMD | grep -c start`
|
||||
# If words of "start" are not included in restart processing,
|
||||
if [ $check -ne 1 ]; then
|
||||
log_info "$PROC_LIST format error (RESTART_CMD) line $LINE_NO"
|
||||
exit 2
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check the existence and validity of special processing shell file to be executed before and after start processing.
|
||||
START_SP_CMDFILE_BEFORE=`echo $line | cut -d"," -f 5`
|
||||
if [ ! -z "$START_SP_CMDFILE_BEFORE" ]; then
|
||||
# The starting (before executing) special processing shell file does not exist.
|
||||
if [ ! -e $START_SP_CMDFILE_BEFORE ]; then
|
||||
log_info "$PROC_LIST format error (START_SP_CMDFILE_BEFORE) not exists. line $LINE_NO"
|
||||
exit 2
|
||||
fi
|
||||
if [ ! -x $START_SP_CMDFILE_BEFORE ]; then
|
||||
log_info "$PROC_LIST format error (START_SP_CMDFILE_BEFORE) not exeutable. line $LINE_NO"
|
||||
exit 2
|
||||
fi
|
||||
fi
|
||||
|
||||
START_SP_CMDFILE_AFTER=`echo $line | cut -d"," -f 6`
|
||||
if [ ! -z "$START_SP_CMDFILE_AFTER" ]; then
|
||||
# The restarting (before executing) special processing shell file does not exist.
|
||||
if [ ! -e $START_SP_CMDFILE_AFTER ]; then
|
||||
log_info "$PROC_LIST format error (START_SP_CMDFILE_AFTER) not exists. line $LINE_NO"
|
||||
exit 2
|
||||
fi
|
||||
if [ ! -x $START_SP_CMDFILE_AFTER ]; then
|
||||
log_info "$PROC_LIST format error (START_SP_CMDFILE_AFTER) not exeutable. line $LINE_NO"
|
||||
exit 2
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check the existence and validity of special processing shell file to be executed before and after restart processing.
|
||||
RESTART_SP_CMDFILE_BEFORE=`echo $line | cut -d"," -f 7`
|
||||
if [ ! -z "$RESTART_SP_CMDFILE_BEFORE" ]; then
|
||||
# The restarting (before executing) special processing shell file does not exist.
|
||||
if [ ! -e $RESTART_SP_CMDFILE_BEFORE ]; then
|
||||
log_info "$PROC_LIST format error (RESTART_SP_CMDFILE_BEFORE) not exists. line $LINE_NO"
|
||||
exit 2
|
||||
fi
|
||||
if [ ! -x $RESTART_SP_CMDFILE_BEFORE ]; then
|
||||
log_info "$PROC_LIST format error (RESTART_SP_CMDFILE_BEFORE) not exeutable. line $LINE_NO"
|
||||
exit 2
|
||||
fi
|
||||
fi
|
||||
|
||||
RESTART_SP_CMDFILE_AFTER=`echo $line | cut -d"," -f 8`
|
||||
if [ ! -z "$RESTART_SP_CMDFILE_AFTER" ]; then
|
||||
# The restarting (before executing) special processing shell file does not exist.
|
||||
if [ ! -e $RESTART_SP_CMDFILE_AFTER ]; then
|
||||
log_info "$PROC_LIST format error (RESTART_SP_CMDFILE_AFTER) not exists. line $LINE_NO"
|
||||
exit 2
|
||||
fi
|
||||
if [ ! -x $RESTART_SP_CMDFILE_AFTER ]; then
|
||||
log_info "$PROC_LIST format error (RESTART_SP_CMDFILE_AFTER) not exeutable. line $LINE_NO"
|
||||
exit 2
|
||||
fi
|
||||
fi
|
||||
|
||||
LINE_NO=`expr $LINE_NO + 1`
|
||||
done
|
||||
}
|
||||
|
@ -1,24 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright(c) 2016 Nippon Telegraph and Telephone Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Delete the child process as the required steps to restart of nova_compute process.
|
||||
|
||||
KILL_PS_LIST=(`ps -ef | grep nova-compute | grep -v grep | awk '{ print $2; }'`)
|
||||
|
||||
for PS_ID in ${KILL_PS_LIST[@]}
|
||||
do
|
||||
sudo kill -9 ${PS_ID}
|
||||
done
|
@ -1,53 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright(c) 2016 Nippon Telegraph and Telephone Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Define constants
|
||||
SCRIPT_DIR=$(cd $(dirname $0);pwd)
|
||||
SCRIPT_COMMON_SH="$SCRIPT_DIR/common.sh"
|
||||
|
||||
TMP_DIR="/var/tmp"
|
||||
PROC_LIST=$1
|
||||
BAD_CODE_LIST_FILE="$TMP_DIR/badproc.list"
|
||||
|
||||
# Common processing (check of proc.list)
|
||||
. $SCRIPT_COMMON_SH
|
||||
check_proc_file_common
|
||||
|
||||
# Get the process list.
|
||||
ps_result=`ps -ef`
|
||||
|
||||
# Initialize abnormal condition list
|
||||
cat /dev/null > ${BAD_CODE_LIST_FILE}
|
||||
|
||||
# Process check main processing
|
||||
while read line
|
||||
do
|
||||
PROC_NO=`echo $line | cut -d"," -f 1`
|
||||
PROC_NAME=`echo $line | cut -d"," -f 2`
|
||||
PROC_CHECK=`echo $ps_result |grep -c "${PROC_NAME}"`
|
||||
# If process was not detect, register ID in the abnormality process.
|
||||
if [ ${PROC_CHECK} -eq 0 ]; then
|
||||
log_info "down process id_no : ${PROC_NO}"
|
||||
echo ${PROC_NO} >> ${BAD_CODE_LIST_FILE}
|
||||
fi
|
||||
done < ${PROC_LIST}
|
||||
|
||||
# If failing process ID was detected, decide state as abnormal termination(exit code:1).
|
||||
if [ -s ${BAD_CODE_LIST_FILE} ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
@ -1,497 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright(c) 2016 Nippon Telegraph and Telephone Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Define constants.
|
||||
BASE_NAME=`basename $0`
|
||||
TMP_DIR="/var/tmp"
|
||||
TMP_CRM_MON_FILE="$TMP_DIR/crm_mon.tmp"
|
||||
STATUS_FILE="$TMP_DIR/node_status.tmp"
|
||||
TMP_CRMADM_FILE="$TMP_DIR/crmadmin.tmp"
|
||||
NOTICE_OUTPUT="$TMP_DIR/${BASE_NAME}_resp.out"
|
||||
|
||||
SCRIPT_DIR=$(cd $(dirname $0);pwd)
|
||||
SCRIPT_CHECK_PROCESS="$SCRIPT_DIR/process_status_checker.sh"
|
||||
SCRIPT_COMMON_SH="$SCRIPT_DIR/common.sh"
|
||||
|
||||
DOWN_PROCESS_LIST="$TMP_DIR/badproc.list"
|
||||
|
||||
MASAKARI_API_SEND_PROGRAM=curl
|
||||
MASAKARI_API_SEND_FAIL_FLG="off"
|
||||
|
||||
ALREADY_SEND_ID_LIST=()
|
||||
LOGTAG=`basename $0`
|
||||
P_HOST=`uname -n`
|
||||
|
||||
# Define the default setting.
|
||||
DEFAULT_PROCESS_CHECK_INTERVAL=5
|
||||
DEFAULT_PROCESS_REBOOT_RETRY=3
|
||||
DEFAULT_REBOOT_INTERVAL=10
|
||||
DEFAULT_MASAKARI_API_SEND_TIMEOUT=10
|
||||
DEFAULT_MASAKARI_API_SEND_RETRY=12
|
||||
DEFAULT_MASAKARI_API_SEND_DELAY=10
|
||||
|
||||
|
||||
# This function locks a file
|
||||
# Argument:
|
||||
# $1 : File name
|
||||
file_lock () {
|
||||
exec 9>>$1
|
||||
flock -x 9
|
||||
}
|
||||
|
||||
# This function unlocks a file
|
||||
file_unlock () {
|
||||
exec 9>&-
|
||||
}
|
||||
|
||||
# This function reads the configuration file and setting value.
|
||||
# If the value is omitted, set the default value.
|
||||
# If invalid value is set, return "1".
|
||||
# Note) The default value for each item are as follows.
|
||||
# PROCESS_CHECK_INTERVAL (defualt : 60)
|
||||
# PROCESS_REBOOT_RETRY (default : 10)
|
||||
# REBOOT_INTERVAL (default : 3)
|
||||
# MASAKARI_API_SEND_TIMEOUT (defualt : 10)
|
||||
# MASAKARI_API_SEND_RETRY (default : 3)
|
||||
# MASAKARI_API_SEND_DELAY (default : 1)
|
||||
#
|
||||
# Return value:
|
||||
# 0 : Setting completion
|
||||
# 1 : Reading failure of the configuration or invalid setting value
|
||||
# 2 : Omission of the required item
|
||||
set_conf_value () {
|
||||
# Initialize setting
|
||||
unset PROCESS_CHECK_INTERVAL
|
||||
unset PROCESS_REBOOT_RETRY
|
||||
unset REBOOT_INTERVAL
|
||||
unset MASAKARI_API_SEND_TIMEOUT
|
||||
unset MASAKARI_API_SEND_RETRY
|
||||
unset MASAKARI_API_SEND_DELAY
|
||||
unset DOMAIN
|
||||
unset PROJECT
|
||||
unset ADMIN_USER
|
||||
unset ADMIN_PASS
|
||||
unset AUTH_URL
|
||||
unset REGION
|
||||
|
||||
# Read configuration file
|
||||
source $SCRIPT_CONF_FILE > /dev/null 2>&1
|
||||
if [ $? -ne 0 ]; then
|
||||
log_info "config file read error. [$SCRIPT_CONF_FILE]"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Empty string is permitted. If there is no key itself, consider it as an error.
|
||||
|
||||
# If the PROCESS_CHECK_INTERVAL is omitted, set the default value.
|
||||
# If invalid is set, return 1.
|
||||
expect_empty=`echo -n $PROCESS_CHECK_INTERVAL | sed 's/[0-9]//g'`
|
||||
if [ "x" = "x${PROCESS_CHECK_INTERVAL}" ]; then
|
||||
PROCESS_CHECK_INTERVAL=$DEFAULT_PROCESS_CHECK_INTERVAL
|
||||
elif [ "x" != "x${expect_empty}" ]; then
|
||||
log_info "config file parameter error. [$SCRIPT_CONF_FILE:PROCESS_CHECK_INTERVAL]"
|
||||
return 1
|
||||
fi
|
||||
log_debug "config file parameter : PROCESS_CHECK_INTERVAL=$PROCESS_CHECK_INTERVAL"
|
||||
|
||||
# If the PROCESS_REBOOT_RETRY is omitted, set the default value.
|
||||
# If invalid is set, return 1.
|
||||
expect_empty=`echo -n $PROCESS_REBOOT_RETRY | sed 's/[0-9]//g'`
|
||||
if [ "x" = "x${PROCESS_REBOOT_RETRY}" ]; then
|
||||
PROCESS_REBOOT_RETRY=$DEFAULT_PROCESS_REBOOT_RETRY
|
||||
elif [ "x" != "x${expect_empty}" ]; then
|
||||
log_info "config file parameter error. [$SCRIPT_CONF_FILE:PROCESS_REBOOT_RETRY]"
|
||||
return 1
|
||||
fi
|
||||
log_debug "config file parameter : PROCESS_REBOOT_RETRY=$PROCESS_REBOOT_RETRY"
|
||||
|
||||
# If the REBOOT_INTERVAL is omitted, set the default value.
|
||||
# If invalid is set, return 1.
|
||||
expect_empty=`echo -n $REBOOT_INTERVAL | sed 's/[0-9]//g'`
|
||||
if [ "x" = "x${REBOOT_INTERVAL}" ]; then
|
||||
REBOOT_INTERVAL=$DEFAULT_REBOOT_INTERVAL
|
||||
elif [ "x" != "x${expect_empty}" ]; then
|
||||
log_info "config file parameter error. [$SCRIPT_CONF_FILE:REBOOT_INTERVAL]"
|
||||
return 1
|
||||
fi
|
||||
log_debug "config file parameter : REBOOT_INTERVAL=$REBOOT_INTERVAL"
|
||||
|
||||
# If the MASAKARI_API_SEND_TIMEOUT is omitted, set the default value.
|
||||
# If invalid is set, return 1.
|
||||
expect_empty=`echo -n $MASAKARI_API_SEND_TIMEOUT | sed 's/[0-9]//g'`
|
||||
if [ "x" = "x${MASAKARI_API_SEND_TIMEOUT}" ]; then
|
||||
MASAKARI_API_SEND_TIMEOUT=$DEFAULT_MASAKARI_API_SEND_TIMEOUT
|
||||
elif [ "x" != "x${expect_empty}" ]; then
|
||||
log_info "config file parameter error. [$SCRIPT_CONF_FILE:MASAKARI_API_SEND_TIMEOUT]"
|
||||
return 1
|
||||
fi
|
||||
log_debug "config file parameter : MASAKARI_API_SEND_TIMEOUT=$MASAKARI_API_SEND_TIMEOUT"
|
||||
|
||||
# If the MASAKARI_API_SEND_RETRY is omitted, set the default value.
|
||||
# If invalid is set, return 1.
|
||||
expect_empty=`echo -n $MASAKARI_API_SEND_RETRY | sed 's/[0-9]//g'`
|
||||
if [ "x" = "x${MASAKARI_API_SEND_RETRY}" ]; then
|
||||
MASAKARI_API_SEND_RETRY=$DEFAULT_MASAKARI_API_SEND_RETRY
|
||||
elif [ "x" != "x${expect_empty}" ]; then
|
||||
log_info "config file parameter error. [$SCRIPT_CONF_FILE:MASAKARI_API_SEND_RETRY]"
|
||||
return 1
|
||||
fi
|
||||
log_debug "config file parameter : MASAKARI_API_SEND_RETRY=$MASAKARI_API_SEND_RETRY"
|
||||
|
||||
# If the MASAKARI_API_SEND_DELAY is omitted, set the default value.
|
||||
# If invalid is set, return 1.
|
||||
expect_empty=`echo -n $MASAKARI_API_SEND_DELAY | sed 's/[0-9]//g'`
|
||||
if [ "x" = "x${MASAKARI_API_SEND_DELAY}" ]; then
|
||||
MASAKARI_API_SEND_DELAY=$DEFAULT_MASAKARI_API_SEND_DELAY
|
||||
elif [ "x" != "x${expect_empty}" ]; then
|
||||
log_info "config file parameter error. [$SCRIPT_CONF_FILE:MASAKARI_API_SEND_DELAY]"
|
||||
return 1
|
||||
fi
|
||||
log_debug "config file parameter : MASAKARI_API_SEND_DELAY=$MASAKARI_API_SEND_DELAY"
|
||||
|
||||
# If the DOMAIN is omitted, return 1.
|
||||
if [ "x" = "x${DOMAIN}" ]; then
|
||||
log_info "config file parameter error. [$DOMAIN:DOMAIN]"
|
||||
return 1
|
||||
else
|
||||
log_debug "config file parameter : DOMAIN=$DOMAIN"
|
||||
fi
|
||||
|
||||
# If the PROJECT is omitted, return 1.
|
||||
if [ "x" = "x${PROJECT}" ]; then
|
||||
log_info "config file parameter error. [$PROJECT:PROJECT]"
|
||||
return 1
|
||||
else
|
||||
log_debug "config file parameter : PROJECT=$PROJECT"
|
||||
fi
|
||||
|
||||
# If the ADMIN_USER is omitted, return 1.
|
||||
if [ "x" = "x${ADMIN_USER}" ]; then
|
||||
log_info "config file parameter error. [$ADMIN_USER:ADMIN_USER]"
|
||||
return 1
|
||||
else
|
||||
log_debug "config file parameter : ADMIN_USER=$ADMIN_USER"
|
||||
fi
|
||||
|
||||
# If the ADMIN_PASS is omitted, return 1.
|
||||
if [ "x" = "x${ADMIN_PASS}" ]; then
|
||||
log_info "config file parameter error. [$ADMIN_PASS:ADMIN_PASS]"
|
||||
return 1
|
||||
else
|
||||
log_debug "config file parameter : ADMIN_PASS=$ADMIN_PASS"
|
||||
fi
|
||||
|
||||
# If the AUTH_URL is omitted, return 1.
|
||||
if [ "x" = "x${AUTH_URL}" ]; then
|
||||
log_info "config file parameter error. [$AUTH_URL:AUTH_URL]"
|
||||
return 1
|
||||
else
|
||||
log_debug "config file parameter : AUTH_URL=$AUTH_URL"
|
||||
fi
|
||||
|
||||
# If the REGION is omitted, return 1.
|
||||
if [ "x" = "x${REGION}" ]; then
|
||||
log_info "config file parameter error. [$REGION:REGION]"
|
||||
return 1
|
||||
else
|
||||
log_debug "config file parameter : REGION=$REGION"
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# Initial startup command execution method:
|
||||
# This method does not execute same command as startup command that executed once.
|
||||
|
||||
init_boot() {
|
||||
log_debug "init_boot start"
|
||||
CMD_LIST=()
|
||||
for line in "${proc_list[@]}"
|
||||
do
|
||||
ALREADY_FLG="off"
|
||||
CMD=`echo ${line} | cut -d"," -f 3`
|
||||
SPECIAL_BEFORE=`echo $line | cut -d"," -f 5`
|
||||
SPECIAL_AFTER=`echo $line | cut -d"," -f 6`
|
||||
|
||||
# If there is no startup command, can proceed to the next command.
|
||||
if [ -z "$CMD" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
# Check whether already is executed.
|
||||
for CHECK_CMD in "${CMD_LIST[@]}"
|
||||
do
|
||||
if [ "$CHECK_CMD" = "$CMD" ]; then
|
||||
ALREADY_FLG="on"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
# Execute special processing before the initial startup.
|
||||
if [ ! -z "$SPECIAL_BEFORE" ]; then
|
||||
$SPECIAL_BEFORE
|
||||
fi
|
||||
|
||||
# If not be executed, execute start command.
|
||||
if [ "$ALREADY_FLG" = "off" ]; then
|
||||
OLD_IFS=$IFS
|
||||
IFS=';'
|
||||
set -- $CMD
|
||||
CMD_SPLIT_LIST=("$@")
|
||||
IFS=$OLD_IFS
|
||||
for SPLIT_CMD in "${CMD_SPLIT_LIST[@]}"
|
||||
do
|
||||
$SPLIT_CMD > /dev/null 2>&1
|
||||
done
|
||||
|
||||
CMD_LIST=("$CMD_LIST" "$CMD")
|
||||
fi
|
||||
|
||||
# Execute special processing after the initial startup.
|
||||
if [ ! -z "$SPECIAL_AFTER" ]; then
|
||||
$SPECIAL_AFTER
|
||||
fi
|
||||
done
|
||||
log_debug "init_boot end"
|
||||
}
|
||||
|
||||
# This function creates data that is notified to the masakari api.
|
||||
# It is called from the child process.
|
||||
#
|
||||
make_notice_data () {
|
||||
TIME=`date -u +'%Y-%m-%d %H:%M:%S'`
|
||||
|
||||
PAYLOAD="{\"event\": \"STOPPED\", \"process_name\": \"${PROCESS_NAME}\"}"
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
# This function notifies to the masakari api.
|
||||
# It is called masakari_cli post_event method.
|
||||
send_notification () {
|
||||
TYPE="PROCESS"
|
||||
TARGET="post_event"
|
||||
AUTH_INFO="--os-domain-name ${DOMAIN} --os-project-name ${PROJECT} --os-region-name ${REGION} --os-auth-url ${AUTH_URL} --os-username ${ADMIN_USER} --os-password ${ADMIN_PASS}"
|
||||
|
||||
log_info "info : Send a notification."
|
||||
log_info "info : openstack ${AUTH_INFO} notification create ${TYPE} ${P_HOST} \"${TIME}\" \"${PAYLOAD}\""
|
||||
|
||||
RESP=`openstack ${AUTH_INFO} notification create ${TYPE} ${P_HOST} "${TIME}" "${PAYLOAD}"`
|
||||
result=$?
|
||||
|
||||
if [ $result -eq 0 ]; then
|
||||
log_info "info : Succeeded in sending a notification."
|
||||
log_info "info : $RESP"
|
||||
else
|
||||
log_info "info : Failed to send a notification. [exit-code: $result]"
|
||||
log_info "info : $RESP"
|
||||
MASAKARI_API_SEND_FAIL_FLG="on"
|
||||
fi
|
||||
|
||||
return
|
||||
|
||||
}
|
||||
|
||||
# Attempt to restart the failer process.
|
||||
# If failure to number of retries, notify to the masakari api.
|
||||
|
||||
down_process_reboot(){
|
||||
ALREADY_REBOOT_CMD_LIST=()
|
||||
while read line
|
||||
do
|
||||
ALREADY_FLG="off"
|
||||
# No processing is executed about process id included in the send list.
|
||||
for already_id in "${ALREADY_SEND_ID_LIST[@]}"
|
||||
do
|
||||
if [ "$line" = "$already_id" ]; then
|
||||
ALREADY_FLG="on"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$ALREADY_FLG" = "on" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
for proc in "${proc_list[@]}"
|
||||
do
|
||||
PROC_ID=`echo $proc | cut -d"," -f 1`
|
||||
if [ "$line" = "$PROC_ID" ] ; then
|
||||
CMD=`echo $proc | cut -d"," -f 4`
|
||||
PROCESS_NAME=`echo $proc | cut -d"," -f 2`
|
||||
SPECIAL_BEFORE=`echo $proc | cut -d"," -f 7`
|
||||
SPECIAL_AFTER=`echo $proc | cut -d"," -f 8`
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ ! -z "$SPECIAL_BEFORE" ]; then
|
||||
$SPECIAL_BEFORE
|
||||
fi
|
||||
|
||||
# If there is not restart command, can proceed to the next command.
|
||||
if [ -z "$CMD" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
RESULT_FLG=1
|
||||
# Decomposes multiple processing be joined by ";" and execute them. (restart execution part)
|
||||
OLD_IFS=$IFS
|
||||
IFS=';'
|
||||
set -- $CMD
|
||||
CMD_SPLIT_LIST=("$@")
|
||||
IFS=$OLD_IFS
|
||||
for SPLIT_CMD in "${CMD_SPLIT_LIST[@]}"
|
||||
do
|
||||
ALREADY_FLG="off"
|
||||
# Check whether already is executed.
|
||||
for CHECK_CMD in "${ALREADY_REBOOT_CMD_LIST[@]}"
|
||||
do
|
||||
if [ "$CHECK_CMD" = "$SPLIT_CMD" ]; then
|
||||
ALREADY_FLG="on"
|
||||
break
|
||||
fi
|
||||
done
|
||||
# If is already executed, skip.
|
||||
if [ "$ALREADY_FLG" = "on" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
log_debug "reboot cmd:$SPLIT_CMD"
|
||||
$SPLIT_CMD > /dev/null 2>&1
|
||||
if [ $? -ne 0 ]; then
|
||||
RESULT_FLG=0
|
||||
break
|
||||
else
|
||||
ALREADY_REBOOT_CMD_LIST=("$ALREADY_REBOOT_CMD_LIST" "$SPLIT_CMD")
|
||||
fi
|
||||
done
|
||||
|
||||
# If fail to restart, executes retry restart.
|
||||
if [ $RESULT_FLG -ne 1 ]; then
|
||||
result=0
|
||||
for retry in `seq $PROCESS_REBOOT_RETRY`
|
||||
do
|
||||
sleep $REBOOT_INTERVAL
|
||||
# Retry the restart processing.
|
||||
RESULT_FLG=1
|
||||
for SPLIT_CMD in "${CMD_SPLIT_LIST[@]}"
|
||||
do
|
||||
ALREADY_FLG="off"
|
||||
# Check whether already is executed.
|
||||
for CHECK_CMD in "${ALREADY_REBOOT_CMD_LIST[@]}"
|
||||
do
|
||||
if [ "$CHECK_CMD" = "$SPLIT_CMD" ]; then
|
||||
ALREADY_FLG="on"
|
||||
break
|
||||
fi
|
||||
done
|
||||
# If is already executed, skip.
|
||||
if [ "$ALREADY_FLG" = "on" ]; then
|
||||
continue
|
||||
fi
|
||||
log_debug "reboot cmd:$SPLIT_CMD"
|
||||
$SPLIT_CMD > /dev/null 2>&1
|
||||
if [ $? -ne 0 ]; then
|
||||
RESULT_FLG=0
|
||||
break
|
||||
else
|
||||
ALREADY_REBOOT_CMD_LIST=("$ALREADY_REBOOT_CMD_LIST" "$SPLIT_CMD")
|
||||
fi
|
||||
done
|
||||
if [ $RESULT_FLG -eq 1 ]; then
|
||||
break
|
||||
elif [ $retry -eq $PROCESS_REBOOT_RETRY ]; then
|
||||
# If number of retries is exceeded, notify to the masakari api.
|
||||
make_notice_data
|
||||
if [ $result -eq 0 ]&&
|
||||
[ "$MASAKARI_API_SEND_FAIL_FLG" = "off" ]; then
|
||||
send_notification
|
||||
fi
|
||||
# Add the sent list.
|
||||
ALREADY_SEND_ID_LIST=("${ALREADY_SEND_ID_LIST[@]}" "${line}")
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# Special processes after restart.
|
||||
if [ ! -z "$SPECIAL_AFTER" ]; then
|
||||
$SPECIAL_AFTER
|
||||
fi
|
||||
|
||||
|
||||
done < $DOWN_PROCESS_LIST
|
||||
}
|
||||
|
||||
|
||||
# Argument check
|
||||
if [ $# -ne 2 ]; then
|
||||
echo "Usage: $0 <configuration file path> <proc.list file path>"
|
||||
exit 1
|
||||
else
|
||||
SCRIPT_CONF_FILE=$1
|
||||
PROC_LIST=$2
|
||||
fi
|
||||
|
||||
# Initial processing (check proc.list and read conf file)
|
||||
. $SCRIPT_COMMON_SH
|
||||
|
||||
# Output warning message.
|
||||
log_info "WARNING : $0 is deprecated as of the Ocata release and will be removed in the Queens release. Use masakari-processmonitor implemented in python instead of $0."
|
||||
|
||||
log_debug "processmonitor start!!"
|
||||
check_proc_file_common
|
||||
set_conf_value
|
||||
if [ $? -ne 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -e $NOTICE_OUTPUT ]; then
|
||||
sudo rm -rf $NOTICE_OUTPUT
|
||||
fi
|
||||
|
||||
# Initial startup
|
||||
init_boot
|
||||
|
||||
while true
|
||||
do
|
||||
# Recheck and reload of the proc.list.
|
||||
check_proc_file_common
|
||||
# If invalid value is set to configuration file, set default value.
|
||||
set_conf_value
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Execute process check processing.
|
||||
${SCRIPT_CHECK_PROCESS} ${PROC_LIST}
|
||||
RESULT_CODE=$?
|
||||
|
||||
# If the return code is 2, because can't continue functionally, stop.
|
||||
if [ $RESULT_CODE -eq 2 ]; then
|
||||
log_debug "process_status_checker down!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# If the failing process is detected by shell check, retry restart.
|
||||
if [ $RESULT_CODE -ne 0 ]; then
|
||||
down_process_reboot
|
||||
fi
|
||||
|
||||
sleep ${PROCESS_CHECK_INTERVAL}
|
||||
done
|
Loading…
Reference in New Issue
Block a user