
This update introduces interface monitoring for oam, mgmt and infra networks as a collectd plugin. The interface plugin runs and queries the new maintenance Link Monitor daemon for Link Model and Information every 10 seconds. The plugin then manages alarms based on the link model similar to how rmon did in the past ; port and interface alarms. Severity: Interface and Port levels Alarm Level Minor Major Critical ----------- ----- --------------------- ---------------------------- Interface N/A One of lag pair is Up All Interface ports are Down Port N/A Physical Link is Down N/A Degrade support for interface monitoring is add to the mtce degrade notifier. Any link down condition results in a host degrade condition like was in rmon. Sample Data: represented as % of total links Up for that network interface 100 or 100% percent used - all links of interface are up. 50 or 50% percent used - one of lag pair is Up and the other is Down 0 or 0% percent used - all ports for that network are Down The plugin documents all of this in its header. This update also 1. Adds the new lmond process to syslog-ng config file. 2. Adds the new lmond process to the mtce patch script. 3. Modifies the cpu, df and memory threshold settings by -1. rmon thresholds were precise whereas collectd requires that the samples cross the thresholds, not just meet them. So for example, in terms of a 90% usage action the threshold needs to be 89. Test Plan: (WIP but almost complete) PASS: Verify interface plugin startup PASS: Verify interface plugin logging PASS: Verify interface plugin Link Status Query and response handling PASS: Verify monitor, sample storage and grafana display PASS: verify port and interface alarm matches what rmon produced PASS: Verify lmon port config from manifest configured plugin PASS: Verify lmon port config from lmon.conf PASS: Verify single interface failure handling and recovery PASS: Verify lagged interface failure handling and recovery PASS: Verify link loss of lagged interface shared between mgmt and oam (hp380) PASS: Verify network interface failure handling ; single port PASS: Verify network interface degrade handling ; lagged interface PEND: Verify network interface degrade handling ; vlan interface PASS: Verify HTTP request timeout period and handling PASS: Verify link status query failure handling - invalid uri (timeout) PASS: Verify link status query failure handling - missing uri (timeout) PASS: Verify link status query failure handling - status fail PASS: Verify link status query failure handling - bad json resp Change-Id: I2e2dfe6ddfa06a46770245540c7153d330bdf196 Story: 2002823 Task: 28635 Depends-On: https://review.openstack.org/#/c/633264 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
480 lines
15 KiB
Bash
Executable File
480 lines
15 KiB
Bash
Executable File
#!/bin/bash
|
|
#
|
|
# Copyright (c) 2016 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
##############################################################################
|
|
#
|
|
# This script supports no-reboot patching of any single or
|
|
# combination of maintenance processes specified on the command line.
|
|
#
|
|
# Calling sequence:
|
|
#
|
|
# rc=mtce-restart process1 process2 process3 ...
|
|
# if [ $? != 0 ] ; then
|
|
# restart action failed
|
|
#
|
|
#
|
|
###############################################################################
|
|
#
|
|
# The patching subsystem provides a patch-functions bash source file
|
|
# with useful function and variable definitions.
|
|
#
|
|
if [ -e "/etc/patching/patch-functions" ] ; then
|
|
. /etc/patching/patch-functions
|
|
fi
|
|
|
|
loginfo "----------------------------------------------"
|
|
loginfo "Maintenance No-Reboot Patching Restart Request"
|
|
|
|
#
|
|
# Declare an overall script return code
|
|
#
|
|
declare -i GLOBAL_RC=$PATCH_STATUS_FAILED
|
|
|
|
#if [ ! -e $PATCH_FLAGDIR ] ; then
|
|
# mkdir -p $PATCH_FLAGDIR
|
|
#fi
|
|
|
|
# if set with -c or --clean options then the flag files for
|
|
# each process are removed at the start.
|
|
CLEAN=false
|
|
|
|
#
|
|
# Completion status ; stored in PID index
|
|
#
|
|
DISABLED="disabled"
|
|
NOPID="not-running"
|
|
SKIPPED="skipped"
|
|
RESTARTED="restarted"
|
|
|
|
#
|
|
# process query and restart executables
|
|
#
|
|
SM_RESTART_EXEC="sm-restart-safe"
|
|
SM_QUERY_EXEC="sm-query"
|
|
PMON_RESTART_EXEC="pmon-restart"
|
|
|
|
#
|
|
# Struct indexes
|
|
#
|
|
PROCESS_INDEX=0
|
|
PID_INDEX=1
|
|
ALIAS_INDEX=2
|
|
|
|
|
|
#
|
|
# Process Struct and List [ name ] [ alias ] [ pid | status ]
|
|
#
|
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
# NOTE TO PATCH WRITERS: Simply Un-Comment processes you want no-reboot patch restarted.
|
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
#
|
|
|
|
# The process restart control structure
|
|
declare sm_managed_processes=""
|
|
declare pmon_managed_processes=""
|
|
|
|
# Build the process list.
|
|
# All arguements should be a valid maintenance process name.
|
|
# The name of the binary, not the SM alias.
|
|
# See the list below for supported process names.
|
|
while [[ ${#} > 0 ]]
|
|
do
|
|
process="${1}"
|
|
case $process in
|
|
|
|
-c|--clean)
|
|
CLEAN=true
|
|
;;
|
|
|
|
# Maintenance Processes - SM managed
|
|
"mtcAgent")
|
|
sm_managed_processes=( ${sm_managed_processes[@]} "mtcAgent:0:mtc-agent")
|
|
;;
|
|
"guestAgent")
|
|
sm_managed_processes=( ${sm_managed_processes[@]} "guestAgent:0:guest-agent")
|
|
;;
|
|
"hwmond")
|
|
sm_managed_processes=( ${sm_managed_processes[@]} "hwmond:0:hw-mon")
|
|
;;
|
|
|
|
# Maintenance Processes - PMON managed
|
|
"pmond")
|
|
pmon_managed_processes=(${pmon_managed_processes[@]} "pmond:0")
|
|
;;
|
|
"guestServer")
|
|
pmon_managed_processes=(${pmon_managed_processes[@]} "guestServer:0")
|
|
;;
|
|
"hbsAgent")
|
|
pmon_managed_processes=(${pmon_managed_processes[@]} "hbsAgent:0")
|
|
;;
|
|
"mtcClient")
|
|
pmon_managed_processes=(${pmon_managed_processes[@]} "mtcClient:0")
|
|
;;
|
|
"hbsClient")
|
|
pmon_managed_processes=(${pmon_managed_processes[@]} "hbsClient:0")
|
|
;;
|
|
"rmond")
|
|
pmon_managed_processes=(${pmon_managed_processes[@]} "rmond:0")
|
|
;;
|
|
"hostwd")
|
|
pmon_managed_processes=(${pmon_managed_processes[@]} "hostwd:0")
|
|
;;
|
|
"fsmond")
|
|
pmon_managed_processes=(${pmon_managed_processes[@]} "fsmond:0")
|
|
;;
|
|
"mtclogd")
|
|
pmon_managed_processes=(${pmon_managed_processes[@]} "mtclogd:0")
|
|
;;
|
|
"mtcalarmd")
|
|
pmon_managed_processes=(${pmon_managed_processes[@]} "mtcalarmd:0")
|
|
;;
|
|
"lmond")
|
|
pmon_managed_processes=(${pmon_managed_processes[@]} "lmond:0")
|
|
;;
|
|
|
|
*)
|
|
loginfo "Unknown process:${process}"
|
|
;;
|
|
esac
|
|
shift
|
|
done
|
|
|
|
# Assume both groupings are done until we know there are not
|
|
sm_done=true
|
|
pmon_done=true
|
|
|
|
#if [ ${#sm_managed_processes[@]} -ne 0 -a is_controller ] ; then
|
|
if [ -n "${sm_managed_processes}" -a is_controller ] ; then
|
|
|
|
# Record current process IDs
|
|
index=0
|
|
for DAEMON in "${sm_managed_processes[@]}"
|
|
do
|
|
info=(${DAEMON//:/ })
|
|
|
|
if [ "${CLEAN}" = true ] ; then
|
|
rm -f $PATCH_FLAGDIR/${info[${PROCESS_INDEX}]}.restarted
|
|
fi
|
|
|
|
info[${PID_INDEX}]=`pidof ${info[${PROCESS_INDEX}]}`
|
|
if [ -z "${info[${PID_INDEX}]}" ] ; then
|
|
loginfo "${info[${PROCESS_INDEX}]} is not running"
|
|
info[${PID_INDEX}]="${NOPID}"
|
|
fi
|
|
|
|
# Save the PID or NOPID status to the process line
|
|
sm_managed_processes[${index}]="${info[${PROCESS_INDEX}]}:${info[${PID_INDEX}]}:${info[${ALIAS_INDEX}]}"
|
|
|
|
((index++))
|
|
done
|
|
|
|
# Restart the processes
|
|
index=0
|
|
for DAEMON in "${sm_managed_processes[@]}"
|
|
do
|
|
info=(${DAEMON//:/ })
|
|
|
|
if [ -e $PATCH_FLAGDIR/${info[${PROCESS_INDEX}]}.restarted ] ; then
|
|
info[${PID_INDEX}]="${SKIPPED}"
|
|
|
|
# Add the PID to the process line
|
|
sm_managed_processes[${index}]="${info[${PROCESS_INDEX}]}:${info[${PID_INDEX}]}:${info[${ALIAS_INDEX}]}"
|
|
((index++))
|
|
|
|
continue
|
|
fi
|
|
sm_query_result=`${SM_QUERY_EXEC} service ${info[${ALIAS_INDEX}]}`
|
|
if [[ "${sm_query_result}" == *"enabled-active"* ]] ; then
|
|
# Save the original PID
|
|
info[${PID_INDEX}]=`pidof ${info[${PROCESS_INDEX}]}`
|
|
|
|
if [ -n "${info[${PID_INDEX}]}" ] ; then
|
|
|
|
loginfo "sm-restart of ${info[${PROCESS_INDEX}]} [pid:${info[${PID_INDEX}]}]"
|
|
touch $PATCH_FLAGDIR/${info[${PROCESS_INDEX}]}.restarted
|
|
${SM_RESTART_EXEC} service "${info[${ALIAS_INDEX}]}"
|
|
sm_done=false
|
|
sleep 5
|
|
|
|
else
|
|
loginfo "${info[${PROCESS_INDEX}]} is not running ; must be on inactive controller"
|
|
info[${PID_INDEX}]="${NOPID}"
|
|
fi
|
|
elif [[ ${sm_query_result} == *"is enabling"* ]] ; then
|
|
info[${PID_INDEX}]="${NOPID}"
|
|
loginfo "sm-restart ${info[${PROCESS_INDEX}]} ; [in progress] ; [pid:${info[${PID_INDEX}]}]"
|
|
else
|
|
info[${PID_INDEX}]="${DISABLED}"
|
|
loginfo "${info[${PROCESS_INDEX}]} is not active"
|
|
fi
|
|
|
|
# Add the PID to the process line
|
|
sm_managed_processes[${index}]="${info[${PROCESS_INDEX}]}:${info[${PID_INDEX}]}:${info[${ALIAS_INDEX}]}"
|
|
|
|
((index++))
|
|
done
|
|
fi
|
|
|
|
if [ -n "${pmon_managed_processes}" ] ; then
|
|
|
|
echo "DEBUG: pmon_managed_processes:${pmon_managed_processes}"
|
|
|
|
# Restart the pmond processes
|
|
index=0
|
|
for DAEMON in "${pmon_managed_processes[@]}"
|
|
do
|
|
info=(${DAEMON//:/ })
|
|
|
|
if [ "${CLEAN}" = true ] ; then
|
|
rm -f $PATCH_FLAGDIR/${info[${PROCESS_INDEX}]}.restarted
|
|
fi
|
|
|
|
if [ -e $PATCH_FLAGDIR/${info[${PROCESS_INDEX}]}.restarted ] ; then
|
|
info[${PID_INDEX}]="${SKIPPED}"
|
|
pmon_managed_processes[${index}]="${info[${PROCESS_INDEX}]}:${info[${PID_INDEX}]}"
|
|
((index++))
|
|
continue
|
|
fi
|
|
|
|
# Save the original PID
|
|
info[${PID_INDEX}]=`pidof ${info[${PROCESS_INDEX}]}`
|
|
|
|
if [ -n "${info[${PID_INDEX}]}" ] ; then
|
|
loginfo "pmon-restart of ${info[${PROCESS_INDEX}]} [pid:${info[${PID_INDEX}]}]"
|
|
touch $PATCH_FLAGDIR/${info[${PROCESS_INDEX}]}.restarted
|
|
${PMON_RESTART_EXEC} ${info[${PROCESS_INDEX}]}
|
|
pmon_done=false
|
|
sleep 2
|
|
|
|
####################################################################
|
|
# Special Handling Section
|
|
#
|
|
# - pmond needs 30 seconds to restart before it will start
|
|
# monitoring processes.We can maybe remove that in the daemon
|
|
# config file but for now its there and we have to wait.
|
|
####################################################################
|
|
if [ "${info[${PROCESS_INDEX}]}" == "pmond" ] ; then
|
|
sleep 30
|
|
fi
|
|
|
|
else
|
|
info[${PID_INDEX}]="${DISABLED}"
|
|
loginfo "${info[${PROCESS_INDEX}]} is not active"
|
|
fi
|
|
|
|
# Save the updated PID or other status to the process line
|
|
pmon_managed_processes[${index}]="${info[${PROCESS_INDEX}]}:${info[${PID_INDEX}]}"
|
|
|
|
((index++))
|
|
done
|
|
fi
|
|
|
|
# check for done. If this is not met in timeout then fail is returned
|
|
if [ "$sm_done" = true -a "$pmon_done" = true ] ; then
|
|
GLOBAL_RC=$PATCH_STATUS_OK
|
|
loginfo " SM Processes: ${sm_managed_processes[@]}"
|
|
loginfo "PMON Processes: ${pmon_managed_processes[@]}"
|
|
loginfo "Maintenance No-Reboot Patching Status: ${GLOBAL_RC} - nothing to do."
|
|
exit ${GLOBAL_RC}
|
|
fi
|
|
|
|
# Monitor the restart of SM processes
|
|
#
|
|
# Don't want to start from the beginning of the shell
|
|
# Want time zero now plus 20 seconds.
|
|
#
|
|
SECONDS=0
|
|
TIMEOUT=120
|
|
let UNTIL=${SECONDS}+${TIMEOUT}
|
|
loginfo "restart timeout is ${TIMEOUT}"
|
|
|
|
while [ ${UNTIL} -ge ${SECONDS} ]
|
|
do
|
|
if [ "$sm_done" = false ] ; then
|
|
if [ is_controller -o is_cpe ] ; then
|
|
sm_not_done=false
|
|
index=0
|
|
for DAEMON in "${sm_managed_processes[@]}"
|
|
do
|
|
info=(${DAEMON//:/ })
|
|
|
|
# Don't wast time on processes that are being skipped due to past restart
|
|
if [ "${info[${PID_INDEX}]}" == "${SKIPPED}" ] ; then
|
|
((index++))
|
|
continue
|
|
|
|
# Don't wast time on processes that have already restarted
|
|
elif [ "${info[${PID_INDEX}]}" == "${RESTARTED}" ] ; then
|
|
((index++))
|
|
continue
|
|
|
|
# Don't look for disabled processes
|
|
elif [ "${info[${PID_INDEX}]}" == "${DISABLED}" ] ; then
|
|
((index++))
|
|
continue
|
|
|
|
# Don't look at not running processes
|
|
elif [ "${info[${PID_INDEX}]}" == "${NOPID}" ] ; then
|
|
((index++))
|
|
continue
|
|
|
|
elif [[ `sm-query service ${info[${ALIAS_INDEX}]}` == *"enabled-active"* ]] ; then
|
|
|
|
# Save the original PID
|
|
new_pid=`pidof ${info[${PROCESS_INDEX}]}`
|
|
if [ $? -eq 0 -a -n ${new_pid} ] ; then
|
|
|
|
if [ "${info[${PID_INDEX}]}" != "${new_pid}" ] ; then
|
|
loginfo "${info[${PROCESS_INDEX}]} ${RESTARTED} ok [pid:${info[${PID_INDEX}]} -> ${new_pid}]"
|
|
info[${PID_INDEX}]="${RESTARTED}"
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
if [ "${info[${PID_INDEX}]}" != "${RESTARTED}" ] ; then
|
|
sm_not_done=true
|
|
fi
|
|
|
|
# Add the PID to the process line
|
|
sm_managed_processes[${index}]="${info[${PROCESS_INDEX}]}:${info[${PID_INDEX}]}:${info[${ALIAS_INDEX}]}"
|
|
|
|
((index++))
|
|
done
|
|
fi
|
|
|
|
# log when SM restarts are done print a summary only once
|
|
if [ "${sm_not_done}" = false -a "${sm_done}" = false ] ; then
|
|
sm_done=true
|
|
logged=false
|
|
for DAEMON in "${sm_managed_processes[@]}"
|
|
do
|
|
info=(${DAEMON//:/ })
|
|
if [ "${info[${PID_INDEX}]}" == "${RESTARTED}" ] ; then
|
|
if [ "${logged}" = false ] ; then
|
|
loginfo "The following 'sm managed' processes have been 'restarted'"
|
|
logged=true
|
|
fi
|
|
loginfo "... process: ${info[${PROCESS_INDEX}]}"
|
|
fi
|
|
done
|
|
logged=false
|
|
for DAEMON in "${sm_managed_processes[@]}"
|
|
do
|
|
info=(${DAEMON//:/ })
|
|
if [ "${info[${PID_INDEX}]}" == "${SKIPPED}" ] ; then
|
|
if [ "${logged}" = false ] ; then
|
|
loginfo "The following 'sm managed' processes have been 'skipped' ; due to previous restart"
|
|
logged=true
|
|
fi
|
|
loginfo "... process: ${info[${PROCESS_INDEX}]}"
|
|
fi
|
|
done
|
|
fi
|
|
fi
|
|
|
|
#########################################################################
|
|
# For all nodes ....
|
|
#########################################################################
|
|
|
|
# Loop over all PMON proceses looking for complete restarts.
|
|
# Update process struct PID field as status is learned.
|
|
|
|
if [ "$pmon_done" = false ] ; then
|
|
# Start assuming we are not done
|
|
pmon_not_done=false
|
|
index=0
|
|
for DAEMON in "${pmon_managed_processes[@]}"
|
|
do
|
|
info=(${DAEMON//:/ })
|
|
|
|
# Don't wast time on processes that are being skipped due to past restart
|
|
if [ "${info[${PID_INDEX}]}" == "${SKIPPED}" ] ; then
|
|
((index++))
|
|
continue
|
|
|
|
# Don't wast time on processes that have already restarted
|
|
elif [ "${info[${PID_INDEX}]}" == "${RESTARTED}" ] ; then
|
|
((index++))
|
|
continue
|
|
|
|
# Don't look for disabled processes
|
|
elif [ "${info[${PID_INDEX}]}" == "${DISABLED}" ] ; then
|
|
((index++))
|
|
continue
|
|
|
|
# Don't look at not running processes
|
|
elif [ "${info[${PID_INDEX}]}" == "${NOPID}" ] ; then
|
|
((index++))
|
|
continue
|
|
fi
|
|
|
|
# Save the original PID
|
|
new_pid=`pidof ${info[${PROCESS_INDEX}]}`
|
|
if [ $? -eq 0 -a "${new_pid}" != "" ] ; then
|
|
# set the process as restarted as soon as we have a new pid
|
|
if [ "${info[${PID_INDEX}]}" != "${RESTARTED}" ] ; then
|
|
loginfo "${info[${PROCESS_INDEX}]} ${RESTARTED} ok [PID: ${info[${PID_INDEX}]} -> ${new_pid}]"
|
|
info[${PID_INDEX}]=${RESTARTED}
|
|
fi
|
|
fi
|
|
|
|
# Set not done as long as there is one process not restarted
|
|
if [ "${info[${PID_INDEX}]}" != "${RESTARTED}" ] ; then
|
|
pmon_not_done=true
|
|
fi
|
|
|
|
# Add the PID to the process line
|
|
pmon_managed_processes[${index}]="${info[${PROCESS_INDEX}]}:${info[${PID_INDEX}]}"
|
|
|
|
((index++))
|
|
done
|
|
fi
|
|
|
|
# log when all pmond restarts are done
|
|
if [ "${pmon_not_done}" = false -a "${pmon_done}" = false ] ; then
|
|
pmon_done=true
|
|
logged=false
|
|
for DAEMON in "${pmon_managed_processes[@]}"
|
|
do
|
|
info=(${DAEMON//:/ })
|
|
|
|
if [ "${info[${PID_INDEX}]}" == "${RESTARTED}" ] ; then
|
|
if [ "${logged}" = false ] ; then
|
|
loginfo "The following 'pmon managed' processes have been 'restarted'"
|
|
logged=true
|
|
fi
|
|
loginfo "... process: ${info[${PROCESS_INDEX}]}"
|
|
fi
|
|
done
|
|
|
|
logged=false
|
|
for DAEMON in "${pmon_managed_processes[@]}"
|
|
do
|
|
info=(${DAEMON//:/ })
|
|
|
|
if [ "${info[${PID_INDEX}]}" == "${SKIPPED}" ] ; then
|
|
if [ "${logged}" = false ] ; then
|
|
loginfo "The following 'pmon managed' processes have been 'skipped' ; due to previous restart"
|
|
logged=true
|
|
fi
|
|
loginfo "... process: ${info[${PROCESS_INDEX}]}"
|
|
fi
|
|
done
|
|
fi
|
|
|
|
# check for done. If this is not met in timeout then fail is returned
|
|
if [ "$sm_done" = true -a "$pmon_done" = true ] ; then
|
|
GLOBAL_RC=$PATCH_STATUS_OK
|
|
break
|
|
fi
|
|
|
|
sleep 1
|
|
done
|
|
|
|
loginfo "Maintenance No-Reboot Patching Status: ${GLOBAL_RC}"
|
|
|
|
exit ${GLOBAL_RC}
|