Add parallel option to the collect tool
The current implementation of collect cycles through the specified host list, one after the other. This update adds a parallel (-p|--parallel) option to collect with the goal to decrease the time it takes to collect logs/data from all hosts in larger systems. This update does not change any of the current collect default options. The collect tool will take advantage of this new feature if the -p or --parallel option is specified on the command line when starting collect. This update also refactors/consolidates fault reporting. See review comments showing examples of collect output in various success and failure modes. Unless specified, all of the following test cases were executed for both serial and parallel collects. Test Plan: PASS: Verify success collect collect AIO DX and 8 host storage system PASS: Verify collect output and logging PASS: Verify bash shellcheck static analysis against changed code Failure Cases: Failure Handling = FH PASS: Verify collect FH for never unlocked online host PASS: Verify collect FH for an offline host PASS: Verify collect FH for host that recently rebooted PASS: Verify collect FH for host that reboots during collect PASS: Verify collect FH for host mgmnt network drop during collect PASS: Verify collect FH for remote host with <25% free /scratch space PASS: Verify collect FH for local host with <25% free /scratch space PASS: Verify collect FH when no tarballs are collected PASS: Verify collect FH when not all tarballs are collected PASS: Verify collect FH of various bad command line options PASS: Verify collect FH when collect of that host fills its filesystem PASS: Verify parallel collect overall timeout failure handling PASS: Verify collect host timeout failure handling PASS: Verify collect in system with many never unlocked hosts Regression: PASS: Verify dated collect PASS: Verify handling of unknown host PASS: Verify ^C|TERM|KILL running collect removes all child processes PASS: Verify Single host collect (any host) PASS: Verify Listed hosts collect (many different groupings) Soak: PASS: Verify repeated collects (50+) until after local fs is full Change-Id: I91814d14341cdc438a6d5af999b6c12d39c7d97c Story: 2009055 Task: 42835 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
a1c63fc8ba
commit
71f0e30113
File diff suppressed because it is too large
Load Diff
|
@ -84,7 +84,6 @@ COLLECT_INCLUDE="/var/run /etc /root"
|
|||
FLIGHT_RECORDER_PATH="var/lib/sm/"
|
||||
FLIGHT_RECORDER_FILE="sm.eru.v1"
|
||||
VAR_LOG_INCLUDE_LIST="/tmp/${COLLECT_NAME}.lst"
|
||||
COLLECT_DIR_PCENT_CMD="df --output=pcent ${COLLECT_BASE_DIR}"
|
||||
COLLECT_DIR_USAGE_CMD="df -h ${COLLECT_BASE_DIR}"
|
||||
COLLECT_DATE="/usr/local/sbin/collect_date"
|
||||
COLLECT_SYSINV="${COLLECT_PATH}/collect_sysinv"
|
||||
|
@ -98,26 +97,7 @@ function log_space()
|
|||
ilog "${COLLECT_BASE_DIR} ${msg} ${space1}"
|
||||
}
|
||||
|
||||
function space_precheck()
|
||||
{
|
||||
space="`${COLLECT_DIR_PCENT_CMD}`"
|
||||
space1=`echo "${space}" | grep -v Use`
|
||||
size=`echo ${space1} | cut -f 1 -d '%'`
|
||||
if [ ${size} -ge 0 -a ${size} -le 100 ] ; then
|
||||
ilog "${COLLECT_BASE_DIR} is $size% full"
|
||||
if [ ${size} -ge ${MIN_PERCENT_SPACE_REQUIRED} ] ; then
|
||||
wlog "${HOSTNAME}:${COLLECT_BASE_DIR} does not have enough available space in to perform collect"
|
||||
wlog "${HOSTNAME}:${COLLECT_BASE_DIR} must be below ${MIN_PERCENT_SPACE_REQUIRED}% to perform collect"
|
||||
wlog "Increase available space in ${HOSTNAME}:${COLLECT_BASE_DIR} and retry operation."
|
||||
echo "${FAIL_INSUFFICIENT_SPACE_STR}"
|
||||
exit ${FAIL_INSUFFICIENT_SPACE}
|
||||
fi
|
||||
else
|
||||
wlog "unable to parse available space from '${COLLECT_DIR_PCENT_CMD}' output"
|
||||
fi
|
||||
}
|
||||
|
||||
space_precheck
|
||||
space_precheck ${HOSTNAME} ${COLLECT_BASE_DIR}
|
||||
|
||||
CURR_DIR=`pwd`
|
||||
mkdir -p ${COLLECT_NAME_DIR}
|
||||
|
@ -187,7 +167,7 @@ function collect_extra()
|
|||
|
||||
# Collect process, thread and scheduling, and elapsed time
|
||||
# This has everything that ps-sched.sh does, except for cpu affinity mask,
|
||||
# adds: stime,etime,time,wchan,tty).
|
||||
# adds: stime,etime,time,wchan,tty).
|
||||
delimiter ${LOGFILE} "ps -eL -o pid,lwp,ppid,state,class,nice,rtprio,priority,psr,stime,etime,time,wchan:16,tty,comm,command"
|
||||
ps -eL -o pid,lwp,ppid,state,class,nice,rtprio,priority,psr,stime,etime,time,wchan:16,tty,comm,command >> ${LOGFILE}
|
||||
|
||||
|
|
|
@ -36,17 +36,20 @@ FAIL_INACTIVE=35
|
|||
FAIL_PERMISSION_SKIP=36
|
||||
FAIL_OUT_OF_SPACE=37
|
||||
FAIL_INSUFFICIENT_SPACE=38
|
||||
FAIL_OUT_OF_SPACE_LOCAL=39
|
||||
FAIL_CREATE=39
|
||||
FAIL_INTERNAL=39
|
||||
FAIL_NO_TARDIR=40
|
||||
FAIL_NO_TARBALLS=41
|
||||
|
||||
# Warnings are above 200
|
||||
WARN_WARNING=200
|
||||
WARN_HOSTNAME=201
|
||||
|
||||
# Failure Strings
|
||||
FAIL_NOT_ENOUGH_SPACE_STR="Not enough /scratch filesystem space"
|
||||
FAIL_OUT_OF_SPACE_STR="No space left on device"
|
||||
FAIL_TAR_OUT_OF_SPACE_STR="tar: Error is not recoverable"
|
||||
FAIL_INSUFFICIENT_SPACE_STR="Not enough space on device"
|
||||
FAIL_UNREACHABLE_STR="Unreachable"
|
||||
|
||||
# The minimum amount of % free space on /scratch to allow collect to proceed
|
||||
MIN_PERCENT_SPACE_REQUIRED=75
|
||||
|
@ -121,19 +124,18 @@ function ilog
|
|||
{
|
||||
echo "$@"
|
||||
logger -t ${COLLECT_TAG} $@
|
||||
#logger -p local3.info -t ${COLLECT_TAG} $@
|
||||
}
|
||||
|
||||
function elog
|
||||
{
|
||||
echo "Error: $@"
|
||||
logger -t ${COLLECT_TAG} $@
|
||||
logger -t ${COLLECT_TAG} "Error: $@"
|
||||
}
|
||||
|
||||
function wlog
|
||||
{
|
||||
echo "Warning: $@"
|
||||
logger -t ${COLLECT_TAG} $@
|
||||
logger -t ${COLLECT_TAG} "Warning: $@"
|
||||
}
|
||||
|
||||
function set_debug_mode()
|
||||
|
@ -144,8 +146,8 @@ function set_debug_mode()
|
|||
function dlog()
|
||||
{
|
||||
if [ "$DEBUG" == true ] ; then
|
||||
logger -t ${COLLECT_TAG} $@
|
||||
echo "Debug: $@"
|
||||
logger -t ${COLLECT_TAG} "Debug: $@"
|
||||
echo "$(date) Debug: $@"
|
||||
fi
|
||||
}
|
||||
|
||||
|
@ -235,3 +237,35 @@ function collect_errors()
|
|||
fi
|
||||
return ${RC}
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#
|
||||
# Name : space_precheck
|
||||
#
|
||||
# Description:
|
||||
#
|
||||
############################################################################
|
||||
|
||||
function space_precheck()
|
||||
{
|
||||
HOSTNAME=${1}
|
||||
COLLECT_BASE_DIR=${2}
|
||||
COLLECT_DIR_PCENT_CMD="df --output=pcent ${COLLECT_BASE_DIR}"
|
||||
|
||||
space="`${COLLECT_DIR_PCENT_CMD}`"
|
||||
space1=`echo "${space}" | grep -v Use`
|
||||
size=`echo ${space1} | cut -f 1 -d '%'`
|
||||
if [ ${size} -ge 0 -a ${size} -le 100 ] ; then
|
||||
if [ ${size} -ge ${MIN_PERCENT_SPACE_REQUIRED} ] ; then
|
||||
ilog "${COLLECT_BASE_DIR} is $size% full"
|
||||
echo "${FAIL_INSUFFICIENT_SPACE_STR}"
|
||||
wlog "${HOSTNAME}:${COLLECT_BASE_DIR} does not have enough available space in to perform collect"
|
||||
wlog "${HOSTNAME}:${COLLECT_BASE_DIR} must be below ${MIN_PERCENT_SPACE_REQUIRED}% to perform collect"
|
||||
wlog "Increase available space in ${HOSTNAME}:${COLLECT_BASE_DIR} and retry operation."
|
||||
exit ${FAIL_INSUFFICIENT_SPACE}
|
||||
fi
|
||||
else
|
||||
wlog "unable to parse available space from '${COLLECT_DIR_PCENT_CMD}' output"
|
||||
fi
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue