Add parallel option to the collect tool

The current implementation of collect cycles through
the specified host list, one after the other.

This update adds a parallel (-p|--parallel) option to
collect with the goal to decrease the time it takes to
collect logs/data from all hosts in larger systems.

This update does not change any of the current collect
default options. The collect tool will take advantage
of this new feature if the -p or --parallel option is
specified on the command line when starting collect.

This update also refactors/consolidates fault reporting.

See review comments showing examples of collect output
in various success and failure modes.

Unless specified, all of the following test cases
were executed for both serial and parallel collects.

Test Plan:

PASS: Verify success collect collect
      AIO DX and 8 host storage system
PASS: Verify collect output and logging
PASS: Verify bash shellcheck static analysis against changed code

Failure Cases: Failure Handling = FH

PASS: Verify collect FH for never unlocked online host
PASS: Verify collect FH for an offline host
PASS: Verify collect FH for host that recently rebooted
PASS: Verify collect FH for host that reboots during collect
PASS: Verify collect FH for host mgmnt network drop during collect
PASS: Verify collect FH for remote host with <25% free /scratch space
PASS: Verify collect FH for local host with <25% free /scratch space
PASS: Verify collect FH when no tarballs are collected
PASS: Verify collect FH when not all tarballs are collected
PASS: Verify collect FH of various bad command line options
PASS: Verify collect FH when collect of that host fills its filesystem
PASS: Verify parallel collect overall timeout failure handling
PASS: Verify collect host timeout failure handling
PASS: Verify collect in system with many never unlocked hosts

Regression:

PASS: Verify dated collect
PASS: Verify handling of unknown host
PASS: Verify ^C|TERM|KILL running collect removes all child processes
PASS: Verify Single host collect (any host)
PASS: Verify Listed hosts collect (many different groupings)

Soak:

PASS: Verify repeated collects (50+) until after local fs is full

Change-Id: I91814d14341cdc438a6d5af999b6c12d39c7d97c
Story: 2009055
Task: 42835
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2021-07-14 12:34:27 -04:00
parent a1c63fc8ba
commit 71f0e30113
3 changed files with 705 additions and 372 deletions

File diff suppressed because it is too large Load Diff

View File

@ -84,7 +84,6 @@ COLLECT_INCLUDE="/var/run /etc /root"
FLIGHT_RECORDER_PATH="var/lib/sm/"
FLIGHT_RECORDER_FILE="sm.eru.v1"
VAR_LOG_INCLUDE_LIST="/tmp/${COLLECT_NAME}.lst"
COLLECT_DIR_PCENT_CMD="df --output=pcent ${COLLECT_BASE_DIR}"
COLLECT_DIR_USAGE_CMD="df -h ${COLLECT_BASE_DIR}"
COLLECT_DATE="/usr/local/sbin/collect_date"
COLLECT_SYSINV="${COLLECT_PATH}/collect_sysinv"
@ -98,26 +97,7 @@ function log_space()
ilog "${COLLECT_BASE_DIR} ${msg} ${space1}"
}
function space_precheck()
{
space="`${COLLECT_DIR_PCENT_CMD}`"
space1=`echo "${space}" | grep -v Use`
size=`echo ${space1} | cut -f 1 -d '%'`
if [ ${size} -ge 0 -a ${size} -le 100 ] ; then
ilog "${COLLECT_BASE_DIR} is $size% full"
if [ ${size} -ge ${MIN_PERCENT_SPACE_REQUIRED} ] ; then
wlog "${HOSTNAME}:${COLLECT_BASE_DIR} does not have enough available space in to perform collect"
wlog "${HOSTNAME}:${COLLECT_BASE_DIR} must be below ${MIN_PERCENT_SPACE_REQUIRED}% to perform collect"
wlog "Increase available space in ${HOSTNAME}:${COLLECT_BASE_DIR} and retry operation."
echo "${FAIL_INSUFFICIENT_SPACE_STR}"
exit ${FAIL_INSUFFICIENT_SPACE}
fi
else
wlog "unable to parse available space from '${COLLECT_DIR_PCENT_CMD}' output"
fi
}
space_precheck
space_precheck ${HOSTNAME} ${COLLECT_BASE_DIR}
CURR_DIR=`pwd`
mkdir -p ${COLLECT_NAME_DIR}
@ -187,7 +167,7 @@ function collect_extra()
# Collect process, thread and scheduling, and elapsed time
# This has everything that ps-sched.sh does, except for cpu affinity mask,
# adds: stime,etime,time,wchan,tty).
# adds: stime,etime,time,wchan,tty).
delimiter ${LOGFILE} "ps -eL -o pid,lwp,ppid,state,class,nice,rtprio,priority,psr,stime,etime,time,wchan:16,tty,comm,command"
ps -eL -o pid,lwp,ppid,state,class,nice,rtprio,priority,psr,stime,etime,time,wchan:16,tty,comm,command >> ${LOGFILE}

View File

@ -36,17 +36,20 @@ FAIL_INACTIVE=35
FAIL_PERMISSION_SKIP=36
FAIL_OUT_OF_SPACE=37
FAIL_INSUFFICIENT_SPACE=38
FAIL_OUT_OF_SPACE_LOCAL=39
FAIL_CREATE=39
FAIL_INTERNAL=39
FAIL_NO_TARDIR=40
FAIL_NO_TARBALLS=41
# Warnings are above 200
WARN_WARNING=200
WARN_HOSTNAME=201
# Failure Strings
FAIL_NOT_ENOUGH_SPACE_STR="Not enough /scratch filesystem space"
FAIL_OUT_OF_SPACE_STR="No space left on device"
FAIL_TAR_OUT_OF_SPACE_STR="tar: Error is not recoverable"
FAIL_INSUFFICIENT_SPACE_STR="Not enough space on device"
FAIL_UNREACHABLE_STR="Unreachable"
# The minimum amount of % free space on /scratch to allow collect to proceed
MIN_PERCENT_SPACE_REQUIRED=75
@ -121,19 +124,18 @@ function ilog
{
echo "$@"
logger -t ${COLLECT_TAG} $@
#logger -p local3.info -t ${COLLECT_TAG} $@
}
function elog
{
echo "Error: $@"
logger -t ${COLLECT_TAG} $@
logger -t ${COLLECT_TAG} "Error: $@"
}
function wlog
{
echo "Warning: $@"
logger -t ${COLLECT_TAG} $@
logger -t ${COLLECT_TAG} "Warning: $@"
}
function set_debug_mode()
@ -144,8 +146,8 @@ function set_debug_mode()
function dlog()
{
if [ "$DEBUG" == true ] ; then
logger -t ${COLLECT_TAG} $@
echo "Debug: $@"
logger -t ${COLLECT_TAG} "Debug: $@"
echo "$(date) Debug: $@"
fi
}
@ -235,3 +237,35 @@ function collect_errors()
fi
return ${RC}
}
############################################################################
#
# Name : space_precheck
#
# Description:
#
############################################################################
function space_precheck()
{
HOSTNAME=${1}
COLLECT_BASE_DIR=${2}
COLLECT_DIR_PCENT_CMD="df --output=pcent ${COLLECT_BASE_DIR}"
space="`${COLLECT_DIR_PCENT_CMD}`"
space1=`echo "${space}" | grep -v Use`
size=`echo ${space1} | cut -f 1 -d '%'`
if [ ${size} -ge 0 -a ${size} -le 100 ] ; then
if [ ${size} -ge ${MIN_PERCENT_SPACE_REQUIRED} ] ; then
ilog "${COLLECT_BASE_DIR} is $size% full"
echo "${FAIL_INSUFFICIENT_SPACE_STR}"
wlog "${HOSTNAME}:${COLLECT_BASE_DIR} does not have enough available space in to perform collect"
wlog "${HOSTNAME}:${COLLECT_BASE_DIR} must be below ${MIN_PERCENT_SPACE_REQUIRED}% to perform collect"
wlog "Increase available space in ${HOSTNAME}:${COLLECT_BASE_DIR} and retry operation."
exit ${FAIL_INSUFFICIENT_SPACE}
fi
else
wlog "unable to parse available space from '${COLLECT_DIR_PCENT_CMD}' output"
fi
}