diff --git a/tools/collector/scripts/collect b/tools/collector/scripts/collect index 2ed3d1aa..9ad1e45c 100755 --- a/tools/collector/scripts/collect +++ b/tools/collector/scripts/collect @@ -74,28 +74,34 @@ TOOL_NAME=collect TOOL_VER=2 -TOOL_REV=0 +TOOL_REV=1 -# collect must be run as sysadmin -if [ ${UID} -eq 0 ]; then - echo "Error: Cannot run collect as 'root' user" - exit 1 -fi +# only supported username +UN="sysadmin" # pull in common utils and environment source /usr/local/sbin/collect_utils + +# collect must be run as sysadmin +if [ ${UID} -eq 0 ]; then + elog "Cannot run collect as 'root' user" + exit 1 +fi + source_openrc_if_needed function clean_up() { - `reset` - echo "" + # kill all processes whose parent is this process + pkill -P $$ + $(reset) + echo " clean up called" } function control_c() { echo "" - echo "... received exit signal ..." + echo -n "... received exit signal ..." clean_up exit 0 } @@ -104,25 +110,20 @@ function control_c() trap control_c SIGINT trap control_c SIGTERM - - # static expect log level control ; # 0 = hide expect output # 1 = show expect outout USER_LOG_MODE=0 -# static execution status 'return value' -RETVAL=0 - # limit scp bandwidth to 1MB/s # increase limit of scp bandwidth from 1MB/s to 10MB/s SCP_CMD="scp -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o PreferredAuthentications=password -o PubkeyAuthentication=no -l $((10*8*1000))" SCP_TIMEOUT="600" SSH_CMD="ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o PreferredAuthentications=password -o PubkeyAuthentication=no" -NOWDATE=`date +"%Y%m%d.%H%M%S"` +NOWDATE=$(date +"%Y%m%d.%H%M%S") COLLECT_BASE_DIR="/scratch" collect_host="/usr/local/sbin/collect_host" -CURR_DIR=`pwd` +CURR_DIR=$(pwd) # common permission error strings @@ -145,6 +146,8 @@ function print_help() echo "Host data collection scope can be the current host, any single specified hostname," echo "a --list of hostnames or --all hosts in the system using a single command." echo "" + echo "Optionally specify --parallel or -p to collect from hosts in parallel" + echo "" echo "Optionally specify --start-date and/or --end-date options to limit" echo " the date range and therefore size of the collect." echo "" @@ -159,7 +162,8 @@ function print_help() echo " collect host1 ... collect logs for single named host" echo " collect host1 host2 host3 ... collect logs for stacked host list" echo " collect [--list | -l] host1 host2 host3 ... collect logs for list of named hosts" - echo " collect [--all | -a] ... collect data for all hosts" + echo " collect [--all | -a] ... collect logs for all hosts" + echo " collect -a -p ... collect logs for all hosts in parallel" echo "" echo "Dated Collect:" echo "" @@ -197,6 +201,7 @@ function print_help() # command line arguement variables ; defaulted DEBUG=false CLEAN=false +ASYNC=false VERBOSE=false SKIP_MASK=false INVENTORY=false @@ -214,7 +219,9 @@ LISTING=false ALLHOSTS=false HOSTS=1 HOSTLIST=(${HOSTNAME}) -THISHOST=false + +# overall collect timeout +TIMEOUT=1000 COLLECT_TARNAME="" @@ -226,8 +233,13 @@ function clear_variable_args() GETENDDATE=false } +space_precheck ${HOSTNAME} ${COLLECT_BASE_DIR} + +############################################################################ # -# Utility function to print a status message and record the last error code +# Name : report_error +# +# Purpose : Report error to console and logfile # # Assumptions: Handles specific cases of invalid password and permission errors # by exiting so as to avoid repeated errors during multi-host @@ -236,54 +248,39 @@ function clear_variable_args() # $1 - status string # $2 - status code number # -function print_status() +function report_error() { local string=${1} local code=${2} - logger -t ${COLLECT_TAG} "${string} (reason:${code})" + if [ ${code} -eq ${FAIL_PASSWORD} ] ; then + elog "Invalid password" + exit ${code} - # if the status code is in the FAIL range ( less than WARNING ) then update RETVAL - if [ ${code} -lt ${WARN_WARNING} ] ; then - RETVAL=${code} - fi + elif [ ${code} -eq ${FAIL_PERMISSION} ] ; then + elog "Permission error ; exiting (${string})" + exit ${code} - if [ ${RETVAL} -eq ${FAIL_PASSWORD} ] ; then + elif [ ${code} -eq ${FAIL_UNREACHABLE} ] ; then + elog "${string} (reason:${code}:host unreachable)" - echo "Invalid password ; exiting (${string})" - exit ${RETVAL} + elif [ ${code} -eq ${FAIL_PERMISSION_SKIP} ] ; then + elog "${string} (reason:${code}:permission error)" - elif [ ${RETVAL} -eq ${FAIL_PERMISSION} ] ; then + elif [ ${code} -eq ${FAIL_OUT_OF_SPACE} ] ; then + elog "${string} (reason:${code}:${FAIL_NOT_ENOUGH_SPACE_STR}) ; need to increase available space ${COLLECT_BASE_DIR}" - echo "Permission error ; exiting (${string})" - exit ${RETVAL} + elif [ ${code} -eq ${FAIL_INSUFFICIENT_SPACE} ] ; then + elog "${string} (reason:${code}:${FAIL_NOT_ENOUGH_SPACE_STR}) ; ${COLLECT_BASE_DIR} usage must be below ${MIN_PERCENT_SPACE_REQUIRED}%" - elif [ ${RETVAL} -eq ${FAIL_UNREACHABLE} ] ; then + elif [ ${code} -ge ${FAIL_TIMEOUT} -a ${code} -le ${FAIL_TIMEOUT9} ] ; then + elog "${string} (reason:${code}:operation timeout)" - echo "${string} (reason:${code}:host unreachable)" - - elif [ ${RETVAL} -eq ${FAIL_PERMISSION_SKIP} -o ${RETVAL} -eq ${FAIL_PERMISSION} ] ; then - - echo "${string} (reason:${code}:permission error)" - - elif [ ${RETVAL} -eq ${FAIL_OUT_OF_SPACE} ] ; then - - echo "${string} (reason:${code}) ; need to increase available space in host ${COLLECT_BASE_DIR}" - - elif [ ${RETVAL} -eq ${FAIL_OUT_OF_SPACE_LOCAL} ] ; then - - echo "${string} (reason:${code}) ; need to increase available space in ${HOSTNAME}:${COLLECT_BASE_DIR}" - - elif [ ${RETVAL} -eq ${FAIL_INSUFFICIENT_SPACE} ] ; then - - echo "${string} (reason:${code}) ; ${HOSTNAME}:${COLLECT_BASE_DIR} usage must be below ${MIN_PERCENT_SPACE_REQUIRED}%" - - elif [ ${RETVAL} -ge ${FAIL_TIMEOUT} -a ${RETVAL} -le ${FAIL_TIMEOUT9} ] ; then - - echo "${string} (reason:${code}:operation timeout)" + elif [ ${code} -ge ${FAIL_HOSTNAME} ] ; then + wlog "${string} (reason:${code})" else - echo "${string} (reason:${code})" + elog "${string} (reason:${code})" fi } @@ -300,17 +297,17 @@ function is_valid_host() if [ "${this_hostname}" == "None" ] ; then return ${FAIL_HOSTNAME} elif [ "${this_hostname}" == "${HOSTNAME}" ] ; then - return $PASS + return ${PASS} elif [ "${ACTIVE}" = true ] ; then system host-show "${this_hostname}" 2>/dev/null 1>/dev/null if [ ${?} -ne 0 ] ; then return ${FAIL_HOSTNAME} fi else - print_status "Error: can only run collect for remote hosts on active controller" ${FAIL_INACTIVE} + report_error "can only run collect for remote hosts on active controller" ${FAIL_INACTIVE} exit ${FAIL_INACTIVE} fi - return $PASS + return ${PASS} } @@ -333,6 +330,7 @@ while [[ ${#} -gt 0 ]] ; do ;; -v|--verbose) + USER_LOG_MODE=1 VERBOSE=true ;; @@ -346,21 +344,19 @@ while [[ ${#} -gt 0 ]] ; do -l|--list) if [[ ${#} -lt 2 ]] ; then - print_status "Error: empty host --list" ${FAIL} + report_error "empty host list" ${FAIL_HOSTNAME} exit ${FAIL} fi is_valid_host "${2}" if [ ${?} -ne 0 ] ; then - print_status "Error: empty host --list or invalid first hostname" ${FAIL} - exit ${FAIL} + report_error "empty host list or invalid first hostname" ${FAIL_HOSTNAME} + exit ${FAIL_HOSTNAME} fi HOSTLIST=(${2}) HOSTS=1 - if [ "${2}" == "${HOSTNAME}" ] ; then - THISHOST=true - elif [ "${ACTIVE}" = false ] ; then - print_status "Error: can only run collect for remote hosts on active controller" ${FAIL_INACTIVE} + if [ "${ACTIVE}" = false ] ; then + report_error "can only run collect for remote hosts on active controller" ${FAIL_INACTIVE} exit ${FAIL_INACTIVE} fi LISTING=true @@ -371,13 +367,12 @@ while [[ ${#} -gt 0 ]] ; do -a|--all|all) if [ "${ACTIVE}" = false ] ; then - print_status "Error: can only run collect for remote hosts on active controller" ${FAIL_INACTIVE} + report_error "can only run collect for remote hosts on active controller" ${FAIL_INACTIVE} exit ${FAIL_INACTIVE} fi ALLHOSTS=true HOSTLIST=(${HOSTNAME}) HOSTS=1 - THISHOST=true clear_variable_args ;; @@ -399,7 +394,6 @@ while [[ ${#} -gt 0 ]] ; do -d|--debug) DEBUG=true - USER_LOG_MODE=1 clear_variable_args ;; @@ -408,19 +402,23 @@ while [[ ${#} -gt 0 ]] ; do shift ;; + -p|--parallel) + ASYNC=true + SECONDS=0 + let UNTIL=${SECONDS}+${TIMEOUT} + dlog "collect timeout is ${TIMEOUT}" + ;; + *) if [ "${LISTING}" = true ] ; then is_valid_host ${key} if [ ${?} -eq 0 ] ; then - HOSTS=$((${HOSTS} + 1)) + HOSTS=$((HOSTS+1)) HOSTLIST=( "${HOSTLIST[@]}" ${key} ) - if [ "${key}" == "${HOSTNAME}" ] ; then - THISHOST=true - fi else # make the invalid hostname a warning only. # if we got here then at least the first hostname was valid - print_status "Warning: cannot collect data from unknown host '${key}'" ${WARN_HOSTNAME} + report_error "cannot collect data from unknown host '${key}'" ${WARN_HOSTNAME} fi elif [ "${GETSTARTDATE}" = true ] ; then dlog "accepting but ignoring legacy starttime specification" @@ -428,17 +426,14 @@ while [[ ${#} -gt 0 ]] ; do dlog "accepting but ignoring legacy endtime specification" else is_valid_host ${key} - RETVAL=${?} - if [ ${RETVAL} -eq 0 ] ; then + rc=${?} + if [ ${rc} -eq 0 ] ; then HOSTLIST=${key} HOSTS=1 LISTING=true - if [ "${key}" == "${HOSTNAME}" ] ; then - THISHOST=true - fi else - print_status "Error: cannot collect data from unknown host '${key}'" ${RETVAL} - exit ${RETVAL} + report_error "cannot collect data from unknown host '${key}'" ${rc} + exit ${rc} fi fi GETSTARTDATE=false @@ -448,13 +443,6 @@ while [[ ${#} -gt 0 ]] ; do shift # past argument or value done -if [ ${RETVAL} -ne 0 ]; then - echo "command line parse error (${RETVAL})" - print_help - exit ${RETVAL} -fi - - # # request root password and use it for # all the expect driven requests below @@ -480,46 +468,37 @@ if [ "${ALLHOSTS}" = true ] ; then for foreign_host in $(system host-list | grep '[0-9]' | cut -d '|' -f 3 | tr -d ' ' | grep -v ${HOSTNAME}); do if [ "${foreign_host}" != "None" ] ; then - HOSTS=$((${HOSTS} + 1)) + HOSTS=$((HOSTS+1)) HOSTLIST=( "${HOSTLIST[@]}" ${foreign_host}) - dlog "Host:${HOSTS}: ${foreign_host}" fi done elif [ ${HOSTS} == 0 ] ; then HOSTLIST=${HOSTNAME} - THISHOST=true COLLECT_TARNAME="${HOSTNAME}_${NOWDATE}" fi -# Print Summary -if [ "${DEBUG}" == true ] ; then +# debug logs +dlog "HOSTLIST = ${HOSTLIST[@]}" +dlog "HOSTS = ${HOSTS}" +dlog "ALLHOSTS = ${ALLHOSTS}" +dlog "STARTDATE= ${STARTDATE}" +dlog "ENDDATE = ${ENDDATE}" +dlog "SECONDS = ${SECONDS}" +for hosts in "${HOSTLIST[@]}" ; do + dlog "Host:${hosts}" +done - echo "HOSTLIST = <${HOSTLIST[@]}>" - echo "HOSTS = ${HOSTS}" - echo "ALLHOSTS = ${ALLHOSTS}" - echo "STARTDATE= ${STARTDATE}" - echo "ENDDATE = ${ENDDATE}" - - for hosts in "${HOSTLIST[@]}" ; do - echo "Host:${hosts}" - done - -elif [ ${HOSTS} -eq 0 ] ; then - - print_status "Error: no hosts specified" "${FAIL}" +if [ ${HOSTS} -eq 0 ] ; then + elog "no hosts specified" exit ${FAIL} - -elif [ "${CLEAN}" == false ] ; then - +fi +if [ "${CLEAN}" == false ] ; then ilog "collecting data from ${HOSTS} host(s): ${HOSTLIST[@]}" - else - ilog "cleaning scratch space on ${HOSTLIST[@]}" - fi # @@ -550,7 +529,7 @@ function clean_scratch_dir_local () EOF local rc=${?} if [ ${rc} -ne ${PASS} ] ; then - print_status "Error: clean_scratch_dir_local ${this_hostname} failed" ${rc} + report_error "clean_scratch_dir_local ${this_hostname} failed" ${rc} fi return ${rc} } @@ -571,7 +550,7 @@ function clean_scratch_dir_remote() spawn bash -i expect -re $ set timeout 60 - send "${SSH_CMD} sysadmin@${this_hostname}\n" + send "${SSH_CMD} ${UN}@${this_hostname}\n" expect { "assword:" { send "${pw}\r" @@ -608,7 +587,7 @@ function clean_scratch_dir_remote() EOF local rc=${?} if [ ${rc} -ne ${PASS} ] ; then - print_status "Error: clean_scratch_dir_remote ${this_hostname} failed" ${rc} + report_error "clean_scratch_dir_remote ${this_hostname} failed" ${rc} fi return ${rc} } @@ -629,7 +608,7 @@ function delete_remote_dir_or_file() spawn bash -i expect -re $ set timeout 60 - send "${SSH_CMD} sysadmin@${this_hostname}\n" + send "${SSH_CMD} ${UN}@${this_hostname}\n" expect { "assword:" { send "${pw}\r" @@ -666,7 +645,7 @@ function delete_remote_dir_or_file() EOF local rc=${?} if [ ${rc} -ne ${PASS} ] ; then - print_status "Error: delete_remote_dir_or_file ${this_hostname} failed" ${rc} + report_error "delete_remote_dir_or_file ${this_hostname} failed" ${rc} fi return ${rc} } @@ -691,7 +670,7 @@ function get_file_from_host() spawn bash -i set timeout ${SCP_TIMEOUT} expect -re $ - send "${SCP_CMD} sysadmin@${this_hostname}:${remote_src} ${local_dest} 2>>${HOST_COLLECT_ERROR_LOG}\n" + send "${SCP_CMD} ${UN}@${this_hostname}:${remote_src} ${local_dest} 2>>${HOST_COLLECT_ERROR_LOG}\n" expect { "assword:" { send "${pw}\r" @@ -713,7 +692,7 @@ function get_file_from_host() EOF local rc=${?} if [ ${rc} -ne ${PASS} ] ; then - print_status "failed to get_file_from ${this_hostname}" ${rc} + report_error "failed to get_file_from ${this_hostname}" ${rc} else # Look for "No space left on device" error grep -q "${FAIL_OUT_OF_SPACE_STR}" ${HOST_COLLECT_ERROR_LOG} @@ -763,7 +742,8 @@ function create_collect_dir_local() EOF local rc=${?} if [ ${rc} -ne ${PASS} ] ; then - print_status "failed to create_collect_dir_local for ${dir}" ${rc} + report_error "failed to create_collect_dir_local for ${dir}" ${rc} + exit ${rc} fi return ${rc} } @@ -797,7 +777,7 @@ function remove_file_local() EOF local rc=${?} if [ ${rc} -ne ${PASS} ] ; then - print_status "failed to remove_file_local ${local_file}" ${rc} + report_error "failed to remove_file_local ${local_file}" ${rc} fi fi return ${rc} @@ -829,7 +809,7 @@ function remove_dir_local() EOF local rc=${?} if [ ${rc} -ne ${PASS} ] ; then - print_status "failed to remove_dir_local ${dir}" ${rc} + report_error "failed to remove_dir_local ${dir}" ${rc} fi return ${rc} } @@ -862,7 +842,7 @@ function move_file_local() EOF local rc=${?} if [ ${rc} -ne ${PASS} ] ; then - print_status "failed to move_file_local ${src} to ${dst}" ${rc} + report_error "failed to move_file_local ${src} to ${dst}" ${rc} fi return ${rc} } @@ -872,22 +852,435 @@ EOF function echo_stats() { local secs=${1} - local file=${2} + local label="${2}" + local file="${3}" + MSG="" + if [ $label != "stats-only" ] ; then + if [ "${ASYNC}" = true ] ; then - echo -n " ($(date -d@${secs} -u +%H:%M:%S)" + MSG="collected " + len=${#label} + for ((i=len;i/dev/null) if [ $? -eq 0 ] ; then - printf " %5s)\n" "${size}" + printf "%s %5s)\n" "${MSG}" "${size}" return fi fi - echo ")" + printf "%s )\n" "${MSG}" +} + +############################################################################ +# +# Name : collect_host_run +# +# Purpose : Run collect host in selected mode +# +# Description: Run collect_host as a background task for each host if +# parallel option is specified. Otherwise, run collect in +# forground (legacy mode) for each host one after the other. +# +############################################################################ + +function collect_host_run() +{ + local host="${1}" + local rc=${PASS} + + if [ "${ASYNC}" = false ] ; then + MSG="collecting" + # line up the host names + len=${#host} + for ((i=len;i/dev/null + rc=${?} + if [ ${rc} -ne 0 ] ; then - exit ${RETVAL} + # the process is done ; get its exit code + wait "${info[${INDEX_PID}]}" + rc=${?} + if [ ${rc} == ${PASS} ] ; then + + # if it passed then fetch that host's tarball + if [ "${info[${INDEX_HOST}]}" == "${HOSTNAME}" ] ; then + collect_host_complete_local "${info[${INDEX_TARBALL}]}" + else + collect_host_complete_remote "${info[${INDEX_HOST}]}" \ + "${info[${INDEX_TARBALL}]}" + fi + + collect_host_done ${index} ${rc} + collect_host_stats ${index} ${rc} + else + collect_host_done ${index} ${rc} + report_error "failed to collect from ${info[${INDEX_HOST}]}" ${rc} + fi + else + monitoring=true + fi + + elif [ "${info[${INDEX_STAGE}]}" == "${STAGE_RUN}" ] ; then + monitoring=true + # update stage to Monitor + collect_host_monitor ${index} + fi + index=$((index+1)) + done + + if [ "${monitoring}" = false ] ; then + dlog "All hosts done ..." + break + fi + done + echo "" +fi + +# Report that the overall collect timed-out +if [ "$monitoring" = true ]; then + report_error "collect operation timeout after $TIMEOUT secs" ${FAIL_TIMEOUT} +fi + +# Don't create a tarball if the collect name dir does not exists or contain files +if [ -d ${COLLECT_DIR} ] ; then + stat ${COLLECT_DIR}/* 2>/dev/null 1>/dev/null + if [ $? -eq 0 ] ; then + tarballs=(${COLLECT_DIR}/*) + for tarball in ${tarballs[@]} + do + dlog "collected $tarball" + done + else + elog "No ${COLLECT_DIR} tarballs found ; refusing to create empty ${TARBALL_NAME}" + exit ${FAIL_NO_TARFILES} + fi +else + elog "${COLLECT_DIR} not present ; refusing to create empty ${TARBALL_NAME}" + exit ${FAIL_NO_TARDIR} fi echo -n "creating ${named} tarball ${TARBALL_NAME} ... " +remove_file_local ${COLLECT_ERROR_LOG} +remove_file_local ${HOST_COLLECT_ERROR_LOG} + /usr/bin/expect << EOF log_user ${USER_LOG_MODE} spawn bash -i @@ -1226,17 +1545,17 @@ echo -n "creating ${named} tarball ${TARBALL_NAME} ... " timeout { exit ${FAIL_TIMEOUT} } } EOF - RETVAL=${?} - if [ ${RETVAL} -ne ${PASS} ] ; then + rc=${?} + if [ ${rc} -ne ${PASS} ] ; then collect_errors ${HOSTNAME} - print_status "failed to create ${TARBALL_NAME}" ${RETVAL} + report_error "failed to create ${TARBALL_NAME}" ${rc} else collect_errors ${HOSTNAME} - RETVAL=$? - if [ ${RETVAL} -eq ${PASS} ] ; then + rc=$? + if [ ${rc} -eq ${PASS} ] ; then secs=$((SECONDS-COLLECT_START_TIME)) echo -n "done" - echo_stats $secs "${TARBALL_NAME}" + echo_stats $secs "stats-only" "${TARBALL_NAME}" logger -t ${COLLECT_TAG} "created ${named} tarball ${TARBALL_NAME}" else echo "removing incomplete collect: ${TARBALL_NAME}" @@ -1249,4 +1568,4 @@ EOF # return to callers dir cd ${CURR_DIR} -exit ${RETVAL} +exit ${rc} diff --git a/tools/collector/scripts/collect_host b/tools/collector/scripts/collect_host index 7a8efb8a..231742e0 100755 --- a/tools/collector/scripts/collect_host +++ b/tools/collector/scripts/collect_host @@ -84,7 +84,6 @@ COLLECT_INCLUDE="/var/run /etc /root" FLIGHT_RECORDER_PATH="var/lib/sm/" FLIGHT_RECORDER_FILE="sm.eru.v1" VAR_LOG_INCLUDE_LIST="/tmp/${COLLECT_NAME}.lst" -COLLECT_DIR_PCENT_CMD="df --output=pcent ${COLLECT_BASE_DIR}" COLLECT_DIR_USAGE_CMD="df -h ${COLLECT_BASE_DIR}" COLLECT_DATE="/usr/local/sbin/collect_date" COLLECT_SYSINV="${COLLECT_PATH}/collect_sysinv" @@ -98,26 +97,7 @@ function log_space() ilog "${COLLECT_BASE_DIR} ${msg} ${space1}" } -function space_precheck() -{ - space="`${COLLECT_DIR_PCENT_CMD}`" - space1=`echo "${space}" | grep -v Use` - size=`echo ${space1} | cut -f 1 -d '%'` - if [ ${size} -ge 0 -a ${size} -le 100 ] ; then - ilog "${COLLECT_BASE_DIR} is $size% full" - if [ ${size} -ge ${MIN_PERCENT_SPACE_REQUIRED} ] ; then - wlog "${HOSTNAME}:${COLLECT_BASE_DIR} does not have enough available space in to perform collect" - wlog "${HOSTNAME}:${COLLECT_BASE_DIR} must be below ${MIN_PERCENT_SPACE_REQUIRED}% to perform collect" - wlog "Increase available space in ${HOSTNAME}:${COLLECT_BASE_DIR} and retry operation." - echo "${FAIL_INSUFFICIENT_SPACE_STR}" - exit ${FAIL_INSUFFICIENT_SPACE} - fi - else - wlog "unable to parse available space from '${COLLECT_DIR_PCENT_CMD}' output" - fi -} - -space_precheck +space_precheck ${HOSTNAME} ${COLLECT_BASE_DIR} CURR_DIR=`pwd` mkdir -p ${COLLECT_NAME_DIR} @@ -187,7 +167,7 @@ function collect_extra() # Collect process, thread and scheduling, and elapsed time # This has everything that ps-sched.sh does, except for cpu affinity mask, - # adds: stime,etime,time,wchan,tty). + # adds: stime,etime,time,wchan,tty). delimiter ${LOGFILE} "ps -eL -o pid,lwp,ppid,state,class,nice,rtprio,priority,psr,stime,etime,time,wchan:16,tty,comm,command" ps -eL -o pid,lwp,ppid,state,class,nice,rtprio,priority,psr,stime,etime,time,wchan:16,tty,comm,command >> ${LOGFILE} diff --git a/tools/collector/scripts/collect_utils b/tools/collector/scripts/collect_utils index 739e1612..2364cc58 100755 --- a/tools/collector/scripts/collect_utils +++ b/tools/collector/scripts/collect_utils @@ -36,17 +36,20 @@ FAIL_INACTIVE=35 FAIL_PERMISSION_SKIP=36 FAIL_OUT_OF_SPACE=37 FAIL_INSUFFICIENT_SPACE=38 -FAIL_OUT_OF_SPACE_LOCAL=39 -FAIL_CREATE=39 +FAIL_INTERNAL=39 +FAIL_NO_TARDIR=40 +FAIL_NO_TARBALLS=41 # Warnings are above 200 WARN_WARNING=200 WARN_HOSTNAME=201 # Failure Strings +FAIL_NOT_ENOUGH_SPACE_STR="Not enough /scratch filesystem space" FAIL_OUT_OF_SPACE_STR="No space left on device" FAIL_TAR_OUT_OF_SPACE_STR="tar: Error is not recoverable" FAIL_INSUFFICIENT_SPACE_STR="Not enough space on device" +FAIL_UNREACHABLE_STR="Unreachable" # The minimum amount of % free space on /scratch to allow collect to proceed MIN_PERCENT_SPACE_REQUIRED=75 @@ -121,19 +124,18 @@ function ilog { echo "$@" logger -t ${COLLECT_TAG} $@ - #logger -p local3.info -t ${COLLECT_TAG} $@ } function elog { echo "Error: $@" - logger -t ${COLLECT_TAG} $@ + logger -t ${COLLECT_TAG} "Error: $@" } function wlog { echo "Warning: $@" - logger -t ${COLLECT_TAG} $@ + logger -t ${COLLECT_TAG} "Warning: $@" } function set_debug_mode() @@ -144,8 +146,8 @@ function set_debug_mode() function dlog() { if [ "$DEBUG" == true ] ; then - logger -t ${COLLECT_TAG} $@ - echo "Debug: $@" + logger -t ${COLLECT_TAG} "Debug: $@" + echo "$(date) Debug: $@" fi } @@ -235,3 +237,35 @@ function collect_errors() fi return ${RC} } + +############################################################################ +# +# Name : space_precheck +# +# Description: +# +############################################################################ + +function space_precheck() +{ + HOSTNAME=${1} + COLLECT_BASE_DIR=${2} + COLLECT_DIR_PCENT_CMD="df --output=pcent ${COLLECT_BASE_DIR}" + + space="`${COLLECT_DIR_PCENT_CMD}`" + space1=`echo "${space}" | grep -v Use` + size=`echo ${space1} | cut -f 1 -d '%'` + if [ ${size} -ge 0 -a ${size} -le 100 ] ; then + if [ ${size} -ge ${MIN_PERCENT_SPACE_REQUIRED} ] ; then + ilog "${COLLECT_BASE_DIR} is $size% full" + echo "${FAIL_INSUFFICIENT_SPACE_STR}" + wlog "${HOSTNAME}:${COLLECT_BASE_DIR} does not have enough available space in to perform collect" + wlog "${HOSTNAME}:${COLLECT_BASE_DIR} must be below ${MIN_PERCENT_SPACE_REQUIRED}% to perform collect" + wlog "Increase available space in ${HOSTNAME}:${COLLECT_BASE_DIR} and retry operation." + exit ${FAIL_INSUFFICIENT_SPACE} + fi + else + wlog "unable to parse available space from '${COLLECT_DIR_PCENT_CMD}' output" + fi +} +