From 014314a07f875dbfd7aa47c5b46db02b1b5e81fc Mon Sep 17 00:00:00 2001
From: Eric MacDonald <eric.macdonald@windriver.com>
Date: Thu, 13 Jul 2023 16:28:07 +0000
Subject: [PATCH] Fix subcloud collect when a subcloud system node collect
 fails

An orchestrated subclod collect fails if one or more of the
subcloud system nodes fail to collect. For example if inactive
controller-1 is unreachable.

This update makes collect error handling improvements to allow
for an incomplete subcloud collect to finish successfully while
there is at least one valid host collected from that subcloud.

Also, allow the --timeout option to specify a timeout that is
outside the recommended range. A warning message will be produced.

Test Plan: each test case verified in both parallel and inline modes

PASS: Verify subcloud collect completes successfully when the active
      controller collect succeeds but the inactive controller or other
      subcloud system node is unreachable.
PASS: Verify same case above but rather with a --clean rather than
      collect operation.
PASS: Verify the above 2 test cases for local system collect all
      rather than subcloud collect.
PASS: Verify subcloud collect completes successfully when the active
      controller collect succeeds but the inactive controller or
      other subcloud system node is reachable but fails with host
      collect timeout.
PASS: Verify warning but allowing the user to specify a timeout that
      is outside the recommended range.
      Note: Its useful to allow for testing and extreme cases
PASS: Verify missing report tool does not fail a collect

Regression:

PASS: Verify error handling of collect from unknown host or subcloud
PASS: Verify subcloud collect fails if the subcloud is not reachable
PASS: Verify local collect failure handling when remote collect_host
      thread is killed.
PASS: Verify host collect failure handling when a reachable
      host collect fails with a timeout.

Closes-Bug: 2026768
Change-Id: Id0d53c42dae9c22323d798e23463dc636f7fbe38
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
---
 tools/collector/debian-scripts/collect       | 44 ++++++++++----------
 tools/collector/debian-scripts/collect_utils |  2 +
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/tools/collector/debian-scripts/collect b/tools/collector/debian-scripts/collect
index ac1e493a..78021867 100644
--- a/tools/collector/debian-scripts/collect
+++ b/tools/collector/debian-scripts/collect
@@ -491,7 +491,7 @@ function report_error()
         elog "Permission error ; exiting (${string})"
 
     elif [ ${code} -eq ${FAIL_UNREACHABLE} ] ; then
-        elog "${string} (reason:${code}:unreachable)"
+        wlog "${string} (reason:${code}:unreachable)"
 
     elif [ ${code} -eq ${FAIL_PERMISSION_SKIP} ] ; then
         elog "${string} (reason:${code}:permission error)"
@@ -503,7 +503,7 @@ function report_error()
         elog "${string} (reason:${code}:${FAIL_NOT_ENOUGH_SPACE_STR}) ; ${COLLECT_BASE_DIR} usage must be below ${MIN_PERCENT_SPACE_REQUIRED}%"
 
     elif [ ${code} -ge ${FAIL_TIMEOUT} -a ${code} -le ${FAIL_TIMEOUT9} ] ; then
-        elog "${FAIL_TIMEOUT_STR} ; ${string} (reason:${code})"
+        wlog "${FAIL_TIMEOUT_STR} ; ${string} (reason:${code})"
 
     elif [ ${code} -eq ${FAIL_SUBCLOUD_TIMEOUT} ] ; then
         elog "${FAIL_SUBCLOUD_TIMEOUT_STR} ; ${string} (reason:${code})"
@@ -532,6 +532,9 @@ function report_error()
     elif [ ${code} -eq ${FAIL_MISSING_PARAMETER} ] ; then
         elog "${FAIL_MISSING_PARAMETER_STR} ; ${string} (reason:${code})"
 
+    elif [ ${code} -eq ${FAIL_TIMEOUT_ARG} ] ; then
+        wlog "${FAIL_TIMEOUT_ARG_STR} ; ${string} (reason:${code})"
+
     else
         elog "${string} (reason:${code})"
     fi
@@ -728,11 +731,9 @@ while [[ ${#} -gt 0 ]] ; do
         if [[ ${2} =~ ^[0-9]+$ ]] ; then
             if [ ${2} -lt ${TIMEOUT_MIN_MINS} -o \
                  ${2} -gt ${TIMEOUT_MAX_MINS} ] ; then
-                elog "timeout must be between ${TIMEOUT_MIN_MINS} and ${TIMEOUT_MAX_MINS} minutes"
-                collect_exit ${FAIL_TIMEOUT_ARG}
-            else
-                TIMEOUT="$((${2}*60))"
+                report_error "specified ${2} minute timeout is out-of-range ; should be between ${TIMEOUT_MIN_MINS} and ${TIMEOUT_MAX_MINS} minutes" ${FAIL_TIMEOUT_ARG}
             fi
+            TIMEOUT="$((${2}*60))"
         else
             elog "timeout value must be an integer"
             collect_exit ${FAIL_TIMEOUT_ARG}
@@ -924,7 +925,7 @@ else
                         HOSTLIST+=("${host}")
                     fi
                 else
-                    report_error "cannot collect data from unknown host '${host}'" ${WARN_HOSTNAME}
+                    wlog "cannot collect data from unknown host '${host}'"
                 fi
             done
         fi
@@ -981,7 +982,7 @@ else
                             SUBCLOUDLIST+=("${subcloud}")
                         fi
                     else
-                        report_error "cannot collect data from unknown subcloud '${subcloud}'" ${WARN_SUBCLOUD}
+                        wlog "cannot collect data from unknown subcloud '${subcloud}'"
                     fi
                 fi
             done
@@ -1004,7 +1005,7 @@ else
                         SUBCLOUDLIST+=("${subcloud}")
                     fi
                 else
-                    report_error "cannot collect data from unknown subcloud '${subcloud}'" ${WARN_SUBCLOUD}
+                    wlog "cannot collect data from unknown subcloud '${subcloud}'"
                 fi
             done
         fi
@@ -1405,7 +1406,7 @@ EOF
     local rc=${?}
     if [ ${rc} -ne ${PASS} ] ; then
         dlog "delete_remote_dir_or_file parms=${remote_hostname}:${login_prompt}:${dir_or_file}"
-        report_error "failed to delete ${dir_or_file} on ${remote_hostname} (${login_prompt})" ${rc}
+        wlog "failed to delete ${dir_or_file} on ${remote_hostname} (reason:${rc}:${login_prompt})"
     fi
     return ${rc}
 }
@@ -2136,7 +2137,7 @@ function collect_host_complete_local()
             collect_exit ${FAIL_OUT_OF_SPACE}
 
         else
-            report_error "failed to collect from ${HOSTNAME} [host complete]" ${rc}
+            wlog "failed to collect from ${HOSTNAME} (reason:${rc}:host complete:${COLLECT_DIR}:${tarname})"
             dlog "collect_host_complete_local failure: ${COLLECT_DIR}:${tarname}:${rc}"
         fi
     fi
@@ -2189,7 +2190,7 @@ function collect_host_complete_remote ()
             rc=${PASS}
         fi
     else
-        report_error "failed to collect from ${host} [get file]" ${rc}
+        wlog "failed to collect from ${host} (reason:${rc}:get file:${COLLECT_DIR}:${tarname}.${SUFFIX})"
         dlog "get_file_from_host failure: ${host}:${tarname}.${SUFFIX}:${COLLECT_DIR}"
     fi
     return ${rc}
@@ -2428,7 +2429,7 @@ function collect_subcloud_clean()
 
     check_host_reachable "${subcloud}"
     if [ ${?} -ne ${PASS} ] ; then
-        report_error "cannot clean ${subcloud}" ${FAIL_UNREACHABLE}
+        wlog "cannot clean unreachable subcloud ${subcloud}"
         return ${FAIL_UNREACHABLE}
     fi
 
@@ -2637,7 +2638,7 @@ function collect_hosts()
 
             check_host_reachable "${host}"
             if [ ${?} -ne ${PASS} ] ; then
-                report_error "cannot collect from ${host}" ${FAIL_UNREACHABLE}
+                wlog "cannot collect from ${host} (reason:${FAIL_UNREACHABLE}:${FAIL_UNREACHABLE_STR})"
                 continue
             fi
 
@@ -2688,16 +2689,16 @@ function collect_hosts()
                     rc=${?}
                     if [ ${rc} -ne ${PASS} ] ; then
                         # handle copy error here
-                        report_error "failed to collect from ${host} [host file get]" ${rc}
+                        wlog "failed to collect from ${host} (reason:${rc}:host file get)"
                     else
                         secs=$((SECONDS-HOST_START_TIME))
                         echo -n "done"
                         echo_stats $secs "${TARNAME}" "${COLLECT_DIR}/${TARNAME}.tgz"
                     fi
                 elif [ ${rc} -ge ${FAIL_TIMEOUT} -a ${rc} -le ${FAIL_TIMEOUT9} -a "${DCROLE}" == "${DCROLE_SUBCLOUD}" ] ; then
-                    report_error "failed to collect from ${host} [subcloud host run timeout]" ${FAIL_SUBCLOUD_TIMEOUT}
+                    wlog "failed to collect from ${host} (reason:${FAIL_SUBCLOUD_TIMEOUT}:subcloud host run timeout)"
                 else
-                    report_error "failed to collect from ${host} [host]" ${rc}
+                    wlog "failed to collect from ${host} (reason:${rc}:host)"
                 fi
             fi
         fi
@@ -2749,7 +2750,7 @@ function collect_hosts()
                             DONE_COUNT=$((DONE_COUNT+1))
                         else
                             collect_host_done ${index} ${rc}
-                            report_error "failed to collect from ${info[${INDEX_HOST}]} [target]" ${rc}
+                            wlog "failed to collect from ${info[${INDEX_HOST}]} (reason:${rc}:target)"
                         fi
                     else
                         if [ ${DONE_COUNT} -eq 0 ] ; then
@@ -2779,7 +2780,6 @@ function collect_hosts()
 
     # Report that the overall collect timed-out
     if [ "$monitoring" = true ]; then
-        # there may be partial collect worth keeping
         report_error "collect operation timeout after $TIMEOUT secs" ${FAIL_TIMEOUT}
     fi
 }
@@ -2826,7 +2826,7 @@ collect_subclouds()
 
             check_host_reachable "${subcloud}"
             if [ ${?} -ne ${PASS} ] ; then
-                report_error "cannot collect from ${subcloud}" ${FAIL_UNREACHABLE}
+                wlog "cannot collect from ${subcloud} (reason:${FAIL_UNREACHABLE}:${FAIL_UNREACHABLE_STR})"
                 continue
             fi
 
@@ -3061,7 +3061,7 @@ function get_report_tool()
 
     local rc=${?}
     if [ ${rc} -ne ${PASS} ] ; then
-        report_error "failed to get report tool from ${local_path}" ${rc}
+        wlog "failed to get report tool from ${local_path} (reason:${rc})"
     fi
 }
 
@@ -3085,7 +3085,7 @@ function get_report_plugins()
 
     local rc=${?}
     if [ ${rc} -ne ${PASS} ] ; then
-        report_error "failed to get report plugins from ${local_path}" ${rc}
+        wlog "failed to get report tool plugins from ${local_path} (reason:${rc})"
     fi
 }
 
diff --git a/tools/collector/debian-scripts/collect_utils b/tools/collector/debian-scripts/collect_utils
index f0b495e0..bc41ce04 100755
--- a/tools/collector/debian-scripts/collect_utils
+++ b/tools/collector/debian-scripts/collect_utils
@@ -77,6 +77,8 @@ FAIL_INSUFFICIENT_SPACE_STR="Not enough space on device"
 FAIL_UNREACHABLE_STR="Unreachable"
 
 FAIL_TIMEOUT_STR="operation timeout"
+FAIL_TIMEOUT_ARG_STR="out-of-range timeout"
+
 FAIL_SUBCLOUD_TIMEOUT_STR="subcloud collect timeout"
 
 FAIL_NO_FILE_SPECIFIED_STR="no file specified"