Add control plane testing during update.

continous-test.sh is a light shell daemon to start "workload_launch.sh sanityfast" in a loop. That daemon collects the log of each individual run in a "ct-<PID>" directory and the result in control-plane-test-<PID>.log. The pid can be found in control-plane-test.pid. In tripleo-upgrade we add the usual check integration at the relevant part of the update. The new boolean option "--update-controlplane-check" is added to the infrared interface. By default this test is not activated. control_plane_check variable controls that behavior outside of infrared. We have two definitions of failure: - how many successive failures we have; - how many total failures we have. This should avoid false positive while still catching long enough cut. We also activate debug output for the workload launch script to ease error checking. Change-Id: Ieff860667cf67a1c1f67221b05575e952a0636fa (cherry picked from commit 57d9c8336b)
2023-02-09 11:23:27 +01:00 · 2023-02-09 11:23:27 +01:00 · 701e880fc1
parent a370edabc5
commit 701e880fc1
14 changed files with 379 additions and 1 deletions
--- a/defaults/main.yml
+++ b/defaults/main.yml
@ -188,6 +188,13 @@ log_playbook: "{{ working_dir }}/collect_log.yaml"
 log_playbook_script: "{{ working_dir }}/collect_log"
 log_stages: true

+# enable control plane testing.
+control_plane_check: false
+# Max number of workload launch consecutive failure
+tu_ctl_plane_max_successive_failure: 2
+# Max number of total failure (euristic for update run)
+tu_ctl_plane_max_failure: 6
+
 # enable web load test
 fip_http_check: false

--- a/files/continuous-test.sh
+++ b/files/continuous-test.sh
@ -0,0 +1,198 @@
+#!/bin/bash
+set -eu
+## ---------------------------------------------------------------------
+## NAME:
+##    continuous-test.sh - run a script in a loop and gather the results.
+##
+## SYNOPSIS
+##    continuous-test.sh [OPTION] [SCRIPT]
+##
+## DESCRIPTION
+##   Run SCRIPT and collect date, time and exit status.
+##
+##   The SCRIPT will be continuously run until we get a SIGUSR1
+##   signal. When the signal is caught, we will wait for the last
+##   run to end and dump to stdout the result of all commands.
+##
+##   The output of the each command will be saved into "continuous-test-<pid>/" under
+##   the current directory.
+##
+##   A /var/run/continuous-test.pid will register the pid of the
+##   running process.
+##
+## OPTIONS
+##   -d Enable debug mode.
+##   -l <PREFIX> Prefix used for:
+##        - Logfile: Default to  ./continuous-test-<PID>.log
+##        - Done file: Default to ./continuous-test-<PID>.done
+##
+##      The logfile will hold the result of each command run and the
+##      done file indicate that the last run is finished when we want
+##      to end the continuous test.
+##
+##      Both those files will have the <PID> added to the prefix so that
+##      multiple command can be run in parallel if needed.
+##
+##      The pid can be find in the PIDFILE.
+##
+##   -p <PIDFILE> save the PID to that file.
+##        Default to ./continuous-test.pid
+##
+##   -o <DIR> Directory where to save all those files. Default to
+##        the directory where continuous-test.sh is.
+##
+## FILES
+##
+##   /var/run/continuous-test.pid will hold the pid of the process
+##   ./continuous-test.log have the result of the check
+##   ./continuous-test-<pid>/<files> will hold the output of each command.
+##
+## ENVIRONMENT
+##   CT_SCRIPT_ARGS  A string holding any argument that should
+##   be passed to SCRIPT.
+##
+## AUTHOR
+##   Athlan-Guyot Sofer <sathlang@redhat.com>
+## ---------------------------------------------------------------------
+FILE=$(basename $0)
+
+CT_PARENT=${CT_PARENT:-true}
+CT_CHILD=${CT_CHILD:-false}
+
+CT_STOP=false
+
+## ---------------------------------------------------------------------
+## Function definitions.
+process_sig() {
+    echo "$$: received term signal" >&2
+    CT_STOP=true
+}
+
+process_sigterm_parent() {
+    echo "$$: Parent received term signal" >&2
+    if [ -n "${CT_PID}" ]; then
+        echo "$$: received term signal: killing $CT_PID" >&2
+        kill -s USR1 $CT_PID
+    else
+        # Should not happen.
+        echo "$$: received term signal: killing group" >&2
+        kill -s USR1 0
+    fi
+}
+
+# Daemonize the process. This will fork a process and detach from the
+# console after setting the environment from the options.
+if "${CT_PARENT}"; then
+    export DEBUG=false
+    while getopts :p:l:o:d OPT; do
+        case $OPT in
+            l|+l)
+                CT_PREFIX="$OPTARG"
+                ;;
+            p|+p)
+                CT_PIDFILE="$OPTARG"
+                ;;
+            o|+o)
+                CT_DIR="$OPTARG"
+                ;;
+            d|+d)
+                DEBUG=true
+                ;;
+            *)
+                echo "usage: ${0##*/} [-l LOGFILE] [-p PIDFILE] [-d] SCRIPT"
+                exit 2
+        esac
+    done
+    shift $(( OPTIND - 1 ))
+    OPTIND=1
+    if [ -z "${CT_DIR}" ]; then
+        CT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+    fi
+    export CT_DIR
+    if $DEBUG; then
+        export CT_TTY=$(tty)
+    else
+        export CT_TTY=/dev/null
+    fi
+    exec 2>$CT_TTY
+    echo "entering parent $$ $FILE" >&2
+    export CT_SCRIPT_ARGS=${CT_SCRIPT_ARGS:-""}
+    export CT_SCRIPT="${@:?'SCRIPT cannot be empty.'}"
+    export CT_PREFIX="${CT_PREFIX:-}"
+    export CT_PIDFILE="${CT_PIDFILE:-}"
+    export CT_CHILD=true
+    export CT_PARENT=false
+    setsid ${CT_DIR}/${FILE} "$@" </dev/null >$CT_TTY 2>$CT_TTY &
+    CT_PID=$!
+    if $DEBUG ; then
+        trap process_sigterm_parent SIGTERM SIGINT
+        wait $CT_PID
+        echo "leaving parent $$ after waiting for $CT_PID/$FILE" >&2
+    else
+        echo "leaving parent $$ $FILE" >&2
+    fi
+    sync
+    exit 0
+fi
+
+if "${CT_CHILD}"; then
+    if [ -n "${CT_TTY}" ]; then
+        exec 2> ${CT_TTY}
+        exec 1> ${CT_TTY}
+    else
+        CT_TTY=/dev/null
+    fi
+    echo "entering child $$ running $FILE" >&2
+    if [ -z "${CT_PREFIX}" ]; then
+        CT_LOGFILE="${CT_DIR}/continuous-test-$$.log"
+    else
+        CT_LOGFILE="${CT_DIR}/${CT_PREFIX}-$$.log"
+    fi
+    if [ -z "${CT_PIDFILE}" ]; then
+        CT_PIDFILE="${CT_DIR}/continuous-test.pid"
+    fi
+    export CT_LOGFILE
+    export CT_PIDFILE
+    export CT_CMD_OUT_DIR="${CT_DIR}/ct-$$"
+    trap process_sig SIGTERM SIGUSR1
+    export CT_CHILD=false
+    export CT_PARENT=false
+    echo $$ > "${CT_PIDFILE}"
+    # Main loop where eventually run the script.
+    while ! $CT_STOP; do
+        setsid ${CT_DIR}/$FILE "$@" </dev/null 2>$CT_TTY
+    done
+    echo "Leaving child $$ running $FILE" >&2
+    if [ -z "${CT_PREFIX}" ]; then
+        CT_ENDFILE="${CT_DIR}/continuous-test-$$.done"
+    else
+        CT_ENDFILE="${CT_DIR}/${CT_PREFIX}-$$.done"
+    fi
+    date > $CT_ENDFILE
+    sync
+    exit 0
+fi
+
+exec >>$CT_LOGFILE
+mkdir -p "${CT_CMD_OUT_DIR}"
+echo "entering loop $$ $CT_SCRIPT" >&2
+# We cannot have to jobs in the same seconds, or else we will
+# overwrite the file. sleep 1 prevents this.
+sleep 1
+start_time="$(date +%s)"
+start_time_h="$(date -d@${start_time})"
+echo -n "${start_time_h} (${start_time}) "
+set +e
+"${CT_SCRIPT}" ${CT_SCRIPT_ARGS} &>> "${CT_CMD_OUT_DIR}/${start_time}.log"
+RC="${?}"
+set -e
+end_time="$(date +%s)"
+duration=$((end_time - start_time))
+echo -n "${duration}s "
+
+if [ $RC -eq 0 ]; then
+    echo "SUCCESS (0)"
+else
+    echo "FAILED (${RC})"
+fi
+echo "leaving loop $$" >&2
--- a/infrared_plugin/main.yml
+++ b/infrared_plugin/main.yml
@ -110,6 +110,11 @@
            updates_workarounds: true
          when: install.updates.workarounds

+        - name: Set update control plane testing
+          set_fact:
+            control_plane_check: true
+          when: install.get('update', {}).get('controlplane', {}).get('check', {})
+
        - name: Set upgrade floating ip check
          set_fact:
            l3_agent_connectivity_check: true
--- a/infrared_plugin/plugin.spec
+++ b/infrared_plugin/plugin.spec
@ -223,6 +223,11 @@ subparsers:
                          the validations group execution.
                          Example: validations-extra-args: "--extra-vars min_undercloud_ram_gb=5"
                      default: ''
+                  update-controlplane-check:
+                      type: Bool
+                      help: |
+                          Check control plane during update.
+                      default: false

            - title: TripleO Options
              options:
--- a/tasks/common/control_plane_test_start.yaml
+++ b/tasks/common/control_plane_test_start.yaml
@ -0,0 +1,5 @@
+---
+- name: Start control plane testing.
+  shell: |
+    {{ working_dir }}/control_plane_test_start.sh
+  when: control_plane_check|bool
--- a/tasks/common/control_plane_test_stop.yaml
+++ b/tasks/common/control_plane_test_stop.yaml
@ -0,0 +1,6 @@
+---
+- name: Stop control plane testing.
+  shell: |
+    {{ working_dir }}/control_plane_test_stop.sh \
+      {{ max_cons_err is defined|ternary(max_cons_err,tu_ctl_plane_max_successive_failure)}} {{ max_err is defined|ternary(max_err,tu_ctl_plane_max_failure)}}
+  when: control_plane_check|bool
--- a/tasks/common/create_control_plane_test_scripts.yaml
+++ b/tasks/common/create_control_plane_test_scripts.yaml
@ -0,0 +1,20 @@
+---
+- block:
+    - name: Create control plane wrapper
+      copy:
+        src: "continuous-test.sh"
+        dest: "{{ working_dir }}/continuous-test.sh"
+        mode: 0775
+
+    - name: Create control plane start script.
+      template:
+        src: "control_plane_test_start.sh.j2"
+        dest: "{{ working_dir }}/control_plane_test_start.sh"
+        mode: 0775
+
+    - name: Create control plane stop script.
+      template:
+        src: "control_plane_test_stop.sh.j2"
+        dest: "{{ working_dir }}/control_plane_test_stop.sh"
+        mode: 0775
+  when: control_plane_check|bool
--- a/tasks/update/ceph_update_run.yml
+++ b/tasks/update/ceph_update_run.yml
@ -47,6 +47,9 @@
    - name: import tasks from l3_agent_connectivity_check_start_script
      import_tasks: ../common/l3_agent_connectivity_check_start_script.yml

+    - name: Start control plane testing
+      import_tasks: ../common/control_plane_test_start.yaml
+
    - name: Start Ceph update using cephadm
      command:
        cmd: >
@ -104,3 +107,8 @@
      import_tasks: ../common/l3_agent_connectivity_check_stop_script.yml
      vars:
        current_stage_error: "{{ update_loss_threshold }}"
+    - name: Stop control plane testing
+      import_tasks: ../common/control_plane_test_stop.yaml
+      vars:
+        max_cons_err: 0
+        max_err: 0
--- a/tasks/update/create-overcloud-update-scripts.yaml
+++ b/tasks/update/create-overcloud-update-scripts.yaml
@ -124,6 +124,9 @@
 - name: Create L3-agent failover scripts
  import_tasks: ../common/create_l3_agent_failover_check_script.yml

+- name: include control plane test tasks
+  import_tasks: ../common/create_control_plane_test_scripts.yaml
+
 - name: create nova actions check script
  import_tasks: ../common/create_nova_actions_check_script.yml

--- a/tasks/update/overcloud_update_run_role.yml
+++ b/tasks/update/overcloud_update_run_role.yml
@ -10,6 +10,26 @@
    - not overcloud_batch_update|bool
    - log_stages|bool

+- name: Start control plane testing
+  import_tasks: ../common/control_plane_test_start.yaml
+  when:
+    - overcloud_batch_update|bool
+
+- name: Start control plane testing (serial)
+  import_tasks: ../common/control_plane_test_start.yaml
+  vars:
+    max_cons_err: 0
+    max_err: 0
+  when:
+    - not overcloud_batch_update|bool
+    - oc_current_role[0] != "Controller"
+
+- name: Start control plane testing (serial - controller)
+  import_tasks: ../common/control_plane_test_start.yaml
+  when:
+    - not overcloud_batch_update|bool
+    - oc_current_role[0] == "Controller"
+
 - name: run overcloud minor update in each of the roles/hostgroups
  async: 25200
  poll: 0
@ -40,3 +60,6 @@
  import_tasks: ../common/l3_agent_connectivity_check_stop_script.yml
  vars:
    current_stage_error: "{{ update_loss_threshold }}"
+
+- name: Stop control plane testing
+  import_tasks: ../common/control_plane_test_stop.yaml
--- a/templates/control_plane_test_start.sh.j2
+++ b/templates/control_plane_test_start.sh.j2
@ -0,0 +1,12 @@
+#!/bin/bash
+#
+# Script to test control plane by creating a vm in a loop during the
+# update. Start sequence.
+set -eu
+
+continuous_test_wrapper={{ working_dir }}/continuous-test.sh
+
+if [ -e "${continuous_test_wrapper}" ]; then
+    export CT_SCRIPT_ARGS=sanityfast
+    ${continuous_test_wrapper} -o {{ working_dir}} -l control-plane-test -p control-plane-test.pid ./workload_launch.sh
+fi
--- a/templates/control_plane_test_stop.sh.j2
+++ b/templates/control_plane_test_stop.sh.j2
@ -0,0 +1,70 @@
+#!/bin/bash
+#
+# Script to stop a previously started control plane testing.
+# Get the pid, kill it and wait for the end of the last run.
+set -eu
+
+max_cons_fail=${1:-{{ tu_ctl_plane_max_successive_failure }}}
+max_fail=${2:-{{ tu_ctl_plane_max_failure }}}
+
+continuous_test_wrapper={{ working_dir }}/continuous-test.sh
+pid_file={{ working_dir }}/control-plane-test.pid
+
+if [ ! -e "${pid_file}" ]; then
+    echo "Not pid file: ${pid_file}"
+    exit 1
+fi
+
+PID=$(cat ${pid_file})
+
+done_file={{ working_dir }}/control-plane-test-${PID}.done
+result_file={{ working_dir }}/control-plane-test-${PID}.log
+
+kill "${PID}"
+
+max_tries=60                     # 5min
+current_try=0
+until [ -e ${done_file} ]; do
+    if [ $current_try -le $max_tries ]; then
+        sleep 5
+        current_try=$((current_try+1))
+    else
+        echo "Waited to long for ${PID} to finish. Aborting."
+        exit 1
+    fi
+done
+
+# Verify that we didn't get any workload issue.
+FAILURE=""
+# Number of successive failure
+successive_failure=$(
+    awk 'BEGIN{fail=0; max=0}
+         NR>1 && NF>1 && $(NF-1)==prev{fail++; if (fail > max){max = fail}}
+         /FAIL/{prev=$(NF-1)}
+         /SUCCESS/{fail=0}
+         END{print max}' "${result_file}"
+)
+if [ "${successive_failure}" -gt  ${max_cons_fail} ]; then
+    echo "Max number of consecutive control plane failure (${max_cons_fail}) reached."
+    echo "Found ${successive_failure} consecutive failures during update."
+    grep FAILED "${result_file}"
+    FAILURE="true"
+fi
+# Total number of failure
+failures=$(grep -Fc FAILED "${result_file}" ||:) # prevents exit 1 when no match
+if [ "${failures}" -gt ${max_fail} ]; then
+    echo "Max number of control plan failure (${max_fail}) reached."
+    echo "Found ${failures} failures during update."
+    grep FAILED "${result_file}"
+    FAILURE="true"
+fi
+
+if [ -n "${FAILURE}" ]; then
+    echo "Concaneted files in {{ working_dir }}/control-plane-testing-detailed.log"
+    tail -n +1 ./ct-${PID}/*.log > {{ working_dir }}/control-plane-testing-detailed.log
+    exit 1
+else
+    echo "$(date) No (or not enough) failure(s) during control plane testing"
+    echo "Successive failure: ${successive_failure}/${max_cons_fail}"
+    echo "Total number of failures: ${failures}/${max_fail}"
+fi
--- a/templates/ovn-external-update.sh.j2
+++ b/templates/ovn-external-update.sh.j2
@ -20,6 +20,15 @@ kill -9 $( lsof -t {{ working_dir }}/fip_http_check_start.sh ) || :
 bash {{ working_dir }}/fip_http_check_start.sh &
 {% endif %}

+{% if control_plane_check|bool %}
+if [[ -e {{ working_dir }}/control_plane_test_start.sh ]]; then
+    bash {{ working_dir }}/control_plane_test_start.sh
+    # Give some time for the test to start as it need to download some
+    # image.
+    sleep 10
+fi
+{% endif %}
+
 source {{ undercloud_rc }}

 set +o pipefail
@ -62,3 +71,9 @@ source {{ overcloud_rc }}
 kill -9 $( lsof -t {{ working_dir }}/fip_http_check_start.sh )
 bash {{ working_dir }}/fip_http_check_stop.sh
 {% endif %}
+
+{% if control_plane_check|bool %}
+if [[ -e {{ working_dir }}/control_plane_test_stop.sh ]]; then
+    bash {{ working_dir }}/control_plane_test_stop.sh 0 0
+fi
+{% endif %}
--- a/templates/workload_launch.sh.j2
+++ b/templates/workload_launch.sh.j2
@ -11,6 +11,7 @@
 #   - cleanup:  clean up VM and other elements created previously
 #
 set -o pipefail
+set -x

 IN_TEARDOWN=false
 FAST=""
@ -419,7 +420,7 @@ if [[ "${MODE}" == "sanity" ]]; then
 fi

 if [[ "${MODE}" == "sanityfast" ]]; then
-    FAST=10
+    FAST=20
    trap cleanup_on_exit EXIT
    SUFFIX=$(openssl rand -hex 5)
    prepare_env