Add control plane testing during update.

continous-test.sh is a light shell daemon to start "workload_launch.sh
sanityfast" in a loop.

That daemon collects the log of each individual run in a "ct-<PID>"
directory and the result in control-plane-test-<PID>.log.  The pid
can be found in control-plane-test.pid.

In tripleo-upgrade we add the usual check integration at the relevant
part of the update. The new boolean option
"--update-controlplane-check" is added to the infrared interface.

By default this test is not activated. control_plane_check variable
controls that behavior outside of infrared.

We have two definitions of failure:
- how many successive failures we have;
- how many total failures we have.

This should avoid false positive while still catching long enough cut.

We also activate debug output for the workload launch script to ease
error checking.

Change-Id: Ieff860667cf67a1c1f67221b05575e952a0636fa
(cherry picked from commit 57d9c8336b)
This commit is contained in:
Sofer Athlan-Guyot 2023-02-09 11:23:27 +01:00
parent a370edabc5
commit 701e880fc1
14 changed files with 379 additions and 1 deletions

View File

@ -188,6 +188,13 @@ log_playbook: "{{ working_dir }}/collect_log.yaml"
log_playbook_script: "{{ working_dir }}/collect_log"
log_stages: true
# enable control plane testing.
control_plane_check: false
# Max number of workload launch consecutive failure
tu_ctl_plane_max_successive_failure: 2
# Max number of total failure (euristic for update run)
tu_ctl_plane_max_failure: 6
# enable web load test
fip_http_check: false

198
files/continuous-test.sh Normal file
View File

@ -0,0 +1,198 @@
#!/bin/bash
set -eu
## ---------------------------------------------------------------------
## NAME:
## continuous-test.sh - run a script in a loop and gather the results.
##
## SYNOPSIS
## continuous-test.sh [OPTION] [SCRIPT]
##
## DESCRIPTION
## Run SCRIPT and collect date, time and exit status.
##
## The SCRIPT will be continuously run until we get a SIGUSR1
## signal. When the signal is caught, we will wait for the last
## run to end and dump to stdout the result of all commands.
##
## The output of the each command will be saved into "continuous-test-<pid>/" under
## the current directory.
##
## A /var/run/continuous-test.pid will register the pid of the
## running process.
##
## OPTIONS
## -d Enable debug mode.
## -l <PREFIX> Prefix used for:
## - Logfile: Default to ./continuous-test-<PID>.log
## - Done file: Default to ./continuous-test-<PID>.done
##
## The logfile will hold the result of each command run and the
## done file indicate that the last run is finished when we want
## to end the continuous test.
##
## Both those files will have the <PID> added to the prefix so that
## multiple command can be run in parallel if needed.
##
## The pid can be find in the PIDFILE.
##
## -p <PIDFILE> save the PID to that file.
## Default to ./continuous-test.pid
##
## -o <DIR> Directory where to save all those files. Default to
## the directory where continuous-test.sh is.
##
## FILES
##
## /var/run/continuous-test.pid will hold the pid of the process
## ./continuous-test.log have the result of the check
## ./continuous-test-<pid>/<files> will hold the output of each command.
##
## ENVIRONMENT
## CT_SCRIPT_ARGS A string holding any argument that should
## be passed to SCRIPT.
##
## AUTHOR
## Athlan-Guyot Sofer <sathlang@redhat.com>
## ---------------------------------------------------------------------
FILE=$(basename $0)
CT_PARENT=${CT_PARENT:-true}
CT_CHILD=${CT_CHILD:-false}
CT_STOP=false
## ---------------------------------------------------------------------
## Function definitions.
process_sig() {
echo "$$: received term signal" >&2
CT_STOP=true
}
process_sigterm_parent() {
echo "$$: Parent received term signal" >&2
if [ -n "${CT_PID}" ]; then
echo "$$: received term signal: killing $CT_PID" >&2
kill -s USR1 $CT_PID
else
# Should not happen.
echo "$$: received term signal: killing group" >&2
kill -s USR1 0
fi
}
# Daemonize the process. This will fork a process and detach from the
# console after setting the environment from the options.
if "${CT_PARENT}"; then
export DEBUG=false
while getopts :p:l:o:d OPT; do
case $OPT in
l|+l)
CT_PREFIX="$OPTARG"
;;
p|+p)
CT_PIDFILE="$OPTARG"
;;
o|+o)
CT_DIR="$OPTARG"
;;
d|+d)
DEBUG=true
;;
*)
echo "usage: ${0##*/} [-l LOGFILE] [-p PIDFILE] [-d] SCRIPT"
exit 2
esac
done
shift $(( OPTIND - 1 ))
OPTIND=1
if [ -z "${CT_DIR}" ]; then
CT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
fi
export CT_DIR
if $DEBUG; then
export CT_TTY=$(tty)
else
export CT_TTY=/dev/null
fi
exec 2>$CT_TTY
echo "entering parent $$ $FILE" >&2
export CT_SCRIPT_ARGS=${CT_SCRIPT_ARGS:-""}
export CT_SCRIPT="${@:?'SCRIPT cannot be empty.'}"
export CT_PREFIX="${CT_PREFIX:-}"
export CT_PIDFILE="${CT_PIDFILE:-}"
export CT_CHILD=true
export CT_PARENT=false
setsid ${CT_DIR}/${FILE} "$@" </dev/null >$CT_TTY 2>$CT_TTY &
CT_PID=$!
if $DEBUG ; then
trap process_sigterm_parent SIGTERM SIGINT
wait $CT_PID
echo "leaving parent $$ after waiting for $CT_PID/$FILE" >&2
else
echo "leaving parent $$ $FILE" >&2
fi
sync
exit 0
fi
if "${CT_CHILD}"; then
if [ -n "${CT_TTY}" ]; then
exec 2> ${CT_TTY}
exec 1> ${CT_TTY}
else
CT_TTY=/dev/null
fi
echo "entering child $$ running $FILE" >&2
if [ -z "${CT_PREFIX}" ]; then
CT_LOGFILE="${CT_DIR}/continuous-test-$$.log"
else
CT_LOGFILE="${CT_DIR}/${CT_PREFIX}-$$.log"
fi
if [ -z "${CT_PIDFILE}" ]; then
CT_PIDFILE="${CT_DIR}/continuous-test.pid"
fi
export CT_LOGFILE
export CT_PIDFILE
export CT_CMD_OUT_DIR="${CT_DIR}/ct-$$"
trap process_sig SIGTERM SIGUSR1
export CT_CHILD=false
export CT_PARENT=false
echo $$ > "${CT_PIDFILE}"
# Main loop where eventually run the script.
while ! $CT_STOP; do
setsid ${CT_DIR}/$FILE "$@" </dev/null 2>$CT_TTY
done
echo "Leaving child $$ running $FILE" >&2
if [ -z "${CT_PREFIX}" ]; then
CT_ENDFILE="${CT_DIR}/continuous-test-$$.done"
else
CT_ENDFILE="${CT_DIR}/${CT_PREFIX}-$$.done"
fi
date > $CT_ENDFILE
sync
exit 0
fi
exec >>$CT_LOGFILE
mkdir -p "${CT_CMD_OUT_DIR}"
echo "entering loop $$ $CT_SCRIPT" >&2
# We cannot have to jobs in the same seconds, or else we will
# overwrite the file. sleep 1 prevents this.
sleep 1
start_time="$(date +%s)"
start_time_h="$(date -d@${start_time})"
echo -n "${start_time_h} (${start_time}) "
set +e
"${CT_SCRIPT}" ${CT_SCRIPT_ARGS} &>> "${CT_CMD_OUT_DIR}/${start_time}.log"
RC="${?}"
set -e
end_time="$(date +%s)"
duration=$((end_time - start_time))
echo -n "${duration}s "
if [ $RC -eq 0 ]; then
echo "SUCCESS (0)"
else
echo "FAILED (${RC})"
fi
echo "leaving loop $$" >&2

View File

@ -110,6 +110,11 @@
updates_workarounds: true
when: install.updates.workarounds
- name: Set update control plane testing
set_fact:
control_plane_check: true
when: install.get('update', {}).get('controlplane', {}).get('check', {})
- name: Set upgrade floating ip check
set_fact:
l3_agent_connectivity_check: true

View File

@ -223,6 +223,11 @@ subparsers:
the validations group execution.
Example: validations-extra-args: "--extra-vars min_undercloud_ram_gb=5"
default: ''
update-controlplane-check:
type: Bool
help: |
Check control plane during update.
default: false
- title: TripleO Options
options:

View File

@ -0,0 +1,5 @@
---
- name: Start control plane testing.
shell: |
{{ working_dir }}/control_plane_test_start.sh
when: control_plane_check|bool

View File

@ -0,0 +1,6 @@
---
- name: Stop control plane testing.
shell: |
{{ working_dir }}/control_plane_test_stop.sh \
{{ max_cons_err is defined|ternary(max_cons_err,tu_ctl_plane_max_successive_failure)}} {{ max_err is defined|ternary(max_err,tu_ctl_plane_max_failure)}}
when: control_plane_check|bool

View File

@ -0,0 +1,20 @@
---
- block:
- name: Create control plane wrapper
copy:
src: "continuous-test.sh"
dest: "{{ working_dir }}/continuous-test.sh"
mode: 0775
- name: Create control plane start script.
template:
src: "control_plane_test_start.sh.j2"
dest: "{{ working_dir }}/control_plane_test_start.sh"
mode: 0775
- name: Create control plane stop script.
template:
src: "control_plane_test_stop.sh.j2"
dest: "{{ working_dir }}/control_plane_test_stop.sh"
mode: 0775
when: control_plane_check|bool

View File

@ -47,6 +47,9 @@
- name: import tasks from l3_agent_connectivity_check_start_script
import_tasks: ../common/l3_agent_connectivity_check_start_script.yml
- name: Start control plane testing
import_tasks: ../common/control_plane_test_start.yaml
- name: Start Ceph update using cephadm
command:
cmd: >
@ -104,3 +107,8 @@
import_tasks: ../common/l3_agent_connectivity_check_stop_script.yml
vars:
current_stage_error: "{{ update_loss_threshold }}"
- name: Stop control plane testing
import_tasks: ../common/control_plane_test_stop.yaml
vars:
max_cons_err: 0
max_err: 0

View File

@ -124,6 +124,9 @@
- name: Create L3-agent failover scripts
import_tasks: ../common/create_l3_agent_failover_check_script.yml
- name: include control plane test tasks
import_tasks: ../common/create_control_plane_test_scripts.yaml
- name: create nova actions check script
import_tasks: ../common/create_nova_actions_check_script.yml

View File

@ -10,6 +10,26 @@
- not overcloud_batch_update|bool
- log_stages|bool
- name: Start control plane testing
import_tasks: ../common/control_plane_test_start.yaml
when:
- overcloud_batch_update|bool
- name: Start control plane testing (serial)
import_tasks: ../common/control_plane_test_start.yaml
vars:
max_cons_err: 0
max_err: 0
when:
- not overcloud_batch_update|bool
- oc_current_role[0] != "Controller"
- name: Start control plane testing (serial - controller)
import_tasks: ../common/control_plane_test_start.yaml
when:
- not overcloud_batch_update|bool
- oc_current_role[0] == "Controller"
- name: run overcloud minor update in each of the roles/hostgroups
async: 25200
poll: 0
@ -40,3 +60,6 @@
import_tasks: ../common/l3_agent_connectivity_check_stop_script.yml
vars:
current_stage_error: "{{ update_loss_threshold }}"
- name: Stop control plane testing
import_tasks: ../common/control_plane_test_stop.yaml

View File

@ -0,0 +1,12 @@
#!/bin/bash
#
# Script to test control plane by creating a vm in a loop during the
# update. Start sequence.
set -eu
continuous_test_wrapper={{ working_dir }}/continuous-test.sh
if [ -e "${continuous_test_wrapper}" ]; then
export CT_SCRIPT_ARGS=sanityfast
${continuous_test_wrapper} -o {{ working_dir}} -l control-plane-test -p control-plane-test.pid ./workload_launch.sh
fi

View File

@ -0,0 +1,70 @@
#!/bin/bash
#
# Script to stop a previously started control plane testing.
# Get the pid, kill it and wait for the end of the last run.
set -eu
max_cons_fail=${1:-{{ tu_ctl_plane_max_successive_failure }}}
max_fail=${2:-{{ tu_ctl_plane_max_failure }}}
continuous_test_wrapper={{ working_dir }}/continuous-test.sh
pid_file={{ working_dir }}/control-plane-test.pid
if [ ! -e "${pid_file}" ]; then
echo "Not pid file: ${pid_file}"
exit 1
fi
PID=$(cat ${pid_file})
done_file={{ working_dir }}/control-plane-test-${PID}.done
result_file={{ working_dir }}/control-plane-test-${PID}.log
kill "${PID}"
max_tries=60 # 5min
current_try=0
until [ -e ${done_file} ]; do
if [ $current_try -le $max_tries ]; then
sleep 5
current_try=$((current_try+1))
else
echo "Waited to long for ${PID} to finish. Aborting."
exit 1
fi
done
# Verify that we didn't get any workload issue.
FAILURE=""
# Number of successive failure
successive_failure=$(
awk 'BEGIN{fail=0; max=0}
NR>1 && NF>1 && $(NF-1)==prev{fail++; if (fail > max){max = fail}}
/FAIL/{prev=$(NF-1)}
/SUCCESS/{fail=0}
END{print max}' "${result_file}"
)
if [ "${successive_failure}" -gt ${max_cons_fail} ]; then
echo "Max number of consecutive control plane failure (${max_cons_fail}) reached."
echo "Found ${successive_failure} consecutive failures during update."
grep FAILED "${result_file}"
FAILURE="true"
fi
# Total number of failure
failures=$(grep -Fc FAILED "${result_file}" ||:) # prevents exit 1 when no match
if [ "${failures}" -gt ${max_fail} ]; then
echo "Max number of control plan failure (${max_fail}) reached."
echo "Found ${failures} failures during update."
grep FAILED "${result_file}"
FAILURE="true"
fi
if [ -n "${FAILURE}" ]; then
echo "Concaneted files in {{ working_dir }}/control-plane-testing-detailed.log"
tail -n +1 ./ct-${PID}/*.log > {{ working_dir }}/control-plane-testing-detailed.log
exit 1
else
echo "$(date) No (or not enough) failure(s) during control plane testing"
echo "Successive failure: ${successive_failure}/${max_cons_fail}"
echo "Total number of failures: ${failures}/${max_fail}"
fi

View File

@ -20,6 +20,15 @@ kill -9 $( lsof -t {{ working_dir }}/fip_http_check_start.sh ) || :
bash {{ working_dir }}/fip_http_check_start.sh &
{% endif %}
{% if control_plane_check|bool %}
if [[ -e {{ working_dir }}/control_plane_test_start.sh ]]; then
bash {{ working_dir }}/control_plane_test_start.sh
# Give some time for the test to start as it need to download some
# image.
sleep 10
fi
{% endif %}
source {{ undercloud_rc }}
set +o pipefail
@ -62,3 +71,9 @@ source {{ overcloud_rc }}
kill -9 $( lsof -t {{ working_dir }}/fip_http_check_start.sh )
bash {{ working_dir }}/fip_http_check_stop.sh
{% endif %}
{% if control_plane_check|bool %}
if [[ -e {{ working_dir }}/control_plane_test_stop.sh ]]; then
bash {{ working_dir }}/control_plane_test_stop.sh 0 0
fi
{% endif %}

View File

@ -11,6 +11,7 @@
# - cleanup: clean up VM and other elements created previously
#
set -o pipefail
set -x
IN_TEARDOWN=false
FAST=""
@ -419,7 +420,7 @@ if [[ "${MODE}" == "sanity" ]]; then
fi
if [[ "${MODE}" == "sanityfast" ]]; then
FAST=10
FAST=20
trap cleanup_on_exit EXIT
SUFFIX=$(openssl rand -hex 5)
prepare_env