Add retry/rerun support for exec module
Add support for retries and reruns at specified intervals for divingbell-exec scripts. Also adds support for timeouts. Also update osh-infra-upgrade-host to allow gate to run. Change-Id: I5f4cd43b13a467d94f67b358f3190f515256ae66
This commit is contained in:
parent
012800d854
commit
4ed467e512
@ -16,6 +16,8 @@
|
||||
# limitations under the License.
|
||||
*/}}
|
||||
|
||||
{{- $exec_loop_sleep_interval := 60 }}
|
||||
|
||||
set -e
|
||||
|
||||
cat <<'UNIQUE_EOF_9c341059-25a0-4725-9489-1789e255e381' > {{ .Values.conf.chroot_mnt_path | quote }}/tmp/exec_host_{{ .Chart.Version }}.sh
|
||||
@ -74,19 +76,28 @@ cd "${exec_path}"
|
||||
{{- $_ := set $.Values "__blocking_policy" $keypath.blocking_policy }}
|
||||
{{- end }}
|
||||
|
||||
{{- $_ := set $.Values "__timeout" 3600 }}
|
||||
{{- $_ := set $.Values "__timeout" 1800 }}
|
||||
{{- if hasKey $keypath "timeout" }}
|
||||
{{- fail (print "NOT IMPLEMENTED: 'timeout' FOR '" $script "'") }}
|
||||
{{- if eq ($keypath.timeout | toString) "infinite" }}
|
||||
{{- fail (print "BAD 'timeout' FOR '" $script "': 'infinite' timeouts not supported.") }}
|
||||
{{- end }}
|
||||
{{- $_ := set $.Values "__timeout" $keypath.timeout }}
|
||||
{{- end }}
|
||||
|
||||
{{- $_ := set $.Values "__rerun_interval" "infinite" }}
|
||||
{{- if hasKey $keypath "rerun_interval" }}
|
||||
{{- fail (print "NOT IMPLEMENTED: 'rerun_interval' FOR '" $script "'") }}
|
||||
{{- if not (eq ($keypath.rerun_interval | toString) "infinity") }}
|
||||
{{- if lt ($keypath.rerun_interval | int) $exec_loop_sleep_interval }}
|
||||
{{- fail (print "BAD 'rerun_interval' FOR '" $script "': Got '" $keypath.rerun_interval "', but expected >= '" $exec_loop_sleep_interval "'.") }}
|
||||
{{- end }}
|
||||
{{- if not (eq $.Values.__rerun_policy "always") }}
|
||||
{{- fail (print "BAD COMBINATION: Must use 'rerun_policy' of 'always' when defining a finite 'rerun_interval'. Got 'rerun_policy' of '" $.Values.__rerun_policy "' and 'rerun_interval' of '" $keypath.rerun_interval "' for '" $script "'.") }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- $_ := set $.Values "__rerun_interval" $keypath.rerun_interval }}
|
||||
{{- end }}
|
||||
|
||||
{{- $_ := set $.Values "__rerun_interval_persist" "false" }}
|
||||
{{- $_ := set $.Values "__rerun_interval_persist" "true" }}
|
||||
{{- if hasKey $keypath "rerun_interval_persist" }}
|
||||
{{- fail (print "NOT IMPLEMENTED: 'rerun_interval_persist' FOR '" $script "'") }}
|
||||
{{- $_ := set $.Values "__rerun_interval_persist" $keypath.rerun_interval_persist }}
|
||||
@ -98,13 +109,20 @@ cd "${exec_path}"
|
||||
{{- $_ := set $.Values "__rerun_max_count" $keypath.rerun_max_count }}
|
||||
{{- end }}
|
||||
|
||||
{{- $_ := set $.Values "__retry_interval" $.Values.__rerun_interval }}
|
||||
{{- $_ := set $.Values "__retry_interval" (print $.Values.__rerun_interval) }}
|
||||
{{- if hasKey $keypath "retry_interval" }}
|
||||
{{- fail (print "NOT IMPLEMENTED: 'retry_interval' FOR '" $script "'") }}
|
||||
{{- if not (eq ($keypath.retry_interval | toString) "infinity") }}
|
||||
{{- if lt ($keypath.retry_interval | int) $exec_loop_sleep_interval }}
|
||||
{{- fail (print "BAD 'retry_interval' FOR '" $script "': Got '" $keypath.retry_interval "', but expected >= '" $exec_loop_sleep_interval "'.") }}
|
||||
{{- end }}
|
||||
{{- if and (not (eq $.Values.__rerun_policy "always")) (not (eq $.Values.__rerun_policy "once_successfully")) }}
|
||||
{{- fail (print "BAD COMBINATION: Must use 'rerun_policy' of 'always' or 'once_successfully' when defining a finite 'retry_interval'. Got 'rerun_policy' of '" $.Values.__rerun_policy "' and 'retry_interval' of '" $keypath.retry_interval "' for '" $script "'.") }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- $_ := set $.Values "__retry_interval" $keypath.retry_interval }}
|
||||
{{- end }}
|
||||
|
||||
{{- $_ := set $.Values "__retry_interval_persist" "false" }}
|
||||
{{- $_ := set $.Values "__retry_interval_persist" "true" }}
|
||||
{{- if hasKey $keypath "retry_interval_persist" }}
|
||||
{{- fail (print "NOT IMPLEMENTED: 'retry_interval_persist' FOR '" $script "'") }}
|
||||
{{- $_ := set $.Values "__retry_interval_persist" $keypath.retry_interval_persist }}
|
||||
@ -115,15 +133,43 @@ cd "${exec_path}"
|
||||
{{- fail (print "NOT IMPLEMENTED: 'retry_max_count' FOR '" $script "'") }}
|
||||
{{- $_ := set $.Values "__retry_max_count" $keypath.retry_max_count }}
|
||||
{{- end }}
|
||||
|
||||
cat <<'UNIQUE_EOF_1840dbd4-09e1-4725-87f5-3b6944b80526' > {{ $script }}
|
||||
{{ $keypath.data }}
|
||||
UNIQUE_EOF_1840dbd4-09e1-4725-87f5-3b6944b80526
|
||||
chmod 700 {{ $script }}
|
||||
# check rerun policy
|
||||
hash_check=fail
|
||||
if [[ {{ $.Values.__rerun_policy }} = always ]] || \
|
||||
[[ ! -f ${hash}/exit_code ]] || \
|
||||
([[ {{ $.Values.__rerun_policy }} = once_successfully ]] && \
|
||||
[[ -f ${hash}/exit_code ]] && \
|
||||
[[ $(cat ${hash}/exit_code) != 0 ]]); then
|
||||
[[ $(cat ${hash}/exit_code) != 0 ]]); then
|
||||
hash_check=pass
|
||||
fi
|
||||
# check rerun/retry interval
|
||||
interval_check=fail
|
||||
if [[ ! -f ${hash}/last_run_timestamp ]] || [[ ! -f ${hash}/exit_code ]]; then
|
||||
interval_check=pass
|
||||
elif [[ $(cat ${hash}/exit_code) = 0 ]]; then
|
||||
if [[ {{ $.Values.__rerun_interval }} = infinite ]]; then
|
||||
interval_check=pass
|
||||
elif [[ $(date +"%s") -ge $(($(cat ${hash}/last_run_timestamp) + {{ $.Values.__rerun_interval }})) ]]; then
|
||||
interval_check=pass
|
||||
fi
|
||||
elif [[ $(cat ${hash}/exit_code) != 0 ]]; then
|
||||
if [[ {{ $.Values.__retry_interval }} = infinite ]]; then
|
||||
interval_check=pass
|
||||
elif [[ $(date +"%s") -ge $(($(cat ${hash}/last_run_timestamp) + {{ $.Values.__retry_interval }})) ]]; then
|
||||
interval_check=pass
|
||||
fi
|
||||
fi
|
||||
if [[ $hash_check = pass ]] && [[ $interval_check = pass ]]; then
|
||||
if [[ -f ${hash}/exit_code ]]; then
|
||||
# remove previous run record, in case this run is interrupted
|
||||
rm ${hash}/exit_code
|
||||
fi
|
||||
# write timestamp at beginning of execution
|
||||
echo $(date +"%s") > "${hash}/last_run_timestamp"
|
||||
{{- if hasKey $keypath "env" }}
|
||||
{{- range $env_key, $env_val := $keypath.env }}
|
||||
{{ $env_key }}={{ $env_val | squote }} \
|
||||
@ -135,7 +181,26 @@ UNIQUE_EOF_1840dbd4-09e1-4725-87f5-3b6944b80526
|
||||
{{ $arg | squote }} \
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
&& echo 0 > "${hash}/exit_code" || echo $? > "${hash}/exit_code"
|
||||
&
|
||||
pid=$!
|
||||
time_waited=0
|
||||
sleep_interval=5
|
||||
timeout={{ $.Values.__timeout }}
|
||||
while true; do
|
||||
if [[ $time_waited -ge $timeout ]]; then
|
||||
log.ERROR "Hit '$timeout' second timeout waiting for '{{ $script }}' - terminating."
|
||||
# ask nicely first
|
||||
kill $pid
|
||||
sleep 10
|
||||
# force kill if still running
|
||||
ps $pid > /dev/null && kill -9 $pid
|
||||
break
|
||||
fi
|
||||
ps $pid > /dev/null || break
|
||||
sleep $sleep_interval
|
||||
time_waited=$(($time_waited + $sleep_interval))
|
||||
done
|
||||
wait $pid && echo 0 > "${hash}/exit_code" || echo $? > "${hash}/exit_code"
|
||||
{{- if hasKey $keypath "blocking_policy" }}
|
||||
{{- if eq $keypath.blocking_policy "foreground_halt_pod_on_failure" }}
|
||||
if [[ $(cat "${hash}/exit_code") != '0' ]]; then
|
||||
@ -144,20 +209,16 @@ UNIQUE_EOF_1840dbd4-09e1-4725-87f5-3b6944b80526
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
fi
|
||||
{{ end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
exit 0
|
||||
UNIQUE_EOF_9c341059-25a0-4725-9489-1789e255e381
|
||||
|
||||
chmod 700 {{ .Values.conf.chroot_mnt_path | quote }}/tmp/exec_host_{{ .Chart.Version }}.sh
|
||||
chroot {{ .Values.conf.chroot_mnt_path | quote }} /tmp/exec_host_{{ .Chart.Version }}.sh
|
||||
|
||||
sleep 1
|
||||
echo 'INFO Putting the daemon to sleep.'
|
||||
|
||||
while [ 1 ]; do
|
||||
sleep 300
|
||||
while true; do
|
||||
chroot {{ .Values.conf.chroot_mnt_path | quote }} /tmp/exec_host_{{ .Chart.Version }}.sh
|
||||
sleep 2
|
||||
echo 'INFO Putting the daemon to sleep.'
|
||||
sleep {{ $exec_loop_sleep_interval }}
|
||||
done
|
||||
|
||||
exit 0
|
||||
|
@ -209,6 +209,17 @@ The following set of options are fully implemeneted::
|
||||
If any of that info changes, so will the hash, and it will be seen as a new
|
||||
object which will be executed regardless of this setting.
|
||||
|
||||
``script_timeout`` may optionally be set to the number of seconds to wait for
|
||||
script completion before termination. Default value is ``1800`` (30 min).
|
||||
|
||||
``rerun_interval`` may be optionally set to the number of seconds to wait
|
||||
between rerunning a given script which ran successfully the previous time.
|
||||
Default value is ``infinite``.
|
||||
|
||||
``retry_interval`` may be optionally set to the number of seconds to wait
|
||||
between rerunning a given script which did not run successfully the previous
|
||||
time. Default behavior is to match the ``rerun_interval``.
|
||||
|
||||
The following set of options are partially implemeneted::
|
||||
|
||||
``blocking_policy`` may optionally be set to ``background``, ``foreground``,
|
||||
@ -223,30 +234,17 @@ The following set of options are partially implemeneted::
|
||||
|
||||
The following set of options are not yet implemeneted::
|
||||
|
||||
``script_timeout`` may optionally be set to the number of seconds to wait for
|
||||
script completion before termination. Default value is ``3600`` (1 hour).
|
||||
|
||||
``rerun_interval`` may be optionally set to the number of seconds to wait
|
||||
between rerunning a given script which ran successfully the previous time.
|
||||
Default value is ``infinite``.
|
||||
|
||||
``rerun_interval_persist`` may be optionally set to ``true`` for
|
||||
a given script. This allows a script to persist its rerun interval through a
|
||||
pod/node restart. Otherwise, the time since last successful script execution
|
||||
will not be considered on pod/node startup. Default value is ``false``.
|
||||
``rerun_interval_persist`` may be optionally set to ``false`` for a given
|
||||
script. This makes the script execute on pod/node startup regardless of the
|
||||
interval since the last successful execution. Default value is ``true``.
|
||||
|
||||
``rerun_max_count`` may be optionally set to the maximum number of times a
|
||||
succeeding script should be retried. Successful exec count does not persist
|
||||
through pod/node restart. Default value is ``infinite``.
|
||||
|
||||
``retry_interval`` may be optionally set to the number of seconds to wait
|
||||
between rerunning a given script which did not run successfully the previous
|
||||
time. Default value is set to the ``rerun_interval``.
|
||||
|
||||
``retry_interval_persist`` may be optionally set to ``true`` for
|
||||
a given script. This allows a script to persist its retry interval through a
|
||||
pod/node restart. Otherwise, the time since last failed script execution
|
||||
will not be considered on pod/node startup. Default value is ``false``.
|
||||
``retry_interval_persist`` may be optionally set to ``false`` for a given
|
||||
script. This makes the script execute on pod/node startup, regardless of the
|
||||
time since the last execution. Default value is ``true``.
|
||||
|
||||
``retry_max_count`` may be optionally set to the maximum number of times a
|
||||
failing script should be retried. Failed exec count does not persist
|
||||
|
@ -39,3 +39,15 @@
|
||||
- upgrade-host
|
||||
- start-zuul-console
|
||||
- disable-local-nameserver
|
||||
|
||||
- hosts: all
|
||||
vars_files:
|
||||
- vars.yaml
|
||||
vars:
|
||||
work_dir: "{{ zuul.project.src_dir }}/{{ zuul_osh_infra_relative_path | default('') }}"
|
||||
gather_facts: False
|
||||
become: yes
|
||||
roles:
|
||||
- deploy-apparmor
|
||||
tags:
|
||||
- deploy-apparmor
|
||||
|
@ -1178,6 +1178,123 @@ manifests:
|
||||
echo "[SUCCESS] exec test$(($i + 5)) passed successfully" >> "${TEST_RESULTS}"
|
||||
done
|
||||
|
||||
# test timeout
|
||||
local overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set17.yaml
|
||||
echo 'conf:
|
||||
exec:
|
||||
011-timeout.sh:
|
||||
timeout: 11
|
||||
data: |
|
||||
#!/bin/bash
|
||||
sleep 60' > "${overrides_yaml}"
|
||||
install_base "--values=${overrides_yaml}"
|
||||
get_container_status exec
|
||||
_test_clog_msg 'timeout waiting for'
|
||||
echo '[SUCCESS] exec test17 passed successfully' >> "${TEST_RESULTS}"
|
||||
|
||||
# Test invalid timeout
|
||||
overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set18.yaml
|
||||
echo 'conf:
|
||||
exec:
|
||||
011-timeout.sh:
|
||||
timeout: infinite
|
||||
data: |
|
||||
#!/bin/bash
|
||||
sleep 60' > "${overrides_yaml}"
|
||||
install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD .timeout. FOR' || \
|
||||
(echo "[FAIL] exec test18 did not receive expected 'BAD .timeout. FOR' error" && exit 1)
|
||||
echo '[SUCCESS] exec test18 passed successfully' >> "${TEST_RESULTS}"
|
||||
|
||||
# Test invalid rerun_interval (too short)
|
||||
overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set19.yaml
|
||||
echo 'conf:
|
||||
exec:
|
||||
012-rerun-interval.sh:
|
||||
rerun_interval: 30
|
||||
data: |
|
||||
#!/bin/bash
|
||||
true' > "${overrides_yaml}"
|
||||
install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD .rerun_interval. FOR' || \
|
||||
(echo "[FAIL] exec test19 did not receive expected 'BAD .rerun_interval. FOR' error" && exit 1)
|
||||
echo '[SUCCESS] exec test19 passed successfully' >> "${TEST_RESULTS}"
|
||||
|
||||
# Test invalid retry_interval (too short)
|
||||
overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set20.yaml
|
||||
echo 'conf:
|
||||
exec:
|
||||
012-retry-interval.sh:
|
||||
retry_interval: 30
|
||||
data: |
|
||||
#!/bin/bash
|
||||
true' > "${overrides_yaml}"
|
||||
install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD .retry_interval. FOR' || \
|
||||
(echo "[FAIL] exec test20 did not receive expected 'BAD .retry_interval. FOR' error" && exit 1)
|
||||
echo '[SUCCESS] exec test20 passed successfully' >> "${TEST_RESULTS}"
|
||||
|
||||
# Test invalid rerun_interval combination
|
||||
overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set21.yaml
|
||||
echo 'conf:
|
||||
exec:
|
||||
012-rerun-interval.sh:
|
||||
rerun_interval: 60
|
||||
rerun_policy: once_successfully
|
||||
data: |
|
||||
#!/bin/bash
|
||||
true' > "${overrides_yaml}"
|
||||
install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD COMBINATION' || \
|
||||
(echo "[FAIL] exec test21 did not receive expected 'BAD COMBINATION' error" && exit 1)
|
||||
echo '[SUCCESS] exec test21 passed successfully' >> "${TEST_RESULTS}"
|
||||
|
||||
# Test invalid retry_interval combination
|
||||
overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set22.yaml
|
||||
echo 'conf:
|
||||
exec:
|
||||
012-retry-interval.sh:
|
||||
retry_interval: 60
|
||||
rerun_policy: never
|
||||
data: |
|
||||
#!/bin/bash
|
||||
true' > "${overrides_yaml}"
|
||||
install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD COMBINATION' || \
|
||||
(echo "[FAIL] exec test22 did not receive expected 'BAD COMBINATION' error" && exit 1)
|
||||
echo '[SUCCESS] exec test22 passed successfully' >> "${TEST_RESULTS}"
|
||||
|
||||
# test rerun_interval
|
||||
overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set23.yaml
|
||||
echo 'conf:
|
||||
exec:
|
||||
012-rerun-interval.sh:
|
||||
rerun_interval: 60
|
||||
data: |
|
||||
#!/bin/bash
|
||||
echo script name: ${BASH_SOURCE} >> exec_testfile' > "${overrides_yaml}"
|
||||
install_base "--values=${overrides_yaml}"
|
||||
get_container_status exec
|
||||
sleep 72
|
||||
get_container_status exec
|
||||
expected_result='script name: ./012-rerun-interval.sh
|
||||
script name: ./012-rerun-interval.sh'
|
||||
_test_exec_match "$expected_result" "${EXEC_DIR}/exec_testfile" "test23"
|
||||
echo '[SUCCESS] exec test23 passed successfully' >> "${TEST_RESULTS}"
|
||||
|
||||
# test retry_interval
|
||||
overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set24.yaml
|
||||
echo 'conf:
|
||||
exec:
|
||||
012-retry-interval.sh:
|
||||
retry_interval: 60
|
||||
data: |
|
||||
#!/bin/bash
|
||||
echo script name: ${BASH_SOURCE} >> exec_testfile
|
||||
false' > "${overrides_yaml}"
|
||||
install_base "--values=${overrides_yaml}"
|
||||
get_container_status exec
|
||||
sleep 72
|
||||
get_container_status exec
|
||||
expected_result='script name: ./012-retry-interval.sh
|
||||
script name: ./012-retry-interval.sh'
|
||||
_test_exec_match "$expected_result" "${EXEC_DIR}/exec_testfile" "test24"
|
||||
echo '[SUCCESS] exec test24 passed successfully' >> "${TEST_RESULTS}"
|
||||
}
|
||||
|
||||
# test daemonset value overrides for hosts and labels
|
||||
|
Loading…
Reference in New Issue
Block a user