Browse Source

Add retry/rerun support for exec module

Add support for retries and reruns at specified intervals for
divingbell-exec scripts. Also adds support for timeouts.

Also update osh-infra-upgrade-host to allow gate to run.

Change-Id: I5f4cd43b13a467d94f67b358f3190f515256ae66
Craig Anderson 4 months ago
parent
commit
4ed467e512

+ 81
- 20
divingbell/templates/bin/_exec.sh.tpl View File

@@ -16,6 +16,8 @@
16 16
 # limitations under the License.
17 17
 */}}
18 18
 
19
+{{- $exec_loop_sleep_interval := 60 }}
20
+
19 21
 set -e
20 22
 
21 23
 cat <<'UNIQUE_EOF_9c341059-25a0-4725-9489-1789e255e381' > {{ .Values.conf.chroot_mnt_path | quote }}/tmp/exec_host_{{ .Chart.Version }}.sh
@@ -74,19 +76,28 @@ cd "${exec_path}"
74 76
       {{- $_ := set $.Values "__blocking_policy" $keypath.blocking_policy }}
75 77
     {{- end }}
76 78
 
77
-    {{- $_ := set $.Values "__timeout" 3600 }}
79
+    {{- $_ := set $.Values "__timeout" 1800 }}
78 80
     {{- if hasKey $keypath "timeout" }}
79
-      {{- fail (print "NOT IMPLEMENTED: 'timeout' FOR '" $script "'") }}
81
+      {{- if eq ($keypath.timeout | toString) "infinite" }}
82
+        {{- fail (print "BAD 'timeout' FOR '" $script "': 'infinite' timeouts not supported.") }}
83
+      {{- end }}
80 84
       {{- $_ := set $.Values "__timeout" $keypath.timeout }}
81 85
     {{- end }}
82 86
 
83 87
     {{- $_ := set $.Values "__rerun_interval" "infinite" }}
84 88
     {{- if hasKey $keypath "rerun_interval" }}
85
-      {{- fail (print "NOT IMPLEMENTED: 'rerun_interval' FOR '" $script "'") }}
89
+      {{- if not (eq ($keypath.rerun_interval | toString) "infinity") }}
90
+        {{- if lt ($keypath.rerun_interval | int) $exec_loop_sleep_interval }}
91
+          {{- fail (print "BAD 'rerun_interval' FOR '" $script "': Got '" $keypath.rerun_interval "', but expected >= '" $exec_loop_sleep_interval "'.") }}
92
+        {{- end }}
93
+        {{- if not (eq $.Values.__rerun_policy "always") }}
94
+          {{- fail (print "BAD COMBINATION: Must use 'rerun_policy' of 'always' when defining a finite 'rerun_interval'. Got 'rerun_policy' of '" $.Values.__rerun_policy "' and 'rerun_interval' of '" $keypath.rerun_interval "' for '" $script "'.") }}
95
+        {{- end }}
96
+      {{- end }}
86 97
       {{- $_ := set $.Values "__rerun_interval" $keypath.rerun_interval }}
87 98
     {{- end }}
88 99
 
89
-    {{- $_ := set $.Values "__rerun_interval_persist" "false" }}
100
+    {{- $_ := set $.Values "__rerun_interval_persist" "true" }}
90 101
     {{- if hasKey $keypath "rerun_interval_persist" }}
91 102
       {{- fail (print "NOT IMPLEMENTED: 'rerun_interval_persist' FOR '" $script "'") }}
92 103
       {{- $_ := set $.Values "__rerun_interval_persist" $keypath.rerun_interval_persist }}
@@ -98,13 +109,20 @@ cd "${exec_path}"
98 109
       {{- $_ := set $.Values "__rerun_max_count" $keypath.rerun_max_count }}
99 110
     {{- end }}
100 111
 
101
-    {{- $_ := set $.Values "__retry_interval" $.Values.__rerun_interval }}
112
+    {{- $_ := set $.Values "__retry_interval" (print $.Values.__rerun_interval) }}
102 113
     {{- if hasKey $keypath "retry_interval" }}
103
-      {{- fail (print "NOT IMPLEMENTED: 'retry_interval' FOR '" $script "'") }}
114
+      {{- if not (eq ($keypath.retry_interval | toString) "infinity") }}
115
+        {{- if lt ($keypath.retry_interval | int) $exec_loop_sleep_interval }}
116
+          {{- fail (print "BAD 'retry_interval' FOR '" $script "': Got '" $keypath.retry_interval "', but expected >= '" $exec_loop_sleep_interval "'.") }}
117
+        {{- end }}
118
+        {{- if and (not (eq $.Values.__rerun_policy "always")) (not (eq $.Values.__rerun_policy "once_successfully")) }}
119
+          {{- fail (print "BAD COMBINATION: Must use 'rerun_policy' of 'always' or 'once_successfully' when defining a finite 'retry_interval'. Got 'rerun_policy' of '" $.Values.__rerun_policy "' and 'retry_interval' of '" $keypath.retry_interval "' for '" $script "'.") }}
120
+        {{- end }}
121
+      {{- end }}
104 122
       {{- $_ := set $.Values "__retry_interval" $keypath.retry_interval }}
105 123
     {{- end }}
106 124
 
107
-    {{- $_ := set $.Values "__retry_interval_persist" "false" }}
125
+    {{- $_ := set $.Values "__retry_interval_persist" "true" }}
108 126
     {{- if hasKey $keypath "retry_interval_persist" }}
109 127
       {{- fail (print "NOT IMPLEMENTED: 'retry_interval_persist' FOR '" $script "'") }}
110 128
       {{- $_ := set $.Values "__retry_interval_persist" $keypath.retry_interval_persist }}
@@ -115,15 +133,43 @@ cd "${exec_path}"
115 133
       {{- fail (print "NOT IMPLEMENTED: 'retry_max_count' FOR '" $script "'") }}
116 134
       {{- $_ := set $.Values "__retry_max_count" $keypath.retry_max_count }}
117 135
     {{- end }}
136
+
118 137
     cat <<'UNIQUE_EOF_1840dbd4-09e1-4725-87f5-3b6944b80526' > {{ $script }}
119 138
 {{ $keypath.data }}
120 139
 UNIQUE_EOF_1840dbd4-09e1-4725-87f5-3b6944b80526
121 140
     chmod 700 {{ $script }}
141
+    # check rerun policy
142
+    hash_check=fail
122 143
     if  [[ {{ $.Values.__rerun_policy }} = always ]] || \
123 144
         [[ ! -f ${hash}/exit_code ]] || \
124 145
        ([[ {{ $.Values.__rerun_policy }} = once_successfully ]] && \
125
-          [[ -f ${hash}/exit_code ]] && \
126
-          [[ $(cat ${hash}/exit_code) != 0 ]]); then
146
+        [[ $(cat ${hash}/exit_code) != 0 ]]); then
147
+      hash_check=pass
148
+    fi
149
+    # check rerun/retry interval
150
+    interval_check=fail
151
+    if  [[ ! -f ${hash}/last_run_timestamp ]] || [[ ! -f ${hash}/exit_code ]]; then
152
+      interval_check=pass
153
+    elif [[ $(cat ${hash}/exit_code) = 0 ]]; then
154
+      if [[ {{ $.Values.__rerun_interval }} = infinite ]]; then
155
+        interval_check=pass
156
+      elif [[ $(date +"%s") -ge $(($(cat ${hash}/last_run_timestamp) + {{ $.Values.__rerun_interval }})) ]]; then
157
+        interval_check=pass
158
+      fi
159
+    elif [[ $(cat ${hash}/exit_code) != 0 ]]; then
160
+      if [[ {{ $.Values.__retry_interval }} = infinite ]]; then
161
+        interval_check=pass
162
+      elif [[ $(date +"%s") -ge $(($(cat ${hash}/last_run_timestamp) + {{ $.Values.__retry_interval }})) ]]; then
163
+        interval_check=pass
164
+      fi
165
+    fi
166
+    if [[ $hash_check = pass ]] && [[ $interval_check = pass ]]; then
167
+      if [[ -f ${hash}/exit_code ]]; then
168
+        # remove previous run record, in case this run is interrupted
169
+        rm ${hash}/exit_code
170
+      fi
171
+      # write timestamp at beginning of execution
172
+      echo $(date +"%s") > "${hash}/last_run_timestamp"
127 173
       {{- if hasKey $keypath "env" }}
128 174
         {{- range $env_key, $env_val := $keypath.env }}
129 175
           {{ $env_key }}={{ $env_val | squote }} \
@@ -135,7 +181,26 @@ UNIQUE_EOF_1840dbd4-09e1-4725-87f5-3b6944b80526
135 181
           {{ $arg | squote }} \
136 182
         {{- end }}
137 183
       {{- end }}
138
-      && echo 0 > "${hash}/exit_code" || echo $? > "${hash}/exit_code"
184
+      &
185
+      pid=$!
186
+      time_waited=0
187
+      sleep_interval=5
188
+      timeout={{ $.Values.__timeout }}
189
+      while true; do
190
+        if [[ $time_waited -ge $timeout ]]; then
191
+          log.ERROR "Hit '$timeout' second timeout waiting for '{{ $script }}' - terminating."
192
+          # ask nicely first
193
+          kill $pid
194
+          sleep 10
195
+          # force kill if still running
196
+          ps $pid > /dev/null && kill -9 $pid
197
+          break
198
+        fi
199
+        ps $pid > /dev/null || break
200
+        sleep $sleep_interval
201
+        time_waited=$(($time_waited + $sleep_interval))
202
+      done
203
+      wait $pid && echo 0 > "${hash}/exit_code" || echo $? > "${hash}/exit_code"
139 204
       {{- if hasKey $keypath "blocking_policy" }}
140 205
         {{- if eq $keypath.blocking_policy "foreground_halt_pod_on_failure" }}
141 206
           if [[ $(cat "${hash}/exit_code") != '0' ]]; then
@@ -144,20 +209,16 @@ UNIQUE_EOF_1840dbd4-09e1-4725-87f5-3b6944b80526
144 209
         {{- end }}
145 210
       {{- end }}
146 211
     fi
147
-  {{ end }}
212
+  {{- end }}
148 213
 {{- end }}
149 214
 
150
-exit 0
151 215
 UNIQUE_EOF_9c341059-25a0-4725-9489-1789e255e381
152 216
 
153 217
 chmod 700 {{ .Values.conf.chroot_mnt_path | quote }}/tmp/exec_host_{{ .Chart.Version }}.sh
154
-chroot {{ .Values.conf.chroot_mnt_path | quote }} /tmp/exec_host_{{ .Chart.Version }}.sh
155
-
156
-sleep 1
157
-echo 'INFO Putting the daemon to sleep.'
158 218
 
159
-while [ 1 ]; do
160
-  sleep 300
219
+while true; do
220
+  chroot {{ .Values.conf.chroot_mnt_path | quote }} /tmp/exec_host_{{ .Chart.Version }}.sh
221
+  sleep 2
222
+  echo 'INFO Putting the daemon to sleep.'
223
+  sleep {{ $exec_loop_sleep_interval }}
161 224
 done
162
-
163
-exit 0

+ 17
- 19
doc/source/index.rst View File

@@ -209,6 +209,17 @@ The following set of options are fully implemeneted::
209 209
     If any of that info changes, so will the hash, and it will be seen as a new
210 210
     object which will be executed regardless of this setting.
211 211
 
212
+    ``script_timeout`` may optionally be set to the number of seconds to wait for
213
+    script completion before termination. Default value is ``1800`` (30 min).
214
+
215
+    ``rerun_interval`` may be optionally set to the number of seconds to wait
216
+    between rerunning a given script which ran successfully the previous time.
217
+    Default value is ``infinite``.
218
+
219
+    ``retry_interval`` may be optionally set to the number of seconds to wait
220
+    between rerunning a given script which did not run successfully the previous
221
+    time. Default behavior is to match the ``rerun_interval``.
222
+
212 223
 The following set of options are partially implemeneted::
213 224
 
214 225
     ``blocking_policy`` may optionally be set to ``background``, ``foreground``,
@@ -223,30 +234,17 @@ The following set of options are partially implemeneted::
223 234
 
224 235
 The following set of options are not yet implemeneted::
225 236
 
226
-    ``script_timeout`` may optionally be set to the number of seconds to wait for
227
-    script completion before termination. Default value is ``3600`` (1 hour).
228
-
229
-    ``rerun_interval`` may be optionally set to the number of seconds to wait
230
-    between rerunning a given script which ran successfully the previous time.
231
-    Default value is ``infinite``.
232
-
233
-    ``rerun_interval_persist`` may be optionally set to ``true`` for
234
-    a given script. This allows a script to persist its rerun interval through a
235
-    pod/node restart. Otherwise, the time since last successful script execution
236
-    will not be considered on pod/node startup. Default value is ``false``.
237
+    ``rerun_interval_persist`` may be optionally set to ``false`` for a given
238
+    script. This makes the script execute on pod/node startup regardless of the
239
+    interval since the last successful execution. Default value is ``true``.
237 240
 
238 241
     ``rerun_max_count`` may be optionally set to the maximum number of times a
239 242
     succeeding script should be retried. Successful exec count does not persist
240 243
     through pod/node restart. Default value is ``infinite``.
241 244
 
242
-    ``retry_interval`` may be optionally set to the number of seconds to wait
243
-    between rerunning a given script which did not run successfully the previous
244
-    time. Default value is set to the ``rerun_interval``.
245
-
246
-    ``retry_interval_persist`` may be optionally set to ``true`` for
247
-    a given script. This allows a script to persist its retry interval through a
248
-    pod/node restart. Otherwise, the time since last failed script execution
249
-    will not be considered on pod/node startup. Default value is ``false``.
245
+    ``retry_interval_persist`` may be optionally set to ``false`` for a given
246
+    script. This makes the script execute on pod/node startup, regardless of the
247
+    time since the last execution. Default value is ``true``.
250 248
 
251 249
     ``retry_max_count`` may be optionally set to the maximum number of times a
252 250
     failing script should be retried. Failed exec count does not persist

+ 12
- 0
tools/gate/playbooks/osh-infra-upgrade-host.yaml View File

@@ -39,3 +39,15 @@
39 39
     - upgrade-host
40 40
     - start-zuul-console
41 41
     - disable-local-nameserver
42
+
43
+- hosts: all
44
+  vars_files:
45
+    - vars.yaml
46
+  vars:
47
+    work_dir: "{{ zuul.project.src_dir }}/{{ zuul_osh_infra_relative_path | default('') }}"
48
+  gather_facts: False
49
+  become: yes
50
+  roles:
51
+    - deploy-apparmor
52
+  tags:
53
+    - deploy-apparmor

+ 117
- 0
tools/gate/scripts/020-test-divingbell.sh View File

@@ -1178,6 +1178,123 @@ manifests:
1178 1178
     echo "[SUCCESS] exec test$(($i + 5)) passed successfully" >> "${TEST_RESULTS}"
1179 1179
   done
1180 1180
 
1181
+  # test timeout
1182
+  local overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set17.yaml
1183
+  echo 'conf:
1184
+  exec:
1185
+    011-timeout.sh:
1186
+      timeout: 11
1187
+      data: |
1188
+        #!/bin/bash
1189
+        sleep 60' > "${overrides_yaml}"
1190
+  install_base "--values=${overrides_yaml}"
1191
+  get_container_status exec
1192
+  _test_clog_msg 'timeout waiting for'
1193
+  echo '[SUCCESS] exec test17 passed successfully' >> "${TEST_RESULTS}"
1194
+
1195
+  # Test invalid timeout
1196
+  overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set18.yaml
1197
+  echo 'conf:
1198
+  exec:
1199
+    011-timeout.sh:
1200
+      timeout: infinite
1201
+      data: |
1202
+        #!/bin/bash
1203
+        sleep 60' > "${overrides_yaml}"
1204
+  install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD .timeout. FOR' || \
1205
+    (echo "[FAIL] exec test18 did not receive expected 'BAD .timeout. FOR' error" && exit 1)
1206
+  echo '[SUCCESS] exec test18 passed successfully' >> "${TEST_RESULTS}"
1207
+
1208
+  # Test invalid rerun_interval (too short)
1209
+  overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set19.yaml
1210
+  echo 'conf:
1211
+  exec:
1212
+    012-rerun-interval.sh:
1213
+      rerun_interval: 30
1214
+      data: |
1215
+        #!/bin/bash
1216
+        true' > "${overrides_yaml}"
1217
+  install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD .rerun_interval. FOR' || \
1218
+    (echo "[FAIL] exec test19 did not receive expected 'BAD .rerun_interval. FOR' error" && exit 1)
1219
+  echo '[SUCCESS] exec test19 passed successfully' >> "${TEST_RESULTS}"
1220
+
1221
+  # Test invalid retry_interval (too short)
1222
+  overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set20.yaml
1223
+  echo 'conf:
1224
+  exec:
1225
+    012-retry-interval.sh:
1226
+      retry_interval: 30
1227
+      data: |
1228
+        #!/bin/bash
1229
+        true' > "${overrides_yaml}"
1230
+  install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD .retry_interval. FOR' || \
1231
+    (echo "[FAIL] exec test20 did not receive expected 'BAD .retry_interval. FOR' error" && exit 1)
1232
+  echo '[SUCCESS] exec test20 passed successfully' >> "${TEST_RESULTS}"
1233
+
1234
+  # Test invalid rerun_interval combination
1235
+  overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set21.yaml
1236
+  echo 'conf:
1237
+  exec:
1238
+    012-rerun-interval.sh:
1239
+      rerun_interval: 60
1240
+      rerun_policy: once_successfully
1241
+      data: |
1242
+        #!/bin/bash
1243
+        true' > "${overrides_yaml}"
1244
+  install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD COMBINATION' || \
1245
+    (echo "[FAIL] exec test21 did not receive expected 'BAD COMBINATION' error" && exit 1)
1246
+  echo '[SUCCESS] exec test21 passed successfully' >> "${TEST_RESULTS}"
1247
+
1248
+  # Test invalid retry_interval combination
1249
+  overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set22.yaml
1250
+  echo 'conf:
1251
+  exec:
1252
+    012-retry-interval.sh:
1253
+      retry_interval: 60
1254
+      rerun_policy: never
1255
+      data: |
1256
+        #!/bin/bash
1257
+        true' > "${overrides_yaml}"
1258
+  install_base "--values=${overrides_yaml}" 2>&1 | grep 'BAD COMBINATION' || \
1259
+    (echo "[FAIL] exec test22 did not receive expected 'BAD COMBINATION' error" && exit 1)
1260
+  echo '[SUCCESS] exec test22 passed successfully' >> "${TEST_RESULTS}"
1261
+
1262
+  # test rerun_interval
1263
+  overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set23.yaml
1264
+  echo 'conf:
1265
+  exec:
1266
+    012-rerun-interval.sh:
1267
+      rerun_interval: 60
1268
+      data: |
1269
+        #!/bin/bash
1270
+        echo script name: ${BASH_SOURCE} >> exec_testfile' > "${overrides_yaml}"
1271
+  install_base "--values=${overrides_yaml}"
1272
+  get_container_status exec
1273
+  sleep 72
1274
+  get_container_status exec
1275
+  expected_result='script name: ./012-rerun-interval.sh
1276
+script name: ./012-rerun-interval.sh'
1277
+  _test_exec_match "$expected_result" "${EXEC_DIR}/exec_testfile" "test23"
1278
+  echo '[SUCCESS] exec test23 passed successfully' >> "${TEST_RESULTS}"
1279
+
1280
+  # test retry_interval
1281
+  overrides_yaml=${LOGS_SUBDIR}/${FUNCNAME}-set24.yaml
1282
+  echo 'conf:
1283
+  exec:
1284
+    012-retry-interval.sh:
1285
+      retry_interval: 60
1286
+      data: |
1287
+        #!/bin/bash
1288
+        echo script name: ${BASH_SOURCE} >> exec_testfile
1289
+        false' > "${overrides_yaml}"
1290
+  install_base "--values=${overrides_yaml}"
1291
+  get_container_status exec
1292
+  sleep 72
1293
+  get_container_status exec
1294
+  expected_result='script name: ./012-retry-interval.sh
1295
+script name: ./012-retry-interval.sh'
1296
+  _test_exec_match "$expected_result" "${EXEC_DIR}/exec_testfile" "test24"
1297
+  echo '[SUCCESS] exec test24 passed successfully' >> "${TEST_RESULTS}"
1181 1298
 }
1182 1299
 
1183 1300
 # test daemonset value overrides for hosts and labels

Loading…
Cancel
Save