From ba02b0a23582b286b775fd129e3a09970862755b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Jeanneret?= Date: Wed, 22 Jan 2020 16:23:24 +0100 Subject: [PATCH] Make healthchecks more strict It was discovered that healthchecks aren't really reliable because they aren't strict enough. The current patch adds the "standard" options in order to ensure we actually catch errors soon enough in order to return the actual state of the checked element. It also requires a small change for the healthcheck_port() function, since the "piping" returned a 141 code instead of 0 due SIGPIPE being sent at some point[1]. It also depends on two other changes, in order to ensure we won't get any "sudo" issues inside the checks (here again, healthcheck_port is tricky). [1] https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q http://www.tldp.org/LDP/lpg/node20.html Depends-On: https://review.opendev.org/703818 Depends-On: https://review.opendev.org/703816 Change-Id: Iada9fb49881c8edc9c6ede46a939d1853204f896 Closes-Bug: #1860556 Related: https://bugzilla.redhat.com/show_bug.cgi?id=1794044 --- healthcheck/common.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/healthcheck/common.sh b/healthcheck/common.sh index 7d9538f16..e660bc7e0 100755 --- a/healthcheck/common.sh +++ b/healthcheck/common.sh @@ -1,4 +1,5 @@ #!/bin/bash +set -eo pipefail : ${HEALTHCHECK_CURL_MAX_TIME:=10} : ${HEALTHCHECK_CURL_USER_AGENT:=curl-healthcheck} : ${HEALTHCHECK_CURL_WRITE_OUT:='\n%{http_code} %{remote_ip}:%{remote_port} %{time_total} seconds\n'} @@ -47,7 +48,11 @@ healthcheck_port () { # port by using "sudo -u" to get the right output. # Note: the privileged containers have the correct ss output with root # user; which is why we need to run with both users, as a best effort. - (ss -ntuap; sudo -u $puser ss -ntuap) | sort -u | grep -qE ":($ports).*,pid=($pids)," + # https://bugs.launchpad.net/tripleo/+bug/1860556 + # do ot use "-q" option for grep, since it returns 141 for some reason with + # set -o pipefail. + # See https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q + (ss -ntuap; sudo -u $puser ss -ntuap) | sort -u | grep -E ":($ports).*,pid=($pids),">/dev/null } healthcheck_listen () {