Move away from "ss" and drop default verbose mode
This patch does three main things: - drop the ultra-verbose output (set -x), adding a new param we can use when calling the healthcheck directly - move away from the "ss" calls and the multiple pipes used to filter its output, using "lsof" and native filtering - rewrite the "ps" call in order to use ps native filters instead of piping through grep These changes should make the healthchecks stronger, and avoid some weird issues due to the pipes, and lower the amount of logs while keeping the important information visible. In order to get verbose mode when running the healthcheck directly, you can do as follow: podman exec -u root -ti <container> bash HEALTHCHECK_DEBUG=1 ./openstack/healthcheck or podman -u root -e "HEALTHCHECK_DEBUG=1" <container> /openstack/healthcheck and enjoy a nice debug output. Change-Id: I137fe3211043b00b553db26b2f5930f98373496d
This commit is contained in:
parent
b4fa252253
commit
d03401438c
@ -1,5 +1,12 @@
|
||||
#!/bin/bash
|
||||
set -euxo pipefail
|
||||
set -euo pipefail
|
||||
: ${HEALTHCHECK_DEBUG:=0}
|
||||
if [ $HEALTHCHECK_DEBUG -ne 0 ]; then
|
||||
set -x
|
||||
exec 3>&1
|
||||
else
|
||||
exec 3>/dev/null
|
||||
fi
|
||||
: ${HEALTHCHECK_CURL_MAX_TIME:=10}
|
||||
: ${HEALTHCHECK_CURL_USER_AGENT:=curl-healthcheck}
|
||||
: ${HEALTHCHECK_CURL_WRITE_OUT:='\n%{http_code} %{remote_ip}:%{remote_port} %{time_total} seconds\n'}
|
||||
@ -8,21 +15,13 @@ set -euxo pipefail
|
||||
get_user_from_process() {
|
||||
process=$1
|
||||
|
||||
# This helps to capture the actual pids running the process
|
||||
pids=$(pgrep -d '|' -f $process)
|
||||
# This helps to capture the actual pid running the process
|
||||
pid=$(pgrep -d ',' -f $process)
|
||||
|
||||
# 'cmd' is added to help in case part of the pid is in another pid from
|
||||
# another process.
|
||||
# $ ps -eo user,pid,cmd
|
||||
# USER PID CMD
|
||||
# nova 1 dumb-init --single-child -- kolla_start
|
||||
# nova 7 /usr/bin/python2 /usr/bin/nova-conductor
|
||||
# nova 25 /usr/bin/python2 /usr/bin/nova-conductor
|
||||
# nova 26 /usr/bin/python2 /usr/bin/nova-conductor
|
||||
# root 8311 ps -eo user,pid,cmd
|
||||
# The following "ps" command will capture the user from PID 7 which
|
||||
# is safe enough to assert this is the user running the process.
|
||||
ps -eo user,pid,cmd | grep $process | grep -E $pids | awk 'NR==1{print $1}'
|
||||
# Here, we use the embedded `ps' filter capabilities, and remove the
|
||||
# output header. We ensure we get the user for the selected PIDs only.
|
||||
# In order to ensure we don't get multiple lines, we truncate it with `head'
|
||||
ps -h -q${pid} -o user | head -n1
|
||||
}
|
||||
|
||||
healthcheck_curl () {
|
||||
@ -44,19 +43,16 @@ healthcheck_port () {
|
||||
shift 1
|
||||
args=$@
|
||||
puser=$(get_user_from_process $process)
|
||||
ports=${args// /|}
|
||||
pids=$(pgrep -d '|' -f $process)
|
||||
# https://bugs.launchpad.net/tripleo/+bug/1843555
|
||||
# "ss" output is different if run as root vs as the user actually running
|
||||
# the process. So we also verify that the process is connected to the
|
||||
# port by using "sudo -u" to get the right output.
|
||||
# Note: the privileged containers have the correct ss output with root
|
||||
# user; which is why we need to run with both users, as a best effort.
|
||||
# https://bugs.launchpad.net/tripleo/+bug/1860556
|
||||
# do ot use "-q" option for grep, since it returns 141 for some reason with
|
||||
# set -o pipefail.
|
||||
# See https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
|
||||
(ss -ntuap; sudo -u $puser ss -ntuap) | sort -u | grep -E ":($ports).*,pid=($pids),">/dev/null
|
||||
ports=${args// /,}
|
||||
pids=$(pgrep -d ',' -f $process)
|
||||
# First match exits - usually TCP and "sudo TCP" are enough.
|
||||
# `sudo' is needed, as in some cases even root can get a "permission denied"
|
||||
# on some file descriptors (case for heat_manager for example)
|
||||
# UDP support is needed for octavia manager (UDP:5555).
|
||||
lsof -n -w -P -a -iTCP:${ports} -p${pids} >&3 2>&1 || \
|
||||
sudo -u $puser lsof -n -w -P -a -iTCP:${ports} -p${pids} >&3 2>&1 || \
|
||||
lsof -w -P -a -iUDP:${ports} -p${pids} >&3 2>&1 || \
|
||||
sudo -u $puser lsof -n -w -P -a -iUDP:${ports} -p${pids} >&3 2>&1
|
||||
}
|
||||
|
||||
healthcheck_listen () {
|
||||
@ -64,21 +60,17 @@ healthcheck_listen () {
|
||||
|
||||
shift 1
|
||||
args=$@
|
||||
ports=${args// /|}
|
||||
pids=$(pgrep -d '|' -f $process)
|
||||
ss -lnp | grep -qE ":($ports).*,pid=($pids),"
|
||||
ports=${args// /,}
|
||||
pids=$(pgrep -d ',' -f $process)
|
||||
lsof -n -w -P -a -p${pids} -iTCP:${ports} -s TCP:LISTEN >&3 2>&1
|
||||
}
|
||||
|
||||
healthcheck_socket () {
|
||||
process=$1
|
||||
socket=$2
|
||||
pids=$(pgrep -d ',' -f $process)
|
||||
|
||||
# lsof truncate command name to 15 characters and this behaviour
|
||||
# cannot be disabled
|
||||
if [ ${#process} -gt 15 ] ; then
|
||||
process=${process:0:15}
|
||||
fi
|
||||
lsof -Fc -Ua $socket | grep -q "c$process"
|
||||
lsof -n -Fc -Ua -p${pids} $socket >&3 2>&1
|
||||
}
|
||||
|
||||
healthcheck_file_modification () {
|
||||
@ -125,7 +117,7 @@ get_url_from_vhost () {
|
||||
|
||||
check_swift_interval () {
|
||||
service=$1
|
||||
if ps -e | grep --quiet swift-$service; then
|
||||
if pgrep -f swift-${service} >&3 2>&1; then
|
||||
interval=$(get_config_val $conf $service interval 300)
|
||||
last=`grep -o "\"replication_last\": [0-9]*" $cache | cut -f 2 -d " "`
|
||||
now=`date +%s`
|
||||
|
12
releasenotes/notes/no_ss-368721c3af17b782.yaml
Normal file
12
releasenotes/notes/no_ss-368721c3af17b782.yaml
Normal file
@ -0,0 +1,12 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
This patch moves away from "ss" execs, using lsof instead. This allows
|
||||
to drop most of the piping and subshells, making things stronger.
|
||||
- |
|
||||
Introduce new HEALTHCHECK_DEBUG variable in order to toggle verbosity,
|
||||
defaults to 0 (no verbosity). Setting it to 1 will activate -x flag,
|
||||
among other things.
|
||||
- |
|
||||
Push some verbose output to a third descriptor, visible only if we set
|
||||
the healthcheck to debug.
|
Loading…
Reference in New Issue
Block a user