Improve Swift healthchecks

This patch adds checks for the replicators.

It also removes some unused or invalid code from other checks. Checking
modification time of the recon files is not enough, these might also be
changed by other Swift processes and are not a good indicator for stuck
processes.

Co-Authored-By: Christian Schwede <cschwede@redhat.com>
Co-Authored-By: Emilien Macchi <emilien@redhat.com>

Change-Id: Ib15f1ec4766bf4d64a2860422c230e4d514bc224
This commit is contained in:
Christian Schwede 2019-04-09 12:57:15 +02:00 committed by Emilien Macchi
parent 07c7889ca5
commit 0862fcf98f
10 changed files with 42 additions and 71 deletions

View File

@ -109,3 +109,16 @@ get_url_from_vhost () {
fi fi
echo ${proto}://${server_name}:${bind_port}${wsgi_alias} echo ${proto}://${server_name}:${bind_port}${wsgi_alias}
} }
check_swift_interval () {
service=$1
if ps -e | grep --quiet swift-$service; then
interval=$(get_config_val $conf $service interval 300)
last=`grep -o "\"replication_last\": [0-9]*" $cache | cut -f 2 -d " "`
now=`date +%s`
if [ `expr $now - $last` -gt $interval ]; then
echo "Last replication run did not finish within interval of $interval seconds."
exit 1
fi
fi
}

View File

@ -0,0 +1,8 @@
#!/bin/bash
. ${HEALTHCHECK_SCRIPTS:-/usr/share/openstack-tripleo-common/healthcheck}/common.sh
conf=/etc/swift/account-server.conf
cache=/var/cache/swift/account.recon
check_swift_interval account-replicator

View File

@ -19,17 +19,4 @@ if ps -e | grep --quiet swift-account-server; then
bind_host="[${bind_host}]" bind_host="[${bind_host}]"
fi fi
healthcheck_curl http://${bind_host}:${bind_port}/healthcheck healthcheck_curl http://${bind_host}:${bind_port}/healthcheck
else
if ps -e | grep --quiet swift-account-auditor; then
interval=$(get_config_val $conf account-auditor interval 1800)
elif ps -e | grep --quiet swift-account-reaper; then
interval=$(get_config_val $conf account-reaper interval 3600)
else
interval=$(get_config_val $conf account-replicator interval 30)
fi
if ! healthcheck_file_modification $cache $interval; then
echo "Cache file $cache was not updated within interval of $interval seconds."
exit 1
fi
fi fi

View File

@ -0,0 +1,8 @@
#!/bin/bash
. ${HEALTHCHECK_SCRIPTS:-/usr/share/openstack-tripleo-common/healthcheck}/common.sh
conf=/etc/swift/container-server.conf
cache=/var/cache/swift/container.recon
check_swift_interval container-replicator

View File

@ -19,17 +19,4 @@ if ps -e | grep --quiet swift-container-server; then
bind_host="[${bind_host}]" bind_host="[${bind_host}]"
fi fi
healthcheck_curl http://${bind_host}:${bind_port}/healthcheck healthcheck_curl http://${bind_host}:${bind_port}/healthcheck
else
if ps -e | grep --quiet swift-account-auditor; then
interval=$(get_config_val $conf container-auditor interval 1800)
elif ps -e | grep --quiet swift-account-reaper; then
interval=$(get_config_val $conf container-replicator interval 30)
else
interval=$(get_config_val $conf container-updater interval 300)
fi
if ! healthcheck_file_modification $cache $interval; then
echo "Cache file $cache was not updated within interval of $interval seconds."
exit 1
fi
fi fi

View File

@ -1,12 +0,0 @@
#!/bin/sh
. ${HEALTHCHECK_SCRIPTS:-/usr/share/openstack-tripleo-common/healthcheck}/common.sh
conf=/etc/swift/object-expirer.conf
cache=/var/cache/swift/object.recon
interval=$(get_config_val $conf object-expirer interval 300)
if ! healthcheck_file_modification $cache $interval; then
echo "Cache file $cache was not updated within interval of $interval seconds."
exit 1
fi

View File

@ -0,0 +1,8 @@
#!/bin/bash
. ${HEALTHCHECK_SCRIPTS:-/usr/share/openstack-tripleo-common/healthcheck}/common.sh
conf=/etc/swift/object-server.conf
cache=/var/cache/swift/object.recon
check_swift_interval object-replicator

View File

@ -19,26 +19,4 @@ if ps -e | grep --quiet swift-object-server; then
bind_host="[${bind_host}]" bind_host="[${bind_host}]"
fi fi
healthcheck_curl http://${bind_host}:${bind_port}/healthcheck healthcheck_curl http://${bind_host}:${bind_port}/healthcheck
elif ps -e | grep --quiet rsync; then
process='rsync'
args="${@:-873}"
if ! healthcheck_listen $process $args; then
ports=${args// /,}
echo "There is no $process process, listening on port(s) $ports, running in the container."
exit 1
fi
else
if ps -e | grep --quiet swift-account-auditor; then
interval=$(get_config_val $conf object-auditor interval 300)
elif ps -e | grep --quiet swift-account-reaper; then
interval=$(get_config_val $conf object-replicator interval 300)
else
interval=$(get_config_val $conf object-updater interval 300)
fi
if ! healthcheck_file_modification $cache $interval; then
echo "Cache file $cache was not updated within interval of $interval seconds."
exit 1
fi
fi fi

View File

@ -18,15 +18,4 @@ if pgrep -f swift-proxy-server; then
bind_host="[${bind_host}]" bind_host="[${bind_host}]"
fi fi
healthcheck_curl http://${bind_host}:${bind_port}/healthcheck healthcheck_curl http://${bind_host}:${bind_port}/healthcheck
else
# TODO(mmagr): Remove this once swift_object_expirer container will start
# using swift-object-expirer kolla image
conf=/etc/swift/object-expirer.conf
cache=/var/cache/swift/object.recon
interval=$(get_config_val $conf object-expirer interval 300)
if ! healthcheck_file_modification $cache $interval; then
echo "Cache file $cache was not updated within interval of $interval seconds."
exit 1
fi
fi fi

View File

@ -0,0 +1,5 @@
---
features:
- |
Adds additional healtchecks for Swift to monitor account,
container and object replicators as well as the rsync process.