Improve Swift healthchecks

This patch adds checks for the replicators.

It also removes some unused or invalid code from other checks. Checking
modification time of the recon files is not enough, these might also be
changed by other Swift processes and are not a good indicator for stuck
processes.

Co-Authored-By: Christian Schwede <cschwede@redhat.com>
Co-Authored-By: Emilien Macchi <emilien@redhat.com>

Change-Id: Ib15f1ec4766bf4d64a2860422c230e4d514bc224
(cherry picked from commit 0862fcf98f)
This commit is contained in:
Christian Schwede 2019-04-09 12:57:15 +02:00 committed by Emilien Macchi
parent 234efc00b8
commit a60453eaa0
11 changed files with 42 additions and 80 deletions

View File

@ -694,15 +694,6 @@ RUN mkdir -p /openstack && \
chmod a+rx /openstack/healthcheck
{% endblock %}
# Note(mmagr): Below block is currently noop because swift-proxy-server kolla image is used
# for swift_object_expirer container, but it will allow smooth transition
# to usage of correct image for the service in future.
{% block swift_object_expirer_footer %}
RUN mkdir -p /openstack && \
ln -s /usr/share/openstack-tripleo-common/healthcheck/swift-object-expirer /openstack/healthcheck && \
chmod a+rx /openstack/healthcheck
{% endblock %}
{% block swift_proxy_server_footer %}
RUN mkdir -p /openstack && \
ln -s /usr/share/openstack-tripleo-common/healthcheck/swift-proxy /openstack/healthcheck && \

View File

@ -109,3 +109,16 @@ get_url_from_vhost () {
fi
echo ${proto}://${server_name}:${bind_port}${wsgi_alias}
}
check_swift_interval () {
service=$1
if ps -e | grep --quiet swift-$service; then
interval=$(get_config_val $conf $service interval 300)
last=`grep -o "\"replication_last\": [0-9]*" $cache | cut -f 2 -d " "`
now=`date +%s`
if [ `expr $now - $last` -gt $interval ]; then
echo "Last replication run did not finish within interval of $interval seconds."
exit 1
fi
fi
}

View File

@ -0,0 +1,8 @@
#!/bin/bash
. ${HEALTHCHECK_SCRIPTS:-/usr/share/openstack-tripleo-common/healthcheck}/common.sh
conf=/etc/swift/account-server.conf
cache=/var/cache/swift/account.recon
check_swift_interval account-replicator

View File

@ -19,17 +19,4 @@ if ps -e | grep --quiet swift-account-server; then
bind_host="[${bind_host}]"
fi
healthcheck_curl http://${bind_host}:${bind_port}/healthcheck
else
if ps -e | grep --quiet swift-account-auditor; then
interval=$(get_config_val $conf account-auditor interval 1800)
elif ps -e | grep --quiet swift-account-reaper; then
interval=$(get_config_val $conf account-reaper interval 3600)
else
interval=$(get_config_val $conf account-replicator interval 30)
fi
if ! healthcheck_file_modification $cache $interval; then
echo "Cache file $cache was not updated within interval of $interval seconds."
exit 1
fi
fi

View File

@ -0,0 +1,8 @@
#!/bin/bash
. ${HEALTHCHECK_SCRIPTS:-/usr/share/openstack-tripleo-common/healthcheck}/common.sh
conf=/etc/swift/container-server.conf
cache=/var/cache/swift/container.recon
check_swift_interval container-replicator

View File

@ -19,17 +19,4 @@ if ps -e | grep --quiet swift-container-server; then
bind_host="[${bind_host}]"
fi
healthcheck_curl http://${bind_host}:${bind_port}/healthcheck
else
if ps -e | grep --quiet swift-account-auditor; then
interval=$(get_config_val $conf container-auditor interval 1800)
elif ps -e | grep --quiet swift-account-reaper; then
interval=$(get_config_val $conf container-replicator interval 30)
else
interval=$(get_config_val $conf container-updater interval 300)
fi
if ! healthcheck_file_modification $cache $interval; then
echo "Cache file $cache was not updated within interval of $interval seconds."
exit 1
fi
fi

View File

@ -1,12 +0,0 @@
#!/bin/sh
. ${HEALTHCHECK_SCRIPTS:-/usr/share/openstack-tripleo-common/healthcheck}/common.sh
conf=/etc/swift/object-expirer.conf
cache=/var/cache/swift/object.recon
interval=$(get_config_val $conf object-expirer interval 300)
if ! healthcheck_file_modification $cache $interval; then
echo "Cache file $cache was not updated within interval of $interval seconds."
exit 1
fi

View File

@ -0,0 +1,8 @@
#!/bin/bash
. ${HEALTHCHECK_SCRIPTS:-/usr/share/openstack-tripleo-common/healthcheck}/common.sh
conf=/etc/swift/object-server.conf
cache=/var/cache/swift/object.recon
check_swift_interval object-replicator

View File

@ -19,26 +19,4 @@ if ps -e | grep --quiet swift-object-server; then
bind_host="[${bind_host}]"
fi
healthcheck_curl http://${bind_host}:${bind_port}/healthcheck
elif ps -e | grep --quiet rsync; then
process='rsync'
args="${@:-873}"
if ! healthcheck_listen $process $args; then
ports=${args// /,}
echo "There is no $process process, listening on port(s) $ports, running in the container."
exit 1
fi
else
if ps -e | grep --quiet swift-account-auditor; then
interval=$(get_config_val $conf object-auditor interval 300)
elif ps -e | grep --quiet swift-account-reaper; then
interval=$(get_config_val $conf object-replicator interval 300)
else
interval=$(get_config_val $conf object-updater interval 300)
fi
if ! healthcheck_file_modification $cache $interval; then
echo "Cache file $cache was not updated within interval of $interval seconds."
exit 1
fi
fi

View File

@ -18,15 +18,4 @@ if pgrep -f swift-proxy-server; then
bind_host="[${bind_host}]"
fi
healthcheck_curl http://${bind_host}:${bind_port}/healthcheck
else
# TODO(mmagr): Remove this once swift_object_expirer container will start
# using swift-object-expirer kolla image
conf=/etc/swift/object-expirer.conf
cache=/var/cache/swift/object.recon
interval=$(get_config_val $conf object-expirer interval 300)
if ! healthcheck_file_modification $cache $interval; then
echo "Cache file $cache was not updated within interval of $interval seconds."
exit 1
fi
fi

View File

@ -0,0 +1,5 @@
---
features:
- |
Adds additional healtchecks for Swift to monitor account,
container and object replicators as well as the rsync process.