From 575fde1a963d3941e7dbc5a457bc05faf7e9b75f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20M=C3=A1gr?= Date: Mon, 9 Apr 2018 15:26:10 +0200 Subject: [PATCH] Add health checks for Swift services This patch is adding health checks for background processes. The check is performed by comparing recon cache file modification. This is far from ideal, because recon files are shared via services of same type. But it's the only way to at least partialy check correct behaviour for background processes. In future there we should add timestamp info to each recon file for all services using the recon file. Then we would be able to find out if all background services are updating their caches in timely fashion and stay within the border of containers. Co-Authored-By: Thiago da Silva Change-Id: Ib6fad8311b5a728914ce9df9122194c5f7036be7 --- healthcheck/common.sh | 11 +++++++ healthcheck/swift-account-server | 38 +++++++++++++++++------- healthcheck/swift-container-server | 38 +++++++++++++++++------- healthcheck/swift-object-expirer | 12 ++++++++ healthcheck/swift-object-server | 47 +++++++++++++++++++++++------- healthcheck/swift-proxy | 37 +++++++++++++++-------- healthcheck/swift-rsync | 14 +++++++++ 7 files changed, 152 insertions(+), 45 deletions(-) create mode 100755 healthcheck/swift-object-expirer create mode 100755 healthcheck/swift-rsync diff --git a/healthcheck/common.sh b/healthcheck/common.sh index dc83fd5ae..c05dad13e 100644 --- a/healthcheck/common.sh +++ b/healthcheck/common.sh @@ -52,6 +52,17 @@ healthcheck_socket () { lsof -Fc -Ua $socket | grep "c$process" } +healthcheck_file_modification () { + file_path=$1 + limit_seconds=$2 + + curr_time=$(date +%s) + last_mod=$(stat -c '%Y' $file_path) + limit_epoch=$(( curr_time-limit_seconds )) + if [ "$limit_epoch" -gt "$last_mod" ]; then + return 1 + fi +} get_config_val () { crudini --get "$1" "$2" "$3" 2> /dev/null || echo "$4" diff --git a/healthcheck/swift-account-server b/healthcheck/swift-account-server index d686e6364..8762604ce 100755 --- a/healthcheck/swift-account-server +++ b/healthcheck/swift-account-server @@ -3,17 +3,33 @@ . ${HEALTHCHECK_SCRIPTS:-/usr/share/openstack-tripleo-common/healthcheck}/common.sh conf=/etc/swift/account-server.conf +cache=/var/cache/swift/account.recon -if ! crudini --get $conf pipeline:main pipeline | grep -q healthcheck; then - echo "healthcheck is not available" >&2 - exit 0 -fi +if ps -e | grep --quiet swift-account-server; then + if ! crudini --get $conf pipeline:main pipeline | grep -q healthcheck; then + echo "healthcheck is not available" >&2 + exit 0 + fi -# swift-account-server is still eventlet -bind_host=$(get_config_val $conf DEFAULT bind_ip 127.0.0.1) -bind_port=$(get_config_val $conf DEFAULT bind_port 6002) -# Add brackets if IPv6 -if [[ $bind_host =~ ":" ]]; then - bind_host="[${bind_host}]" + # swift-account-server is still eventlet + bind_host=$(get_config_val $conf DEFAULT bind_ip 127.0.0.1) + bind_port=$(get_config_val $conf DEFAULT bind_port 6002) + # Add brackets if IPv6 + if [[ $bind_host =~ ":" ]]; then + bind_host="[${bind_host}]" + fi + healthcheck_curl http://${bind_host}:${bind_port}/healthcheck +else + if ps -e | grep --quiet swift-account-auditor; then + interval=$(get_config_val $conf account-auditor interval 1800) + elif ps -e | grep --quiet swift-account-reaper; then + interval=$(get_config_val $conf account-reaper interval 3600) + else + interval=$(get_config_val $conf account-replicator interval 30) + fi + + if ! healthcheck_file_modification $cache $interval; then + echo "Cache file $cache was not updated within interval of $interval seconds." + exit 1 + fi fi -healthcheck_curl http://${bind_host}:${bind_port}/healthcheck diff --git a/healthcheck/swift-container-server b/healthcheck/swift-container-server index 9bb1f5a00..c9c8cba89 100755 --- a/healthcheck/swift-container-server +++ b/healthcheck/swift-container-server @@ -3,17 +3,33 @@ . ${HEALTHCHECK_SCRIPTS:-/usr/share/openstack-tripleo-common/healthcheck}/common.sh conf=/etc/swift/container-server.conf +cache=/var/cache/swift/container.recon -if ! crudini --get $conf pipeline:main pipeline | grep -q healthcheck; then - echo "healthcheck is not available" >&2 - exit 0 -fi +if ps -e | grep --quiet swift-container-server; then + if ! crudini --get $conf pipeline:main pipeline | grep -q healthcheck; then + echo "healthcheck is not available" >&2 + exit 0 + fi -# swift-container-server is still eventlet -bind_host=$(get_config_val $conf DEFAULT bind_ip 127.0.0.1) -bind_port=$(get_config_val $conf DEFAULT bind_port 6001) -# Add brackets if IPv6 -if [[ $bind_host =~ ":" ]]; then - bind_host="[${bind_host}]" + # swift-container-server is still eventlet + bind_host=$(get_config_val $conf DEFAULT bind_ip 127.0.0.1) + bind_port=$(get_config_val $conf DEFAULT bind_port 6001) + # Add brackets if IPv6 + if [[ $bind_host =~ ":" ]]; then + bind_host="[${bind_host}]" + fi + healthcheck_curl http://${bind_host}:${bind_port}/healthcheck +else + if ps -e | grep --quiet swift-account-auditor; then + interval=$(get_config_val $conf container-auditor interval 1800) + elif ps -e | grep --quiet swift-account-reaper; then + interval=$(get_config_val $conf container-replicator interval 30) + else + interval=$(get_config_val $conf container-updater interval 300) + fi + + if ! healthcheck_file_modification $cache $interval; then + echo "Cache file $cache was not updated within interval of $interval seconds." + exit 1 + fi fi -healthcheck_curl http://${bind_host}:${bind_port}/healthcheck diff --git a/healthcheck/swift-object-expirer b/healthcheck/swift-object-expirer new file mode 100755 index 000000000..7c4c61526 --- /dev/null +++ b/healthcheck/swift-object-expirer @@ -0,0 +1,12 @@ +#!/bin/sh + +. ${HEALTHCHECK_SCRIPTS:-/usr/share/openstack-tripleo-common/healthcheck}/common.sh + +conf=/etc/swift/object-expirer.conf +cache=/var/cache/swift/object.recon +interval=$(get_config_val $conf object-expirer interval 300) + +if ! healthcheck_file_modification $cache $interval; then + echo "Cache file $cache was not updated within interval of $interval seconds." + exit 1 +fi diff --git a/healthcheck/swift-object-server b/healthcheck/swift-object-server index 2e4a35140..b9f618a4d 100755 --- a/healthcheck/swift-object-server +++ b/healthcheck/swift-object-server @@ -3,17 +3,42 @@ . ${HEALTHCHECK_SCRIPTS:-/usr/share/openstack-tripleo-common/healthcheck}/common.sh conf=/etc/swift/object-server.conf +cache=/var/cache/swift/object.recon -if ! crudini --get $conf pipeline:main pipeline | grep -q healthcheck; then - echo "healthcheck is not available" >&2 - exit 0 -fi +if ps -e | grep --quiet swift-object-server; then + if ! crudini --get $conf pipeline:main pipeline | grep -q healthcheck; then + echo "healthcheck is not available" >&2 + exit 0 + fi -# swift-object-server is still eventlet -bind_host=$(get_config_val $conf DEFAULT bind_ip 127.0.0.1) -bind_port=$(get_config_val $conf DEFAULT bind_port 6000) -# Add brackets if IPv6 -if [[ $bind_host =~ ":" ]]; then - bind_host="[${bind_host}]" + # swift-object-server is still eventlet + bind_host=$(get_config_val $conf DEFAULT bind_ip 127.0.0.1) + bind_port=$(get_config_val $conf DEFAULT bind_port 6000) + # Add brackets if IPv6 + if [[ $bind_host =~ ":" ]]; then + bind_host="[${bind_host}]" + fi + healthcheck_curl http://${bind_host}:${bind_port}/healthcheck +elif ps -e | grep --quiet rsync; then + process='rsync' + args="${@:-873}" + + if ! healthcheck_listen $process $args; then + ports=${args// /,} + echo "There is no $process process, listening on port(s) $ports, running in the container." + exit 1 + fi +else + if ps -e | grep --quiet swift-account-auditor; then + interval=$(get_config_val $conf object-auditor interval 300) + elif ps -e | grep --quiet swift-account-reaper; then + interval=$(get_config_val $conf object-replicator interval 300) + else + interval=$(get_config_val $conf object-updater interval 300) + fi + + if ! healthcheck_file_modification $cache $interval; then + echo "Cache file $cache was not updated within interval of $interval seconds." + exit 1 + fi fi -healthcheck_curl http://${bind_host}:${bind_port}/healthcheck diff --git a/healthcheck/swift-proxy b/healthcheck/swift-proxy index c926dff97..6b9df0a2d 100755 --- a/healthcheck/swift-proxy +++ b/healthcheck/swift-proxy @@ -2,18 +2,31 @@ . ${HEALTHCHECK_SCRIPTS:-/usr/share/openstack-tripleo-common/healthcheck}/common.sh -conf=/etc/swift/proxy-server.conf +if ps -e | grep --quiet swift-proxy-server; then + conf=/etc/swift/proxy-server.conf -if ! crudini --get $conf pipeline:main pipeline | grep -q healthcheck; then - echo "healthcheck is not available" >&2 - exit 0 -fi + if ! crudini --get $conf pipeline:main pipeline | grep -q healthcheck; then + echo "healthcheck is not available" >&2 + exit 0 + fi -# swift-proxy is still eventlet -bind_host=$(get_config_val $conf DEFAULT bind_ip 127.0.0.1) -bind_port=$(get_config_val $conf DEFAULT bind_port 8080) -# Add brackets if IPv6 -if [[ $bind_host =~ ":" ]]; then - bind_host="[${bind_host}]" + # swift-proxy is still eventlet + bind_host=$(get_config_val $conf DEFAULT bind_ip 127.0.0.1) + bind_port=$(get_config_val $conf DEFAULT bind_port 8080) + # Add brackets if IPv6 + if [[ $bind_host =~ ":" ]]; then + bind_host="[${bind_host}]" + fi + healthcheck_curl http://${bind_host}:${bind_port}/healthcheck +else + # TODO(mmagr): Remove this once swift_object_expirer container will start + # using swift-object-expirer kolla image + conf=/etc/swift/object-expirer.conf + cache=/var/cache/swift/object.recon + interval=$(get_config_val $conf object-expirer interval 300) + + if ! healthcheck_file_modification $cache $interval; then + echo "Cache file $cache was not updated within interval of $interval seconds." + exit 1 + fi fi -healthcheck_curl http://${bind_host}:${bind_port}/healthcheck diff --git a/healthcheck/swift-rsync b/healthcheck/swift-rsync new file mode 100755 index 000000000..70174d597 --- /dev/null +++ b/healthcheck/swift-rsync @@ -0,0 +1,14 @@ +#!/bin/bash + +. ${HEALTHCHECK_SCRIPTS:-/usr/share/openstack-tripleo-common/healthcheck}/common.sh + +process='rsync' +args="${@:-873}" + +if healthcheck_listen $process $args; then + exit 0 +else + ports=${args// /,} + echo "There is no $process process, listening on port(s) $ports, running in the container." + exit 1 +fi