From 0862fcf98fab43cce64979a1d4a9849498e496fb Mon Sep 17 00:00:00 2001 From: Christian Schwede Date: Tue, 9 Apr 2019 12:57:15 +0200 Subject: [PATCH] Improve Swift healthchecks This patch adds checks for the replicators. It also removes some unused or invalid code from other checks. Checking modification time of the recon files is not enough, these might also be changed by other Swift processes and are not a good indicator for stuck processes. Co-Authored-By: Christian Schwede Co-Authored-By: Emilien Macchi Change-Id: Ib15f1ec4766bf4d64a2860422c230e4d514bc224 --- healthcheck/common.sh | 13 +++++++++++ healthcheck/swift-account-replicator | 8 +++++++ healthcheck/swift-account-server | 13 ----------- healthcheck/swift-container-replicator | 8 +++++++ healthcheck/swift-container-server | 13 ----------- healthcheck/swift-object-expirer | 12 ---------- healthcheck/swift-object-replicator | 8 +++++++ healthcheck/swift-object-server | 22 ------------------- healthcheck/swift-proxy | 11 ---------- ...ditional-healtchecks-ab8cd9c7562654f3.yaml | 5 +++++ 10 files changed, 42 insertions(+), 71 deletions(-) create mode 100755 healthcheck/swift-account-replicator create mode 100755 healthcheck/swift-container-replicator delete mode 100755 healthcheck/swift-object-expirer create mode 100755 healthcheck/swift-object-replicator create mode 100644 releasenotes/notes/swift-additional-healtchecks-ab8cd9c7562654f3.yaml diff --git a/healthcheck/common.sh b/healthcheck/common.sh index aa9c2c52d..7d9538f16 100755 --- a/healthcheck/common.sh +++ b/healthcheck/common.sh @@ -109,3 +109,16 @@ get_url_from_vhost () { fi echo ${proto}://${server_name}:${bind_port}${wsgi_alias} } + +check_swift_interval () { + service=$1 + if ps -e | grep --quiet swift-$service; then + interval=$(get_config_val $conf $service interval 300) + last=`grep -o "\"replication_last\": [0-9]*" $cache | cut -f 2 -d " "` + now=`date +%s` + if [ `expr $now - $last` -gt $interval ]; then + echo "Last replication run did not finish within interval of $interval seconds." + exit 1 + fi + fi +} diff --git a/healthcheck/swift-account-replicator b/healthcheck/swift-account-replicator new file mode 100755 index 000000000..1e69cc1c8 --- /dev/null +++ b/healthcheck/swift-account-replicator @@ -0,0 +1,8 @@ +#!/bin/bash + +. ${HEALTHCHECK_SCRIPTS:-/usr/share/openstack-tripleo-common/healthcheck}/common.sh + +conf=/etc/swift/account-server.conf +cache=/var/cache/swift/account.recon + +check_swift_interval account-replicator diff --git a/healthcheck/swift-account-server b/healthcheck/swift-account-server index 1ef7f05ec..9afcd1a08 100755 --- a/healthcheck/swift-account-server +++ b/healthcheck/swift-account-server @@ -19,17 +19,4 @@ if ps -e | grep --quiet swift-account-server; then bind_host="[${bind_host}]" fi healthcheck_curl http://${bind_host}:${bind_port}/healthcheck -else - if ps -e | grep --quiet swift-account-auditor; then - interval=$(get_config_val $conf account-auditor interval 1800) - elif ps -e | grep --quiet swift-account-reaper; then - interval=$(get_config_val $conf account-reaper interval 3600) - else - interval=$(get_config_val $conf account-replicator interval 30) - fi - - if ! healthcheck_file_modification $cache $interval; then - echo "Cache file $cache was not updated within interval of $interval seconds." - exit 1 - fi fi diff --git a/healthcheck/swift-container-replicator b/healthcheck/swift-container-replicator new file mode 100755 index 000000000..34ef269ee --- /dev/null +++ b/healthcheck/swift-container-replicator @@ -0,0 +1,8 @@ +#!/bin/bash + +. ${HEALTHCHECK_SCRIPTS:-/usr/share/openstack-tripleo-common/healthcheck}/common.sh + +conf=/etc/swift/container-server.conf +cache=/var/cache/swift/container.recon + +check_swift_interval container-replicator diff --git a/healthcheck/swift-container-server b/healthcheck/swift-container-server index a1fdcb952..c2f66cde4 100755 --- a/healthcheck/swift-container-server +++ b/healthcheck/swift-container-server @@ -19,17 +19,4 @@ if ps -e | grep --quiet swift-container-server; then bind_host="[${bind_host}]" fi healthcheck_curl http://${bind_host}:${bind_port}/healthcheck -else - if ps -e | grep --quiet swift-account-auditor; then - interval=$(get_config_val $conf container-auditor interval 1800) - elif ps -e | grep --quiet swift-account-reaper; then - interval=$(get_config_val $conf container-replicator interval 30) - else - interval=$(get_config_val $conf container-updater interval 300) - fi - - if ! healthcheck_file_modification $cache $interval; then - echo "Cache file $cache was not updated within interval of $interval seconds." - exit 1 - fi fi diff --git a/healthcheck/swift-object-expirer b/healthcheck/swift-object-expirer deleted file mode 100755 index 7c4c61526..000000000 --- a/healthcheck/swift-object-expirer +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/sh - -. ${HEALTHCHECK_SCRIPTS:-/usr/share/openstack-tripleo-common/healthcheck}/common.sh - -conf=/etc/swift/object-expirer.conf -cache=/var/cache/swift/object.recon -interval=$(get_config_val $conf object-expirer interval 300) - -if ! healthcheck_file_modification $cache $interval; then - echo "Cache file $cache was not updated within interval of $interval seconds." - exit 1 -fi diff --git a/healthcheck/swift-object-replicator b/healthcheck/swift-object-replicator new file mode 100755 index 000000000..c165a373b --- /dev/null +++ b/healthcheck/swift-object-replicator @@ -0,0 +1,8 @@ +#!/bin/bash + +. ${HEALTHCHECK_SCRIPTS:-/usr/share/openstack-tripleo-common/healthcheck}/common.sh + +conf=/etc/swift/object-server.conf +cache=/var/cache/swift/object.recon + +check_swift_interval object-replicator diff --git a/healthcheck/swift-object-server b/healthcheck/swift-object-server index e0cee668d..b9a56c348 100755 --- a/healthcheck/swift-object-server +++ b/healthcheck/swift-object-server @@ -19,26 +19,4 @@ if ps -e | grep --quiet swift-object-server; then bind_host="[${bind_host}]" fi healthcheck_curl http://${bind_host}:${bind_port}/healthcheck -elif ps -e | grep --quiet rsync; then - process='rsync' - args="${@:-873}" - - if ! healthcheck_listen $process $args; then - ports=${args// /,} - echo "There is no $process process, listening on port(s) $ports, running in the container." - exit 1 - fi -else - if ps -e | grep --quiet swift-account-auditor; then - interval=$(get_config_val $conf object-auditor interval 300) - elif ps -e | grep --quiet swift-account-reaper; then - interval=$(get_config_val $conf object-replicator interval 300) - else - interval=$(get_config_val $conf object-updater interval 300) - fi - - if ! healthcheck_file_modification $cache $interval; then - echo "Cache file $cache was not updated within interval of $interval seconds." - exit 1 - fi fi diff --git a/healthcheck/swift-proxy b/healthcheck/swift-proxy index 00528ebae..d849253eb 100755 --- a/healthcheck/swift-proxy +++ b/healthcheck/swift-proxy @@ -18,15 +18,4 @@ if pgrep -f swift-proxy-server; then bind_host="[${bind_host}]" fi healthcheck_curl http://${bind_host}:${bind_port}/healthcheck -else - # TODO(mmagr): Remove this once swift_object_expirer container will start - # using swift-object-expirer kolla image - conf=/etc/swift/object-expirer.conf - cache=/var/cache/swift/object.recon - interval=$(get_config_val $conf object-expirer interval 300) - - if ! healthcheck_file_modification $cache $interval; then - echo "Cache file $cache was not updated within interval of $interval seconds." - exit 1 - fi fi diff --git a/releasenotes/notes/swift-additional-healtchecks-ab8cd9c7562654f3.yaml b/releasenotes/notes/swift-additional-healtchecks-ab8cd9c7562654f3.yaml new file mode 100644 index 000000000..129a2c9cd --- /dev/null +++ b/releasenotes/notes/swift-additional-healtchecks-ab8cd9c7562654f3.yaml @@ -0,0 +1,5 @@ +--- +features: + - | + Adds additional healtchecks for Swift to monitor account, + container and object replicators as well as the rsync process.