#!/bin/bash # pacemaker_mutex_restart_bundle.sh --lock mysql galera galera-bundle Master _ # pacemaker_mutex_restart_bundle.sh --lock ovn_dbs ovndb_servers ovn-dbs-bundle Slave Master set -u usage() { echo "Restart a clustered resource in a coordinated way across the cluster" echo "Usage:" echo " $0 --lock " echo } log() { echo "$(date -u): $1" } error() { echo "$(date -u): $1" 1>&2 exit 1 } pacemaker_supports_promoted() { # The Promoted token is only matched in recent pacemaker versions grep -wq "Promoted" /usr/share/pacemaker/resources-*.rng } ACTION=$1 case $ACTION in --help) usage; exit 0;; --lock) ;; *) error "Unknown action '$ACTION'";; esac TRIPLEO_SERVICE=$2 LOCK_NAME=${TRIPLEO_SERVICE}-restart-lock LOCK_OWNER=$(crm_node -n 2>/dev/null) rc=$? if [ $rc -ne 0 ]; then if [ $rc -eq 102 ]; then log "Cluster is not running locally, no need to restart resource $TRIPLEO_SERVICE" exit 0 else error "Unexpected error while connecting to the cluster (rc: $rc), bailing out" fi fi RESOURCE_NAME=$3 BUNDLE_NAME=$4 WAIT_TARGET_LOCAL=$5 WAIT_TARGET_ANYWHERE=${6:-_} if pacemaker_supports_promoted; then WAIT_TARGET_LOCAL=$(echo "$5" | sed -e 's/Master/Promoted/' -e 's/Slave/Unpromoted/') WAIT_TARGET_ANYWHERE=$(echo "${6:-_}" | sed -e 's/Master/Promoted/' -e 's/Slave/Unpromoted/') promoted_role="Promoted" else promoted_role="Master" fi # The lock TTL should accomodate for the resource start/promote timeout if [ "$RESOURCE_NAME" != "$BUNDLE_NAME" ]; then if [ "$WAIT_TARGET_LOCAL" = "$promoted_role" ] || [ "$WAIT_TARGET_ANYWHERE" = "$promoted_role" ]; then rsc_op="promote" else rsc_op="start" fi # PCMK_TTL=$(cibadmin -Q | xmllint -xpath "string(//primitive[@id='${RESOURCE_NAME}']/operations/op[@name='${rsc_op}']/@timeout)" - | sed 's/s$//') LOCK_TTL=$((PCMK_TTL + 30)) else # The podman RA's default start timeout LOCK_TTL=90 fi log "Acquire a ${LOCK_TTL}s restart lock for service $TRIPLEO_SERVICE before restarting it" # Loop until we hold the lock. The lock has a TTL, so we're guaranteed to get it eventually rc=1 while [ $rc -ne 0 ]; do /var/lib/container-config-scripts/pacemaker_resource_lock.sh --acquire $LOCK_NAME $LOCK_OWNER $LOCK_TTL rc=$? if [ $rc != 0 ]; then if [ $rc -gt 1 ]; then error "Could not acquire lock due to unrecoverable error (rc: $rc), bailing out" else log "Could not acquire lock, retrying" sleep 10 fi fi done log "Restart the service $TRIPLEO_SERVICE locally" # Reuse the local restart script in t-h-t (driven by env var TRIPLEO_MINOR_UPDATE) TRIPLEO_MINOR_UPDATE=true /var/lib/container-config-scripts/pacemaker_restart_bundle.sh $TRIPLEO_SERVICE $RESOURCE_NAME $BUNDLE_NAME $WAIT_TARGET_LOCAL $WAIT_TARGET_ANYWHERE # If we reached this point, always try to release the lock log "Release the restart lock for service $TRIPLEO_SERVICE" /var/lib/container-config-scripts/pacemaker_resource_lock.sh --release $LOCK_NAME $LOCK_OWNER rc=$? if [ $rc -ne 0 ] && [ $rc -ne 1 ]; then error "Could not release held lock (rc: $rc)" fi