Damien Ciabrini 128c2bcc25 CentOS 9: support restart of HA resources
Pacemaker 2.1 changed naming convention around multi-state
resources and ocf resource name. Adapt our resource restart
scripts so that they parse the proper data from the CIB.

Change-Id: Ieade3444e44e305f507c057991e02048ab5f3b3a
Closes-Bug: #1942771
2021-09-06 14:26:55 +02:00

104 lines
3.3 KiB
Bash
Executable File

#!/bin/bash
# pacemaker_mutex_restart_bundle.sh --lock mysql galera galera-bundle Master _
# pacemaker_mutex_restart_bundle.sh --lock ovn_dbs ovndb_servers ovn-dbs-bundle Slave Master
set -u
usage() {
echo "Restart a clustered resource in a coordinated way across the cluster"
echo "Usage:"
echo " $0 --lock <tripleo-service> <pcmk-resource> <pcmk-bundle> <target-state-local> <target-state-cluster>"
echo
}
log() {
echo "$(date -u): $1"
}
error() {
echo "$(date -u): $1" 1>&2
exit 1
}
pacemaker_supports_promoted() {
# The Promoted token is only matched in recent pacemaker versions
grep -wq "<value>Promoted</value>" /usr/share/pacemaker/resources-*.rng
}
ACTION=$1
case $ACTION in
--help) usage; exit 0;;
--lock) ;;
*) error "Unknown action '$ACTION'";;
esac
TRIPLEO_SERVICE=$2
LOCK_NAME=${TRIPLEO_SERVICE}-restart-lock
LOCK_OWNER=$(crm_node -n 2>/dev/null)
rc=$?
if [ $rc -ne 0 ]; then
if [ $rc -eq 102 ]; then
log "Cluster is not running locally, no need to restart resource $TRIPLEO_SERVICE"
exit 0
else
error "Unexpected error while connecting to the cluster (rc: $rc), bailing out"
fi
fi
RESOURCE_NAME=$3
BUNDLE_NAME=$4
WAIT_TARGET_LOCAL=$5
WAIT_TARGET_ANYWHERE=${6:-_}
if pacemaker_supports_promoted; then
WAIT_TARGET_LOCAL=$(echo "$5" | sed -e 's/Master/Promoted/' -e 's/Slave/Unpromoted/')
WAIT_TARGET_ANYWHERE=$(echo "${6:-_}" | sed -e 's/Master/Promoted/' -e 's/Slave/Unpromoted/')
promoted_role="Promoted"
else
promoted_role="Master"
fi
# The lock TTL should accomodate for the resource start/promote timeout
if [ "$RESOURCE_NAME" != "$BUNDLE_NAME" ]; then
if [ "$WAIT_TARGET_LOCAL" = "$promoted_role" ] || [ "$WAIT_TARGET_ANYWHERE" = "$promoted_role" ]; then
rsc_op="promote"
else
rsc_op="start"
fi
# <op id="galera-promote-interval-0s" interval="0s" name="promote" on-fail="block" timeout="300s"/>
PCMK_TTL=$(cibadmin -Q | xmllint -xpath "string(//primitive[@id='${RESOURCE_NAME}']/operations/op[@name='${rsc_op}']/@timeout)" - | sed 's/s$//')
LOCK_TTL=$((PCMK_TTL + 30))
else
# The podman RA's default start timeout
LOCK_TTL=90
fi
log "Acquire a ${LOCK_TTL}s restart lock for service $TRIPLEO_SERVICE before restarting it"
# Loop until we hold the lock. The lock has a TTL, so we're guaranteed to get it eventually
rc=1
while [ $rc -ne 0 ]; do
/var/lib/container-config-scripts/pacemaker_resource_lock.sh --acquire $LOCK_NAME $LOCK_OWNER $LOCK_TTL
rc=$?
if [ $rc != 0 ]; then
if [ $rc -gt 1 ]; then
error "Could not acquire lock due to unrecoverable error (rc: $rc), bailing out"
else
log "Could not acquire lock, retrying"
sleep 10
fi
fi
done
log "Restart the service $TRIPLEO_SERVICE locally"
# Reuse the local restart script in t-h-t (driven by env var TRIPLEO_MINOR_UPDATE)
TRIPLEO_MINOR_UPDATE=true /var/lib/container-config-scripts/pacemaker_restart_bundle.sh $TRIPLEO_SERVICE $RESOURCE_NAME $BUNDLE_NAME $WAIT_TARGET_LOCAL $WAIT_TARGET_ANYWHERE
# If we reached this point, always try to release the lock
log "Release the restart lock for service $TRIPLEO_SERVICE"
/var/lib/container-config-scripts/pacemaker_resource_lock.sh --release $LOCK_NAME $LOCK_OWNER
rc=$?
if [ $rc -ne 0 ] && [ $rc -ne 1 ]; then
error "Could not release held lock (rc: $rc)"
fi