128c2bcc25
Pacemaker 2.1 changed naming convention around multi-state resources and ocf resource name. Adapt our resource restart scripts so that they parse the proper data from the CIB. Change-Id: Ieade3444e44e305f507c057991e02048ab5f3b3a Closes-Bug: #1942771
104 lines
3.3 KiB
Bash
Executable File
104 lines
3.3 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# pacemaker_mutex_restart_bundle.sh --lock mysql galera galera-bundle Master _
|
|
# pacemaker_mutex_restart_bundle.sh --lock ovn_dbs ovndb_servers ovn-dbs-bundle Slave Master
|
|
|
|
set -u
|
|
|
|
usage() {
|
|
echo "Restart a clustered resource in a coordinated way across the cluster"
|
|
echo "Usage:"
|
|
echo " $0 --lock <tripleo-service> <pcmk-resource> <pcmk-bundle> <target-state-local> <target-state-cluster>"
|
|
echo
|
|
}
|
|
|
|
log() {
|
|
echo "$(date -u): $1"
|
|
}
|
|
|
|
error() {
|
|
echo "$(date -u): $1" 1>&2
|
|
exit 1
|
|
}
|
|
|
|
pacemaker_supports_promoted() {
|
|
# The Promoted token is only matched in recent pacemaker versions
|
|
grep -wq "<value>Promoted</value>" /usr/share/pacemaker/resources-*.rng
|
|
}
|
|
|
|
ACTION=$1
|
|
case $ACTION in
|
|
--help) usage; exit 0;;
|
|
--lock) ;;
|
|
*) error "Unknown action '$ACTION'";;
|
|
esac
|
|
|
|
TRIPLEO_SERVICE=$2
|
|
LOCK_NAME=${TRIPLEO_SERVICE}-restart-lock
|
|
LOCK_OWNER=$(crm_node -n 2>/dev/null)
|
|
rc=$?
|
|
if [ $rc -ne 0 ]; then
|
|
if [ $rc -eq 102 ]; then
|
|
log "Cluster is not running locally, no need to restart resource $TRIPLEO_SERVICE"
|
|
exit 0
|
|
else
|
|
error "Unexpected error while connecting to the cluster (rc: $rc), bailing out"
|
|
fi
|
|
fi
|
|
|
|
RESOURCE_NAME=$3
|
|
BUNDLE_NAME=$4
|
|
WAIT_TARGET_LOCAL=$5
|
|
WAIT_TARGET_ANYWHERE=${6:-_}
|
|
|
|
if pacemaker_supports_promoted; then
|
|
WAIT_TARGET_LOCAL=$(echo "$5" | sed -e 's/Master/Promoted/' -e 's/Slave/Unpromoted/')
|
|
WAIT_TARGET_ANYWHERE=$(echo "${6:-_}" | sed -e 's/Master/Promoted/' -e 's/Slave/Unpromoted/')
|
|
promoted_role="Promoted"
|
|
else
|
|
promoted_role="Master"
|
|
fi
|
|
|
|
# The lock TTL should accomodate for the resource start/promote timeout
|
|
if [ "$RESOURCE_NAME" != "$BUNDLE_NAME" ]; then
|
|
if [ "$WAIT_TARGET_LOCAL" = "$promoted_role" ] || [ "$WAIT_TARGET_ANYWHERE" = "$promoted_role" ]; then
|
|
rsc_op="promote"
|
|
else
|
|
rsc_op="start"
|
|
fi
|
|
# <op id="galera-promote-interval-0s" interval="0s" name="promote" on-fail="block" timeout="300s"/>
|
|
PCMK_TTL=$(cibadmin -Q | xmllint -xpath "string(//primitive[@id='${RESOURCE_NAME}']/operations/op[@name='${rsc_op}']/@timeout)" - | sed 's/s$//')
|
|
LOCK_TTL=$((PCMK_TTL + 30))
|
|
else
|
|
# The podman RA's default start timeout
|
|
LOCK_TTL=90
|
|
fi
|
|
|
|
log "Acquire a ${LOCK_TTL}s restart lock for service $TRIPLEO_SERVICE before restarting it"
|
|
# Loop until we hold the lock. The lock has a TTL, so we're guaranteed to get it eventually
|
|
rc=1
|
|
while [ $rc -ne 0 ]; do
|
|
/var/lib/container-config-scripts/pacemaker_resource_lock.sh --acquire $LOCK_NAME $LOCK_OWNER $LOCK_TTL
|
|
rc=$?
|
|
if [ $rc != 0 ]; then
|
|
if [ $rc -gt 1 ]; then
|
|
error "Could not acquire lock due to unrecoverable error (rc: $rc), bailing out"
|
|
else
|
|
log "Could not acquire lock, retrying"
|
|
sleep 10
|
|
fi
|
|
fi
|
|
done
|
|
|
|
log "Restart the service $TRIPLEO_SERVICE locally"
|
|
# Reuse the local restart script in t-h-t (driven by env var TRIPLEO_MINOR_UPDATE)
|
|
TRIPLEO_MINOR_UPDATE=true /var/lib/container-config-scripts/pacemaker_restart_bundle.sh $TRIPLEO_SERVICE $RESOURCE_NAME $BUNDLE_NAME $WAIT_TARGET_LOCAL $WAIT_TARGET_ANYWHERE
|
|
|
|
# If we reached this point, always try to release the lock
|
|
log "Release the restart lock for service $TRIPLEO_SERVICE"
|
|
/var/lib/container-config-scripts/pacemaker_resource_lock.sh --release $LOCK_NAME $LOCK_OWNER
|
|
rc=$?
|
|
if [ $rc -ne 0 ] && [ $rc -ne 1 ]; then
|
|
error "Could not release held lock (rc: $rc)"
|
|
fi
|