CentOS 9: support restart of HA resources

Pacemaker 2.1 changed naming convention around multi-state
resources and ocf resource name. Adapt our resource restart
scripts so that they parse the proper data from the CIB.

Change-Id: Ieade3444e44e305f507c057991e02048ab5f3b3a
Closes-Bug: #1942771
This commit is contained in:
Damien Ciabrini 2021-09-06 14:22:39 +02:00
parent ab8666fafe
commit 128c2bcc25
2 changed files with 35 additions and 9 deletions

View File

@ -21,6 +21,11 @@ error() {
exit 1
}
pacemaker_supports_promoted() {
# The Promoted token is only matched in recent pacemaker versions
grep -wq "<value>Promoted</value>" /usr/share/pacemaker/resources-*.rng
}
ACTION=$1
case $ACTION in
--help) usage; exit 0;;
@ -46,9 +51,17 @@ BUNDLE_NAME=$4
WAIT_TARGET_LOCAL=$5
WAIT_TARGET_ANYWHERE=${6:-_}
if pacemaker_supports_promoted; then
WAIT_TARGET_LOCAL=$(echo "$5" | sed -e 's/Master/Promoted/' -e 's/Slave/Unpromoted/')
WAIT_TARGET_ANYWHERE=$(echo "${6:-_}" | sed -e 's/Master/Promoted/' -e 's/Slave/Unpromoted/')
promoted_role="Promoted"
else
promoted_role="Master"
fi
# The lock TTL should accomodate for the resource start/promote timeout
if [ "$RESOURCE_NAME" != "$BUNDLE_NAME" ]; then
if [ "$WAIT_TARGET_LOCAL" = "Master" ] || [ "$WAIT_TARGET_ANYWHERE" = "Master" ]; then
if [ "$WAIT_TARGET_LOCAL" = "$promoted_role" ] || [ "$WAIT_TARGET_ANYWHERE" = "$promoted_role" ]; then
rsc_op="promote"
else
rsc_op="start"

View File

@ -40,6 +40,10 @@ usage() {
exit 1
}
pacemaker_supports_promoted() {
# The Promoted token is only matched in recent pacemaker versions
grep -wq "<value>Promoted</value>" /usr/share/pacemaker/resources-*.rng
}
#
# Utility functions to detect stuck resources
@ -69,7 +73,7 @@ bundle_running_globally() {
local engine=$BUNDLE_CONTAINER_ENGINE
# return the number of running bundles replica, i.e. the number of
# docker/podman resource replicas currently running in the cluster
crm_mon --as-xml | xmllint --xpath "count(//resources/bundle[@id='${BUNDLE_NAME}']/replica/resource[@resource_agent='ocf::heartbeat:${engine}']/node)" -
crm_mon --as-xml | xmllint --xpath "count(//resources/bundle[@id='${BUNDLE_NAME}']/replica/resource[@resource_agent='${OCF}:heartbeat:${engine}']/node)" -
}
ocf_failures_globally() {
@ -91,7 +95,7 @@ did_resource_failed_locally() {
# pacemaker_remote rather that on the real host, and the
# failcounts are thus associated to the pcmk remote. Replace
# the host's name with the pcmk remote's name.
remotehost=$(crm_mon --as-xml | xmllint --xpath "string(//resources/bundle[@id='${BUNDLE_NAME}']/replica/resource/node[@name='${HOST}']/../../resource[@resource_agent='ocf::pacemaker:remote']/@id)" -)
remotehost=$(crm_mon --as-xml | xmllint --xpath "string(//resources/bundle[@id='${BUNDLE_NAME}']/replica/resource/node[@name='${HOST}']/../../resource[@resource_agent='${OCF}:pacemaker:remote']/@id)" -)
if [ -n "${remotehost}" ]; then
crm_failcount -q -G -r $NAME -N $remotehost | grep -q -w INFINITY
return $?
@ -118,7 +122,7 @@ did_resource_failed_globally() {
if [ "${NAME}" != "${BUNDLE_NAME}" ]; then
# we check the state of an ocf resource only if the
# pcmkremotes are started
remotecount=$(crm_mon --as-xml | xmllint --xpath "count(//resources/bundle[@id='${BUNDLE_NAME}']/replica/resource[@resource_agent='ocf::pacemaker:remote']/node)" -)
remotecount=$(crm_mon --as-xml | xmllint --xpath "count(//resources/bundle[@id='${BUNDLE_NAME}']/replica/resource[@resource_agent='${OCF}:pacemaker:remote']/node)" -)
if [ "${remotecount}" = "0" ]; then
# no pcmkremote is running, so check the bundle state
# instead of checking the ocf resource
@ -170,17 +174,26 @@ if [ -z "${ROLE_LOCAL}" ]; then
exit 1
fi
else
if !(echo "${ROLE_LOCAL}" | grep -q -x -E "(Started|Slave|Master)"); then
echo 2>&1 "Error: argument ROLE_LOCAL must be either 'Started' 'Slave' or 'Master'"
if !(echo "${ROLE_LOCAL}" | grep -q -x -E "(Started|Slave|Master|Unpromoted|Promoted)"); then
echo 2>&1 "Error: argument ROLE_LOCAL must be either 'Started' 'Slave' 'Master' 'Unpromoted' or 'Promoted'"
exit 1
fi
fi
if [ -n "${ROLE_ANYWHERE}" ] && !(echo "${ROLE_ANYWHERE}" | grep -q -x -E "(Started|Slave|Master)"); then
echo 2>&1 "Error: argument ROLE_ANYWHERE must be either 'Started' 'Slave' or 'Master'"
if [ -n "${ROLE_ANYWHERE}" ] && !(echo "${ROLE_ANYWHERE}" | grep -q -x -E "(Started|Slave|Master|Unpromoted|Promoted)"); then
echo 2>&1 "Error: argument ROLE_ANYWHERE must be either 'Started' 'Slave' 'Master' 'Unpromoted' or 'Promoted'"
exit 1
fi
# Ensure compatibility with pacemaker 2.1
if pacemaker_supports_promoted; then
ROLE_LOCAL=$(echo "$ROLE_LOCAL" | sed -e 's/Master/Promoted/' -e 's/Slave/Unpromoted/')
ROLE_ANYWHERE=$(echo "$ROLE_ANYWHERE" | sed -e 's/Master/Promoted/' -e 's/Slave/Unpromoted/')
OCF="ocf"
else
OCF="ocf:"
fi
HOST=${5:-$(facter hostname)}
TIMEOUT=${6:-__PCMKTIMEOUT__}
@ -194,7 +207,7 @@ TIMEOUT=${6:-__PCMKTIMEOUT__}
if [ "${BUNDLE_NAME}" != "${NAME}" ]; then
# ocf resource
local_resource_xpath="//bundle/replica/resource[@resource_agent='ocf::pacemaker:remote']/node[@name='${HOST}']/../../resource[@id='${NAME}']"
local_resource_xpath="//bundle/replica/resource[@resource_agent='${OCF}:pacemaker:remote']/node[@name='${HOST}']/../../resource[@id='${NAME}']"
any_resource_xpath="//bundle//resource[@id='${NAME}']"
replicas_xpath="//bundle/primitive[@id='${BUNDLE_NAME}']/../*[boolean(@image) and boolean(@replicas)]"
else