From 128c2bcc25c057dc8a880b266869369f02f4f504 Mon Sep 17 00:00:00 2001 From: Damien Ciabrini Date: Mon, 6 Sep 2021 14:22:39 +0200 Subject: [PATCH] CentOS 9: support restart of HA resources Pacemaker 2.1 changed naming convention around multi-state resources and ocf resource name. Adapt our resource restart scripts so that they parse the proper data from the CIB. Change-Id: Ieade3444e44e305f507c057991e02048ab5f3b3a Closes-Bug: #1942771 --- .../pacemaker_mutex_restart_bundle.sh | 15 +++++++++- .../pacemaker_wait_bundle.sh | 29 ++++++++++++++----- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/container_config_scripts/pacemaker_mutex_restart_bundle.sh b/container_config_scripts/pacemaker_mutex_restart_bundle.sh index d7b1e83f52..ab76ce11c2 100755 --- a/container_config_scripts/pacemaker_mutex_restart_bundle.sh +++ b/container_config_scripts/pacemaker_mutex_restart_bundle.sh @@ -21,6 +21,11 @@ error() { exit 1 } +pacemaker_supports_promoted() { + # The Promoted token is only matched in recent pacemaker versions + grep -wq "Promoted" /usr/share/pacemaker/resources-*.rng +} + ACTION=$1 case $ACTION in --help) usage; exit 0;; @@ -46,9 +51,17 @@ BUNDLE_NAME=$4 WAIT_TARGET_LOCAL=$5 WAIT_TARGET_ANYWHERE=${6:-_} +if pacemaker_supports_promoted; then + WAIT_TARGET_LOCAL=$(echo "$5" | sed -e 's/Master/Promoted/' -e 's/Slave/Unpromoted/') + WAIT_TARGET_ANYWHERE=$(echo "${6:-_}" | sed -e 's/Master/Promoted/' -e 's/Slave/Unpromoted/') + promoted_role="Promoted" +else + promoted_role="Master" +fi + # The lock TTL should accomodate for the resource start/promote timeout if [ "$RESOURCE_NAME" != "$BUNDLE_NAME" ]; then - if [ "$WAIT_TARGET_LOCAL" = "Master" ] || [ "$WAIT_TARGET_ANYWHERE" = "Master" ]; then + if [ "$WAIT_TARGET_LOCAL" = "$promoted_role" ] || [ "$WAIT_TARGET_ANYWHERE" = "$promoted_role" ]; then rsc_op="promote" else rsc_op="start" diff --git a/container_config_scripts/pacemaker_wait_bundle.sh b/container_config_scripts/pacemaker_wait_bundle.sh index 20701fdfc2..c47de1bcb2 100755 --- a/container_config_scripts/pacemaker_wait_bundle.sh +++ b/container_config_scripts/pacemaker_wait_bundle.sh @@ -40,6 +40,10 @@ usage() { exit 1 } +pacemaker_supports_promoted() { + # The Promoted token is only matched in recent pacemaker versions + grep -wq "Promoted" /usr/share/pacemaker/resources-*.rng +} # # Utility functions to detect stuck resources @@ -69,7 +73,7 @@ bundle_running_globally() { local engine=$BUNDLE_CONTAINER_ENGINE # return the number of running bundles replica, i.e. the number of # docker/podman resource replicas currently running in the cluster - crm_mon --as-xml | xmllint --xpath "count(//resources/bundle[@id='${BUNDLE_NAME}']/replica/resource[@resource_agent='ocf::heartbeat:${engine}']/node)" - + crm_mon --as-xml | xmllint --xpath "count(//resources/bundle[@id='${BUNDLE_NAME}']/replica/resource[@resource_agent='${OCF}:heartbeat:${engine}']/node)" - } ocf_failures_globally() { @@ -91,7 +95,7 @@ did_resource_failed_locally() { # pacemaker_remote rather that on the real host, and the # failcounts are thus associated to the pcmk remote. Replace # the host's name with the pcmk remote's name. - remotehost=$(crm_mon --as-xml | xmllint --xpath "string(//resources/bundle[@id='${BUNDLE_NAME}']/replica/resource/node[@name='${HOST}']/../../resource[@resource_agent='ocf::pacemaker:remote']/@id)" -) + remotehost=$(crm_mon --as-xml | xmllint --xpath "string(//resources/bundle[@id='${BUNDLE_NAME}']/replica/resource/node[@name='${HOST}']/../../resource[@resource_agent='${OCF}:pacemaker:remote']/@id)" -) if [ -n "${remotehost}" ]; then crm_failcount -q -G -r $NAME -N $remotehost | grep -q -w INFINITY return $? @@ -118,7 +122,7 @@ did_resource_failed_globally() { if [ "${NAME}" != "${BUNDLE_NAME}" ]; then # we check the state of an ocf resource only if the # pcmkremotes are started - remotecount=$(crm_mon --as-xml | xmllint --xpath "count(//resources/bundle[@id='${BUNDLE_NAME}']/replica/resource[@resource_agent='ocf::pacemaker:remote']/node)" -) + remotecount=$(crm_mon --as-xml | xmllint --xpath "count(//resources/bundle[@id='${BUNDLE_NAME}']/replica/resource[@resource_agent='${OCF}:pacemaker:remote']/node)" -) if [ "${remotecount}" = "0" ]; then # no pcmkremote is running, so check the bundle state # instead of checking the ocf resource @@ -170,17 +174,26 @@ if [ -z "${ROLE_LOCAL}" ]; then exit 1 fi else - if !(echo "${ROLE_LOCAL}" | grep -q -x -E "(Started|Slave|Master)"); then - echo 2>&1 "Error: argument ROLE_LOCAL must be either 'Started' 'Slave' or 'Master'" + if !(echo "${ROLE_LOCAL}" | grep -q -x -E "(Started|Slave|Master|Unpromoted|Promoted)"); then + echo 2>&1 "Error: argument ROLE_LOCAL must be either 'Started' 'Slave' 'Master' 'Unpromoted' or 'Promoted'" exit 1 fi fi -if [ -n "${ROLE_ANYWHERE}" ] && !(echo "${ROLE_ANYWHERE}" | grep -q -x -E "(Started|Slave|Master)"); then - echo 2>&1 "Error: argument ROLE_ANYWHERE must be either 'Started' 'Slave' or 'Master'" +if [ -n "${ROLE_ANYWHERE}" ] && !(echo "${ROLE_ANYWHERE}" | grep -q -x -E "(Started|Slave|Master|Unpromoted|Promoted)"); then + echo 2>&1 "Error: argument ROLE_ANYWHERE must be either 'Started' 'Slave' 'Master' 'Unpromoted' or 'Promoted'" exit 1 fi +# Ensure compatibility with pacemaker 2.1 +if pacemaker_supports_promoted; then + ROLE_LOCAL=$(echo "$ROLE_LOCAL" | sed -e 's/Master/Promoted/' -e 's/Slave/Unpromoted/') + ROLE_ANYWHERE=$(echo "$ROLE_ANYWHERE" | sed -e 's/Master/Promoted/' -e 's/Slave/Unpromoted/') + OCF="ocf" +else + OCF="ocf:" +fi + HOST=${5:-$(facter hostname)} TIMEOUT=${6:-__PCMKTIMEOUT__} @@ -194,7 +207,7 @@ TIMEOUT=${6:-__PCMKTIMEOUT__} if [ "${BUNDLE_NAME}" != "${NAME}" ]; then # ocf resource -local_resource_xpath="//bundle/replica/resource[@resource_agent='ocf::pacemaker:remote']/node[@name='${HOST}']/../../resource[@id='${NAME}']" +local_resource_xpath="//bundle/replica/resource[@resource_agent='${OCF}:pacemaker:remote']/node[@name='${HOST}']/../../resource[@id='${NAME}']" any_resource_xpath="//bundle//resource[@id='${NAME}']" replicas_xpath="//bundle/primitive[@id='${BUNDLE_NAME}']/../*[boolean(@image) and boolean(@replicas)]" else