Serialize shutdown of pacemaker nodes

When running minor update in a composable HA, different roles could run ansible tasks concurrently. However, there is currently a race when pacemaker nodes are stopped in parallel [1,2], that could cause nodes to incorrectly stop themselves once they reconnect to the cluster. To prevent concurrent shutdown, use a cluster-wide lock to signals that one node is about to shutdown, and block the others until the node disconnects from the cluster. Tested the minor update in a composable HA environment: . when run with "openstack update run", every role is updated sequentially, and the shutdown lock doesn't interfere. . when running multiple ansible tasks in parallel "openstack update run --limit role<X>", pacemaker nodes are correctly stopped sequentially thanks to the shutdown lock. . when updating an existing overcloud, the new locking script used in the review is correctly injected on the overcloud, thanks to [3]. [1] https://bugzilla.redhat.com/show_bug.cgi?id=1791841 [2] https://bugzilla.redhat.com/show_bug.cgi?id=1872404 [3] I2ac6bb98e1d4183327e888240fc8d5a70e0d6fcb Closes-Bug: #1904193 Change-Id: I0e041c6a95a7f53019967f9263df2326b1408c6f
2020-10-22 12:23:54 +02:00 · 2020-10-22 12:23:54 +02:00 · cb55cc8ce5
commit cb55cc8ce5
parent 93a6f9d4cf
4 changed files with 161 additions and 2 deletions
--- a/container_config_scripts/pacemaker_mutex_shutdown.sh
+++ b/container_config_scripts/pacemaker_mutex_shutdown.sh
@ -0,0 +1,120 @@
+#!/bin/bash
+
+# pacemaker_mutex_shutdown.sh --acquire
+# pacemaker_mutex_shutdown.sh --release
+
+set -u
+
+usage() {
+    echo "Shutdown a cluster node in a coordinated way across the cluster"
+    echo "Usage:"
+    echo "   $0 --acquire # prevent other node from shutting down until we hold the lock"
+    echo "   $0 --release # release the lock, other node can compete for the shutdown lock"
+    echo
+}
+
+log() {
+    echo "$(date -u): $1"
+}
+
+error() {
+    echo "$(date -u): $1" 1>&2
+    exit 1
+}
+
+# Loop until we hold the lock. The lock has a TTL, so we're guaranteed to get it eventually
+shutdown_lock_acquire() {
+    local lockname=$1
+    local requester=$2
+    local ttl=$3
+    local rc=1
+    local current_owner
+    local owner_stopped
+    local owner_rc
+
+    log "Acquiring the shutdown lock"
+    while [ $rc -ne 0 ]; do
+        /var/lib/container-config-scripts/pacemaker_resource_lock.sh --acquire-once $lockname $requester $ttl
+        rc=$?
+        if [ $rc -ne 0 ]; then
+            if [ $rc -eq 2 ]; then
+                error "Could not acquire the shutdown lock due to unrecoverable error (rc: $rc), bailing out"
+            else
+                # The lock is held by another node.
+                current_owner=$(/var/lib/container-config-scripts/pacemaker_resource_lock.sh --owner $lockname)
+                owner_rc=$?
+                if [ $owner_rc -eq 2 ]; then
+                    error "Could not get the shutdown lock owner due to unrecoverable error (rc: $owner_rc), bailing out"
+                fi
+                if [ $owner_rc -eq 0 ]; then
+                    # If the owner is marked as offline, that means it has shutdown and
+                    # we can clean the lock preemptively and try to acquire it.
+                    owner_stopped=$(crm_mon -1X | xmllint --xpath 'count(//nodes/node[@name="'${current_owner}'" and @online="false" and @unclean="false"])' -)
+                    if [ "${owner_stopped}" = "1" ]; then
+                        log "Shutdown lock held by stopped node '${current_owner}', lock can be released"
+                        /var/lib/container-config-scripts/pacemaker_resource_lock.sh --release $lockname $current_owner
+                        continue
+                    fi
+                fi
+                log "Shutdown lock held by another node (rc: $rc), retrying"
+                sleep 10
+            fi
+        fi
+    done
+    log "Shutdown lock acquired"
+    return 0
+}
+
+
+# Release the lock if we still own it. Not owning it anymore is not fatal
+shutdown_lock_release() {
+    local lockname=$1
+    local requester=$2
+    local rc
+
+    log "Releasing the shutdown lock"
+    /var/lib/container-config-scripts/pacemaker_resource_lock.sh --release $lockname $requester
+    rc=$?
+    if [ $rc -ne 0 ]; then
+        if [ $rc -gt 1 ]; then
+            error "Could not release the shutdown lock due to unrecoverable error (rc: $rc), bailing out"
+        else
+            log "Shutdown lock no longer held, nothing to do"
+        fi
+    else
+        log "Shutdown lock released"
+    fi
+    return 0
+}
+
+
+ACTION=$1
+if [ -z "$ACTION" ]; then
+    error "Action must be specified"
+fi
+
+LOCK_NAME=tripleo-shutdown-lock
+LOCK_OWNER=$(crm_node -n 2>/dev/null)
+rc=$?
+if [ $rc -ne 0 ]; then
+    if [ $rc -eq 102 ]; then
+        log "Cluster is not running locally, no need to aquire the shutdown lock"
+        exit 0
+    else
+        error "Unexpected error while connecting to the cluster (rc: $rc), bailing out"
+    fi
+fi
+
+# We start with a very high TTL, that long enough to accomodate a cluster stop.
+# As soon as the node will get offline, the other competing node will be entitled
+# to steal the lock, so they should never wait that long in practice.
+LOCK_TTL=600
+
+
+case $ACTION in
+    --help) usage; exit 0;;
+    --acquire|-a) shutdown_lock_acquire ${LOCK_NAME} ${LOCK_OWNER} ${LOCK_TTL};;
+    --release|-r) shutdown_lock_release ${LOCK_NAME} ${LOCK_OWNER};;
+    *) error "Invalid action";;
+esac
+exit $?
--- a/container_config_scripts/pacemaker_resource_lock.sh
+++ b/container_config_scripts/pacemaker_resource_lock.sh
@ -213,6 +213,29 @@ lock_release() {
 }


+# Retrieve the owner of a lock from the CIB
+# this is a read-only operation, so no need to log debug info
+lock_get_owner() {
+    local lockname=$1
+    local rc
+    local lock
+    local owner
+
+    lock=$(lock_get $lockname)
+    rc=$?
+    if [ $rc -ne 0 ] && [ $rc -ne $CIB_ENOTFOUND ]; then
+        return 2
+    fi
+
+    if [ -z "$lock" ]; then
+        return 1
+    else
+        lock_owner $lock
+        return 0
+    fi
+}
+
+
 ACTION=$1
 LOCKNAME=$2
 REQUESTER=$3
@ -223,8 +246,13 @@ if [ -z "$ACTION" ]; then
 fi

 if [ $ACTION != "--help" ]; then
-    if [ -z "$LOCKNAME" ] || [ -z "$REQUESTER" ]; then
-        error "You must specific a lock name and a requester"
+    if [ -z "$LOCKNAME" ]; then
+        error "You must specific a lock name"
+    fi
+    if [ $ACTION != "--owner" ] && [ $ACTION != "-o" ]; then
+        if [ -z "$REQUESTER" ]; then
+            error "You must specific a lock requester"
+        fi
    fi
 fi

@ -232,6 +260,8 @@ case $ACTION in
    --help) usage; exit 0;;
    --acquire|-a) try_action lock_acquire $LOCKNAME $REQUESTER $TTL;;
    --release|-r) try_action lock_release $LOCKNAME $REQUESTER;;
+    --acquire-once|-A) lock_acquire $LOCKNAME $REQUESTER $TTL;;
+    --owner|-o) lock_get_owner $LOCKNAME;;
    *) error "Invalid action";;
 esac
 exit $?
--- a/deployment/containers-common.yaml
+++ b/deployment/containers-common.yaml
@ -127,6 +127,9 @@ outputs:
      pacemaker_mutex_restart_bundle.sh:
        mode: "0755"
        content: { get_file: ../container_config_scripts/pacemaker_mutex_restart_bundle.sh }
+      pacemaker_mutex_shutdown.sh:
+        mode: "0755"
+        content: { get_file: ../container_config_scripts/pacemaker_mutex_shutdown.sh }

  volumes_base:
    description: Base volume list
--- a/deployment/pacemaker/pacemaker-baremetal-puppet.yaml
+++ b/deployment/pacemaker/pacemaker-baremetal-puppet.yaml
@ -370,9 +370,15 @@ outputs:
                    echo "Could not retrieve and clear location constraint for VIP $v" 2>&1
                fi
            done
+        - name: Acquire the cluster shutdown lock to stop pacemaker cluster
+          when: step|int == 1
+          command: systemd-cat -t ha-shutdown /var/lib/container-config-scripts/pacemaker_mutex_shutdown.sh --acquire
        - name: Stop pacemaker cluster
          when: step|int == 1
          pacemaker_cluster: state=offline
        - name: Start pacemaker cluster
          when: step|int == 4
          pacemaker_cluster: state=online
+        - name: Release the cluster shutdown lock
+          when: step|int == 4
+          command: systemd-cat -t ha-shutdown /var/lib/container-config-scripts/pacemaker_mutex_shutdown.sh --release