From cb55cc8ce538f3e075754b0328872ec075f4cde9 Mon Sep 17 00:00:00 2001
From: Damien Ciabrini <dciabrin@redhat.com>
Date: Thu, 22 Oct 2020 12:23:54 +0200
Subject: [PATCH] Serialize shutdown of pacemaker nodes

When running minor update in a composable HA, different
roles could run ansible tasks concurrently. However,
there is currently a race when pacemaker nodes are
stopped in parallel [1,2], that could cause nodes to
incorrectly stop themselves once they reconnect to the
cluster.

To prevent concurrent shutdown, use a cluster-wide lock
to signals that one node is about to shutdown, and block
the others until the node disconnects from the cluster.

Tested the minor update in a composable HA environment:
  . when run with "openstack update run", every role
    is updated sequentially, and the shutdown lock
    doesn't interfere.
  . when running multiple ansible tasks in parallel
    "openstack update run --limit role<X>", pacemaker
    nodes are correctly stopped sequentially thanks
    to the shutdown lock.
  . when updating an existing overcloud, the new
    locking script used in the review is correctly
    injected on the overcloud, thanks to [3].

[1] https://bugzilla.redhat.com/show_bug.cgi?id=1791841
[2] https://bugzilla.redhat.com/show_bug.cgi?id=1872404
[3] I2ac6bb98e1d4183327e888240fc8d5a70e0d6fcb

Closes-Bug: #1904193
Change-Id: I0e041c6a95a7f53019967f9263df2326b1408c6f
---
 .../pacemaker_mutex_shutdown.sh               | 120 ++++++++++++++++++
 .../pacemaker_resource_lock.sh                |  34 ++++-
 deployment/containers-common.yaml             |   3 +
 .../pacemaker/pacemaker-baremetal-puppet.yaml |   6 +
 4 files changed, 161 insertions(+), 2 deletions(-)
 create mode 100755 container_config_scripts/pacemaker_mutex_shutdown.sh
diff --git a/container_config_scripts/pacemaker_mutex_shutdown.sh b/container_config_scripts/pacemaker_mutex_shutdown.sh
new file mode 100755
index 0000000000..9de8f3a90c
--- /dev/null
+++ b/container_config_scripts/pacemaker_mutex_shutdown.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+
+# pacemaker_mutex_shutdown.sh --acquire
+# pacemaker_mutex_shutdown.sh --release
+
+set -u
+
+usage() {
+    echo "Shutdown a cluster node in a coordinated way across the cluster"
+    echo "Usage:"
+    echo "   $0 --acquire # prevent other node from shutting down until we hold the lock"
+    echo "   $0 --release # release the lock, other node can compete for the shutdown lock"
+    echo
+}
+
+log() {
+    echo "$(date -u): $1"
+}
+
+error() {
+    echo "$(date -u): $1" 1>&2
+    exit 1
+}
+
+# Loop until we hold the lock. The lock has a TTL, so we're guaranteed to get it eventually
+shutdown_lock_acquire() {
+    local lockname=$1
+    local requester=$2
+    local ttl=$3
+    local rc=1
+    local current_owner
+    local owner_stopped
+    local owner_rc
+
+    log "Acquiring the shutdown lock"
+    while [ $rc -ne 0 ]; do
+        /var/lib/container-config-scripts/pacemaker_resource_lock.sh --acquire-once $lockname $requester $ttl
+        rc=$?
+        if [ $rc -ne 0 ]; then
+            if [ $rc -eq 2 ]; then
+                error "Could not acquire the shutdown lock due to unrecoverable error (rc: $rc), bailing out"
+            else
+                # The lock is held by another node.
+                current_owner=$(/var/lib/container-config-scripts/pacemaker_resource_lock.sh --owner $lockname)
+                owner_rc=$?
+                if [ $owner_rc -eq 2 ]; then
+                    error "Could not get the shutdown lock owner due to unrecoverable error (rc: $owner_rc), bailing out"
+                fi
+                if [ $owner_rc -eq 0 ]; then
+                    # If the owner is marked as offline, that means it has shutdown and
+                    # we can clean the lock preemptively and try to acquire it.
+                    owner_stopped=$(crm_mon -1X | xmllint --xpath 'count(//nodes/node[@name="'${current_owner}'" and @online="false" and @unclean="false"])' -)
+                    if [ "${owner_stopped}" = "1" ]; then
+                        log "Shutdown lock held by stopped node '${current_owner}', lock can be released"
+                        /var/lib/container-config-scripts/pacemaker_resource_lock.sh --release $lockname $current_owner
+                        continue
+                    fi
+                fi
+                log "Shutdown lock held by another node (rc: $rc), retrying"
+                sleep 10
+            fi
+        fi
+    done
+    log "Shutdown lock acquired"
+    return 0
+}
+
+
+# Release the lock if we still own it. Not owning it anymore is not fatal
+shutdown_lock_release() {
+    local lockname=$1
+    local requester=$2
+    local rc
+
+    log "Releasing the shutdown lock"
+    /var/lib/container-config-scripts/pacemaker_resource_lock.sh --release $lockname $requester
+    rc=$?
+    if [ $rc -ne 0 ]; then
+        if [ $rc -gt 1 ]; then
+            error "Could not release the shutdown lock due to unrecoverable error (rc: $rc), bailing out"
+        else
+            log "Shutdown lock no longer held, nothing to do"
+        fi
+    else
+        log "Shutdown lock released"
+    fi
+    return 0
+}
+
+
+ACTION=$1
+if [ -z "$ACTION" ]; then
+    error "Action must be specified"
+fi
+
+LOCK_NAME=tripleo-shutdown-lock
+LOCK_OWNER=$(crm_node -n 2>/dev/null)
+rc=$?
+if [ $rc -ne 0 ]; then
+    if [ $rc -eq 102 ]; then
+        log "Cluster is not running locally, no need to aquire the shutdown lock"
+        exit 0
+    else
+        error "Unexpected error while connecting to the cluster (rc: $rc), bailing out"
+    fi
+fi
+
+# We start with a very high TTL, that long enough to accomodate a cluster stop.
+# As soon as the node will get offline, the other competing node will be entitled
+# to steal the lock, so they should never wait that long in practice.
+LOCK_TTL=600
+
+
+case $ACTION in
+    --help) usage; exit 0;;
+    --acquire|-a) shutdown_lock_acquire ${LOCK_NAME} ${LOCK_OWNER} ${LOCK_TTL};;
+    --release|-r) shutdown_lock_release ${LOCK_NAME} ${LOCK_OWNER};;
+    *) error "Invalid action";;
+esac
+exit $?
diff --git a/container_config_scripts/pacemaker_resource_lock.sh b/container_config_scripts/pacemaker_resource_lock.sh
index eb2b36162b..fc4bf91250 100755
--- a/container_config_scripts/pacemaker_resource_lock.sh
+++ b/container_config_scripts/pacemaker_resource_lock.sh
@@ -213,6 +213,29 @@ lock_release() {
 }
 
 
+# Retrieve the owner of a lock from the CIB
+# this is a read-only operation, so no need to log debug info
+lock_get_owner() {
+    local lockname=$1
+    local rc
+    local lock
+    local owner
+
+    lock=$(lock_get $lockname)
+    rc=$?
+    if [ $rc -ne 0 ] && [ $rc -ne $CIB_ENOTFOUND ]; then
+        return 2
+    fi
+
+    if [ -z "$lock" ]; then
+        return 1
+    else
+        lock_owner $lock
+        return 0
+    fi
+}
+
+
 ACTION=$1
 LOCKNAME=$2
 REQUESTER=$3
@@ -223,8 +246,13 @@ if [ -z "$ACTION" ]; then
 fi
 
 if [ $ACTION != "--help" ]; then
-    if [ -z "$LOCKNAME" ] || [ -z "$REQUESTER" ]; then
-        error "You must specific a lock name and a requester"
+    if [ -z "$LOCKNAME" ]; then
+        error "You must specific a lock name"
+    fi
+    if [ $ACTION != "--owner" ] && [ $ACTION != "-o" ]; then
+        if [ -z "$REQUESTER" ]; then
+            error "You must specific a lock requester"
+        fi
     fi
 fi
 
@@ -232,6 +260,8 @@ case $ACTION in
     --help) usage; exit 0;;
     --acquire|-a) try_action lock_acquire $LOCKNAME $REQUESTER $TTL;;
     --release|-r) try_action lock_release $LOCKNAME $REQUESTER;;
+    --acquire-once|-A) lock_acquire $LOCKNAME $REQUESTER $TTL;;
+    --owner|-o) lock_get_owner $LOCKNAME;;
     *) error "Invalid action";;
 esac
 exit $?
diff --git a/deployment/containers-common.yaml b/deployment/containers-common.yaml
index 8b119b8eb4..cb3ae68286 100644
--- a/deployment/containers-common.yaml
+++ b/deployment/containers-common.yaml
@@ -127,6 +127,9 @@ outputs:
       pacemaker_mutex_restart_bundle.sh:
         mode: "0755"
         content: { get_file: ../container_config_scripts/pacemaker_mutex_restart_bundle.sh }
+      pacemaker_mutex_shutdown.sh:
+        mode: "0755"
+        content: { get_file: ../container_config_scripts/pacemaker_mutex_shutdown.sh }
 
   volumes_base:
     description: Base volume list
diff --git a/deployment/pacemaker/pacemaker-baremetal-puppet.yaml b/deployment/pacemaker/pacemaker-baremetal-puppet.yaml
index b7c5624cd1..447706960e 100644
--- a/deployment/pacemaker/pacemaker-baremetal-puppet.yaml
+++ b/deployment/pacemaker/pacemaker-baremetal-puppet.yaml
@@ -370,9 +370,15 @@ outputs:
                     echo "Could not retrieve and clear location constraint for VIP $v" 2>&1
                 fi
             done
+        - name: Acquire the cluster shutdown lock to stop pacemaker cluster
+          when: step|int == 1
+          command: systemd-cat -t ha-shutdown /var/lib/container-config-scripts/pacemaker_mutex_shutdown.sh --acquire
         - name: Stop pacemaker cluster
           when: step|int == 1
           pacemaker_cluster: state=offline
         - name: Start pacemaker cluster
           when: step|int == 4
           pacemaker_cluster: state=online
+        - name: Release the cluster shutdown lock
+          when: step|int == 4
+          command: systemd-cat -t ha-shutdown /var/lib/container-config-scripts/pacemaker_mutex_shutdown.sh --release