Rolling certificate update for HA services

(manually squashed the subsequent fix [1] into a single commit) (also manually squashed [2] because of #1906505) There are certain HA clustered services (e.g. galera) that don't have the ability natively to reload their TLS certificate without being restarted. If too many replicas are restarted concurrently this might result in full service disruption. To ensure service availability, provide a means to ensure that only one service replica is restarted at a time in the cluster. This works by using pacemaker's CIB to implement a cluster-wide restart lock for a service. The lock has a TTL so it's guaranteed to be eventually released without requiring complex contingency cleanup in case of failures. Tested locally by running the following: 1. force recreate certificate on all nodes at once for galera (ipa-cert resubmit -i mysql), and verify that the resources restart one after the other 2. create a lock manually in pacemaker, recreate certificate for galera on all nodes, and verify that no resource is restarted before the manually created lock expires. 3. create a lock manually, let it expires, recreate a certificate, and verify that the resource is restarted appropriately and the lock gets cleaned up from pacemaker once the restart finished. [1] Id10f026c8b31cad7b7313ac9427a99b3e6744788 [2] I17f1364932e43b8487515084e41b525e186888db Related-Bug: #1904193 Closes-Bug: #1885113 Change-Id: Ib2b62e33b34cf72edfdae6299cf432259bf960a2 (cherry picked from commit 0f54889408) (cherry picked from commit c8f5fdfc36) (cherry picked from commit 8b16911cc2) (cherry picked from commit 8968c7efd6)
2020-06-23 11:22:45 +02:00 · 2020-06-23 11:22:45 +02:00 · 916b9385c6
parent a4b690800c
commit 916b9385c6
4 changed files with 342 additions and 0 deletions
--- a/container_config_scripts/pacemaker_mutex_restart_bundle.sh
+++ b/container_config_scripts/pacemaker_mutex_restart_bundle.sh
@ -0,0 +1,90 @@
+#!/bin/bash
+
+# pacemaker_mutex_restart_bundle.sh --lock mysql galera galera-bundle Master _
+# pacemaker_mutex_restart_bundle.sh --lock ovn_dbs ovndb_servers ovn-dbs-bundle Slave Master
+
+set -u
+
+usage() {
+    echo "Restart a clustered resource in a coordinated way across the cluster"
+    echo "Usage:"
+    echo "   $0 --lock <tripleo-service> <pcmk-resource> <pcmk-bundle> <target-state-local> <target-state-cluster>"
+    echo
+}
+
+log() {
+    echo "$(date -u): $1"
+}
+
+error() {
+    echo "$(date -u): $1" 1>&2
+    exit 1
+}
+
+ACTION=$1
+case $ACTION in
+    --help) usage; exit 0;;
+    --lock) ;;
+    *) error "Unknown action '$ACTION'";;
+esac
+
+TRIPLEO_SERVICE=$2
+LOCK_NAME=${TRIPLEO_SERVICE}-restart-lock
+LOCK_OWNER=$(crm_node -n 2>/dev/null)
+rc=$?
+if [ $rc -ne 0 ]; then
+    if [ $rc -eq 102 ]; then
+        log "Cluster is not running locally, no need to restart resource $TRIPLEO_SERVICE"
+        exit 0
+    else
+        error "Unexpected error while connecting to the cluster (rc: $rc), bailing out"
+    fi
+fi
+
+RESOURCE_NAME=$3
+BUNDLE_NAME=$4
+WAIT_TARGET_LOCAL=$5
+WAIT_TARGET_ANYWHERE=${6:-_}
+
+# The lock TTL should accomodate for the resource start/promote timeout
+if [ "$RESOURCE_NAME" != "$BUNDLE_NAME" ]; then
+    if [ "$WAIT_TARGET_LOCAL" = "Master" ] || [ "$WAIT_TARGET_ANYWHERE" = "Master" ]; then
+        rsc_op="promote"
+    else
+        rsc_op="start"
+    fi
+    # <op id="galera-promote-interval-0s" interval="0s" name="promote" on-fail="block" timeout="300s"/>
+    PCMK_TTL=$(cibadmin -Q | xmllint -xpath "string(//primitive[@id='${RESOURCE_NAME}']/operations/op[@name='${rsc_op}']/@timeout)" - | sed 's/s$//')
+    LOCK_TTL=$((PCMK_TTL + 30))
+else
+    # The podman RA's default start timeout
+    LOCK_TTL=90
+fi
+
+log "Acquire a ${LOCK_TTL}s restart lock for service $TRIPLEO_SERVICE before restarting it"
+# Loop until we hold the lock. The lock has a TTL, so we're guaranteed to get it eventually
+rc=1
+while [ $rc -ne 0 ]; do
+    /var/lib/container-config-scripts/pacemaker_resource_lock.sh --acquire $LOCK_NAME $LOCK_OWNER $LOCK_TTL
+    rc=$?
+    if [ $rc != 0 ]; then
+        if [ $rc -gt 1 ]; then
+            error "Could not acquire lock due to unrecoverable error (rc: $rc), bailing out"
+        else
+            log "Could not acquire lock, retrying"
+            sleep 10
+        fi
+    fi
+done
+
+log "Restart the service $TRIPLEO_SERVICE locally"
+# Reuse the local restart script in t-h-t (driven by env var TRIPLEO_MINOR_UPDATE)
+TRIPLEO_MINOR_UPDATE=true /var/lib/container-config-scripts/pacemaker_restart_bundle.sh $TRIPLEO_SERVICE $RESOURCE_NAME $BUNDLE_NAME $WAIT_TARGET_LOCAL $WAIT_TARGET_ANYWHERE
+
+# If we reached this point, always try to release the lock
+log "Release the restart lock for service $TRIPLEO_SERVICE"
+/var/lib/container-config-scripts/pacemaker_resource_lock.sh --release $LOCK_NAME $LOCK_OWNER
+rc=$?
+if [ $rc -ne 0 ] && [ $rc -ne 1 ]; then
+    error "Could not release held lock (rc: $rc)"
+fi
--- a/container_config_scripts/pacemaker_resource_lock.sh
+++ b/container_config_scripts/pacemaker_resource_lock.sh
@ -0,0 +1,237 @@
+#!/bin/bash
+
+MAX_RETRIES=10
+CIB_ENOTFOUND=105
+
+usage() {
+   echo "Set a global property in the cluster with a validity timestamp."
+   echo "Usage:"
+   echo "   $0 --acquire <lock_name> <lock_owner> <lock_ttl_in_seconds>"
+   echo "   $0 --release <lock_name> <lock_owner>"
+   echo
+}
+
+log() {
+    echo "$(date -u): $1" 1>&2
+}
+
+error() {
+    echo "$(date -u): $1" 1>&2
+    exit 1
+}
+
+lock_create() {
+    local name=$1
+    local data=$2
+    # cibadmin won't overwrite a key if someone else succeeded to create it concurrently
+    cibadmin --sync-call --scope crm_config --create --xml-text "<cluster_property_set id='${name}'><nvpair id='${name}-pair' name='${name}' value='${data}'/></cluster_property_set>" &>/dev/null
+    return $?
+}
+
+lock_update() {
+    local name=$1
+    local expected_data=$2
+    local new_data=$3
+    # we only update the lock we expect to see, so we can't update someone else's lock
+    cibadmin --sync-call --scope crm_config --modify --xpath "//cluster_property_set/nvpair[@name='${name}' and @value='${expected_data}']/.." --xml-text "<nvpair id='${name}-pair' name='${name}' value='${new_data}'/>" &>/dev/null
+    return $?
+}
+
+lock_delete() {
+    local name=$1
+    local expected_data=$2
+    # we only delete the lock we expect to see, so we can't delete someone else's lock
+    cibadmin --sync-call --scope crm_config --delete --xpath "//cluster_property_set/nvpair[@name='${name}' and @value='${expected_data}']/.." &>/dev/null
+    return $?
+}
+
+lock_get() {
+    local lockname=$1
+    local res
+    local rc
+    res=$(cibadmin --query --scope crm_config --xpath "//cluster_property_set/nvpair[@name='$lockname']" 2>/dev/null)
+    rc=$?
+    if [ $rc -eq 0 ]; then
+        echo "$res" | sed -n 's/.*value="\([^"]*\)".*/\1/p'
+    fi
+    return $rc
+}
+
+lock_owner() {
+    local lock=$1
+    echo "$lock" | cut -d':' -f1
+}
+
+lock_has_expired() {
+    local lock=$1
+    local expiry=$(echo "$lock" | cut -d':' -f2)
+    local now=$(date +%s)
+    test $now -ge $expiry
+}
+
+
+# Perform a lock action and restart if the CIB has been modified before
+# committing the lock action
+try_action() {
+    local fun=$1
+    local lock=$2
+    local requester=$3
+    local args=${4:-}
+    local tries=$MAX_RETRIES
+    local rc=1
+    if [ "$fun" = "lock_acquire" ] || [ "$fun" = "lock_release" ]; then
+        log "Try running $fun"
+    else
+        return 2
+    fi
+    while [ $rc -ne 0 ]; do
+        $fun $lock $requester $args
+        rc=$?
+        if [ $rc -eq 0 ]; then
+            log "Operation $1 succeeded"
+            return 0
+        elif [ $rc -eq 3 ]; then
+            # rc == 3 -> CIB changed before push
+            if [ $tries -eq 0 ]; then
+                log "Failed to commit after $MAX_RETRIES retries. Bailing out."
+                return 2
+            else
+                log "Failed to commit. Retrying operation."
+                tries=$(($tries - 1))
+            fi
+        elif [ $rc -eq 2 ]; then
+            # rc == 2 -> unrecoverable cib error (e.g. pacemaker down)
+            log "Unexpected failure. Bailing out"
+            return $rc
+        else
+            # rc == 1 -> lock error (not owner, lock doesn't exists)
+            return $rc
+        fi
+    done
+}
+
+# The lock mechanism uses cibadmin's atomic creation so cluster-wide
+# state coherency is guaranteed by pacemaker
+lock_acquire() {
+    local lockname=$1
+    local requester=$2
+    local ttl=$3
+    local rc
+    local lock
+    local expiry
+    local owner
+
+    log "Check whether the lock is already held in the CIB"
+    lock=$(lock_get $lockname)
+    rc=$?
+    if [ $rc -ne 0 ] && [ $rc -ne $CIB_ENOTFOUND ]; then
+        log "Could not retrieve info from the CIB"
+        return 2
+    fi
+
+    if [ -n "$lock" ]; then
+        lock_has_expired $lock
+        rc=$?
+        if [ $rc -eq 0 ]; then
+            log "Lock has expired, now available for being held"
+        else
+            # lock is still held. check whether we're the owner
+            owner=$(lock_owner $lock)
+            if [ "$owner" = "$requester" ];then
+                log "Requester already owns the lock, acquiring attempt will just reconfigure the TTL"
+            else
+                log "Lock is held by someone else ($owner)"
+                return 1
+            fi
+        fi
+    else
+        log "Lock is not held yet"
+    fi
+
+    # prepare the lock info
+    expiry=$(($(date +%s) + $ttl))
+
+    if [ -n "$lock" ]; then
+        log "Attempting to update the lock"
+        lock_update $lockname "$lock" "$requester:$expiry"
+        rc=$?
+    else
+        log "Attempting to acquire the lock"
+        lock_create $lockname "$requester:$expiry"
+        rc=$?
+    fi
+
+    if [ $rc -eq 0 ]; then
+        log "Lock '$lockname' acquired by '$requester', valid until $(date -d @$expiry)"
+        return 0
+    else
+        log "CIB changed, lock cannot be acquired"
+        return 3
+    fi
+}
+
+
+# The lock mechanism uses the CIB's num_updates tag to implement
+# a conditional store. Cluster-wide locking is guaranteed by pacemaker
+lock_release() {
+    local lockname=$1
+    local requester=$2
+    local rc
+    local lock
+    local owner
+
+    log "Check whether the lock is already held in the CIB"
+    lock=$(lock_get $lockname)
+    rc=$?
+    if [ $rc -ne 0 ] && [ $rc -ne $CIB_ENOTFOUND ]; then
+        log "Could not retrieve info from the CIB"
+        return 2
+    fi
+
+    if [ -z "$lock" ]; then
+        log "Lock doesn't exist. Nothing to release"
+        return 0
+    else
+        log "Lock exists, check whether we're the owner"
+        owner=$(lock_owner $lock)
+        if [ "$owner" != "$requester" ];then
+            log "Lock is held by someone else ($owner), will not unlock"
+            return 1
+        fi
+    fi
+
+    lock_delete $lockname "$lock"
+    rc=$?
+
+    if [ $rc -eq 0 ]; then
+        log "Lock '$lockname' released by '$requester'"
+        return 0
+    else
+        log "CIB deletion error, lock cannot be released"
+        return 3
+    fi
+}
+
+
+ACTION=$1
+LOCKNAME=$2
+REQUESTER=$3
+TTL=${4:-60}
+
+if [ -z "$ACTION" ]; then
+    error "Action must be specified"
+fi
+
+if [ $ACTION != "--help" ]; then
+    if [ -z "$LOCKNAME" ] || [ -z "$REQUESTER" ]; then
+        error "You must specific a lock name and a requester"
+    fi
+fi
+
+case $ACTION in
+    --help) usage; exit 0;;
+    --acquire|-a) try_action lock_acquire $LOCKNAME $REQUESTER $TTL;;
+    --release|-r) try_action lock_release $LOCKNAME $REQUESTER;;
+    *) error "Invalid action";;
+esac
+exit $?
--- a/deployment/certs/certmonger-user-baremetal-puppet.yaml
+++ b/deployment/certs/certmonger-user-baremetal-puppet.yaml
@ -72,3 +72,12 @@ outputs:
            - {}
      step_config: |
        include ::tripleo::profile::base::certmonger_user
+      host_prep_tasks:
+        - name: create certificate rotation script for HA services
+          copy:
+            dest: /usr/bin/certmonger-ha-resource-refresh.sh
+            setype: certmonger_unconfined_exec_t
+            mode: "0700"
+            content: |
+              #!/bin/bash
+              /var/lib/container-config-scripts/pacemaker_mutex_restart_bundle.sh --lock $* 2>&1 | logger -t certmonger
--- a/deployment/containers-common.yaml
+++ b/deployment/containers-common.yaml
@ -121,6 +121,12 @@ outputs:
      wait-port-and-run.sh:
        mode: "0755"
        content: { get_file: ../container_config_scripts/wait-port-and-run.sh }
+      pacemaker_resource_lock.sh:
+        mode: "0755"
+        content: { get_file: ../container_config_scripts/pacemaker_resource_lock.sh }
+      pacemaker_mutex_restart_bundle.sh:
+        mode: "0755"
+        content: { get_file: ../container_config_scripts/pacemaker_mutex_restart_bundle.sh }

  volumes_base:
    description: Base volume list