Merge "Rolling certificate update for HA services" into stable/train
This commit is contained in:
commit
c769a503b1
90
container_config_scripts/pacemaker_mutex_restart_bundle.sh
Executable file
90
container_config_scripts/pacemaker_mutex_restart_bundle.sh
Executable file
@ -0,0 +1,90 @@
|
||||
#!/bin/bash
|
||||
|
||||
# pacemaker_mutex_restart_bundle.sh --lock mysql galera galera-bundle Master _
|
||||
# pacemaker_mutex_restart_bundle.sh --lock ovn_dbs ovndb_servers ovn-dbs-bundle Slave Master
|
||||
|
||||
set -u
|
||||
|
||||
usage() {
|
||||
echo "Restart a clustered resource in a coordinated way across the cluster"
|
||||
echo "Usage:"
|
||||
echo " $0 --lock <tripleo-service> <pcmk-resource> <pcmk-bundle> <target-state-local> <target-state-cluster>"
|
||||
echo
|
||||
}
|
||||
|
||||
log() {
|
||||
echo "$(date -u): $1"
|
||||
}
|
||||
|
||||
error() {
|
||||
echo "$(date -u): $1" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
ACTION=$1
|
||||
case $ACTION in
|
||||
--help) usage; exit 0;;
|
||||
--lock) ;;
|
||||
*) error "Unknown action '$ACTION'";;
|
||||
esac
|
||||
|
||||
TRIPLEO_SERVICE=$2
|
||||
LOCK_NAME=${TRIPLEO_SERVICE}-restart-lock
|
||||
LOCK_OWNER=$(crm_node -n 2>/dev/null)
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ]; then
|
||||
if [ $rc -eq 102 ]; then
|
||||
log "Cluster is not running locally, no need to restart resource $TRIPLEO_SERVICE"
|
||||
exit 0
|
||||
else
|
||||
error "Unexpected error while connecting to the cluster (rc: $rc), bailing out"
|
||||
fi
|
||||
fi
|
||||
|
||||
RESOURCE_NAME=$3
|
||||
BUNDLE_NAME=$4
|
||||
WAIT_TARGET_LOCAL=$5
|
||||
WAIT_TARGET_ANYWHERE=${6:-_}
|
||||
|
||||
# The lock TTL should accomodate for the resource start/promote timeout
|
||||
if [ "$RESOURCE_NAME" != "$BUNDLE_NAME" ]; then
|
||||
if [ "$WAIT_TARGET_LOCAL" = "Master" ] || [ "$WAIT_TARGET_ANYWHERE" = "Master" ]; then
|
||||
rsc_op="promote"
|
||||
else
|
||||
rsc_op="start"
|
||||
fi
|
||||
# <op id="galera-promote-interval-0s" interval="0s" name="promote" on-fail="block" timeout="300s"/>
|
||||
PCMK_TTL=$(cibadmin -Q | xmllint -xpath "string(//primitive[@id='${RESOURCE_NAME}']/operations/op[@name='${rsc_op}']/@timeout)" - | sed 's/s$//')
|
||||
LOCK_TTL=$((PCMK_TTL + 30))
|
||||
else
|
||||
# The podman RA's default start timeout
|
||||
LOCK_TTL=90
|
||||
fi
|
||||
|
||||
log "Acquire a ${LOCK_TTL}s restart lock for service $TRIPLEO_SERVICE before restarting it"
|
||||
# Loop until we hold the lock. The lock has a TTL, so we're guaranteed to get it eventually
|
||||
rc=1
|
||||
while [ $rc -ne 0 ]; do
|
||||
/var/lib/container-config-scripts/pacemaker_resource_lock.sh --acquire $LOCK_NAME $LOCK_OWNER $LOCK_TTL
|
||||
rc=$?
|
||||
if [ $rc != 0 ]; then
|
||||
if [ $rc -gt 1 ]; then
|
||||
error "Could not acquire lock due to unrecoverable error (rc: $rc), bailing out"
|
||||
else
|
||||
log "Could not acquire lock, retrying"
|
||||
sleep 10
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
log "Restart the service $TRIPLEO_SERVICE locally"
|
||||
# Reuse the local restart script in t-h-t (driven by env var TRIPLEO_MINOR_UPDATE)
|
||||
TRIPLEO_MINOR_UPDATE=true /var/lib/container-config-scripts/pacemaker_restart_bundle.sh $TRIPLEO_SERVICE $RESOURCE_NAME $BUNDLE_NAME $WAIT_TARGET_LOCAL $WAIT_TARGET_ANYWHERE
|
||||
|
||||
# If we reached this point, always try to release the lock
|
||||
log "Release the restart lock for service $TRIPLEO_SERVICE"
|
||||
/var/lib/container-config-scripts/pacemaker_resource_lock.sh --release $LOCK_NAME $LOCK_OWNER
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ] && [ $rc -ne 1 ]; then
|
||||
error "Could not release held lock (rc: $rc)"
|
||||
fi
|
237
container_config_scripts/pacemaker_resource_lock.sh
Executable file
237
container_config_scripts/pacemaker_resource_lock.sh
Executable file
@ -0,0 +1,237 @@
|
||||
#!/bin/bash
|
||||
|
||||
MAX_RETRIES=10
|
||||
CIB_ENOTFOUND=105
|
||||
|
||||
usage() {
|
||||
echo "Set a global property in the cluster with a validity timestamp."
|
||||
echo "Usage:"
|
||||
echo " $0 --acquire <lock_name> <lock_owner> <lock_ttl_in_seconds>"
|
||||
echo " $0 --release <lock_name> <lock_owner>"
|
||||
echo
|
||||
}
|
||||
|
||||
log() {
|
||||
echo "$(date -u): $1" 1>&2
|
||||
}
|
||||
|
||||
error() {
|
||||
echo "$(date -u): $1" 1>&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
lock_create() {
|
||||
local name=$1
|
||||
local data=$2
|
||||
# cibadmin won't overwrite a key if someone else succeeded to create it concurrently
|
||||
cibadmin --sync-call --scope crm_config --create --xml-text "<cluster_property_set id='${name}'><nvpair id='${name}-pair' name='${name}' value='${data}'/></cluster_property_set>" &>/dev/null
|
||||
return $?
|
||||
}
|
||||
|
||||
lock_update() {
|
||||
local name=$1
|
||||
local expected_data=$2
|
||||
local new_data=$3
|
||||
# we only update the lock we expect to see, so we can't update someone else's lock
|
||||
cibadmin --sync-call --scope crm_config --modify --xpath "//cluster_property_set/nvpair[@name='${name}' and @value='${expected_data}']/.." --xml-text "<nvpair id='${name}-pair' name='${name}' value='${new_data}'/>" &>/dev/null
|
||||
return $?
|
||||
}
|
||||
|
||||
lock_delete() {
|
||||
local name=$1
|
||||
local expected_data=$2
|
||||
# we only delete the lock we expect to see, so we can't delete someone else's lock
|
||||
cibadmin --sync-call --scope crm_config --delete --xpath "//cluster_property_set/nvpair[@name='${name}' and @value='${expected_data}']/.." &>/dev/null
|
||||
return $?
|
||||
}
|
||||
|
||||
lock_get() {
|
||||
local lockname=$1
|
||||
local res
|
||||
local rc
|
||||
res=$(cibadmin --query --scope crm_config --xpath "//cluster_property_set/nvpair[@name='$lockname']" 2>/dev/null)
|
||||
rc=$?
|
||||
if [ $rc -eq 0 ]; then
|
||||
echo "$res" | sed -n 's/.*value="\([^"]*\)".*/\1/p'
|
||||
fi
|
||||
return $rc
|
||||
}
|
||||
|
||||
lock_owner() {
|
||||
local lock=$1
|
||||
echo "$lock" | cut -d':' -f1
|
||||
}
|
||||
|
||||
lock_has_expired() {
|
||||
local lock=$1
|
||||
local expiry=$(echo "$lock" | cut -d':' -f2)
|
||||
local now=$(date +%s)
|
||||
test $now -ge $expiry
|
||||
}
|
||||
|
||||
|
||||
# Perform a lock action and restart if the CIB has been modified before
|
||||
# committing the lock action
|
||||
try_action() {
|
||||
local fun=$1
|
||||
local lock=$2
|
||||
local requester=$3
|
||||
local args=${4:-}
|
||||
local tries=$MAX_RETRIES
|
||||
local rc=1
|
||||
if [ "$fun" = "lock_acquire" ] || [ "$fun" = "lock_release" ]; then
|
||||
log "Try running $fun"
|
||||
else
|
||||
return 2
|
||||
fi
|
||||
while [ $rc -ne 0 ]; do
|
||||
$fun $lock $requester $args
|
||||
rc=$?
|
||||
if [ $rc -eq 0 ]; then
|
||||
log "Operation $1 succeeded"
|
||||
return 0
|
||||
elif [ $rc -eq 3 ]; then
|
||||
# rc == 3 -> CIB changed before push
|
||||
if [ $tries -eq 0 ]; then
|
||||
log "Failed to commit after $MAX_RETRIES retries. Bailing out."
|
||||
return 2
|
||||
else
|
||||
log "Failed to commit. Retrying operation."
|
||||
tries=$(($tries - 1))
|
||||
fi
|
||||
elif [ $rc -eq 2 ]; then
|
||||
# rc == 2 -> unrecoverable cib error (e.g. pacemaker down)
|
||||
log "Unexpected failure. Bailing out"
|
||||
return $rc
|
||||
else
|
||||
# rc == 1 -> lock error (not owner, lock doesn't exists)
|
||||
return $rc
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# The lock mechanism uses cibadmin's atomic creation so cluster-wide
|
||||
# state coherency is guaranteed by pacemaker
|
||||
lock_acquire() {
|
||||
local lockname=$1
|
||||
local requester=$2
|
||||
local ttl=$3
|
||||
local rc
|
||||
local lock
|
||||
local expiry
|
||||
local owner
|
||||
|
||||
log "Check whether the lock is already held in the CIB"
|
||||
lock=$(lock_get $lockname)
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ] && [ $rc -ne $CIB_ENOTFOUND ]; then
|
||||
log "Could not retrieve info from the CIB"
|
||||
return 2
|
||||
fi
|
||||
|
||||
if [ -n "$lock" ]; then
|
||||
lock_has_expired $lock
|
||||
rc=$?
|
||||
if [ $rc -eq 0 ]; then
|
||||
log "Lock has expired, now available for being held"
|
||||
else
|
||||
# lock is still held. check whether we're the owner
|
||||
owner=$(lock_owner $lock)
|
||||
if [ "$owner" = "$requester" ];then
|
||||
log "Requester already owns the lock, acquiring attempt will just reconfigure the TTL"
|
||||
else
|
||||
log "Lock is held by someone else ($owner)"
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
else
|
||||
log "Lock is not held yet"
|
||||
fi
|
||||
|
||||
# prepare the lock info
|
||||
expiry=$(($(date +%s) + $ttl))
|
||||
|
||||
if [ -n "$lock" ]; then
|
||||
log "Attempting to update the lock"
|
||||
lock_update $lockname "$lock" "$requester:$expiry"
|
||||
rc=$?
|
||||
else
|
||||
log "Attempting to acquire the lock"
|
||||
lock_create $lockname "$requester:$expiry"
|
||||
rc=$?
|
||||
fi
|
||||
|
||||
if [ $rc -eq 0 ]; then
|
||||
log "Lock '$lockname' acquired by '$requester', valid until $(date -d @$expiry)"
|
||||
return 0
|
||||
else
|
||||
log "CIB changed, lock cannot be acquired"
|
||||
return 3
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
# The lock mechanism uses the CIB's num_updates tag to implement
|
||||
# a conditional store. Cluster-wide locking is guaranteed by pacemaker
|
||||
lock_release() {
|
||||
local lockname=$1
|
||||
local requester=$2
|
||||
local rc
|
||||
local lock
|
||||
local owner
|
||||
|
||||
log "Check whether the lock is already held in the CIB"
|
||||
lock=$(lock_get $lockname)
|
||||
rc=$?
|
||||
if [ $rc -ne 0 ] && [ $rc -ne $CIB_ENOTFOUND ]; then
|
||||
log "Could not retrieve info from the CIB"
|
||||
return 2
|
||||
fi
|
||||
|
||||
if [ -z "$lock" ]; then
|
||||
log "Lock doesn't exist. Nothing to release"
|
||||
return 0
|
||||
else
|
||||
log "Lock exists, check whether we're the owner"
|
||||
owner=$(lock_owner $lock)
|
||||
if [ "$owner" != "$requester" ];then
|
||||
log "Lock is held by someone else ($owner), will not unlock"
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
|
||||
lock_delete $lockname "$lock"
|
||||
rc=$?
|
||||
|
||||
if [ $rc -eq 0 ]; then
|
||||
log "Lock '$lockname' released by '$requester'"
|
||||
return 0
|
||||
else
|
||||
log "CIB deletion error, lock cannot be released"
|
||||
return 3
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
ACTION=$1
|
||||
LOCKNAME=$2
|
||||
REQUESTER=$3
|
||||
TTL=${4:-60}
|
||||
|
||||
if [ -z "$ACTION" ]; then
|
||||
error "Action must be specified"
|
||||
fi
|
||||
|
||||
if [ $ACTION != "--help" ]; then
|
||||
if [ -z "$LOCKNAME" ] || [ -z "$REQUESTER" ]; then
|
||||
error "You must specific a lock name and a requester"
|
||||
fi
|
||||
fi
|
||||
|
||||
case $ACTION in
|
||||
--help) usage; exit 0;;
|
||||
--acquire|-a) try_action lock_acquire $LOCKNAME $REQUESTER $TTL;;
|
||||
--release|-r) try_action lock_release $LOCKNAME $REQUESTER;;
|
||||
*) error "Invalid action";;
|
||||
esac
|
||||
exit $?
|
@ -72,3 +72,12 @@ outputs:
|
||||
- {}
|
||||
step_config: |
|
||||
include ::tripleo::profile::base::certmonger_user
|
||||
host_prep_tasks:
|
||||
- name: create certificate rotation script for HA services
|
||||
copy:
|
||||
dest: /usr/bin/certmonger-ha-resource-refresh.sh
|
||||
setype: certmonger_unconfined_exec_t
|
||||
mode: "0700"
|
||||
content: |
|
||||
#!/bin/bash
|
||||
/var/lib/container-config-scripts/pacemaker_mutex_restart_bundle.sh --lock $* 2>&1 | logger -t certmonger
|
||||
|
@ -121,6 +121,12 @@ outputs:
|
||||
wait-port-and-run.sh:
|
||||
mode: "0755"
|
||||
content: { get_file: ../container_config_scripts/wait-port-and-run.sh }
|
||||
pacemaker_resource_lock.sh:
|
||||
mode: "0755"
|
||||
content: { get_file: ../container_config_scripts/pacemaker_resource_lock.sh }
|
||||
pacemaker_mutex_restart_bundle.sh:
|
||||
mode: "0755"
|
||||
content: { get_file: ../container_config_scripts/pacemaker_mutex_restart_bundle.sh }
|
||||
|
||||
volumes_base:
|
||||
description: Base volume list
|
||||
|
Loading…
Reference in New Issue
Block a user