Browse Source

Merge "Rolling certificate update for HA services"

changes/83/745483/4
Zuul 1 year ago
committed by Gerrit Code Review
parent
commit
d13d010693
  1. 90
      container_config_scripts/pacemaker_mutex_restart_bundle.sh
  2. 237
      container_config_scripts/pacemaker_resource_lock.sh
  3. 9
      deployment/certs/certmonger-user-baremetal-puppet.yaml
  4. 6
      deployment/containers-common.yaml
  5. 2
      deployment/database/mysql-pacemaker-puppet.yaml
  6. 2
      deployment/ovn/ovn-dbs-pacemaker-puppet.yaml

90
container_config_scripts/pacemaker_mutex_restart_bundle.sh

@ -0,0 +1,90 @@
#!/bin/bash
# pacemaker_mutex_restart_bundle.sh --lock mysql galera galera-bundle Master _
# pacemaker_mutex_restart_bundle.sh --lock ovn_dbs ovndb_servers ovn-dbs-bundle Slave Master
set -u
usage() {
echo "Restart a clustered resource in a coordinated way across the cluster"
echo "Usage:"
echo " $0 --lock <tripleo-service> <pcmk-resource> <pcmk-bundle> <target-state-local> <target-state-cluster>"
echo
}
log() {
echo "$(date -u): $1"
}
error() {
echo "$(date -u): $1" 1>&2
exit 1
}
ACTION=$1
case $ACTION in
--help) usage; exit 0;;
--lock) ;;
*) error "Unknown action '$ACTION'";;
esac
TRIPLEO_SERVICE=$2
LOCK_NAME=${TRIPLEO_SERVICE}-restart-lock
LOCK_OWNER=$(crm_node -n 2>/dev/null)
rc=$?
if [ $rc -ne 0 ]; then
if [ $rc -eq 102 ]; then
log "Cluster is not running locally, no need to restart resource $TRIPLEO_SERVICE"
exit 0
else
error "Unexpected error while connecting to the cluster (rc: $rc), bailing out"
fi
fi
RESOURCE_NAME=$3
BUNDLE_NAME=$4
WAIT_TARGET_LOCAL=$5
WAIT_TARGET_ANYWHERE=${6:-_}
# The lock TTL should accomodate for the resource start/promote timeout
if [ "$RESOURCE_NAME" != "$BUNDLE_NAME" ]; then
if [ "$WAIT_TARGET_LOCAL" = "Master" ] || [ "$WAIT_TARGET_ANYWHERE" = "Master" ]; then
rsc_op="promote"
else
rsc_op="start"
fi
# <op id="galera-promote-interval-0s" interval="0s" name="promote" on-fail="block" timeout="300s"/>
PCMK_TTL=$(cibadmin -Q | xmllint -xpath "string(//primitive[@id='${RESOURCE_NAME}']/operations/op[@name='${rsc_op}']/@timeout)" - | sed 's/s$//')
LOCK_TTL=$((PCMK_TTL + 30))
else
# The podman RA's default start timeout
LOCK_TTL=90
fi
log "Acquire a ${LOCK_TTL}s restart lock for service $TRIPLEO_SERVICE before restarting it"
# Loop until we hold the lock. The lock has a TTL, so we're guaranteed to get it eventually
rc=1
while [ $rc -ne 0 ]; do
/var/lib/container-config-scripts/pacemaker_resource_lock.sh --acquire $LOCK_NAME $LOCK_OWNER $LOCK_TTL
rc=$?
if [ $rc != 0 ]; then
if [ $rc -gt 1 ]; then
error "Could not acquire lock due to unrecoverable error (rc: $rc), bailing out"
else
log "Could not acquire lock, retrying"
sleep 10
fi
fi
done
log "Restart the service $TRIPLEO_SERVICE locally"
# Reuse the local restart script in t-h-t (driven by env var TRIPLEO_MINOR_UPDATE)
TRIPLEO_MINOR_UPDATE=true /var/lib/container-config-scripts/pacemaker_restart_bundle.sh $TRIPLEO_SERVICE $RESOURCE_NAME $BUNDLE_NAME $WAIT_TARGET_LOCAL $WAIT_TARGET_ANYWHERE
# If we reached this point, always try to release the lock
log "Release the restart lock for service $TRIPLEO_SERVICE"
/var/lib/container-config-scripts/pacemaker_resource_lock.sh --release $LOCK_NAME $LOCK_OWNER
rc=$?
if [ $rc -ne 0 ] && [ $rc -ne 1 ]; then
error "Could not release held lock (rc: $rc)"
fi

237
container_config_scripts/pacemaker_resource_lock.sh

@ -0,0 +1,237 @@
#!/bin/bash
MAX_RETRIES=10
TMP_CIB=$(mktemp -p /var/lib/pacemaker/cib -t tmpcib.XXXXXXXX)
function finish {
rm -f $TMP_CIB
}
trap finish EXIT
trap exit INT TERM
usage() {
echo "Set a global property in the cluster with a validity timestamp."
echo "Usage:"
echo " $0 --acquire <lock_name> <lock_owner> <lock_ttl_in_seconds>"
echo " $0 --release <lock_name> <lock_owner>"
echo
}
log() {
echo "$(date -u): $1" 1>&2
}
error() {
echo "$(date -u): $1" 1>&2
exit 1
}
lock_get() {
local cib_copy=$1
local lockname=$2
local res
local rc
res=$(pcs -f $cib_copy property show "$lockname")
rc=$?
if [ $rc -eq 0 ]; then
echo "$res" | grep -w "$lockname" | cut -d' ' -f3
fi
return $rc
}
lock_owner() {
local lock=$1
echo "$lock" | cut -d':' -f1
}
lock_has_expired() {
local lock=$1
local expiry=$(echo "$lock" | cut -d':' -f2)
local now=$(date +%s)
test $now -ge $expiry
}
# Perform a lock action and restart if the CIB has been modified before
# committing the lock action
try_action() {
local fun=$1
local lock=$2
local requester=$3
local args=${4:-}
local tries=$MAX_RETRIES
local rc=1
if [ "$fun" = "lock_acquire" ] || [ "$fun" = "lock_release" ]; then
log "Try running $fun"
else
return 2
fi
while [ $rc -ne 0 ]; do
$fun $lock $requester $args
rc=$?
if [ $rc -eq 0 ]; then
log "Operation $1 succeeded"
return 0
elif [ $rc -eq 3 ]; then
# rc == 3 -> CIB changed before push
if [ $tries -eq 0 ]; then
log "Failed to commit after $MAX_RETRIES retries. Bailing out."
return 2
else
log "Failed to commit. Retrying operation."
tries=$(($tries - 1))
fi
elif [ $rc -eq 2 ]; then
# rc == 2 -> unrecoverable cib error (e.g. pacemaker down)
log "Unexpected failure. Bailing out"
return $rc
else
# rc == 1 -> lock error (not owner, lock doesn't exists)
return $rc
fi
done
}
# The lock mechanism uses the CIB's num_updates tag to implement
# a conditional store. Cluster-wide locking is guaranteed by pacemaker
lock_acquire() {
local lockname=$1
local requester=$2
local ttl=$3
local rc
local lock
local expiry
local owner
log "Snapshot the current CIB"
pcs cluster cib > $TMP_CIB
rc=$?
if [ $rc -ne 0 ]; then
log "Could not snapshot the CIB"
return 2
fi
log "Check whether the lock is already held in the CIB"
lock=$(lock_get $TMP_CIB $lockname)
rc=$?
if [ $rc -ne 0 ]; then
log "Could not retrieve info from snapshot CIB"
return 2
fi
if [ -n "$lock" ]; then
log "Lock exists, check whether it has expired"
lock_has_expired $lock
rc=$?
if [ $rc -eq 0 ]; then
log "Lock has expired, now available for being held"
else
# lock is still held. check whether we're the owner
owner=$(lock_owner $lock)
if [ "$owner" = "$requester" ];then
log "Already own the lock, acquiring attempt will just reconfigure the TTL"
else
log "Lock is held by someone else ($owner)"
return 1
fi
fi
else
log "Lock is not held yet"
fi
log "Prepare the snapshot CIB to acquire the lock"
expiry=$(($(date +%s) + $ttl))
pcs -f $TMP_CIB property set "$lockname"="$requester:$expiry" --force
# Store Conditional: only works if no update have been pushed in the meantime"
log "Try to push the CIB to signal lock is acquired"
pcs cluster cib-push $TMP_CIB
rc=$?
if [ $rc -eq 0 ]; then
log "Lock '$lockname' acquired by '$requester', valid until $(date -d @$expiry)"
return 0
else
log "CIB changed since snapshot, lock cannot be acquired"
return 3
fi
}
# The lock mechanism uses the CIB's num_updates tag to implement
# a conditional store. Cluster-wide locking is guaranteed by pacemaker
lock_release() {
local lockname=$1
local requester=$2
local rc
local lock
local owner
log "Snapshot the current CIB"
pcs cluster cib > $TMP_CIB
rc=$?
if [ $rc -ne 0 ]; then
log "Could not snapshot the CIB"
return 2
fi
log "Check whether the lock is already held in the CIB"
lock=$(lock_get $TMP_CIB $lockname)
rc=$?
if [ $rc -ne 0 ]; then
log "Could not retrieve info from snapshot CIB"
return 2
fi
if [ -z "$lock" ]; then
log "Lock doesn't exist. Nothing to release"
return 0
else
log "Lock exists, check whether we're the owner"
owner=$(lock_owner $lock)
if [ "$owner" != "$requester" ];then
log "Lock is held by someone else ($owner), will not unlock"
return 1
fi
fi
log "Prepare the snapshot CIB to release the lock"
pcs -f $TMP_CIB property set "$lockname"=""
# Store Conditional: only works if no update have been pushed in the meantime"
log "Try to push the CIB to signal lock is released"
pcs cluster cib-push $TMP_CIB
rc=$?
if [ $rc -eq 0 ]; then
log "Lock '$lockname' released by '$requester'"
return 0
else
log "CIB changed since snapshot, lock cannot be released"
return 3
fi
}
ACTION=$1
LOCKNAME=$2
REQUESTER=$3
TTL=${4:-60}
if [ -z "$ACTION" ]; then
error "Action must be specified"
fi
if [ $ACTION != "--help" ]; then
if [ -z "$LOCKNAME" ] || [ -z "$REQUESTER" ]; then
error "You must specific a lock name and a requester"
fi
fi
case $ACTION in
--help) usage; exit 0;;
--acquire|-a) try_action lock_acquire $LOCKNAME $REQUESTER $TTL;;
--release|-r) try_action lock_release $LOCKNAME $REQUESTER;;
*) error "Invalid action";;
esac
exit $?

9
deployment/certs/certmonger-user-baremetal-puppet.yaml

@ -72,3 +72,12 @@ outputs:
- {}
step_config: |
include tripleo::profile::base::certmonger_user
host_prep_tasks:
- name: create certificate rotation script for HA services
copy:
dest: /usr/bin/certmonger-ha-resource-refresh.sh
setype: certmonger_unconfined_exec_t
mode: "0700"
content: |
#!/bin/bash
/var/lib/container-config-scripts/pacemaker_mutex_restart_bundle.sh --lock $* 2>&1 | logger -t certmonger

6
deployment/containers-common.yaml

@ -121,6 +121,12 @@ outputs:
wait-port-and-run.sh:
mode: "0755"
content: { get_file: ../container_config_scripts/wait-port-and-run.sh }
pacemaker_resource_lock.sh:
mode: "0755"
content: { get_file: ../container_config_scripts/pacemaker_resource_lock.sh }
pacemaker_mutex_restart_bundle.sh:
mode: "0755"
content: { get_file: ../container_config_scripts/pacemaker_mutex_restart_bundle.sh }
volumes_base:
description: Base volume list

2
deployment/database/mysql-pacemaker-puppet.yaml

@ -163,6 +163,8 @@ outputs:
if:
- internal_tls_enabled
-
tripleo::certmonger::mysql::postsave_cmd:
/usr/bin/certmonger-ha-resource-refresh.sh mysql galera galera-bundle Master
tripleo::profile::pacemaker::database::mysql_bundle::ca_file:
get_param: InternalTLSCAFile
- {}

2
deployment/ovn/ovn-dbs-pacemaker-puppet.yaml

@ -159,6 +159,8 @@ outputs:
- if:
- internal_tls_enabled
- generate_service_certificates: true
tripleo::certmonger::ovn_dbs::postsave_cmd:
/usr/bin/certmonger-ha-resource-refresh.sh ovn_dbs ovndb_servers ovn-dbs-bundle Slave Master
tripleo::profile::pacemaker::ovn_dbs_bundle::ca_file:
get_param: InternalTLSCAFile
tripleo::profile::base::neutron::agents::ovn::protocol: 'ssl'

Loading…
Cancel
Save