Ensure rabbit node uptime is reset in the CIB for OCF resource
* Add ocf_run wrappers and info log messages for CIB attribute events * Move "fast" CIB attribute updates before "heavy" operations like start/stop/wait to ensure CIB consistent even if the timeouts exceeded for the ops * Delete master and start time attributes from CIB on action_start to ensure the correct rabbit nodes uptime evaluation for new master elections for corresponding pacemaker resources * For post-demote notify and action_demote() delete the master attribute from CIB as well. * For post-start notify, update the start time in the CIB even when the node is already clustered. Otherwise it would remain running in cluster w/o the start time registered, which affects the new master elections badly. Upstream RR https://github.com/rabbitmq/rabbitmq-server/pull/524 Closes-bug: #1530150 Change-Id: I9db3c819031cef620377b4fee08ea92e90b11c70 Signed-off-by: Bogdan Dobrelya <bdobrelia@mirantis.com>
This commit is contained in:
parent
56e8090cbf
commit
d833b3ac71
@ -493,10 +493,12 @@ now() {
|
||||
}
|
||||
|
||||
master_score() {
|
||||
local LH="${LL} master_score():"
|
||||
local score=$1
|
||||
if [ -z $score ] ; then
|
||||
score=0
|
||||
fi
|
||||
ocf_log info "${LH} Updating master score attribute with ${score}"
|
||||
ocf_run crm_master -l reboot -v $score || return $OCF_ERR_GENERIC
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
@ -820,6 +822,7 @@ join_to_cluster() {
|
||||
local rmq_node
|
||||
local rc=$OCF_ERR_GENERIC
|
||||
local LH="${LL} join_to_cluster():"
|
||||
local nowtime
|
||||
|
||||
ocf_log info "${LH} start."
|
||||
|
||||
@ -853,8 +856,9 @@ join_to_cluster() {
|
||||
action_stop
|
||||
return $OCF_ERR_GENERIC
|
||||
else
|
||||
ocf_log info "${LH} Rabbit app started successfully. Updating start time attribute with $(now)"
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update $(now)
|
||||
nowtime="$(now)"
|
||||
ocf_log info "${LH} Rabbit app started successfully. Updating start time attribute with ${nowtime}"
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update "${nowtime}"
|
||||
ocf_log info "${LH} Joined to cluster succesfully."
|
||||
fi
|
||||
|
||||
@ -1602,6 +1606,14 @@ action_start() {
|
||||
return $OCF_SUCCESS
|
||||
fi
|
||||
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_channels_timeouts' --update '0'
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_get_alarms_timeouts' --update '0'
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_queues_timeouts' --update '0'
|
||||
ocf_log info "${LH} Deleting start time attribute"
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
|
||||
ocf_log info "${LH} Deleting master attribute"
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
|
||||
|
||||
ocf_log info "${LH} RMQ going to start."
|
||||
start_rmq_server_app
|
||||
rc=$?
|
||||
@ -1609,10 +1621,6 @@ action_start() {
|
||||
ocf_log info "${LH} RMQ prepared for start succesfully."
|
||||
fi
|
||||
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_channels_timeouts' --update '0'
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_get_alarms_timeouts' --update '0'
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_queues_timeouts' --update '0'
|
||||
|
||||
ocf_log info "${LH} action end."
|
||||
return $rc
|
||||
}
|
||||
@ -1631,17 +1639,16 @@ action_stop() {
|
||||
|
||||
ocf_log info "${LH} action begin."
|
||||
|
||||
ocf_log info "${LH} Deleting master attribute"
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
|
||||
master_score 0
|
||||
ocf_log info "${LH} Deleting start time attribute"
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
|
||||
|
||||
# Wait for synced state first
|
||||
ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync"
|
||||
wait_sync $((OCF_RESKEY_stop_time/2))
|
||||
|
||||
# remove master flag
|
||||
crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
|
||||
# remove master score
|
||||
master_score 0
|
||||
# remove rmq-server start timestamp
|
||||
crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
|
||||
|
||||
ocf_log info "${LH} RMQ-runtime (beam) going to down."
|
||||
stop_server_process
|
||||
# Fail early without additional rabbitmqctl invocations
|
||||
@ -1699,6 +1706,7 @@ action_notify() {
|
||||
local rc2=$OCF_ERR_GENERIC
|
||||
local LH="${LL} notify:"
|
||||
local nodelist
|
||||
local nowtime
|
||||
|
||||
if [ "${OCF_RESKEY_debug}" = 'true' ] ; then
|
||||
d=`date '+%Y%m%d %H:%M:%S'`
|
||||
@ -1718,7 +1726,8 @@ action_notify() {
|
||||
nodelist=$(get_all_pacemaker_nodes)
|
||||
for i in $nodelist
|
||||
do
|
||||
crm_attribute -N $i -l reboot --name 'rabbit-master' --delete
|
||||
ocf_log info "${LH} Deleting master attribute for node ${i}"
|
||||
ocf_run crm_attribute -N $i -l reboot --name 'rabbit-master' --delete
|
||||
done
|
||||
ocf_log info "${LH} pre-promote end."
|
||||
fi
|
||||
@ -1786,6 +1795,9 @@ action_notify() {
|
||||
ocf_log warn "${LH} We are already clustered with node ${OCF_RESKEY_CRM_meta_notify_master_uname}"
|
||||
if try_to_start_rmq_app; then
|
||||
rc2=$OCF_SUCCESS
|
||||
nowtime="$(now)"
|
||||
ocf_log info "${LH} Updating start time attribute with ${nowtime}"
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update "${nowtime}"
|
||||
else
|
||||
rc2=$OCF_ERR_GENERIC
|
||||
fi
|
||||
@ -1854,10 +1866,13 @@ action_notify() {
|
||||
# On the nodes being demoted, reset the master score
|
||||
ocf_log info "${LH} resetting the master score."
|
||||
master_score 0
|
||||
ocf_log info "${LH} Deleting start time attribute"
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
|
||||
ocf_log info "${LH} Deleting master attribute"
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
|
||||
ocf_log info "${LH} master was demoted. stopping RabbitMQ app."
|
||||
stop_rmq_server_app
|
||||
rc2=$?
|
||||
crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
|
||||
if [ $rc2 -ne $OCF_SUCCESS ] ; then
|
||||
ocf_log err "${LH} RMQ-server app can't be stopped on post-demote. Master resource is failed"
|
||||
ocf_log info "${LH} post-demote end."
|
||||
@ -1877,6 +1892,7 @@ action_notify() {
|
||||
action_promote() {
|
||||
local rc=$OCF_ERR_GENERIC
|
||||
local LH="${LL} promote:"
|
||||
local nowtime
|
||||
|
||||
if [ "${OCF_RESKEY_debug}" = 'true' ] ; then
|
||||
d=$(date '+%Y%m%d %H:%M:%S')
|
||||
@ -1916,8 +1932,9 @@ action_promote() {
|
||||
[ -f $set_policy_path ] && . $set_policy_path
|
||||
|
||||
# create timestamp file
|
||||
ocf_log info "${LH} Updating start timestamp"
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update $(now)
|
||||
nowtime="$(now)"
|
||||
ocf_log info "${LH} Updating start timestamp with ${nowtime}"
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update "${nowtime}"
|
||||
ocf_log info "${LH} Checking master status"
|
||||
get_monitor
|
||||
rc=$?
|
||||
@ -1990,6 +2007,10 @@ action_demote() {
|
||||
"$OCF_RUNNING_MASTER")
|
||||
# Running as master. Normal, expected behavior.
|
||||
ocf_log warn "${LH} Resource is currently running as Master"
|
||||
ocf_log info "${LH} Deleting master attribute"
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
|
||||
ocf_log info "${LH} Deleting start timestamp"
|
||||
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
|
||||
|
||||
# Wait for synced state first
|
||||
ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync"
|
||||
@ -1997,8 +2018,6 @@ action_demote() {
|
||||
|
||||
stop_rmq_server_app
|
||||
rc=$?
|
||||
crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
|
||||
crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
|
||||
;;
|
||||
"$OCF_SUCCESS")
|
||||
# Alread running as slave. Nothing to do.
|
||||
|
Loading…
Reference in New Issue
Block a user