Ensure rabbit node uptime is reset in the CIB for OCF resource

* Add ocf_run wrappers and info log messages for CIB attribute events
* Move "fast" CIB attribute updates before "heavy" operations like
  start/stop/wait to ensure CIB consistent even if the timeouts
  exceeded for the ops
* Delete master and start time attributes from CIB on action_start
  to ensure the correct rabbit nodes uptime evaluation for new
  master elections for corresponding pacemaker resources
* For post-demote notify and action_demote() delete the master
  attribute from CIB as well.
* For post-start notify, update the start time in the CIB even when
  the node is already clustered. Otherwise it would remain running
  in cluster w/o the start time registered, which affects the new
  master elections badly.

Upstream RR https://github.com/rabbitmq/rabbitmq-server/pull/524
Closes-bug: #1530150

Change-Id: I9db3c819031cef620377b4fee08ea92e90b11c70
Signed-off-by: Bogdan Dobrelya <bdobrelia@mirantis.com>
This commit is contained in:
Bogdan Dobrelya 2015-12-30 18:08:46 +01:00
parent 56e8090cbf
commit d833b3ac71

View File

@ -493,10 +493,12 @@ now() {
}
master_score() {
local LH="${LL} master_score():"
local score=$1
if [ -z $score ] ; then
score=0
fi
ocf_log info "${LH} Updating master score attribute with ${score}"
ocf_run crm_master -l reboot -v $score || return $OCF_ERR_GENERIC
return $OCF_SUCCESS
}
@ -820,6 +822,7 @@ join_to_cluster() {
local rmq_node
local rc=$OCF_ERR_GENERIC
local LH="${LL} join_to_cluster():"
local nowtime
ocf_log info "${LH} start."
@ -853,8 +856,9 @@ join_to_cluster() {
action_stop
return $OCF_ERR_GENERIC
else
ocf_log info "${LH} Rabbit app started successfully. Updating start time attribute with $(now)"
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update $(now)
nowtime="$(now)"
ocf_log info "${LH} Rabbit app started successfully. Updating start time attribute with ${nowtime}"
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update "${nowtime}"
ocf_log info "${LH} Joined to cluster succesfully."
fi
@ -1602,6 +1606,14 @@ action_start() {
return $OCF_SUCCESS
fi
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_channels_timeouts' --update '0'
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_get_alarms_timeouts' --update '0'
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_queues_timeouts' --update '0'
ocf_log info "${LH} Deleting start time attribute"
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
ocf_log info "${LH} Deleting master attribute"
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
ocf_log info "${LH} RMQ going to start."
start_rmq_server_app
rc=$?
@ -1609,10 +1621,6 @@ action_start() {
ocf_log info "${LH} RMQ prepared for start succesfully."
fi
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_channels_timeouts' --update '0'
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_get_alarms_timeouts' --update '0'
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit_list_queues_timeouts' --update '0'
ocf_log info "${LH} action end."
return $rc
}
@ -1631,17 +1639,16 @@ action_stop() {
ocf_log info "${LH} action begin."
ocf_log info "${LH} Deleting master attribute"
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
master_score 0
ocf_log info "${LH} Deleting start time attribute"
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
# Wait for synced state first
ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync"
wait_sync $((OCF_RESKEY_stop_time/2))
# remove master flag
crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
# remove master score
master_score 0
# remove rmq-server start timestamp
crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
ocf_log info "${LH} RMQ-runtime (beam) going to down."
stop_server_process
# Fail early without additional rabbitmqctl invocations
@ -1699,6 +1706,7 @@ action_notify() {
local rc2=$OCF_ERR_GENERIC
local LH="${LL} notify:"
local nodelist
local nowtime
if [ "${OCF_RESKEY_debug}" = 'true' ] ; then
d=`date '+%Y%m%d %H:%M:%S'`
@ -1718,7 +1726,8 @@ action_notify() {
nodelist=$(get_all_pacemaker_nodes)
for i in $nodelist
do
crm_attribute -N $i -l reboot --name 'rabbit-master' --delete
ocf_log info "${LH} Deleting master attribute for node ${i}"
ocf_run crm_attribute -N $i -l reboot --name 'rabbit-master' --delete
done
ocf_log info "${LH} pre-promote end."
fi
@ -1786,6 +1795,9 @@ action_notify() {
ocf_log warn "${LH} We are already clustered with node ${OCF_RESKEY_CRM_meta_notify_master_uname}"
if try_to_start_rmq_app; then
rc2=$OCF_SUCCESS
nowtime="$(now)"
ocf_log info "${LH} Updating start time attribute with ${nowtime}"
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update "${nowtime}"
else
rc2=$OCF_ERR_GENERIC
fi
@ -1854,10 +1866,13 @@ action_notify() {
# On the nodes being demoted, reset the master score
ocf_log info "${LH} resetting the master score."
master_score 0
ocf_log info "${LH} Deleting start time attribute"
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
ocf_log info "${LH} Deleting master attribute"
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
ocf_log info "${LH} master was demoted. stopping RabbitMQ app."
stop_rmq_server_app
rc2=$?
crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
if [ $rc2 -ne $OCF_SUCCESS ] ; then
ocf_log err "${LH} RMQ-server app can't be stopped on post-demote. Master resource is failed"
ocf_log info "${LH} post-demote end."
@ -1877,6 +1892,7 @@ action_notify() {
action_promote() {
local rc=$OCF_ERR_GENERIC
local LH="${LL} promote:"
local nowtime
if [ "${OCF_RESKEY_debug}" = 'true' ] ; then
d=$(date '+%Y%m%d %H:%M:%S')
@ -1916,8 +1932,9 @@ action_promote() {
[ -f $set_policy_path ] && . $set_policy_path
# create timestamp file
ocf_log info "${LH} Updating start timestamp"
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update $(now)
nowtime="$(now)"
ocf_log info "${LH} Updating start timestamp with ${nowtime}"
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update "${nowtime}"
ocf_log info "${LH} Checking master status"
get_monitor
rc=$?
@ -1990,6 +2007,10 @@ action_demote() {
"$OCF_RUNNING_MASTER")
# Running as master. Normal, expected behavior.
ocf_log warn "${LH} Resource is currently running as Master"
ocf_log info "${LH} Deleting master attribute"
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
ocf_log info "${LH} Deleting start timestamp"
ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
# Wait for synced state first
ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync"
@ -1997,8 +2018,6 @@ action_demote() {
stop_rmq_server_app
rc=$?
crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete
crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete
;;
"$OCF_SUCCESS")
# Alread running as slave. Nothing to do.