RabbitMQ: always allow promotion on HA queue during failover
When the RabbitMQ experience a rolling restart of its peers, the master of an HA queue fails over from one replica to another. If there are messages sent to the HA queue while some rabbit nodes are restarting, the latter will reconnect as unsynchronized slaves. It can happen that during a rolling restart, all rabbit nodes reconnect as unsynchronized, which prevents RabbitMQ to automatically elect a new Master for failover. This has other side effects on fanout queues and may prevent OpenStack notification to be consumed properly. Change the HA policy to always allow a promotion even when all replicas are unsynchronized. When such rare condition happens, rely on OpenStack client to retry RPC if they need to. Closes-Bug: #1823305 Co-Authored-By: Damien Ciabrini <dciabrin@redhat.com> Change-Id: Id9bdd36aa0ee81424212e3a89185311817a15aee
This commit is contained in:
parent
81d7714744
commit
610c8d8d41
@ -66,6 +66,10 @@
|
|||||||
# (Optional) Whether TLS in the internal network is enabled or not.
|
# (Optional) Whether TLS in the internal network is enabled or not.
|
||||||
# Defaults to hiera('enable_internal_tls', false)
|
# Defaults to hiera('enable_internal_tls', false)
|
||||||
#
|
#
|
||||||
|
# [*rabbitmq_extra_policies*]
|
||||||
|
# (Optional) Hash of extra policies for the HA queues
|
||||||
|
# Defaults to hiera('rabbitmq_extra_policies', {'ha-promote-on-shutdown' => 'always'})
|
||||||
|
#
|
||||||
# [*pcs_tries*]
|
# [*pcs_tries*]
|
||||||
# (Optional) The number of times pcs commands should be retried.
|
# (Optional) The number of times pcs commands should be retried.
|
||||||
# Defaults to hiera('pcs_tries', 20)
|
# Defaults to hiera('pcs_tries', 20)
|
||||||
@ -91,6 +95,7 @@ class tripleo::profile::pacemaker::rabbitmq_bundle (
|
|||||||
$notify_bootstrap_node = hiera('oslo_messaging_notify_short_bootstrap_node_name'),
|
$notify_bootstrap_node = hiera('oslo_messaging_notify_short_bootstrap_node_name'),
|
||||||
$notify_nodes = hiera('oslo_messaging_notify_node_names', []),
|
$notify_nodes = hiera('oslo_messaging_notify_node_names', []),
|
||||||
$enable_internal_tls = hiera('enable_internal_tls', false),
|
$enable_internal_tls = hiera('enable_internal_tls', false),
|
||||||
|
$rabbitmq_extra_policies = hiera('rabbitmq_extra_policies', {'ha-promote-on-shutdown' => 'always'}),
|
||||||
$pcs_tries = hiera('pcs_tries', 20),
|
$pcs_tries = hiera('pcs_tries', 20),
|
||||||
$step = Integer(hiera('step')),
|
$step = Integer(hiera('step')),
|
||||||
$container_backend = 'docker',
|
$container_backend = 'docker',
|
||||||
@ -264,16 +269,19 @@ class tripleo::profile::pacemaker::rabbitmq_bundle (
|
|||||||
if $user_ha_queues == 0 {
|
if $user_ha_queues == 0 {
|
||||||
$nr_rabbit_nodes = size($rabbit_nodes)
|
$nr_rabbit_nodes = size($rabbit_nodes)
|
||||||
$nr_ha_queues = $nr_rabbit_nodes / 2 + ($nr_rabbit_nodes % 2)
|
$nr_ha_queues = $nr_rabbit_nodes / 2 + ($nr_rabbit_nodes % 2)
|
||||||
$params = "set_policy='ha-all ^(?!amq\\.).* {\"ha-mode\":\"exactly\",\"ha-params\":${nr_ha_queues}}'"
|
$ha_queues_policy = { 'ha-mode' => 'exactly', 'ha-params' => $nr_ha_queues }
|
||||||
} elsif $user_ha_queues == -1 {
|
} elsif $user_ha_queues == -1 {
|
||||||
$params = 'set_policy=\'ha-all ^(?!amq\.).* {"ha-mode":"all"}\''
|
$ha_queues_policy = { 'ha-mode' => 'all' }
|
||||||
} else {
|
} else {
|
||||||
$nr_ha_queues = $user_ha_queues
|
$nr_ha_queues = $user_ha_queues
|
||||||
$params = "set_policy='ha-all ^(?!amq\\.).* {\"ha-mode\":\"exactly\",\"ha-params\":${nr_ha_queues}}'"
|
$ha_queues_policy = { 'ha-mode' => 'exactly', 'ha-params' => $nr_ha_queues }
|
||||||
}
|
}
|
||||||
|
$ha_policy = merge($ha_queues_policy, $rabbitmq_extra_policies)
|
||||||
|
$ocf_params = "set_policy='ha-all ^(?!amq\\.).* ${to_json($ha_policy)}'"
|
||||||
|
|
||||||
pacemaker::resource::ocf { 'rabbitmq':
|
pacemaker::resource::ocf { 'rabbitmq':
|
||||||
ocf_agent_name => 'heartbeat:rabbitmq-cluster',
|
ocf_agent_name => 'heartbeat:rabbitmq-cluster',
|
||||||
resource_params => $params,
|
resource_params => $ocf_params,
|
||||||
meta_params => 'notify=true container-attribute-target=host',
|
meta_params => 'notify=true container-attribute-target=host',
|
||||||
op_params => 'start timeout=200s stop timeout=200s',
|
op_params => 'start timeout=200s stop timeout=200s',
|
||||||
tries => $pcs_tries,
|
tries => $pcs_tries,
|
||||||
@ -306,6 +314,18 @@ class tripleo::profile::pacemaker::rabbitmq_bundle (
|
|||||||
try_sleep => 10,
|
try_sleep => 10,
|
||||||
tag => 'rabbitmq_ready',
|
tag => 'rabbitmq_ready',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Set the HA queue policy here, because the rabbitmq resource
|
||||||
|
# agent do so very early in the bootstrap process, and it
|
||||||
|
# doesn't seem to work reliably.
|
||||||
|
# Note: rabbitmq_policy expects all the hash values passed
|
||||||
|
# to 'definition' to be strings
|
||||||
|
rabbitmq_policy { 'ha-all@/':
|
||||||
|
applyto => 'queues',
|
||||||
|
pattern => '^(?!amq\.).*',
|
||||||
|
definition => hash($ha_policy.map |$k, $v| {[$k, "${v}"]}),
|
||||||
|
}
|
||||||
|
|
||||||
# Make sure that if we create rabbitmq users at the same step it happens
|
# Make sure that if we create rabbitmq users at the same step it happens
|
||||||
# after the cluster is up
|
# after the cluster is up
|
||||||
Exec['rabbitmq-ready'] -> Rabbitmq_user<||>
|
Exec['rabbitmq-ready'] -> Rabbitmq_user<||>
|
||||||
|
@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
issues:
|
||||||
|
- |
|
||||||
|
Allow a hiera key to add an additional rabbitmq policy in the resource
|
||||||
|
agend.
|
Loading…
x
Reference in New Issue
Block a user