Fix issue where charms aren't clustered but RMQ is
Due to the @cache decorator in the code, it was possible to get the charm into a state where RMQ is clustered, but the charm doesn't record it. The charm 'thinks' it is clustered when it has set the 'clustered' key on the 'cluster' relation. Unfortunately, due to the @cached decorator it's possible in the 'cluster-relation-changed' hook to have a situation where the RMQ instance clusters during the hook execution and then, later, when it's supposed to writing the 'clustered' key, it reads the previous cached value where it wasn't clustered and therefore doesn't set the 'clustered' key. This is just about the only opportunity to do it, and so the charm ends up being locked. The fix was to clear the @cache values so that the nodes would be re-read, and this allows the charm to then write the 'clustered' key. Change-Id: I12be41a83323d150ba1cbaeef64041f0bb5e32ce Closes-Bug: #1975605
This commit is contained in:
parent
09ade6b5ee
commit
81f08ab769
|
@ -69,6 +69,7 @@ from charmhelpers.core.hookenv import (
|
|||
service_name,
|
||||
status_set,
|
||||
cached,
|
||||
flush,
|
||||
relation_set,
|
||||
relation_get,
|
||||
application_version_set,
|
||||
|
@ -972,6 +973,9 @@ def clustered_with_leader():
|
|||
|
||||
def update_peer_cluster_status():
|
||||
"""Inform peers that this unit is clustered if it is."""
|
||||
# clear the nodes cache so that the rabbit nodes are re-evaluated if the
|
||||
# cluster has formed during the hook execution.
|
||||
clear_nodes_cache()
|
||||
# check the leader and try to cluster with it
|
||||
if clustered_with_leader():
|
||||
log('Host already clustered with %s.' % leader_node())
|
||||
|
@ -986,9 +990,14 @@ def update_peer_cluster_status():
|
|||
# NOTE(freyes): this node needs to be marked as clustered, it's
|
||||
# part of the cluster according to 'rabbitmqctl cluster_status'
|
||||
# (LP: #1691510)
|
||||
log("Setting 'clustered' on 'cluster' relation", level=DEBUG)
|
||||
relation_set(relation_id=cluster_rid,
|
||||
clustered=get_unit_hostname(),
|
||||
timestamp=time.time())
|
||||
else:
|
||||
log("Already set 'clustered' on cluster relation", level=DEBUG)
|
||||
else:
|
||||
log('Host not clustered with leader', level=DEBUG)
|
||||
|
||||
|
||||
def join_leader():
|
||||
|
@ -1005,6 +1014,10 @@ def join_leader():
|
|||
log("Attempting to cluster with leader", "INFO")
|
||||
try:
|
||||
node = leader_node()
|
||||
if node is None:
|
||||
log("Couldn't identify leader", "INFO")
|
||||
status_set('blocked', "Leader not yet identified")
|
||||
return
|
||||
join_cluster(node)
|
||||
# NOTE: toggle the cluster relation to ensure that any peers
|
||||
# already clustered re-assess status correctly
|
||||
|
@ -1320,7 +1333,9 @@ def is_partitioned():
|
|||
@cached
|
||||
def running_nodes():
|
||||
''' Determine the current set of running nodes in the RabbitMQ cluster '''
|
||||
return nodes(get_running=True)
|
||||
_nodes = nodes(get_running=True)
|
||||
log("running_nodes: {}".format(_nodes), DEBUG)
|
||||
return _nodes
|
||||
|
||||
|
||||
@cached
|
||||
|
@ -1335,13 +1350,25 @@ def leader_node():
|
|||
leader_node_hostname = peer_retrieve('leader_node_hostname')
|
||||
except ValueError:
|
||||
# This is a single unit
|
||||
log("leader_node: None", DEBUG)
|
||||
return None
|
||||
if leader_node_hostname:
|
||||
log("leader_node: rabbit@{}".format(leader_node_hostname), DEBUG)
|
||||
return "rabbit@" + leader_node_hostname
|
||||
else:
|
||||
log("leader_node: None", DEBUG)
|
||||
return None
|
||||
|
||||
|
||||
def clear_nodes_cache():
|
||||
"""Clear the running_nodes() and nodes() and leader_node() cache"""
|
||||
log("Clearing the cache for nodes.", DEBUG)
|
||||
flush('leader_node')
|
||||
flush('running_nodes')
|
||||
flush('nodes')
|
||||
flush('clustered')
|
||||
|
||||
|
||||
def get_node_hostname(ip_addr):
|
||||
''' Resolve IP address to hostname '''
|
||||
try:
|
||||
|
@ -1390,6 +1417,10 @@ def assess_cluster_status(*args):
|
|||
if not clustered():
|
||||
return 'waiting', 'Unit has peers, but RabbitMQ not clustered'
|
||||
|
||||
# See if all the rabbitmq charms think they are clustered (e.g. that the
|
||||
# 'clustered' attribute is set on the 'cluster' relationship
|
||||
if not cluster_ready():
|
||||
return 'waiting', 'RabbitMQ is clustered, but not all charms are ready'
|
||||
# Departed nodes
|
||||
departed_node = check_cluster_memberships()
|
||||
if departed_node:
|
||||
|
@ -1660,7 +1691,7 @@ def cluster_ready():
|
|||
|
||||
@returns boolean
|
||||
"""
|
||||
min_size = config('min-cluster-size')
|
||||
min_size = int(config('min-cluster-size') or 0)
|
||||
units = 1
|
||||
for rid in relation_ids('cluster'):
|
||||
units += len(related_units(rid))
|
||||
|
@ -1668,9 +1699,12 @@ def cluster_ready():
|
|||
min_size = units
|
||||
|
||||
if not is_sufficient_peers():
|
||||
log("In cluster_ready: not sufficient peers", level=DEBUG)
|
||||
return False
|
||||
elif min_size > 1:
|
||||
if not clustered():
|
||||
log("This unit is not detected as clustered, but should be at "
|
||||
"this stage", WARNING)
|
||||
return False
|
||||
clustered_units = 1
|
||||
for rid in relation_ids('cluster'):
|
||||
|
|
|
@ -328,13 +328,19 @@ def update_clients(check_deferred_restarts=True):
|
|||
if check_deferred_restarts and get_deferred_restarts():
|
||||
log("Not sendinfg client update as a restart is pending.", INFO)
|
||||
return
|
||||
if rabbit.leader_node_is_ready() or rabbit.client_node_is_ready():
|
||||
_leader_node_is_ready = rabbit.leader_node_is_ready()
|
||||
_client_node_is_ready = rabbit.client_node_is_ready()
|
||||
if _leader_node_is_ready or _client_node_is_ready:
|
||||
for rid in relation_ids('amqp'):
|
||||
for unit in related_units(rid):
|
||||
amqp_changed(
|
||||
relation_id=rid,
|
||||
remote_unit=unit,
|
||||
check_deferred_restarts=check_deferred_restarts)
|
||||
else:
|
||||
log("Not updating clients: leader node is ready:{}, "
|
||||
"client node is ready:{}".format(
|
||||
_leader_node_is_ready, _client_node_is_ready), DEBUG)
|
||||
|
||||
|
||||
@hooks.hook('dashboards-relation-joined')
|
||||
|
|
|
@ -480,6 +480,7 @@ class UtilsTests(CharmTestCase):
|
|||
mock_running_nodes.return_value = ['node12', 'node42', 'node27']
|
||||
self.assertTrue(rabbit_utils.clustered_with_leader())
|
||||
|
||||
@mock.patch('rabbit_utils.clear_nodes_cache')
|
||||
@mock.patch('rabbit_utils.get_unit_hostname')
|
||||
@mock.patch('rabbit_utils.time.time')
|
||||
@mock.patch.object(rabbit_utils, 'clustered_with_leader')
|
||||
|
@ -493,7 +494,8 @@ class UtilsTests(CharmTestCase):
|
|||
mock_relation_get,
|
||||
mock_clustered_with_leader,
|
||||
mock_time,
|
||||
mock_get_unit_hostname):
|
||||
mock_get_unit_hostname,
|
||||
mock_clear_nodes_cache):
|
||||
mock_get_unit_hostname.return_value = 'host1'
|
||||
mock_time.return_value = '12:30'
|
||||
mock_leader_node.return_value = 'node42'
|
||||
|
@ -504,6 +506,7 @@ class UtilsTests(CharmTestCase):
|
|||
mock_relation_id.return_value = 'rid1'
|
||||
mock_relation_get.return_value = 'True'
|
||||
rabbit_utils.update_peer_cluster_status()
|
||||
mock_clear_nodes_cache.assert_called_once_with()
|
||||
self.assertFalse(mock_relation_set.called)
|
||||
|
||||
mock_clustered_with_leader.return_value = True
|
||||
|
@ -1068,6 +1071,7 @@ class UtilsTests(CharmTestCase):
|
|||
'{"expires":23000}'
|
||||
)
|
||||
|
||||
@mock.patch.object(rabbit_utils, 'cluster_ready')
|
||||
@mock.patch.object(rabbit_utils, 'leader_get')
|
||||
@mock.patch.object(rabbit_utils, 'is_partitioned')
|
||||
@mock.patch.object(rabbit_utils, 'wait_app')
|
||||
|
@ -1079,7 +1083,7 @@ class UtilsTests(CharmTestCase):
|
|||
def test_assess_cluster_status(
|
||||
self, rabbitmq_is_installed, is_unit_paused_set,
|
||||
is_sufficient_peers, clustered, check_cluster_memberships,
|
||||
wait_app, is_partitioned, leader_get):
|
||||
wait_app, is_partitioned, leader_get, cluster_ready):
|
||||
is_partitioned.return_value = False
|
||||
self.relation_ids.return_value = ["cluster:1"]
|
||||
self.related_units.return_value = ["rabbitmq-server/1"]
|
||||
|
@ -1088,6 +1092,7 @@ class UtilsTests(CharmTestCase):
|
|||
'min-cluster-size': _min,
|
||||
'cluster-partition-handling': 'autoheal'}
|
||||
self.config.side_effect = lambda key: _config.get(key)
|
||||
cluster_ready.return_value = False
|
||||
|
||||
# Paused
|
||||
is_unit_paused_set.return_value = True
|
||||
|
@ -1116,8 +1121,15 @@ class UtilsTests(CharmTestCase):
|
|||
"Unit has peers, but RabbitMQ not clustered")
|
||||
self.assertEqual(_expected, rabbit_utils.assess_cluster_status())
|
||||
|
||||
# Departed node
|
||||
# rabbitmq-server is clustered, but the charms haven't yet caught up.
|
||||
clustered.return_value = True
|
||||
_expected = (
|
||||
"waiting",
|
||||
"RabbitMQ is clustered, but not all charms are ready")
|
||||
self.assertEqual(_expected, rabbit_utils.assess_cluster_status())
|
||||
|
||||
# Departed node
|
||||
cluster_ready.return_value = True
|
||||
_departed_node = "rabbit@hostname"
|
||||
check_cluster_memberships.return_value = _departed_node
|
||||
_expected = (
|
||||
|
|
Loading…
Reference in New Issue