Fix issue where charms aren't clustered but RMQ is

Due to the @cache decorator in the code, it was possible to get the
charm into a state where RMQ is clustered, but the charm doesn't record
it.  The charm 'thinks' it is clustered when it has set the 'clustered'
key on the 'cluster' relation.  Unfortunately, due to the @cached
decorator it's possible in the 'cluster-relation-changed' hook to have a
situation where the RMQ instance clusters during the hook execution and
then, later, when it's supposed to writing the 'clustered' key, it reads
the previous cached value where it wasn't clustered and therefore
doesn't set the 'clustered' key.  This is just about the only
opportunity to do it, and so the charm ends up being locked.

The fix was to clear the @cache values so that the nodes would be
re-read, and this allows the charm to then write the 'clustered' key.

Change-Id: I12be41a83323d150ba1cbaeef64041f0bb5e32ce
Closes-Bug: #1975605
This commit is contained in:
Alex Kavanagh 2022-12-23 13:20:34 +00:00
parent 09ade6b5ee
commit 81f08ab769
3 changed files with 58 additions and 6 deletions

View File

@ -69,6 +69,7 @@ from charmhelpers.core.hookenv import (
service_name,
status_set,
cached,
flush,
relation_set,
relation_get,
application_version_set,
@ -972,6 +973,9 @@ def clustered_with_leader():
def update_peer_cluster_status():
"""Inform peers that this unit is clustered if it is."""
# clear the nodes cache so that the rabbit nodes are re-evaluated if the
# cluster has formed during the hook execution.
clear_nodes_cache()
# check the leader and try to cluster with it
if clustered_with_leader():
log('Host already clustered with %s.' % leader_node())
@ -986,9 +990,14 @@ def update_peer_cluster_status():
# NOTE(freyes): this node needs to be marked as clustered, it's
# part of the cluster according to 'rabbitmqctl cluster_status'
# (LP: #1691510)
log("Setting 'clustered' on 'cluster' relation", level=DEBUG)
relation_set(relation_id=cluster_rid,
clustered=get_unit_hostname(),
timestamp=time.time())
else:
log("Already set 'clustered' on cluster relation", level=DEBUG)
else:
log('Host not clustered with leader', level=DEBUG)
def join_leader():
@ -1005,6 +1014,10 @@ def join_leader():
log("Attempting to cluster with leader", "INFO")
try:
node = leader_node()
if node is None:
log("Couldn't identify leader", "INFO")
status_set('blocked', "Leader not yet identified")
return
join_cluster(node)
# NOTE: toggle the cluster relation to ensure that any peers
# already clustered re-assess status correctly
@ -1320,7 +1333,9 @@ def is_partitioned():
@cached
def running_nodes():
''' Determine the current set of running nodes in the RabbitMQ cluster '''
return nodes(get_running=True)
_nodes = nodes(get_running=True)
log("running_nodes: {}".format(_nodes), DEBUG)
return _nodes
@cached
@ -1335,13 +1350,25 @@ def leader_node():
leader_node_hostname = peer_retrieve('leader_node_hostname')
except ValueError:
# This is a single unit
log("leader_node: None", DEBUG)
return None
if leader_node_hostname:
log("leader_node: rabbit@{}".format(leader_node_hostname), DEBUG)
return "rabbit@" + leader_node_hostname
else:
log("leader_node: None", DEBUG)
return None
def clear_nodes_cache():
"""Clear the running_nodes() and nodes() and leader_node() cache"""
log("Clearing the cache for nodes.", DEBUG)
flush('leader_node')
flush('running_nodes')
flush('nodes')
flush('clustered')
def get_node_hostname(ip_addr):
''' Resolve IP address to hostname '''
try:
@ -1390,6 +1417,10 @@ def assess_cluster_status(*args):
if not clustered():
return 'waiting', 'Unit has peers, but RabbitMQ not clustered'
# See if all the rabbitmq charms think they are clustered (e.g. that the
# 'clustered' attribute is set on the 'cluster' relationship
if not cluster_ready():
return 'waiting', 'RabbitMQ is clustered, but not all charms are ready'
# Departed nodes
departed_node = check_cluster_memberships()
if departed_node:
@ -1660,7 +1691,7 @@ def cluster_ready():
@returns boolean
"""
min_size = config('min-cluster-size')
min_size = int(config('min-cluster-size') or 0)
units = 1
for rid in relation_ids('cluster'):
units += len(related_units(rid))
@ -1668,9 +1699,12 @@ def cluster_ready():
min_size = units
if not is_sufficient_peers():
log("In cluster_ready: not sufficient peers", level=DEBUG)
return False
elif min_size > 1:
if not clustered():
log("This unit is not detected as clustered, but should be at "
"this stage", WARNING)
return False
clustered_units = 1
for rid in relation_ids('cluster'):

View File

@ -328,13 +328,19 @@ def update_clients(check_deferred_restarts=True):
if check_deferred_restarts and get_deferred_restarts():
log("Not sendinfg client update as a restart is pending.", INFO)
return
if rabbit.leader_node_is_ready() or rabbit.client_node_is_ready():
_leader_node_is_ready = rabbit.leader_node_is_ready()
_client_node_is_ready = rabbit.client_node_is_ready()
if _leader_node_is_ready or _client_node_is_ready:
for rid in relation_ids('amqp'):
for unit in related_units(rid):
amqp_changed(
relation_id=rid,
remote_unit=unit,
check_deferred_restarts=check_deferred_restarts)
else:
log("Not updating clients: leader node is ready:{}, "
"client node is ready:{}".format(
_leader_node_is_ready, _client_node_is_ready), DEBUG)
@hooks.hook('dashboards-relation-joined')

View File

@ -480,6 +480,7 @@ class UtilsTests(CharmTestCase):
mock_running_nodes.return_value = ['node12', 'node42', 'node27']
self.assertTrue(rabbit_utils.clustered_with_leader())
@mock.patch('rabbit_utils.clear_nodes_cache')
@mock.patch('rabbit_utils.get_unit_hostname')
@mock.patch('rabbit_utils.time.time')
@mock.patch.object(rabbit_utils, 'clustered_with_leader')
@ -493,7 +494,8 @@ class UtilsTests(CharmTestCase):
mock_relation_get,
mock_clustered_with_leader,
mock_time,
mock_get_unit_hostname):
mock_get_unit_hostname,
mock_clear_nodes_cache):
mock_get_unit_hostname.return_value = 'host1'
mock_time.return_value = '12:30'
mock_leader_node.return_value = 'node42'
@ -504,6 +506,7 @@ class UtilsTests(CharmTestCase):
mock_relation_id.return_value = 'rid1'
mock_relation_get.return_value = 'True'
rabbit_utils.update_peer_cluster_status()
mock_clear_nodes_cache.assert_called_once_with()
self.assertFalse(mock_relation_set.called)
mock_clustered_with_leader.return_value = True
@ -1068,6 +1071,7 @@ class UtilsTests(CharmTestCase):
'{"expires":23000}'
)
@mock.patch.object(rabbit_utils, 'cluster_ready')
@mock.patch.object(rabbit_utils, 'leader_get')
@mock.patch.object(rabbit_utils, 'is_partitioned')
@mock.patch.object(rabbit_utils, 'wait_app')
@ -1079,7 +1083,7 @@ class UtilsTests(CharmTestCase):
def test_assess_cluster_status(
self, rabbitmq_is_installed, is_unit_paused_set,
is_sufficient_peers, clustered, check_cluster_memberships,
wait_app, is_partitioned, leader_get):
wait_app, is_partitioned, leader_get, cluster_ready):
is_partitioned.return_value = False
self.relation_ids.return_value = ["cluster:1"]
self.related_units.return_value = ["rabbitmq-server/1"]
@ -1088,6 +1092,7 @@ class UtilsTests(CharmTestCase):
'min-cluster-size': _min,
'cluster-partition-handling': 'autoheal'}
self.config.side_effect = lambda key: _config.get(key)
cluster_ready.return_value = False
# Paused
is_unit_paused_set.return_value = True
@ -1116,8 +1121,15 @@ class UtilsTests(CharmTestCase):
"Unit has peers, but RabbitMQ not clustered")
self.assertEqual(_expected, rabbit_utils.assess_cluster_status())
# Departed node
# rabbitmq-server is clustered, but the charms haven't yet caught up.
clustered.return_value = True
_expected = (
"waiting",
"RabbitMQ is clustered, but not all charms are ready")
self.assertEqual(_expected, rabbit_utils.assess_cluster_status())
# Departed node
cluster_ready.return_value = True
_departed_node = "rabbit@hostname"
check_cluster_memberships.return_value = _departed_node
_expected = (