From 81f08ab7695ab507ade076c33d0cc168a03be221 Mon Sep 17 00:00:00 2001 From: Alex Kavanagh Date: Fri, 23 Dec 2022 13:20:34 +0000 Subject: [PATCH] Fix issue where charms aren't clustered but RMQ is Due to the @cache decorator in the code, it was possible to get the charm into a state where RMQ is clustered, but the charm doesn't record it. The charm 'thinks' it is clustered when it has set the 'clustered' key on the 'cluster' relation. Unfortunately, due to the @cached decorator it's possible in the 'cluster-relation-changed' hook to have a situation where the RMQ instance clusters during the hook execution and then, later, when it's supposed to writing the 'clustered' key, it reads the previous cached value where it wasn't clustered and therefore doesn't set the 'clustered' key. This is just about the only opportunity to do it, and so the charm ends up being locked. The fix was to clear the @cache values so that the nodes would be re-read, and this allows the charm to then write the 'clustered' key. Change-Id: I12be41a83323d150ba1cbaeef64041f0bb5e32ce Closes-Bug: #1975605 --- hooks/rabbit_utils.py | 38 ++++++++++++++++++++++++++++-- hooks/rabbitmq_server_relations.py | 8 ++++++- unit_tests/test_rabbit_utils.py | 18 +++++++++++--- 3 files changed, 58 insertions(+), 6 deletions(-) diff --git a/hooks/rabbit_utils.py b/hooks/rabbit_utils.py index fd7b7194..99af07a9 100644 --- a/hooks/rabbit_utils.py +++ b/hooks/rabbit_utils.py @@ -69,6 +69,7 @@ from charmhelpers.core.hookenv import ( service_name, status_set, cached, + flush, relation_set, relation_get, application_version_set, @@ -972,6 +973,9 @@ def clustered_with_leader(): def update_peer_cluster_status(): """Inform peers that this unit is clustered if it is.""" + # clear the nodes cache so that the rabbit nodes are re-evaluated if the + # cluster has formed during the hook execution. + clear_nodes_cache() # check the leader and try to cluster with it if clustered_with_leader(): log('Host already clustered with %s.' % leader_node()) @@ -986,9 +990,14 @@ def update_peer_cluster_status(): # NOTE(freyes): this node needs to be marked as clustered, it's # part of the cluster according to 'rabbitmqctl cluster_status' # (LP: #1691510) + log("Setting 'clustered' on 'cluster' relation", level=DEBUG) relation_set(relation_id=cluster_rid, clustered=get_unit_hostname(), timestamp=time.time()) + else: + log("Already set 'clustered' on cluster relation", level=DEBUG) + else: + log('Host not clustered with leader', level=DEBUG) def join_leader(): @@ -1005,6 +1014,10 @@ def join_leader(): log("Attempting to cluster with leader", "INFO") try: node = leader_node() + if node is None: + log("Couldn't identify leader", "INFO") + status_set('blocked', "Leader not yet identified") + return join_cluster(node) # NOTE: toggle the cluster relation to ensure that any peers # already clustered re-assess status correctly @@ -1320,7 +1333,9 @@ def is_partitioned(): @cached def running_nodes(): ''' Determine the current set of running nodes in the RabbitMQ cluster ''' - return nodes(get_running=True) + _nodes = nodes(get_running=True) + log("running_nodes: {}".format(_nodes), DEBUG) + return _nodes @cached @@ -1335,13 +1350,25 @@ def leader_node(): leader_node_hostname = peer_retrieve('leader_node_hostname') except ValueError: # This is a single unit + log("leader_node: None", DEBUG) return None if leader_node_hostname: + log("leader_node: rabbit@{}".format(leader_node_hostname), DEBUG) return "rabbit@" + leader_node_hostname else: + log("leader_node: None", DEBUG) return None +def clear_nodes_cache(): + """Clear the running_nodes() and nodes() and leader_node() cache""" + log("Clearing the cache for nodes.", DEBUG) + flush('leader_node') + flush('running_nodes') + flush('nodes') + flush('clustered') + + def get_node_hostname(ip_addr): ''' Resolve IP address to hostname ''' try: @@ -1390,6 +1417,10 @@ def assess_cluster_status(*args): if not clustered(): return 'waiting', 'Unit has peers, but RabbitMQ not clustered' + # See if all the rabbitmq charms think they are clustered (e.g. that the + # 'clustered' attribute is set on the 'cluster' relationship + if not cluster_ready(): + return 'waiting', 'RabbitMQ is clustered, but not all charms are ready' # Departed nodes departed_node = check_cluster_memberships() if departed_node: @@ -1660,7 +1691,7 @@ def cluster_ready(): @returns boolean """ - min_size = config('min-cluster-size') + min_size = int(config('min-cluster-size') or 0) units = 1 for rid in relation_ids('cluster'): units += len(related_units(rid)) @@ -1668,9 +1699,12 @@ def cluster_ready(): min_size = units if not is_sufficient_peers(): + log("In cluster_ready: not sufficient peers", level=DEBUG) return False elif min_size > 1: if not clustered(): + log("This unit is not detected as clustered, but should be at " + "this stage", WARNING) return False clustered_units = 1 for rid in relation_ids('cluster'): diff --git a/hooks/rabbitmq_server_relations.py b/hooks/rabbitmq_server_relations.py index 90357921..67bd4ccb 100755 --- a/hooks/rabbitmq_server_relations.py +++ b/hooks/rabbitmq_server_relations.py @@ -328,13 +328,19 @@ def update_clients(check_deferred_restarts=True): if check_deferred_restarts and get_deferred_restarts(): log("Not sendinfg client update as a restart is pending.", INFO) return - if rabbit.leader_node_is_ready() or rabbit.client_node_is_ready(): + _leader_node_is_ready = rabbit.leader_node_is_ready() + _client_node_is_ready = rabbit.client_node_is_ready() + if _leader_node_is_ready or _client_node_is_ready: for rid in relation_ids('amqp'): for unit in related_units(rid): amqp_changed( relation_id=rid, remote_unit=unit, check_deferred_restarts=check_deferred_restarts) + else: + log("Not updating clients: leader node is ready:{}, " + "client node is ready:{}".format( + _leader_node_is_ready, _client_node_is_ready), DEBUG) @hooks.hook('dashboards-relation-joined') diff --git a/unit_tests/test_rabbit_utils.py b/unit_tests/test_rabbit_utils.py index 680b5874..232976a6 100644 --- a/unit_tests/test_rabbit_utils.py +++ b/unit_tests/test_rabbit_utils.py @@ -480,6 +480,7 @@ class UtilsTests(CharmTestCase): mock_running_nodes.return_value = ['node12', 'node42', 'node27'] self.assertTrue(rabbit_utils.clustered_with_leader()) + @mock.patch('rabbit_utils.clear_nodes_cache') @mock.patch('rabbit_utils.get_unit_hostname') @mock.patch('rabbit_utils.time.time') @mock.patch.object(rabbit_utils, 'clustered_with_leader') @@ -493,7 +494,8 @@ class UtilsTests(CharmTestCase): mock_relation_get, mock_clustered_with_leader, mock_time, - mock_get_unit_hostname): + mock_get_unit_hostname, + mock_clear_nodes_cache): mock_get_unit_hostname.return_value = 'host1' mock_time.return_value = '12:30' mock_leader_node.return_value = 'node42' @@ -504,6 +506,7 @@ class UtilsTests(CharmTestCase): mock_relation_id.return_value = 'rid1' mock_relation_get.return_value = 'True' rabbit_utils.update_peer_cluster_status() + mock_clear_nodes_cache.assert_called_once_with() self.assertFalse(mock_relation_set.called) mock_clustered_with_leader.return_value = True @@ -1068,6 +1071,7 @@ class UtilsTests(CharmTestCase): '{"expires":23000}' ) + @mock.patch.object(rabbit_utils, 'cluster_ready') @mock.patch.object(rabbit_utils, 'leader_get') @mock.patch.object(rabbit_utils, 'is_partitioned') @mock.patch.object(rabbit_utils, 'wait_app') @@ -1079,7 +1083,7 @@ class UtilsTests(CharmTestCase): def test_assess_cluster_status( self, rabbitmq_is_installed, is_unit_paused_set, is_sufficient_peers, clustered, check_cluster_memberships, - wait_app, is_partitioned, leader_get): + wait_app, is_partitioned, leader_get, cluster_ready): is_partitioned.return_value = False self.relation_ids.return_value = ["cluster:1"] self.related_units.return_value = ["rabbitmq-server/1"] @@ -1088,6 +1092,7 @@ class UtilsTests(CharmTestCase): 'min-cluster-size': _min, 'cluster-partition-handling': 'autoheal'} self.config.side_effect = lambda key: _config.get(key) + cluster_ready.return_value = False # Paused is_unit_paused_set.return_value = True @@ -1116,8 +1121,15 @@ class UtilsTests(CharmTestCase): "Unit has peers, but RabbitMQ not clustered") self.assertEqual(_expected, rabbit_utils.assess_cluster_status()) - # Departed node + # rabbitmq-server is clustered, but the charms haven't yet caught up. clustered.return_value = True + _expected = ( + "waiting", + "RabbitMQ is clustered, but not all charms are ready") + self.assertEqual(_expected, rabbit_utils.assess_cluster_status()) + + # Departed node + cluster_ready.return_value = True _departed_node = "rabbit@hostname" check_cluster_memberships.return_value = _departed_node _expected = (