Safely delete node from ring

Provide the delete-node-from-ring action to safely remove a known node
from the corosync ring.

Update the less safe update-ring action to avoid LP Bug #1933223 and
provide warnings in actions.yaml on its use.

Change-Id: I56cf2360ac41b12fc0a508881897ba63a5e89dbd
Closes-Bug: #1933223
This commit is contained in:
David Ames 2021-06-24 13:14:58 -07:00
parent 3b872ff4d2
commit 102d463aa3
6 changed files with 149 additions and 32 deletions

View File

@ -22,11 +22,26 @@ status:
type: boolean
description: Show cluster status history
update-ring:
description: Trigger corosync node members cleanup
description: |
Trigger corosync node members cleanup.
WARNING This action updates the corosync cluster members by adding or
removing nodes, which may lead to a loss of quorum and other unexpected
side-effects. It is strongly encouraged to manually remove nodes
individually using the delete-node-from-ring action.
params:
i-really-mean-it:
type: boolean
description: |
This must be toggled to enable actually performing this action
This must be toggled to enable actually performing this action.
required:
- i-really-mean-it
delete-node-from-ring:
description: Delete a node from the corosync ring. Must be run on the hacluster leader node.
params:
node:
type: string
description: |
Node name to be removed. i.e. hostname of the node
required:
- node

View File

@ -110,23 +110,7 @@ def cleanup(args):
"'{}'".format(resource_name))
def update_ring(args):
"""Update corosync.conf list of nodes (generally after unit removal)."""
if not action_get('i-really-mean-it'):
action_fail('i-really-mean-it is a required parameter')
return
if not is_leader():
action_fail('only the Juju leader can run this action')
return
diff_nodes = update_node_list()
if not diff_nodes:
# No differences between discovered Pacemaker nodes and
# Juju nodes (ie. no node removal)
action_set({'result': 'noop'})
return
def _trigger_corosync_update():
# Trigger emit_corosync_conf() and corosync-cfgtool -R
# for all the hanode peer units to run
relid = relation_ids('hanode')
@ -144,11 +128,66 @@ def update_ring(args):
emit_corosync_conf()):
cmd = 'corosync-cfgtool -R'
pcmk.commit(cmd)
action_set({'result': 'success'})
def update_ring(args):
"""Update corosync.conf list of nodes (generally after unit removal)."""
if not function_get('i-really-mean-it'):
function_fail('i-really-mean-it is a required parameter')
return
if not is_leader():
function_fail('only the Juju leader can run this action')
return
diff_nodes = update_node_list()
log("Unexpected node(s) found and removed: {}"
.format(",".join(list(diff_nodes))))
if not diff_nodes:
# No differences between discovered Pacemaker nodes and
# Juju nodes (ie. no node removal)
function_set({'result': 'No changes required.'})
return
# Notify the cluster
_trigger_corosync_update()
function_set(
{"result":
"Nodes removed: {}"
.format(" ".join(list(diff_nodes)))})
def delete_node_from_ring(args):
"""Delete a node from the corosync ring."""
node = function_get('node')
if not node:
function_fail('node is a required parameter')
return
if not is_leader():
function_fail('only the Juju leader can run this action')
return
# Delete the node from the live corosync env
try:
pcmk.set_node_status_to_maintenance(node)
pcmk.delete_node(node, failure_is_fatal=True)
except subprocess.CalledProcessError as e:
function_fail(
"Removing {} from the cluster failed. {} output={}"
.format(node, e, e.output))
# Notify the cluster
_trigger_corosync_update()
function_set({'result': 'success'})
ACTIONS = {"pause": pause, "resume": resume,
"status": status, "cleanup": cleanup,
"delete-node-from-ring": delete_node_from_ring,
"update-ring": update_ring}

View File

@ -0,0 +1 @@
actions.py

View File

@ -18,7 +18,6 @@ import copy
import glob
import os
import shutil
import socket
import sys
import traceback
@ -598,11 +597,11 @@ def ha_relation_changed():
@hooks.hook()
def stop():
# NOTE(lourot): This seems to always fail with
# 'ERROR: node <node_name> not found in the CIB', which means that the node
# has already been removed from the cluster. Thus failure_is_fatal=False.
# We might consider getting rid of this call.
pcmk.delete_node(socket.gethostname(), failure_is_fatal=False)
# The pcmk.delete_node handles several known failure modes so
# failure_is_fatal=True actually helps as it causes retries.
node = get_hostname()
pcmk.set_node_status_to_maintenance(node)
pcmk.delete_node(node, failure_is_fatal=True)
apt_purge(['corosync', 'pacemaker'], fatal=True)

View File

@ -1553,19 +1553,32 @@ def is_stonith_configured():
return bool_from_string(configured)
def get_hanode_hostnames():
"""Hostnames of nodes in the hanode relation.
:returns: List of hostnames of nodes in the hanode relation.
:rtype: List
"""
hanode_hostnames = [get_hostname()]
for relid in relation_ids('hanode'):
for unit in related_units(relid):
hostname = relation_get('hostname', rid=relid, unit=unit)
if hostname:
hanode_hostnames.append(hostname)
hanode_hostnames.sort()
return hanode_hostnames
def update_node_list():
"""Delete a node from the corosync ring when a Juju unit is removed.
"""Determine and delete unexpected nodes from the corosync ring.
:returns: Set of pcmk nodes not part of Juju hanode relation
:rtype: Set[str]
:raises: RemoveCorosyncNodeFailed
"""
pcmk_nodes = set(pcmk.list_nodes())
juju_nodes = {socket.gethostname()}
juju_hanode_rel = get_ha_nodes()
for corosync_id, addr in juju_hanode_rel.items():
peer_node_name = utils.get_hostname(addr, fqdn=False)
juju_nodes.add(peer_node_name)
juju_nodes = set(get_hanode_hostnames())
diff_nodes = pcmk_nodes.difference(juju_nodes)
log("pcmk_nodes[{}], juju_nodes[{}], diff[{}]"

View File

@ -1317,3 +1317,53 @@ class UtilsTestCase(unittest.TestCase):
'hanode:0',
),
)
@mock.patch.object(utils, 'get_hostname')
@mock.patch.object(utils, 'relation_get')
@mock.patch.object(utils, 'related_units')
@mock.patch.object(utils, 'relation_ids')
def test_get_hanode_hostnames(
self, _relation_ids, _related_units, _relation_get, _get_hostname):
_data = {
"node/1": {"hostname": "beta"},
"node/3": {"hostname": "alpha"},
}
def _rel_data(_param, unit, rid):
return _data[unit][_param]
_relation_ids.return_value = ["hanode:1"]
_related_units.return_value = ["node/1", "node/3"]
_relation_get.side_effect = _rel_data
_get_hostname.return_value = "gamma"
_expected = ["alpha", "beta", "gamma"]
self.assertEqual(
utils.get_hanode_hostnames(),
_expected)
@mock.patch.object(utils, 'get_hanode_hostnames')
@mock.patch.object(utils, 'pcmk')
def test_update_node_list(self, _pcmk, _get_hanode_hostnames):
_pcmk.list_nodes.return_value = ["zeta", "beta", "psi"]
_get_hanode_hostnames.return_value = ["alpha", "beta", "gamma"]
_expected = set(["zeta", "psi"])
self.assertEqual(
utils.update_node_list(),
_expected)
_pcmk.set_node_status_to_maintenance.assert_has_calls(
[mock.call('zeta'),
mock.call('psi')],
any_order=True)
_pcmk.delete_node.assert_has_calls(
[mock.call('zeta'),
mock.call('psi')],
any_order=True)
# Raise RemoveCorosyncNodeFailed
_pcmk.delete_node.side_effect = subprocess.CalledProcessError(
127, "fake crm command")
with self.assertRaises(utils.RemoveCorosyncNodeFailed):
utils.update_node_list()