Safely delete node from ring
Provide the delete-node-from-ring action to safely remove a known node from the corosync ring. Update the less safe update-ring action to avoid LP Bug #1933223 and provide warnings in actions.yaml on its use. Change-Id: I56cf2360ac41b12fc0a508881897ba63a5e89dbd Closes-Bug: #1933223
This commit is contained in:
parent
3b872ff4d2
commit
102d463aa3
19
actions.yaml
19
actions.yaml
@ -22,11 +22,26 @@ status:
|
||||
type: boolean
|
||||
description: Show cluster status history
|
||||
update-ring:
|
||||
description: Trigger corosync node members cleanup
|
||||
description: |
|
||||
Trigger corosync node members cleanup.
|
||||
|
||||
WARNING This action updates the corosync cluster members by adding or
|
||||
removing nodes, which may lead to a loss of quorum and other unexpected
|
||||
side-effects. It is strongly encouraged to manually remove nodes
|
||||
individually using the delete-node-from-ring action.
|
||||
params:
|
||||
i-really-mean-it:
|
||||
type: boolean
|
||||
description: |
|
||||
This must be toggled to enable actually performing this action
|
||||
This must be toggled to enable actually performing this action.
|
||||
required:
|
||||
- i-really-mean-it
|
||||
delete-node-from-ring:
|
||||
description: Delete a node from the corosync ring. Must be run on the hacluster leader node.
|
||||
params:
|
||||
node:
|
||||
type: string
|
||||
description: |
|
||||
Node name to be removed. i.e. hostname of the node
|
||||
required:
|
||||
- node
|
||||
|
@ -110,23 +110,7 @@ def cleanup(args):
|
||||
"'{}'".format(resource_name))
|
||||
|
||||
|
||||
def update_ring(args):
|
||||
"""Update corosync.conf list of nodes (generally after unit removal)."""
|
||||
if not action_get('i-really-mean-it'):
|
||||
action_fail('i-really-mean-it is a required parameter')
|
||||
return
|
||||
|
||||
if not is_leader():
|
||||
action_fail('only the Juju leader can run this action')
|
||||
return
|
||||
|
||||
diff_nodes = update_node_list()
|
||||
if not diff_nodes:
|
||||
# No differences between discovered Pacemaker nodes and
|
||||
# Juju nodes (ie. no node removal)
|
||||
action_set({'result': 'noop'})
|
||||
return
|
||||
|
||||
def _trigger_corosync_update():
|
||||
# Trigger emit_corosync_conf() and corosync-cfgtool -R
|
||||
# for all the hanode peer units to run
|
||||
relid = relation_ids('hanode')
|
||||
@ -144,11 +128,66 @@ def update_ring(args):
|
||||
emit_corosync_conf()):
|
||||
cmd = 'corosync-cfgtool -R'
|
||||
pcmk.commit(cmd)
|
||||
action_set({'result': 'success'})
|
||||
|
||||
|
||||
def update_ring(args):
|
||||
"""Update corosync.conf list of nodes (generally after unit removal)."""
|
||||
if not function_get('i-really-mean-it'):
|
||||
function_fail('i-really-mean-it is a required parameter')
|
||||
return
|
||||
|
||||
if not is_leader():
|
||||
function_fail('only the Juju leader can run this action')
|
||||
return
|
||||
|
||||
diff_nodes = update_node_list()
|
||||
log("Unexpected node(s) found and removed: {}"
|
||||
.format(",".join(list(diff_nodes))))
|
||||
if not diff_nodes:
|
||||
# No differences between discovered Pacemaker nodes and
|
||||
# Juju nodes (ie. no node removal)
|
||||
function_set({'result': 'No changes required.'})
|
||||
return
|
||||
|
||||
# Notify the cluster
|
||||
_trigger_corosync_update()
|
||||
|
||||
function_set(
|
||||
{"result":
|
||||
"Nodes removed: {}"
|
||||
.format(" ".join(list(diff_nodes)))})
|
||||
|
||||
|
||||
def delete_node_from_ring(args):
|
||||
"""Delete a node from the corosync ring."""
|
||||
|
||||
node = function_get('node')
|
||||
if not node:
|
||||
function_fail('node is a required parameter')
|
||||
return
|
||||
|
||||
if not is_leader():
|
||||
function_fail('only the Juju leader can run this action')
|
||||
return
|
||||
|
||||
# Delete the node from the live corosync env
|
||||
try:
|
||||
pcmk.set_node_status_to_maintenance(node)
|
||||
pcmk.delete_node(node, failure_is_fatal=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
function_fail(
|
||||
"Removing {} from the cluster failed. {} output={}"
|
||||
.format(node, e, e.output))
|
||||
|
||||
# Notify the cluster
|
||||
_trigger_corosync_update()
|
||||
|
||||
function_set({'result': 'success'})
|
||||
|
||||
|
||||
ACTIONS = {"pause": pause, "resume": resume,
|
||||
"status": status, "cleanup": cleanup,
|
||||
"delete-node-from-ring": delete_node_from_ring,
|
||||
"update-ring": update_ring}
|
||||
|
||||
|
||||
|
1
actions/delete-node-from-ring
Symbolic link
1
actions/delete-node-from-ring
Symbolic link
@ -0,0 +1 @@
|
||||
actions.py
|
@ -18,7 +18,6 @@ import copy
|
||||
import glob
|
||||
import os
|
||||
import shutil
|
||||
import socket
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
@ -598,11 +597,11 @@ def ha_relation_changed():
|
||||
|
||||
@hooks.hook()
|
||||
def stop():
|
||||
# NOTE(lourot): This seems to always fail with
|
||||
# 'ERROR: node <node_name> not found in the CIB', which means that the node
|
||||
# has already been removed from the cluster. Thus failure_is_fatal=False.
|
||||
# We might consider getting rid of this call.
|
||||
pcmk.delete_node(socket.gethostname(), failure_is_fatal=False)
|
||||
# The pcmk.delete_node handles several known failure modes so
|
||||
# failure_is_fatal=True actually helps as it causes retries.
|
||||
node = get_hostname()
|
||||
pcmk.set_node_status_to_maintenance(node)
|
||||
pcmk.delete_node(node, failure_is_fatal=True)
|
||||
|
||||
apt_purge(['corosync', 'pacemaker'], fatal=True)
|
||||
|
||||
|
@ -1553,19 +1553,32 @@ def is_stonith_configured():
|
||||
return bool_from_string(configured)
|
||||
|
||||
|
||||
def get_hanode_hostnames():
|
||||
"""Hostnames of nodes in the hanode relation.
|
||||
|
||||
:returns: List of hostnames of nodes in the hanode relation.
|
||||
:rtype: List
|
||||
"""
|
||||
hanode_hostnames = [get_hostname()]
|
||||
for relid in relation_ids('hanode'):
|
||||
for unit in related_units(relid):
|
||||
hostname = relation_get('hostname', rid=relid, unit=unit)
|
||||
if hostname:
|
||||
hanode_hostnames.append(hostname)
|
||||
|
||||
hanode_hostnames.sort()
|
||||
return hanode_hostnames
|
||||
|
||||
|
||||
def update_node_list():
|
||||
"""Delete a node from the corosync ring when a Juju unit is removed.
|
||||
"""Determine and delete unexpected nodes from the corosync ring.
|
||||
|
||||
:returns: Set of pcmk nodes not part of Juju hanode relation
|
||||
:rtype: Set[str]
|
||||
:raises: RemoveCorosyncNodeFailed
|
||||
"""
|
||||
pcmk_nodes = set(pcmk.list_nodes())
|
||||
juju_nodes = {socket.gethostname()}
|
||||
juju_hanode_rel = get_ha_nodes()
|
||||
for corosync_id, addr in juju_hanode_rel.items():
|
||||
peer_node_name = utils.get_hostname(addr, fqdn=False)
|
||||
juju_nodes.add(peer_node_name)
|
||||
juju_nodes = set(get_hanode_hostnames())
|
||||
|
||||
diff_nodes = pcmk_nodes.difference(juju_nodes)
|
||||
log("pcmk_nodes[{}], juju_nodes[{}], diff[{}]"
|
||||
|
@ -1317,3 +1317,53 @@ class UtilsTestCase(unittest.TestCase):
|
||||
'hanode:0',
|
||||
),
|
||||
)
|
||||
|
||||
@mock.patch.object(utils, 'get_hostname')
|
||||
@mock.patch.object(utils, 'relation_get')
|
||||
@mock.patch.object(utils, 'related_units')
|
||||
@mock.patch.object(utils, 'relation_ids')
|
||||
def test_get_hanode_hostnames(
|
||||
self, _relation_ids, _related_units, _relation_get, _get_hostname):
|
||||
_data = {
|
||||
"node/1": {"hostname": "beta"},
|
||||
"node/3": {"hostname": "alpha"},
|
||||
}
|
||||
|
||||
def _rel_data(_param, unit, rid):
|
||||
return _data[unit][_param]
|
||||
|
||||
_relation_ids.return_value = ["hanode:1"]
|
||||
_related_units.return_value = ["node/1", "node/3"]
|
||||
_relation_get.side_effect = _rel_data
|
||||
_get_hostname.return_value = "gamma"
|
||||
_expected = ["alpha", "beta", "gamma"]
|
||||
|
||||
self.assertEqual(
|
||||
utils.get_hanode_hostnames(),
|
||||
_expected)
|
||||
|
||||
@mock.patch.object(utils, 'get_hanode_hostnames')
|
||||
@mock.patch.object(utils, 'pcmk')
|
||||
def test_update_node_list(self, _pcmk, _get_hanode_hostnames):
|
||||
_pcmk.list_nodes.return_value = ["zeta", "beta", "psi"]
|
||||
_get_hanode_hostnames.return_value = ["alpha", "beta", "gamma"]
|
||||
_expected = set(["zeta", "psi"])
|
||||
|
||||
self.assertEqual(
|
||||
utils.update_node_list(),
|
||||
_expected)
|
||||
|
||||
_pcmk.set_node_status_to_maintenance.assert_has_calls(
|
||||
[mock.call('zeta'),
|
||||
mock.call('psi')],
|
||||
any_order=True)
|
||||
_pcmk.delete_node.assert_has_calls(
|
||||
[mock.call('zeta'),
|
||||
mock.call('psi')],
|
||||
any_order=True)
|
||||
|
||||
# Raise RemoveCorosyncNodeFailed
|
||||
_pcmk.delete_node.side_effect = subprocess.CalledProcessError(
|
||||
127, "fake crm command")
|
||||
with self.assertRaises(utils.RemoveCorosyncNodeFailed):
|
||||
utils.update_node_list()
|
||||
|
Loading…
Reference in New Issue
Block a user