Make the workgroup status more robust

The current charm does not indicated to the end user when a specific
resource is not running. Neither does it indicate when a node is offline
or stopped.

Validate that configured resources are actually running and let the end
user know if they are not.

Closes-Bug: #1834263

Change-Id: I1171e71ae3b015b4b838b7ecf0de18eb10d7c8f2
This commit is contained in:
David Ames 2019-06-21 23:08:49 +00:00
parent 4d391e8107
commit 9364440075
3 changed files with 97 additions and 1 deletions

View File

@ -113,6 +113,37 @@ def crm_res_running(opt_name):
return False
def crm_res_running_on_node(resource, node):
"""Determine if the resource is running on the given node.
If the resource is active/passive check if it is running on any node.
If the resources is active/active check it is running on the given node.
:param resource: str name of resource
:param node: str name of node
:returns: boolean
"""
(_, output) = subprocess.getstatusoutput(
"crm resource status {}".format(resource))
lines = output.split("\n")
if len(lines) > 1:
# Multi line is a clone list like haproxy and should run on all nodes
# check if it is running on this node
for line in lines:
if node in line:
if line.startswith("resource {} is running".format(resource)):
return True
else:
# Single line is for active/passive like a VIP, may not be on this node
# but check it is running somewhere
if output.startswith("resource {} is running".format(resource)):
return True
return False
def list_nodes():
"""List member nodes."""
cmd = ['crm', 'node', 'status']

View File

@ -1157,7 +1157,8 @@ def pause_unit():
if has_resources:
messages.append("Resources still running on unit")
status, message = assess_status_helper()
if status != 'active':
# New status message will indicate the resource is not running
if status != 'active' and 'not running' not in message:
messages.append(message)
if messages and not is_unit_upgrading_set():
raise Exception("Couldn't pause: {}".format("; ".join(messages)))
@ -1210,6 +1211,14 @@ def assess_status_helper():
status = 'maintenance'
message = 'Pacemaker in maintenance mode'
for resource in get_resources().keys():
if not pcmk.is_resource_present(resource):
return ("waiting",
"Resource: {} not yet configured".format(resource))
if not pcmk.crm_res_running_on_node(resource, get_hostname()):
return ("blocked",
"Resource: {} not running".format(resource))
return status, message
@ -1266,3 +1275,15 @@ def maintenance_mode(enable):
pcmk.set_property('maintenance-mode', str(enable).lower())
else:
log('Desired value for maintenance-mode is already set', level=DEBUG)
def get_resources():
"""Get resources from the HA relation
:returns: dict of resources
"""
resources = {}
for rid in relation_ids("ha"):
for unit in related_units(rid):
resources = parse_data(rid, unit, 'resources')
return resources

View File

@ -107,6 +107,50 @@ class TestPcmk(unittest.TestCase):
getstatusoutput.return_value = (1, "foobar")
self.assertFalse(pcmk.crm_res_running('res_nova_consoleauth'))
@mock.patch('subprocess.getstatusoutput')
def test_crm_res_running_on_node(self, getstatusoutput):
_resource = "res_nova_consoleauth"
_this_node = "node1"
_another_node = "node5"
# Not running
getstatusoutput.return_value = (1, "foobar")
self.assertFalse(
pcmk.crm_res_running_on_node(_resource, _this_node))
# Running active/passive on some other node
getstatusoutput.return_value = (
0, "resource {} is running: {}".format(_resource, _another_node))
self.assertTrue(
pcmk.crm_res_running_on_node('res_nova_consoleauth', _this_node))
# Running active/passive on this node
getstatusoutput.return_value = (
0, "resource {} is running: {}".format(_resource, _this_node))
self.assertTrue(
pcmk.crm_res_running_on_node('res_nova_consoleauth', _this_node))
# Running on some but not this node
getstatusoutput.return_value = (
0, ("resource {} is running: {}\nresource {} is NOT running"
.format(_resource, _another_node, _resource)))
self.assertFalse(
pcmk.crm_res_running_on_node('res_nova_consoleauth', _this_node))
# Running on this node and not others
getstatusoutput.return_value = (
0, ("resource {} is running: {}\nresource {} is NOT running"
.format(_resource, _this_node, _resource)))
self.assertTrue(
pcmk.crm_res_running_on_node('res_nova_consoleauth', _this_node))
# Running on more than one and this node
getstatusoutput.return_value = (
0, ("resource {} is running: {}\nresource {} is running: {}"
.format(_resource, _another_node, _resource, _this_node)))
self.assertTrue(
pcmk.crm_res_running_on_node('res_nova_consoleauth', _this_node))
@mock.patch('socket.gethostname')
@mock.patch('subprocess.getstatusoutput')
def test_wait_for_pcmk(self, getstatusoutput, gethostname):