Improved action to display the cluster status

The `state` action will provide details about the health of the cluster.
This action has one parameter to display the history of the cluster status,
which is false by default.

Closes-Bug: #1717831
Change-Id: Iaf6e4a75a36491eab8e6802a6f437e5f410ed29e
This commit is contained in:
Robert Gildein 2020-12-09 13:26:21 +01:00
parent 457f88eda6
commit 64e696ae74
6 changed files with 505 additions and 22 deletions

View File

@ -98,6 +98,47 @@ initial step (pause a unit) can be skipped. Unit removal may also be replaced
by `juju remove-machine N --force`, where N is the Juju machine ID where the
unit to be removed runs.
## Presenting status information
Here are a few examples of how to present useful information with the `status`
action and the [jq][jq] utility.
* Querying for `online` and `standby` parameter values:
juju run-action --wait hacluster/leader status \
--format json | jq '.[] | {(.UnitId):.results.result | fromjson \
| .nodes | .[] | {unit_name: .name, online: .online, standby: .standby}}'
output example
{
"hacluster/0": {
"unit_name": "juju-a37bc0-3",
"online": "true",
"standby": "false"
}
}
{
"hacluster/0": {
"unit_name": "juju-a37bc0-4",
"online": "true",
"standby": "false"
}
}
{
"hacluster/0": {
"unit_name": "juju-a37bc0-5",
"online": "true",
"standby": "false"
}
}
* Displaying cluster resource information:
juju run-action --wait hacluster/leader status \
--format json | jq '.[] | {(.UnitId):.results.result | fromjson \
| .resources.groups}'
# Bugs
Please report bugs on [Launchpad][lp-bugs-charm-hacluster].
@ -113,3 +154,4 @@ For general charm questions refer to the [OpenStack Charm Guide][cg].
[upstream-maas]: https://maas.io
[charms-requires-hacluster]: https://jaas.ai/search?requires=hacluster
[cdg]: https://docs.openstack.org/project-deploy-guide/charm-deployment-guide
[jq]: https://stedolan.github.io/jq/

View File

@ -3,8 +3,6 @@ pause:
from this unit to another unit in the hacluster
resume:
descrpition: Take hacluster unit out of standby mode
status:
description: Return cluster and resource status
cleanup:
description: Trigger cluster resource cleanup
params:
@ -12,6 +10,17 @@ cleanup:
default: "all"
type: string
description: Resource name to cleanup
status:
description: Show cluster status
params:
resources:
default: true
type: boolean
description: Show cluster resources
history:
default: false
type: boolean
description: Show cluster status history
update-ring:
description: Trigger corosync node members cleanup
params:

View File

@ -14,6 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import subprocess
import sys
@ -40,6 +41,9 @@ from charmhelpers.core.hookenv import (
action_fail,
action_get,
action_set,
function_fail,
function_get,
function_set,
is_leader,
log,
relation_ids,
@ -70,19 +74,18 @@ def resume(args):
def status(args):
"""Display status of cluster resources.
Includes inactive resources in results."""
cmd = ['crm', 'status', '--inactive']
"""Show hacluster status."""
try:
result = subprocess.check_output(cmd).decode('utf-8')
action_set({'result': result})
except subprocess.CalledProcessError as e:
log("ERROR: Failed call to crm resource status. "
"output: {}. return-code: {}".format(e.output, e.returncode))
health_status = pcmk.cluster_status(
resources=bool(function_get("resources")),
history=bool(function_get("history")))
function_set({"result": json.dumps(health_status)})
except subprocess.CalledProcessError as error:
log("ERROR: Failed call to crm status. output: {}. return-code: {}"
"".format(error.output, error.returncode))
log(traceback.format_exc())
action_set({'result': ''})
action_fail("failed to get cluster status")
function_set({"result": "failure"})
function_fail("failed to get cluster health")
def cleanup(args):

View File

@ -100,6 +100,23 @@ def is_resource_present(resource):
return True
def parse_version(cmd_output):
"""Parse version from cmd output.
:params cmd_output: output from command line
:type cmd_output: str
:returns: parsed version
:rtype: distutils.version.StrictVersion
:raises: ValueError version could not be parsed
"""
r = re.compile(r".*(\d+\.\d+\.\d+).*")
matched = r.match(cmd_output)
if not matched:
raise ValueError("error parsing version: {}".format(cmd_output))
else:
return StrictVersion(matched.group(1))
def crm_opt_exists(opt_name):
output = subprocess.getstatusoutput("crm configure show")[1]
if opt_name in output:
@ -282,17 +299,18 @@ def set_property(name, value):
def crm_version():
"""Parses the output of `crm --version` and returns a
distutils.version.StrictVersion instance
"""Get `crm` version.
Parses the output of `crm --version`.
:returns: crm version
:rtype: distutils.version.StrictVersion
:raises: ValueError version could not be parsed
:raises: subprocess.CalledProcessError if the check_output fails
"""
ver = subprocess.check_output(['crm', '--version'],
ver = subprocess.check_output(["crm", "--version"],
universal_newlines=True)
r = re.compile(r'.*(\d\.\d\.\d).*')
matched = r.match(ver)
if not matched:
raise ValueError('error parsin crm version: %s' % ver)
else:
return StrictVersion(matched.group(1))
return parse_version(ver)
def _crm_update_object(update_template, update_ctxt, hash_keys, unitdata_key,
@ -443,3 +461,126 @@ def resource_checksum(res_name, res_type, res_params=None):
if res_params is not None:
data.append(res_params)
return generate_checksum(data)
def get_tag(element, name):
"""Get tag from element.
:param element: parent element
:type element: etree.Element
:param name: name of tag
:type name: str
:returns: element with tag name
:rtype: etree.Element
"""
tag = element.find(name)
if tag is None:
return etree.Element(name)
return tag
def add_key(dictionary, key, value):
"""Add key to dictionary.
:param dictionary: dictionary
:type dictionary: Dict[Union[str, bytes], Union[str, bytes]]
:param key: new key to be inserted
:type key: str
:param value: new value to be inserted
:type value: Any
:returns: updated dictionary
:rtype: Dict[Union[str, bytes], Any]
"""
if key in dictionary:
log('key already exists and will be rewrite: {}'.format(key), WARNING)
dictionary[key] = value
return dictionary
def crm_mon_version():
"""Get `crm_mon` version.
Parses the output of `crm_mon --version`.
:returns: crm_mon version
:rtype: distutils.version.StrictVersion
:raises: ValueError version could not be parsed
:raises: subprocess.CalledProcessError if the check_output fails
"""
ver = subprocess.check_output(["crm_mon", "--version"],
universal_newlines=True)
return parse_version(ver)
def cluster_status(resources=True, history=False):
"""Parse the cluster status from `crm_mon`.
The `crm_mon` provides a summary of cluster's current state in XML format.
:param resources: flag for parsing resources from status, default is True
:type: boolean
:param history: flag for parsing history from status, default is False
:type: boolean
:returns: converted cluster status to the Dict
:rtype: Dict[str, Any]]
"""
status = {}
crm_mon_ver = crm_mon_version()
if crm_mon_ver >= StrictVersion("2.0.0"):
cmd = ["crm_mon", "--output-as=xml", "--inactive"]
else:
# NOTE (rgildein): The `--as-xml` option is deprecated.
cmd = ["crm_mon", "--as-xml", "--inactive"]
xml = subprocess.check_output(cmd).decode('utf-8')
root = etree.fromstring(xml)
# version
status["crm_mon_version"] = str(crm_mon_ver)
# summary
summary = get_tag(root, "summary")
status["summary"] = {element.tag: element.attrib for element in summary}
# nodes
nodes = get_tag(root, "nodes")
status["nodes"] = {
node.get("name"): node.attrib for node in nodes.findall("node")
}
# resources
if resources:
cluster_resources = get_tag(root, "resources")
resources_groups = {
group.get("id"): [
add_key(resource.attrib, "nodes",
[node.attrib for node in resource.findall("node")])
for resource in group.findall("resource")
] for group in cluster_resources.findall("group")
}
resources_clones = {
clone.get("id"): add_key(clone.attrib, "resources", [
add_key(resource.attrib, "nodes",
[node.attrib for node in resource.findall("node")])
for resource in clone.findall("resource")
]) for clone in cluster_resources.findall("clone")
}
status["resources"] = {"groups": resources_groups,
"clones": resources_clones}
# history
if history:
node_history = get_tag(root, "node_history")
status["history"] = {
node.get("name"): {
resource.get("id"): [
operation.attrib
for operation in resource.findall("operation_history")
] for resource in node.findall("resource_history")
} for node in node_history.findall("node")
}
return status

115
unit_tests/test_action.py Normal file
View File

@ -0,0 +1,115 @@
# Copyright 2020 Canonical Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import mock
import subprocess
import sys
mock_apt = mock.MagicMock()
sys.modules["apt_pkg"] = mock_apt
import actions
import test_utils
class ClusterStatusTestCase(test_utils.CharmTestCase):
TO_PATCH = [
"function_fail",
"function_get",
"function_set",
"pcmk",
"log",
]
health_status = {
"crm_mon_version": "2.0.3",
"summary": {
"last_update": {"time": "Fri Dec 4 14:52:26 2020"},
"last_change": {"time": "Fri Dec 4 14:08:26 2020"},
"nodes_configured": {"number": "3"}},
"nodes": {
"juju-d07fb7-3": {"online": "true", "type": "member"},
"juju-d07fb7-4": {"online": "true", "type": "member"},
"juju-d07fb7-5": {"online": "true", "type": "member"}},
"resources": {},
"history": {
"juju-d07fb7-3": {
"res_ks_36385de_vip": [{"call": "11", "task": "start"},
{"call": "13", "task": "monitor"}],
"res_ks_haproxy": [{"call": "10", "task": "probe"},
{"call": "12", "task": "monitor"}]}}
}
def setUp(self):
super(ClusterStatusTestCase, self).setUp(actions, self.TO_PATCH)
def _cluster_status(resources=True, history=False):
status = self.health_status.copy()
if not resources:
del status["resources"]
if not history:
del status["history"]
return status
self.pcmk.cluster_status.side_effect = _cluster_status
self._function_get = {"history": 1, "resources": 1}
self.function_get.side_effect = self._function_get.get
def test_status_without_resources(self):
"""test getting cluster status without resources"""
self._function_get["resources"] = 0
health_status = self.health_status.copy()
del health_status["resources"]
self.pcmk.cluster_status.return_value = health_status
actions.status([])
self.function_get.assert_has_calls([
mock.call("resources"), mock.call("history")])
self.function_set.assert_called_once_with(
{"result": json.dumps(health_status)})
def test_status_without_history(self):
"""test getting cluster status without history"""
self._function_get["history"] = 0
health_status = self.health_status.copy()
del health_status["history"]
self.pcmk.cluster_status.return_value = health_status
actions.status([])
self.function_get.assert_has_calls([
mock.call("resources"), mock.call("history")])
self.function_set.assert_called_once_with(
{"result": json.dumps(health_status)})
def test_status_with_history(self):
"""test getting cluster status with history"""
health_status = self.health_status.copy()
self.pcmk.cluster_status.return_value = health_status
actions.status([])
self.function_get.assert_has_calls([
mock.call("resources"), mock.call("history")])
self.function_set.assert_called_once_with(
{"result": json.dumps(health_status)})
def test_status_raise_error(self):
self.pcmk.cluster_status.side_effect = subprocess.CalledProcessError(
returncode=1, cmd=["crm", "status", "xml", "--inactive"])
actions.status([])
self.function_get.assert_has_calls([
mock.call("resources"), mock.call("history")])
self.function_set.assert_called_once_with({"result": "failure"})
self.function_fail.assert_called_once_with(
"failed to get cluster health")

View File

@ -18,6 +18,7 @@ import os
import tempfile
import test_utils
import unittest
import xml.etree.ElementTree as etree
from distutils.version import StrictVersion
@ -82,6 +83,73 @@ CRM_NODE_STATUS_XML = b'''
</nodes>
'''
CRM_STATUS_XML = b"""
<pacemaker-result api-version="2.0" request="crm_mon --output-as=xml --inactive">
<summary>
<stack type="corosync"/>
<current_dc present="true" version="2.0.3-4b1f869f0f" name="juju-424dd5-3" id="1001" with_quorum="true"/>
<last_update time="Tue Jan 5 09:55:10 2021"/>
<last_change time="Tue Jan 5 09:05:49 2021" user="hacluster" client="crmd" origin="juju-424dd5-3"/>
<nodes_configured number="4"/>
<resources_configured number="5" disabled="0" blocked="0"/>
<cluster_options stonith-enabled="false" symmetric-cluster="true" no-quorum-policy="stop" maintenance-mode="false"/>
</summary>
<nodes>
<node name="juju-424dd5-3" id="1001" online="true" standby="false" standby_onfail="false" maintenance="false" pending="false" unclean="false" shutdown="false" expected_up="true" is_dc="true" resources_running="2" type="member"/>
<node name="juju-424dd5-4" id="1000" online="true" standby="false" standby_onfail="false" maintenance="false" pending="false" unclean="false" shutdown="false" expected_up="true" is_dc="false" resources_running="1" type="member"/>
<node name="juju-424dd5-5" id="1002" online="true" standby="false" standby_onfail="false" maintenance="false" pending="false" unclean="false" shutdown="false" expected_up="true" is_dc="false" resources_running="1" type="member"/>
<node name="node1" id="1" online="false" standby="false" standby_onfail="false" maintenance="false" pending="false" unclean="false" shutdown="false" expected_up="false" is_dc="false" resources_running="0" type="member"/>
</nodes>
<resources>
<group id="grp_ks_vips" number_resources="1">
<resource id="res_ks_3cb88eb_vip" resource_agent="ocf::heartbeat:IPaddr2" role="Started" active="true" orphaned="false" blocked="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1">
<node name="juju-424dd5-3" id="1001" cached="true"/>
</resource>
</group>
<clone id="cl_ks_haproxy" multi_state="false" unique="false" managed="true" failed="false" failure_ignored="false">
<resource id="res_ks_haproxy" resource_agent="lsb:haproxy" role="Started" active="true" orphaned="false" blocked="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1">
<node name="juju-424dd5-3" id="1001" cached="true"/>
</resource>
<resource id="res_ks_haproxy" resource_agent="lsb:haproxy" role="Started" active="true" orphaned="false" blocked="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1">
<node name="juju-424dd5-5" id="1002" cached="true"/>
</resource>
<resource id="res_ks_haproxy" resource_agent="lsb:haproxy" role="Started" active="true" orphaned="false" blocked="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1">
<node name="juju-424dd5-4" id="1000" cached="true"/>
</resource>
<resource id="res_ks_haproxy" resource_agent="lsb:haproxy" role="Stopped" active="false" orphaned="false" blocked="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="0"/>
</clone>
</resources>
<node_history>
<node name="juju-424dd5-3">
<resource_history id="res_ks_3cb88eb_vip" orphan="false" migration-threshold="1000000">
<operation_history call="10" task="start" last-rc-change="Tue Jan 5 09:03:52 2021" last-run="Tue Jan 5 09:03:52 2021" exec-time="57ms" queue-time="0ms" rc="0" rc_text="ok"/>
<operation_history call="11" task="monitor" interval="10000ms" last-rc-change="Tue Jan 5 09:03:52 2021" exec-time="57ms" queue-time="1ms" rc="0" rc_text="ok"/>
</resource_history>
<resource_history id="res_ks_haproxy" orphan="false" migration-threshold="1000000">
<operation_history call="36" task="probe" last-rc-change="Tue Jan 5 09:05:50 2021" last-run="Tue Jan 5 09:05:50 2021" exec-time="44ms" queue-time="0ms" rc="0" rc_text="ok"/>
<operation_history call="36" task="probe" last-rc-change="Tue Jan 5 09:05:50 2021" last-run="Tue Jan 5 09:05:50 2021" exec-time="44ms" queue-time="0ms" rc="0" rc_text="ok"/>
<operation_history call="37" task="monitor" interval="5000ms" last-rc-change="Tue Jan 5 09:05:50 2021" exec-time="43ms" queue-time="0ms" rc="0" rc_text="ok"/>
</resource_history>
</node>
<node name="juju-424dd5-5">
<resource_history id="res_ks_haproxy" orphan="false" migration-threshold="1000000">
<operation_history call="10" task="probe" last-rc-change="Tue Jan 5 09:03:52 2021" last-run="Tue Jan 5 09:03:52 2021" exec-time="54ms" queue-time="0ms" rc="0" rc_text="ok"/>
<operation_history call="10" task="probe" last-rc-change="Tue Jan 5 09:03:52 2021" last-run="Tue Jan 5 09:03:52 2021" exec-time="54ms" queue-time="0ms" rc="0" rc_text="ok"/>
<operation_history call="11" task="monitor" interval="5000ms" last-rc-change="Tue Jan 5 09:03:52 2021" exec-time="49ms" queue-time="0ms" rc="0" rc_text="ok"/>
</resource_history>
</node>
<node name="juju-424dd5-4">
<resource_history id="res_ks_haproxy" orphan="false" migration-threshold="1000000">
<operation_history call="10" task="probe" last-rc-change="Tue Jan 5 09:04:11 2021" last-run="Tue Jan 5 09:04:11 2021" exec-time="32ms" queue-time="0ms" rc="0" rc_text="ok"/>
<operation_history call="10" task="probe" last-rc-change="Tue Jan 5 09:04:11 2021" last-run="Tue Jan 5 09:04:11 2021" exec-time="32ms" queue-time="0ms" rc="0" rc_text="ok"/>
<operation_history call="11" task="monitor" interval="5000ms" last-rc-change="Tue Jan 5 09:04:11 2021" exec-time="27ms" queue-time="0ms" rc="0" rc_text="ok"/>
</resource_history>
</node>
</node_history>
<status code="0" message="OK"/>
</pacemaker-result>
""" # noqa
class TestPcmk(unittest.TestCase):
def setUp(self):
@ -298,3 +366,108 @@ class TestPcmk(unittest.TestCase):
'juju-982848-zaza-ce47c58f6c88-11',
'juju-982848-zaza-ce47c58f6c88-9'])
mock_check_output.assert_called_once_with(['crm', 'node', 'status'])
def test_get_tag(self):
"""Test get element by tag if exists else empty element."""
main = etree.Element("test")
main.append(etree.Element("child_1", {"id": "t1", "class": "test"}))
main.append(etree.Element("child_2", {"id": "t2", "class": "test"}))
assert pcmk.get_tag(main, "child_1").get("id") == "t1"
assert pcmk.get_tag(main, "child_2").get("id") == "t2"
assert pcmk.get_tag(main, "child_3").get("id") is None
def test_add_key(self):
"""Test add new key to dictionary."""
dict_1 = {"a": 1}
self.assertDictEqual(pcmk.add_key(dict_1, "b", [1, 2, 3]),
{"a": 1, "b": [1, 2, 3]})
dict_1 = {"a": 1, "b": 2}
self.assertDictEqual(pcmk.add_key(dict_1, "b", [1, 2, 3]),
{"a": 1, "b": [1, 2, 3]})
@mock.patch('subprocess.check_output')
def test_crm_mon_version(self, mock_check_output):
# trusty
mock_check_output.return_value = "Pacemaker 1.1.10\n" \
"Written by Andrew Beekhof"
ret = pcmk.crm_mon_version()
self.assertEqual(StrictVersion("1.1.10"), ret)
mock_check_output.assert_called_with(["crm_mon", "--version"],
universal_newlines=True)
# focal
mock_check_output.return_value = "Pacemaker 2.0.3\n" \
"Written by Andrew Beekhof"
ret = pcmk.crm_mon_version()
self.assertEqual(StrictVersion("2.0.3"), ret)
mock_check_output.assert_called_with(["crm_mon", "--version"],
universal_newlines=True)
@mock.patch("subprocess.check_output", return_value=CRM_STATUS_XML)
@mock.patch.object(pcmk, "crm_mon_version")
def test_cluster_status(self, mock_crm_mon_version, mock_check_output):
"""Test parse cluster status from `crm status xml`."""
mock_crm_mon_version.return_value = StrictVersion("2.0.3") # Focal
status = pcmk.cluster_status(resources=True, history=True)
with open("status.json", "w") as file:
import json
json.dump({"result": json.dumps(status)}, file)
mock_check_output.assert_called_with(
["crm_mon", "--output-as=xml", "--inactive"])
self.assertEqual(status["crm_mon_version"], "2.0.3")
self.assertEqual(status["summary"]["last_update"]["time"],
"Tue Jan 5 09:55:10 2021")
self.assertEqual(status["summary"]["nodes_configured"]["number"], "4")
self.assertListEqual(
sorted(status["nodes"].keys()),
sorted(["node1", "juju-424dd5-3", "juju-424dd5-4",
"juju-424dd5-5"]))
self.assertEqual(status["resources"]["groups"]["grp_ks_vips"][0]["id"],
"res_ks_3cb88eb_vip")
self.assertDictEqual(
status["resources"]["groups"]["grp_ks_vips"][0]["nodes"][0],
{"name": "juju-424dd5-3", "id": "1001", "cached": "true"})
self.assertEqual(
status["resources"]["clones"]["cl_ks_haproxy"]["resources"][0]
["id"],
"res_ks_haproxy")
self.assertDictEqual(
status["resources"]["clones"]["cl_ks_haproxy"]["resources"][0]
["nodes"][0],
{"name": "juju-424dd5-3", "id": "1001", "cached": "true"})
self.assertEqual(
status["history"]["juju-424dd5-3"]["res_ks_haproxy"][0]["call"],
"36"
)
self.assertEqual(
status["history"]["juju-424dd5-4"]["res_ks_haproxy"][2]["call"],
"11"
)
self.assertEqual(
status["history"]["juju-424dd5-4"]["res_ks_haproxy"][2]
["last-rc-change"],
"Tue Jan 5 09:04:11 2021"
)
def test_parse_version(self):
"""Test parse version from cmd output."""
for cmd_output, exp_version in [
("Pacemaker 1.1.10", StrictVersion("1.1.10")),
("Test 2.2.2\nnewline\nnewline", StrictVersion("2.2.2")),
("2.2.2", StrictVersion("2.2.2"))
]:
self.assertEqual(pcmk.parse_version(cmd_output), exp_version)
with self.assertRaises(ValueError):
pcmk.parse_version("test 1.1")
with self.assertRaises(ValueError):
pcmk.parse_version("test 1.a.1")
with self.assertRaises(ValueError):
pcmk.parse_version("output failed")