Use crm_mon for pacemaker-remote deployments

As described in bug #1728527 cibadmin does not expose the state of
the pacemaker-remote nodes which means hostmonitor cannot track
them. This change switches to use crm_mon to check the status of
remote nodes if the new config option host.restrict_to_remotes
to set to True. This will trigger host monitor to use crm_mon
to monitor nodes and will only monitor nodes that are marked
as remotes (not members).

Change-Id: I3f2026805413504c875ea5f39eb036d44b26dd43
Depends-On: Iaa2251708616e9c69817bf5b346d795ea7a4d21b
Closes-Bug: #1728527
This commit is contained in:
Liam Young 2019-03-19 20:05:22 +00:00
parent ae3ab24f9a
commit dc9b777724
5 changed files with 380 additions and 2 deletions

View File

@ -40,6 +40,10 @@ Possible values:
If ipmi RA is not set in pacemaker, this value should be set True.
'''),
cfg.BoolOpt('restrict_to_remotes',
default=False,
help='Only monitor pacemaker-remotes, ignore the status of'
' full cluster members.'),
cfg.IntOpt('ipmi_timeout',
default=5,
help='Timeout value(in seconds) of the ipmitool command.'),

View File

@ -23,6 +23,7 @@ from masakarimonitors.ha import masakari
import masakarimonitors.hostmonitor.host_handler.driver as driver
from masakarimonitors.hostmonitor.host_handler import hold_host_status
from masakarimonitors.hostmonitor.host_handler import parse_cib_xml
from masakarimonitors.hostmonitor.host_handler import parse_crmmon_xml
from masakarimonitors.objects import event_constants as ec
from masakarimonitors import utils
@ -30,6 +31,18 @@ LOG = oslo_logging.getLogger(__name__)
CONF = masakarimonitors.conf.CONF
class CibSchemaCompliantTag(dict):
"""Create a dict which has the same attributes as a cib node tag.
Given a crm node tag convert it to a dict with corresponding cib tag
attributes.
"""
def __init__(self, crmon_entry):
self['uname'] = crmon_entry.get('name')
online = crmon_entry.get('online')
self['crmd'] = 'online' if online == 'true' else 'offline'
class HandleHost(driver.DriverBase):
"""Handle hosts.
@ -40,6 +53,7 @@ class HandleHost(driver.DriverBase):
super(HandleHost, self).__init__()
self.my_hostname = socket.gethostname()
self.xml_parser = parse_cib_xml.ParseCibXml()
self.crmmon_xml_parser = parse_crmmon_xml.ParseCrmMonXml()
self.status_holder = hold_host_status.HostHoldStatus()
self.notifier = masakari.SendNotification()
@ -168,6 +182,22 @@ class HandleHost(driver.DriverBase):
return out
def _get_crmmon_xml(self):
"""Get summary of cluster's current state in XML format."""
try:
# Execute crm_mon command.
out, err = utils.execute('crm_mon', '-X', run_as_root=True)
if err:
msg = ("crmmon command output stderr: %s") % err
raise Exception(msg)
except Exception as e:
LOG.warning("Exception caught: %s", e)
return
return out
def _is_poweroff(self, hostname):
ipmi_values = self.xml_parser.get_stonith_ipmi_params(hostname)
if ipmi_values is None:
@ -298,6 +328,31 @@ class HandleHost(driver.DriverBase):
# Update host status.
self.status_holder.set_host_status(node_state_tag)
def _check_host_status_by_crm_mon(self):
crmmon_xml = self._get_crmmon_xml()
if crmmon_xml is None:
# crm_mon command failure.
return 1
# Set to the ParseCrmMonXml object.
self.crmmon_xml_parser.set_crmmon_xml(crmmon_xml)
# Get node_state tag list.
node_state_tag_list = self.crmmon_xml_parser.get_node_state_tag_list()
if len(node_state_tag_list) == 0:
# If crmmon xml doesn't have node_state tag,
# it is an unexpected result.
raise Exception(
"Failed to get nodes tag from crm_mon xml.")
node_state_tag_list = [CibSchemaCompliantTag(n)
for n in node_state_tag_list
if n.get('type') == 'remote']
# Check if status changed.
self._check_if_status_changed(node_state_tag_list)
return 0
def _check_host_status_by_cibadmin(self):
# Get xml of cib info.
cib_xml = self._get_cib_xml()
@ -362,8 +417,13 @@ class HandleHost(driver.DriverBase):
CONF.host.monitoring_interval)
continue
# Check the host status is online or offline by cibadmin.
if self._check_host_status_by_cibadmin() != 0:
# Check the host status is online or offline.
if CONF.host.restrict_to_remotes:
status_func = self._check_host_status_by_crm_mon
else:
status_func = self._check_host_status_by_cibadmin
if status_func() != 0:
LOG.warning("hostmonitor skips monitoring hosts.")
eventlet.greenthread.sleep(CONF.host.monitoring_interval)
continue

View File

@ -0,0 +1,81 @@
# Copyright(c) 2019 Canonical Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from xml.etree import ElementTree
from oslo_log import log as oslo_logging
LOG = oslo_logging.getLogger(__name__)
class ParseCrmMonXml(object):
"""ParseCrmMonXml class
This class parses the crmmon xml.
"""
def __init__(self):
self.crmmon_tag = None
def set_crmmon_xml(self, crmmon_xml):
"""Set xml.etree.ElementTree.Element object.
This method receives string of crmmon xml, and convert it
to xml.etree.ElementTree.Element object.
:params crmmon_xml: String of crmmon xml
"""
# Convert xml.etree.ElementTree.Element object.
self.crmmon_tag = ElementTree.fromstring(crmmon_xml)
def _get_nodes(self):
# status tag exists in the crmmon tag.
if self.crmmon_tag is None:
return None
child_list = self.crmmon_tag.getchildren()
for child in child_list:
if child.tag == 'nodes':
return child
return None
def _get_node_states(self, nodes_tag):
node_state_tag_list = []
# node_state tag exists in the status tag.
child_list = nodes_tag.getchildren()
for child in child_list:
if child.tag == 'node':
node_state_tag_list.append(child)
return node_state_tag_list
def get_node_state_tag_list(self):
"""Get node_state tag list.
This method gets node_state tag list from crmmon xml.
:returns: node_state tag list
"""
# Get status tag.
nodes_tag = self._get_nodes()
if nodes_tag is None:
LOG.error("crm_mon xml doesn't have nodes tag.")
return []
# Get node_state tag list.
node_state_tag_list = self._get_node_states(nodes_tag)
if len(node_state_tag_list) == 0:
LOG.error("crm_mon xml doesn't have online tag.")
return node_state_tag_list

View File

@ -25,6 +25,7 @@ from masakarimonitors.ha import masakari
from masakarimonitors.hostmonitor.host_handler import handle_host
from masakarimonitors.hostmonitor.host_handler import hold_host_status
from masakarimonitors.hostmonitor.host_handler import parse_cib_xml
from masakarimonitors.hostmonitor.host_handler import parse_crmmon_xml
from masakarimonitors.objects import event_constants as ec
from masakarimonitors import utils
@ -48,6 +49,52 @@ STATUS_TAG_XML = ' <status>' \
' <test foo="foo"/>' \
' </node_state>' \
' </status>'
CRMMON_NODES_TAG_XML = """
<nodes>
<node name="member1" id="1002" online="true" standby="false"
standby_onfail="false" maintenance="false" pending="false"
unclean="false" shutdown="false" expected_up="true" is_dc="false"
resources_running="2" type="member" />
<node name="member2" id="1001" online="true" standby="false"
standby_onfail="false" maintenance="false" pending="false"
unclean="false" shutdown="false" expected_up="true" is_dc="true"
resources_running="1" type="member" />
<node name="remote1" id="remotehostname1" online="true" standby="false"
standby_onfail="false" maintenance="false" pending="false"
unclean="false" shutdown="false" expected_up="false"
is_dc="false" resources_running="0" type="remote" />
<node name="remote2" id="remotehostname2" online="true" standby="false"
standby_onfail="false" maintenance="false" pending="false"
unclean="false" shutdown="false" expected_up="false" is_dc="false"
resources_running="0" type="remote" />
<node name="remote3" id="remotehostname3" online="true" standby="false"
standby_onfail="false" maintenance="false" pending="false"
unclean="false" shutdown="false" expected_up="false" is_dc="false"
resources_running="0" type="remote" />
<node name="member3" id="1000" online="true" standby="false"
standby_onfail="false" maintenance="false" pending="false"
unclean="false" shutdown="false" expected_up="true" is_dc="false"
resources_running="4" type="member" />
</nodes>
"""
class TestCibSchemaCompliantTag(testtools.TestCase):
def setUp(self):
super(TestCibSchemaCompliantTag, self).setUp()
def test_init_offline(self):
tag = handle_host.CibSchemaCompliantTag(
{'name': 'test1', 'online': 'false'})
self.assertEqual(tag['uname'], 'test1')
self.assertEqual(tag['crmd'], 'offline')
def test_init_online(self):
tag = handle_host.CibSchemaCompliantTag(
{'name': 'test1', 'online': 'true'})
self.assertEqual(tag['uname'], 'test1')
self.assertEqual(tag['crmd'], 'online')
class TestHandleHost(testtools.TestCase):
@ -309,6 +356,28 @@ class TestHandleHost(testtools.TestCase):
mock_execute.assert_called_once_with(
'cibadmin', '--query', run_as_root=True)
@mock.patch.object(utils, 'execute')
def test_get_crmmon_xml(self, mock_execute):
mock_execute.return_value = ('test_stdout', '')
obj = handle_host.HandleHost()
ret = obj._get_crmmon_xml()
self.assertEqual('test_stdout', ret)
mock_execute.assert_called_once_with(
'crm_mon', '-X', run_as_root=True)
@mock.patch.object(utils, 'execute')
def test_get_crmmon_xml_stderr(self, mock_execute):
mock_execute.return_value = ('test_stdout', 'test_stderr')
obj = handle_host.HandleHost()
ret = obj._get_crmmon_xml()
self.assertIsNone(ret)
mock_execute.assert_called_once_with(
'crm_mon', '-X', run_as_root=True)
@mock.patch.object(utils, 'execute')
@mock.patch.object(parse_cib_xml.ParseCibXml, 'get_stonith_ipmi_params')
def test_is_poweroff(self, mock_get_stonith_ipmi_params, mock_execute):
@ -570,6 +639,65 @@ class TestHandleHost(testtools.TestCase):
mock_send_notification.assert_called_once_with(
CONF.host.api_retry_max, CONF.host.api_retry_interval, test_event)
@mock.patch.object(handle_host.HandleHost, '_check_if_status_changed')
@mock.patch.object(parse_crmmon_xml.ParseCrmMonXml,
'get_node_state_tag_list')
@mock.patch.object(parse_crmmon_xml.ParseCrmMonXml, 'set_crmmon_xml')
@mock.patch.object(handle_host.HandleHost, '_get_crmmon_xml')
def test_check_host_status_by_crm_mon(
self, mock_get_crmmon_xml, mock_set_crmmon_xml,
mock_get_node_state_tag_list, mock_check_if_status_changed):
mock_get_crmmon_xml.return_value = CRMMON_NODES_TAG_XML
mock_set_crmmon_xml.return_value = None
status_tag = ElementTree.fromstring(CRMMON_NODES_TAG_XML)
node_state_tag_list = status_tag.getchildren()
mock_get_node_state_tag_list.return_value = node_state_tag_list
mock_check_if_status_changed.return_value = None
obj = handle_host.HandleHost()
ret = obj._check_host_status_by_crm_mon()
self.assertEqual(0, ret)
mock_get_node_state_tag_list.assert_called_once_with()
mock_set_crmmon_xml.assert_called_once_with(CRMMON_NODES_TAG_XML)
mock_get_node_state_tag_list.assert_called_once_with()
mock_check_if_status_changed.assert_called_once_with(
[
{'uname': 'remote1', 'crmd': 'online'},
{'uname': 'remote2', 'crmd': 'online'},
{'uname': 'remote3', 'crmd': 'online'}])
@mock.patch.object(parse_crmmon_xml.ParseCrmMonXml,
'get_node_state_tag_list')
@mock.patch.object(parse_crmmon_xml.ParseCrmMonXml, 'set_crmmon_xml')
@mock.patch.object(handle_host.HandleHost, '_get_crmmon_xml')
def test_check_host_status_by_crm_mon_not_have_node_state_tag(
self, mock_get_crmmon_xml, mock_set_crmmon_xml,
mock_get_node_state_tag_list):
mock_get_crmmon_xml.return_value = CRMMON_NODES_TAG_XML
mock_set_crmmon_xml.return_value = None
mock_get_node_state_tag_list.return_value = []
obj = handle_host.HandleHost()
self.assertRaisesRegexp(
Exception, "Failed to get nodes tag from crm_mon xml.",
obj._check_host_status_by_crm_mon)
mock_get_crmmon_xml.assert_called_once_with()
mock_set_crmmon_xml.assert_called_once_with(CRMMON_NODES_TAG_XML)
mock_get_node_state_tag_list.assert_called_once_with()
@mock.patch.object(handle_host.HandleHost, '_get_crmmon_xml')
def test_check_host_status_by_crm_mon_xml_is_None(
self, mock_get_crmmon_xml):
mock_get_crmmon_xml.return_value = None
obj = handle_host.HandleHost()
ret = obj._check_host_status_by_crm_mon()
self.assertEqual(1, ret)
mock_get_crmmon_xml.assert_called_once_with()
@mock.patch.object(handle_host.HandleHost, '_check_if_status_changed')
@mock.patch.object(parse_cib_xml.ParseCibXml, 'get_node_state_tag_list')
@mock.patch.object(parse_cib_xml.ParseCibXml, 'have_quorum')
@ -693,3 +821,30 @@ class TestHandleHost(testtools.TestCase):
mock_check_pacemaker_services.assert_called_with('pacemaker_remote')
self.assertEqual(2, mock_check_host_status_by_cibadmin.call_count)
self.assertEqual(2, mock_check_host_status_by_crmadmin.call_count)
@mock.patch.object(eventlet.greenthread, 'sleep')
@mock.patch.object(handle_host.HandleHost,
'_check_host_status_by_crm_mon')
@mock.patch.object(handle_host.HandleHost, '_check_pacemaker_services')
@mock.patch.object(handle_host.HandleHost, '_check_hb_line')
def test_monitor_hosts_remotes_only(self,
mock_check_hb_line,
mock_check_pacemaker_services,
mock_check_host_status_by_crm_mon,
mock_sleep):
CONF.host.restrict_to_remotes = True
mock_check_hb_line.side_effect = \
[0, Exception("Test exception.")]
mock_check_pacemaker_services.return_value = True
mock_check_host_status_by_crm_mon.side_effect = 0
mock_sleep.return_value = None
obj = handle_host.HandleHost()
obj.monitor_hosts()
self.assertEqual(1, mock_check_hb_line.call_count)
self.assertEqual(1, mock_check_pacemaker_services.call_count)
mock_check_pacemaker_services.assert_called_with('pacemaker_remote')
self.assertEqual(1, mock_check_host_status_by_crm_mon.call_count)
mock_check_host_status_by_crm_mon.assert_called_once_with()

View File

@ -0,0 +1,78 @@
# Copyright(c) 2019 Canonical Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import testtools
from masakarimonitors.hostmonitor.host_handler import parse_crmmon_xml
CRMMON_XML = '<?xml version="1.0"?>' \
'<crm_mon version="1.1.18">' \
' <nodes>' \
' <node name="node-1" id="1001" online="true" />' \
' <node name="node-2" id="1002" online="false" />' \
' <node name="node-3" id="1003" online="true" />' \
' </nodes>' \
'</crm_mon>'
CRMMON_NONODES_XML = '<?xml version="1.0"?>' \
'<crm_mon version="1.1.18">' \
' <nodes>' \
' </nodes>' \
'</crm_mon>'
CRMMON_NONODES_TAG_XML = '<?xml version="1.0"?>' \
'<crm_mon version="1.1.18">' \
'</crm_mon>'
class TestParseCrmMonXml(testtools.TestCase):
def setUp(self):
super(TestParseCrmMonXml, self).setUp()
def test_set_crmmon_xml(self):
obj = parse_crmmon_xml.ParseCrmMonXml()
obj.set_crmmon_xml(CRMMON_XML)
def test_get_node_state_tag_list(self):
obj = parse_crmmon_xml.ParseCrmMonXml()
obj.set_crmmon_xml(CRMMON_XML)
node_state_tag_list = obj.get_node_state_tag_list()
expected = {
'node-1': 'true',
'node-2': 'false',
'node-3': 'true'}
for node_state_tag in node_state_tag_list:
self.assertEqual(
expected[node_state_tag.get('name')],
node_state_tag.get('online'))
def test_get_node_state_tag_list_unset(self):
obj = parse_crmmon_xml.ParseCrmMonXml()
self.assertEqual(obj.get_node_state_tag_list(), [])
def test_get_node_state_tag_list_nonodes(self):
obj = parse_crmmon_xml.ParseCrmMonXml()
obj.set_crmmon_xml(CRMMON_NONODES_XML)
self.assertEqual(obj.get_node_state_tag_list(), [])
def test_get_node_state_tag_list_nonodes_tag(self):
obj = parse_crmmon_xml.ParseCrmMonXml()
obj.set_crmmon_xml(CRMMON_NONODES_TAG_XML)
self.assertEqual(obj.get_node_state_tag_list(), [])