Fuel plugin to collect Logging Monitoring and Alerting metrics
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

collectd_pacemaker.py 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. #!/usr/bin/python
  2. # Copyright 2016 Mirantis, Inc.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import collectd
  16. from collections import Counter
  17. from collections import defaultdict
  18. from sets import Set
  19. import socket
  20. import xml.etree.ElementTree as ET
  21. import collectd_base as base
  22. NAME = 'pacemaker'
  23. CRM_MON_BINARY = '/usr/sbin/crm_mon'
  24. # Node status
  25. OFFLINE_STATUS = 0
  26. MAINTENANCE_STATUS = 1
  27. ONLINE_STATUS = 2
  28. class CrmMonitorPlugin(base.Base):
  29. def __init__(self, *args, **kwargs):
  30. super(CrmMonitorPlugin, self).__init__(*args, **kwargs)
  31. self.plugin = NAME
  32. self.crm_mon_binary = CRM_MON_BINARY
  33. self.hostname = socket.getfqdn()
  34. self.notify_resource = None
  35. self.resources = {}
  36. self.history = {}
  37. def config_callback(self, conf):
  38. super(CrmMonitorPlugin, self).config_callback(conf)
  39. for node in conf.children:
  40. if node.key == 'Hostname':
  41. self.hostname = node.values[0]
  42. elif node.key == 'CrmMonBinary':
  43. self.crm_mon_binary = node.values[0]
  44. elif node.key == 'Resource':
  45. self.resources[node.values[0]] = node.values[-1]
  46. elif node.key == 'NotifyResource':
  47. self.notify_resource = node.values[0]
  48. def itermetrics(self):
  49. def str_to_bool(v):
  50. return str(v).lower() == 'true'
  51. def str_to_boolint(v):
  52. if str_to_bool(v):
  53. return 1
  54. else:
  55. return 0
  56. def shorten_hostname(v):
  57. return v.split('.')[0]
  58. def same_hostname(v):
  59. if v is not None and v.get('name') == self.hostname:
  60. return 1
  61. return 0
  62. retcode, out, err = self.execute(
  63. [self.crm_mon_binary, '--as-xml', '-r', '-f'], shell=False)
  64. if retcode != 0:
  65. raise base.CheckException(
  66. "Failed to execute crm_mon '{}'".format(err))
  67. try:
  68. root = ET.fromstring(out)
  69. except ET.ParseError:
  70. raise base.CheckException(
  71. "Failed to parse XML '{}'".format(out[:64]))
  72. if self.notify_resource:
  73. # Notify the other collectd plugins whether the resource runs
  74. # locally or not
  75. node = root.find('resources/resource[@id="{}"]/node'.format(
  76. self.notify_resource))
  77. self.collectd.Notification(
  78. type='gauge',
  79. message='{{"resource":"{}","value":{}}}'.format(
  80. self.notify_resource, same_hostname(node)),
  81. severity=self.collectd.NOTIF_OKAY
  82. ).dispatch()
  83. # The metric needs to be emitted too for the Lua plugins executed
  84. # by the metric_collector service
  85. yield {
  86. 'type_instance': 'local_resource_active',
  87. 'values': same_hostname(node),
  88. 'meta': {'resource': self.notify_resource,
  89. 'host': shorten_hostname(self.hostname)}
  90. }
  91. summary = root.find('summary')
  92. current_dc = summary.find('current_dc')
  93. # The metric needs to be emitted for the alarms that leverage the other
  94. # metrics emitted by the plugin
  95. yield {
  96. 'type_instance': 'local_dc_active',
  97. 'values': same_hostname(current_dc),
  98. 'meta': {'host': shorten_hostname(self.hostname)}
  99. }
  100. if current_dc.get('name') != self.hostname:
  101. # The other metrics are only collected from the cluster's DC
  102. return
  103. # Report global cluster metrics
  104. yield {
  105. 'type_instance': 'dc',
  106. 'values': str_to_boolint(current_dc.get('present', 'false'))
  107. }
  108. yield {
  109. 'type_instance': 'quorum_status',
  110. 'values': str_to_boolint(current_dc.get('with_quorum', 'false'))
  111. }
  112. yield {
  113. 'type_instance': 'configured_nodes',
  114. 'values': int(summary.find('nodes_configured').get('number'))
  115. }
  116. yield {
  117. 'type_instance': 'configured_resources',
  118. 'values': int(summary.find('resources_configured').get('number'))
  119. }
  120. # Report node status metrics
  121. cluster_nodes = []
  122. aggregated_nodes_status = {'online': 0, 'offline': 0, 'maintenance': 0}
  123. nodes_total = 0
  124. for node in root.find('nodes').iter('node'):
  125. nodes_total += 1
  126. hostname = shorten_hostname(node.get('name'))
  127. cluster_nodes.append(node.get('name'))
  128. if str_to_bool(node.get('online')):
  129. if str_to_bool(node.get('maintenance')):
  130. aggregated_nodes_status['maintenance'] += 1
  131. yield {
  132. 'type_instance': 'node_status',
  133. 'values': MAINTENANCE_STATUS,
  134. 'meta': {'status': 'maintenance', 'host': hostname}
  135. }
  136. else:
  137. aggregated_nodes_status['online'] += 1
  138. yield {
  139. 'type_instance': 'node_status',
  140. 'values': ONLINE_STATUS,
  141. 'meta': {'status': 'online', 'host': hostname}
  142. }
  143. else:
  144. aggregated_nodes_status['offline'] += 1
  145. yield {
  146. 'type_instance': 'node_status',
  147. 'values': OFFLINE_STATUS,
  148. 'meta': {'status': 'offline', 'host': hostname}
  149. }
  150. for status, cnt in aggregated_nodes_status.items():
  151. yield {
  152. 'type_instance': 'nodes_count',
  153. 'values': cnt,
  154. 'meta': {'status': status}
  155. }
  156. yield {
  157. 'type_instance': 'nodes_percent',
  158. 'values': 100.0 * cnt / nodes_total,
  159. 'meta': {'status': status}
  160. }
  161. # Report the number of resources per status
  162. # Clone resources can run on multipe nodes while "simple" resources run
  163. # only one node at the same time
  164. aggregated_resources = defaultdict(Counter)
  165. resources = root.find('resources')
  166. for resource_id, resource_name in self.resources.iteritems():
  167. resource_elts = []
  168. simple_resource = None
  169. clone_resource = resources.find(
  170. 'clone/resource[@id="{}"]/..'.format(resource_id))
  171. if not clone_resource:
  172. simple_resource = resources.find('resource[@id="{}"]'.format(
  173. resource_id))
  174. if simple_resource:
  175. resource_elts = [simple_resource]
  176. else:
  177. resource_elts = clone_resource.findall('resource')
  178. if not resource_elts:
  179. self.logger.error("{}: Couldn't find resource '{}'".format(
  180. self.plugin, resource_id))
  181. continue
  182. total = 0
  183. for item in resource_elts:
  184. total += 1
  185. if (item.get('role') in ('Slave', 'Master') and
  186. not str_to_bool(item.get('failed'))):
  187. # Multi-master resource
  188. aggregated_resources[resource_name]['up'] += 1
  189. elif item.get('role') == 'Started':
  190. aggregated_resources[resource_name]['up'] += 1
  191. else:
  192. aggregated_resources[resource_name]['down'] += 1
  193. if simple_resource:
  194. # Report on which node the "simple" resource is running
  195. for node in cluster_nodes:
  196. yield {
  197. 'type_instance': 'local_resource_active',
  198. 'values': str_to_boolint(
  199. node == simple_resource.find('node').get('name')),
  200. 'meta': {'resource': resource_name,
  201. 'host': shorten_hostname(node)}
  202. }
  203. for status in ('up', 'down'):
  204. cnt = aggregated_resources[resource_name][status]
  205. yield {
  206. 'type_instance': 'resource_count',
  207. 'values': cnt,
  208. 'meta': {'status': status, 'resource': resource_name}
  209. }
  210. yield {
  211. 'type_instance': 'resource_percent',
  212. 'values': 100.0 * cnt / total,
  213. 'meta': {'status': status, 'resource': resource_name}
  214. }
  215. # Collect operations' history metrics for the monitored resources
  216. #
  217. # The reported count for the resource's operations is an approximate
  218. # value because crm_mon doesn't provide the exact number. To estimate
  219. # the number of operations applied to a resource, the plugin keeps a
  220. # copy of call_ids and compares it with the current value.
  221. for node in root.find('node_history').iter('node'):
  222. hostname = shorten_hostname(node.get('name'))
  223. if hostname not in self.history:
  224. self.history[hostname] = {}
  225. for resource_id, resource_name in self.resources.iteritems():
  226. if resource_id not in self.history[hostname]:
  227. self.history[hostname][resource_id] = {
  228. 'fail_count': 0,
  229. 'ops_count': 0,
  230. 'call_ids': Set([])
  231. }
  232. v = self.history[hostname][resource_id]
  233. res_history = node.find('resource_history[@id="{}"]'.format(
  234. resource_id))
  235. if res_history:
  236. # For simple resources, the resource_history element only
  237. # exists for the node that runs the resource
  238. v['fail_count'] += int(res_history.get('fail-count', 0))
  239. call_ids = Set([
  240. i.get('call') for i in res_history.findall(
  241. 'operation_history')])
  242. if call_ids:
  243. v['ops_count'] += len(call_ids - v['call_ids'])
  244. v['call_ids'] = call_ids
  245. yield {
  246. 'type_instance': 'resource_failures',
  247. 'values': v['fail_count'],
  248. 'meta': {'resource': resource_name, 'host': hostname}
  249. }
  250. yield {
  251. 'type_instance': 'resource_operations',
  252. 'values': v['ops_count'],
  253. 'meta': {'resource': resource_name, 'host': hostname}
  254. }
  255. plugin = CrmMonitorPlugin(collectd)
  256. def init_callback():
  257. plugin.restore_sigchld()
  258. def config_callback(conf):
  259. plugin.config_callback(conf)
  260. def read_callback():
  261. plugin.read_callback()
  262. collectd.register_config(config_callback)
  263. collectd.register_read(read_callback)