Fix the GSE filter wrt Pacemaker metrics

With the recent refactoring [1] of the Pacemaker collectd plugin, the
GSE filter may receive Pacemaker metrics from the other nodes of the
cluster. The Heka filter needs to be updated to discard these messages
otherwise the GSE filter flaps between active and inactive state.

[1] I8b5b987704f69c6a60b13e8ea982f27924f488d1

Change-Id: I6047da6ec5d28f22d309f1858bfbf5d3558cfcb4
Closes-Bug: #1616860
This commit is contained in:
Simon Pasquier
2016-08-25 16:45:53 +02:00
parent 16b4b02d9e
commit eb9f36fa63
4 changed files with 21 additions and 28 deletions

View File

@@ -286,6 +286,7 @@ lma_collector:
- name: 'rabbitmq-pacemaker-down'
description: 'The RabbitMQ cluster is down because less than half of the nodes are up'
severity: 'down'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
logical_operator: 'and'
@@ -299,15 +300,10 @@ lma_collector:
window: 60
periods: 0
function: last
- metric: pacemaker_local_dc_active
relational_operator: '=='
threshold: 1
window: 60
periods: 0
function: last
- name: 'rabbitmq-pacemaker-warning'
description: 'The RabbitMQ cluster is degraded because some RabbitMQ nodes are missing'
severity: 'warning'
no_data_policy: 'skip' # the metric is only collected from the DC node
enabled: 'true'
trigger:
logical_operator: 'and'
@@ -321,12 +317,6 @@ lma_collector:
window: 60
periods: 0
function: last
- metric: pacemaker_local_dc_active
relational_operator: '=='
threshold: 1
window: 60
periods: 0
function: last
- name: 'apache-warning'
description: 'There is no Apache idle workers available'
severity: 'warning'

View File

@@ -150,24 +150,21 @@ class CrmMonitorPlugin(base.Base):
yield {
'type_instance': 'node_status',
'values': MAINTENANCE_STATUS,
'hostname': hostname,
'meta': {'status': 'maintenance'}
'meta': {'status': 'maintenance', 'host': hostname}
}
else:
aggregated_nodes_status['online'] += 1
yield {
'type_instance': 'node_status',
'values': ONLINE_STATUS,
'hostname': hostname,
'meta': {'status': 'online'}
'meta': {'status': 'online', 'host': hostname}
}
else:
aggregated_nodes_status['offline'] += 1
yield {
'type_instance': 'node_status',
'values': OFFLINE_STATUS,
'hostname': hostname,
'meta': {'status': 'offline'}
'meta': {'status': 'offline', 'host': hostname}
}
for status, cnt in aggregated_nodes_status.items():
@@ -224,8 +221,8 @@ class CrmMonitorPlugin(base.Base):
'type_instance': 'local_resource_active',
'values': str_to_boolint(
node == simple_resource.find('node').get('name')),
'hostname': shorten_hostname(node),
'meta': {'resource': resource_name}
'meta': {'resource': resource_name,
'host': shorten_hostname(node)}
}
for status in ('up', 'down'):
@@ -277,14 +274,12 @@ class CrmMonitorPlugin(base.Base):
yield {
'type_instance': 'resource_failures',
'values': v['fail_count'],
'hostname': hostname,
'meta': {'resource': resource_name}
'meta': {'resource': resource_name, 'host': hostname}
}
yield {
'type_instance': 'resource_operations',
'values': v['ops_count'],
'hostname': hostname,
'meta': {'resource': resource_name}
'meta': {'resource': resource_name, 'host': hostname}
}

View File

@@ -340,6 +340,10 @@ function process_message ()
if #t > 0 then
msg['Fields']['tag_fields'] = t
end
if sample['meta'] and sample['meta']['host'] then
msg['Fields']['hostname'] = sample['meta']['host']
end
elseif metric_source == 'users' then
-- 'users' is a reserved name for InfluxDB v0.9
msg['Fields']['name'] = 'logged_users'

View File

@@ -50,10 +50,14 @@ function process_message()
local name = read_message('Fields[name]')
local hostname = read_message('Fields[hostname]')
if name and name == 'pacemaker_local_resource_active' and read_message("Fields[resource]") == 'vip__management' then
if read_message('Fields[value]') == 1 then
is_active = true
else
is_active = false
-- Skip pacemaker_local_resource_active metrics that don't
-- concern the local node
if read_message('Hostname') == hostname then
if read_message('Fields[value]') == 1 then
is_active = true
else
is_active = false
end
end
return 0
end