Fix the GSE filter wrt Pacemaker metrics
With the recent refactoring [1] of the Pacemaker collectd plugin, the GSE filter may receive Pacemaker metrics from the other nodes of the cluster. The Heka filter needs to be updated to discard these messages otherwise the GSE filter flaps between active and inactive state. [1] I8b5b987704f69c6a60b13e8ea982f27924f488d1 Change-Id: I6047da6ec5d28f22d309f1858bfbf5d3558cfcb4 Closes-Bug: #1616860
This commit is contained in:
@@ -286,6 +286,7 @@ lma_collector:
|
||||
- name: 'rabbitmq-pacemaker-down'
|
||||
description: 'The RabbitMQ cluster is down because less than half of the nodes are up'
|
||||
severity: 'down'
|
||||
no_data_policy: 'skip' # the metric is only collected from the DC node
|
||||
enabled: 'true'
|
||||
trigger:
|
||||
logical_operator: 'and'
|
||||
@@ -299,15 +300,10 @@ lma_collector:
|
||||
window: 60
|
||||
periods: 0
|
||||
function: last
|
||||
- metric: pacemaker_local_dc_active
|
||||
relational_operator: '=='
|
||||
threshold: 1
|
||||
window: 60
|
||||
periods: 0
|
||||
function: last
|
||||
- name: 'rabbitmq-pacemaker-warning'
|
||||
description: 'The RabbitMQ cluster is degraded because some RabbitMQ nodes are missing'
|
||||
severity: 'warning'
|
||||
no_data_policy: 'skip' # the metric is only collected from the DC node
|
||||
enabled: 'true'
|
||||
trigger:
|
||||
logical_operator: 'and'
|
||||
@@ -321,12 +317,6 @@ lma_collector:
|
||||
window: 60
|
||||
periods: 0
|
||||
function: last
|
||||
- metric: pacemaker_local_dc_active
|
||||
relational_operator: '=='
|
||||
threshold: 1
|
||||
window: 60
|
||||
periods: 0
|
||||
function: last
|
||||
- name: 'apache-warning'
|
||||
description: 'There is no Apache idle workers available'
|
||||
severity: 'warning'
|
||||
|
||||
@@ -150,24 +150,21 @@ class CrmMonitorPlugin(base.Base):
|
||||
yield {
|
||||
'type_instance': 'node_status',
|
||||
'values': MAINTENANCE_STATUS,
|
||||
'hostname': hostname,
|
||||
'meta': {'status': 'maintenance'}
|
||||
'meta': {'status': 'maintenance', 'host': hostname}
|
||||
}
|
||||
else:
|
||||
aggregated_nodes_status['online'] += 1
|
||||
yield {
|
||||
'type_instance': 'node_status',
|
||||
'values': ONLINE_STATUS,
|
||||
'hostname': hostname,
|
||||
'meta': {'status': 'online'}
|
||||
'meta': {'status': 'online', 'host': hostname}
|
||||
}
|
||||
else:
|
||||
aggregated_nodes_status['offline'] += 1
|
||||
yield {
|
||||
'type_instance': 'node_status',
|
||||
'values': OFFLINE_STATUS,
|
||||
'hostname': hostname,
|
||||
'meta': {'status': 'offline'}
|
||||
'meta': {'status': 'offline', 'host': hostname}
|
||||
}
|
||||
|
||||
for status, cnt in aggregated_nodes_status.items():
|
||||
@@ -224,8 +221,8 @@ class CrmMonitorPlugin(base.Base):
|
||||
'type_instance': 'local_resource_active',
|
||||
'values': str_to_boolint(
|
||||
node == simple_resource.find('node').get('name')),
|
||||
'hostname': shorten_hostname(node),
|
||||
'meta': {'resource': resource_name}
|
||||
'meta': {'resource': resource_name,
|
||||
'host': shorten_hostname(node)}
|
||||
}
|
||||
|
||||
for status in ('up', 'down'):
|
||||
@@ -277,14 +274,12 @@ class CrmMonitorPlugin(base.Base):
|
||||
yield {
|
||||
'type_instance': 'resource_failures',
|
||||
'values': v['fail_count'],
|
||||
'hostname': hostname,
|
||||
'meta': {'resource': resource_name}
|
||||
'meta': {'resource': resource_name, 'host': hostname}
|
||||
}
|
||||
yield {
|
||||
'type_instance': 'resource_operations',
|
||||
'values': v['ops_count'],
|
||||
'hostname': hostname,
|
||||
'meta': {'resource': resource_name}
|
||||
'meta': {'resource': resource_name, 'host': hostname}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -340,6 +340,10 @@ function process_message ()
|
||||
if #t > 0 then
|
||||
msg['Fields']['tag_fields'] = t
|
||||
end
|
||||
|
||||
if sample['meta'] and sample['meta']['host'] then
|
||||
msg['Fields']['hostname'] = sample['meta']['host']
|
||||
end
|
||||
elseif metric_source == 'users' then
|
||||
-- 'users' is a reserved name for InfluxDB v0.9
|
||||
msg['Fields']['name'] = 'logged_users'
|
||||
|
||||
@@ -50,10 +50,14 @@ function process_message()
|
||||
local name = read_message('Fields[name]')
|
||||
local hostname = read_message('Fields[hostname]')
|
||||
if name and name == 'pacemaker_local_resource_active' and read_message("Fields[resource]") == 'vip__management' then
|
||||
if read_message('Fields[value]') == 1 then
|
||||
is_active = true
|
||||
else
|
||||
is_active = false
|
||||
-- Skip pacemaker_local_resource_active metrics that don't
|
||||
-- concern the local node
|
||||
if read_message('Hostname') == hostname then
|
||||
if read_message('Fields[value]') == 1 then
|
||||
is_active = true
|
||||
else
|
||||
is_active = false
|
||||
end
|
||||
end
|
||||
return 0
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user