Fix the GSE filter wrt Pacemaker metrics

With the recent refactoring [1] of the Pacemaker collectd plugin, the GSE filter may receive Pacemaker metrics from the other nodes of the cluster. The Heka filter needs to be updated to discard these messages otherwise the GSE filter flaps between active and inactive state. [1] I8b5b987704f69c6a60b13e8ea982f27924f488d1 Change-Id: I6047da6ec5d28f22d309f1858bfbf5d3558cfcb4 Closes-Bug: #1616860
2016-08-25 16:45:53 +02:00
parent 16b4b02d9e
commit eb9f36fa63
4 changed files with 21 additions and 28 deletions
--- a/deployment_scripts/puppet/modules/fuel_lma_collector/templates/alarming.yaml.erb
+++ b/deployment_scripts/puppet/modules/fuel_lma_collector/templates/alarming.yaml.erb
@@ -286,6 +286,7 @@ lma_collector:
    - name: 'rabbitmq-pacemaker-down'
      description: 'The RabbitMQ cluster is down because less than half of the nodes are up'
      severity: 'down'
+      no_data_policy: 'skip' # the metric is only collected from the DC node
      enabled: 'true'
      trigger:
        logical_operator: 'and'
@@ -299,15 +300,10 @@ lma_collector:
            window: 60
            periods: 0
            function: last
-          - metric: pacemaker_local_dc_active
-            relational_operator: '=='
-            threshold: 1
-            window: 60
-            periods: 0
-            function: last
    - name: 'rabbitmq-pacemaker-warning'
      description: 'The RabbitMQ cluster is degraded because some RabbitMQ nodes are missing'
      severity: 'warning'
+      no_data_policy: 'skip' # the metric is only collected from the DC node
      enabled: 'true'
      trigger:
        logical_operator: 'and'
@@ -321,12 +317,6 @@ lma_collector:
            window: 60
            periods: 0
            function: last
-          - metric: pacemaker_local_dc_active
-            relational_operator: '=='
-            threshold: 1
-            window: 60
-            periods: 0
-            function: last
    - name: 'apache-warning'
      description: 'There is no Apache idle workers available'
      severity: 'warning'
--- a/deployment_scripts/puppet/modules/lma_collector/files/collectd/collectd_pacemaker.py
+++ b/deployment_scripts/puppet/modules/lma_collector/files/collectd/collectd_pacemaker.py
@@ -150,24 +150,21 @@ class CrmMonitorPlugin(base.Base):
                    yield {
                        'type_instance': 'node_status',
                        'values': MAINTENANCE_STATUS,
-                        'hostname': hostname,
-                        'meta': {'status': 'maintenance'}
+                        'meta': {'status': 'maintenance', 'host': hostname}
                    }
                else:
                    aggregated_nodes_status['online'] += 1
                    yield {
                        'type_instance': 'node_status',
                        'values': ONLINE_STATUS,
-                        'hostname': hostname,
-                        'meta': {'status': 'online'}
+                        'meta': {'status': 'online', 'host': hostname}
                    }
            else:
                aggregated_nodes_status['offline'] += 1
                yield {
                    'type_instance': 'node_status',
                    'values': OFFLINE_STATUS,
-                    'hostname': hostname,
-                    'meta': {'status': 'offline'}
+                    'meta': {'status': 'offline', 'host': hostname}
                }

        for status, cnt in aggregated_nodes_status.items():
@@ -224,8 +221,8 @@ class CrmMonitorPlugin(base.Base):
                        'type_instance': 'local_resource_active',
                        'values': str_to_boolint(
                            node == simple_resource.find('node').get('name')),
-                        'hostname': shorten_hostname(node),
-                        'meta': {'resource': resource_name}
+                        'meta': {'resource': resource_name,
+                                 'host': shorten_hostname(node)}
                    }

            for status in ('up', 'down'):
@@ -277,14 +274,12 @@ class CrmMonitorPlugin(base.Base):
                yield {
                    'type_instance': 'resource_failures',
                    'values': v['fail_count'],
-                    'hostname': hostname,
-                    'meta': {'resource': resource_name}
+                    'meta': {'resource': resource_name, 'host': hostname}
                }
                yield {
                    'type_instance': 'resource_operations',
                    'values': v['ops_count'],
-                    'hostname': hostname,
-                    'meta': {'resource': resource_name}
+                    'meta': {'resource': resource_name, 'host': hostname}
                }


--- a/deployment_scripts/puppet/modules/lma_collector/files/plugins/decoders/collectd.lua
+++ b/deployment_scripts/puppet/modules/lma_collector/files/plugins/decoders/collectd.lua
@@ -340,6 +340,10 @@ function process_message ()
                if #t > 0 then
                    msg['Fields']['tag_fields'] = t
                end
+
+                if sample['meta'] and sample['meta']['host'] then
+                    msg['Fields']['hostname'] = sample['meta']['host']
+                end
            elseif metric_source ==  'users' then
                -- 'users' is a reserved name for InfluxDB v0.9
                msg['Fields']['name'] = 'logged_users'
--- a/deployment_scripts/puppet/modules/lma_collector/files/plugins/filters/gse_cluster_filter.lua
+++ b/deployment_scripts/puppet/modules/lma_collector/files/plugins/filters/gse_cluster_filter.lua
@@ -50,10 +50,14 @@ function process_message()
    local name = read_message('Fields[name]')
    local hostname = read_message('Fields[hostname]')
    if name and name == 'pacemaker_local_resource_active' and read_message("Fields[resource]") == 'vip__management' then
-        if read_message('Fields[value]') == 1 then
-            is_active = true
-        else
-            is_active = false
+        -- Skip pacemaker_local_resource_active metrics that don't
+        -- concern the local node
+        if read_message('Hostname') == hostname then
+            if read_message('Fields[value]') == 1 then
+                is_active = true
+            else
+                is_active = false
+            end
        end
        return 0
    end