fuel-plugin-lma-collector/deployment_scripts/puppet/modules/fuel_lma_collector/templates/clusters.yaml.erb

975 lines
25 KiB
Plaintext

---
lma_collector:
# The GSE policies are applied to clusters and describe how the the GSE
# cluster filter computes the cluster's status. The LMA toolchain is
# pre-configured with a few set of policies but everything can be customized:
# thresholds can be modified, new policies can be defined, and so on.
#
# A policy consists of a list of rules which are evaluated against the
# current status of the cluster's members. When one of the rules matches, the
# cluster's status gets the value associated with the rule and the evaluation
# stops here. The last rule of the list is usually a catch-all rule that
# defines the default status in case no other rule matched.
#
# The declaration of a policy rule is similar to an alarm rule except that:
# - there are no 'metric', 'window' and 'period' parameters.
# - there are only 2 supported functions:
# - 'count' which returns the number of members that match the passed value(s)
# - 'percent' which returns the percentage of members that match the passed value(s)
#
# The following rule definition reads as "the cluster's status is critical if
# more than 50% of its members are either down or critical":
#
# - status: critical
# trigger:
# logical_operator: or
# rules:
# - function: percent
# arguments: [ down, critical ]
# relational_operator: '>'
# threshold: 50
#
gse_policies:
# A policy defining that the cluster's status depends on the member with the
# highest severity, typically used for a cluster of services.
highest_severity:
- status: down
trigger:
logical_operator: or
rules:
- function: count
arguments: [ down ]
relational_operator: '>'
threshold: 0
- status: critical
trigger:
logical_operator: or
rules:
- function: count
arguments: [ critical ]
relational_operator: '>'
threshold: 0
- status: warning
trigger:
logical_operator: or
rules:
- function: count
arguments: [ warning ]
relational_operator: '>'
threshold: 0
- status: okay
trigger:
logical_operator: or
rules:
- function: count
arguments: [ okay ]
relational_operator: '>'
threshold: 0
- status: unknown
# A policy which is typically used for clusters managed by Pacemaker
# with the no-quorum-policy set to 'stop'.
majority_of_members:
- status: down
trigger:
logical_operator: or
rules:
- function: percent
arguments: [ down ]
relational_operator: '>'
threshold: 50
- status: critical
trigger:
logical_operator: and
rules:
- function: percent
arguments: [ down, critical ]
relational_operator: '>'
threshold: 20
- function: percent
arguments: [ okay ]
relational_operator: '<'
threshold: 50
function: percent
- status: warning
trigger:
logical_operator: or
rules:
- function: percent
arguments: [ okay ]
relational_operator: '<'
threshold: 50
function: percent
- status: okay
# A policy that is used to derive a cluster status based
# on the status okay or down status for its members.
availability_of_members:
- status: down
trigger:
logical_operator: or
rules:
- function: count
arguments: [ okay ]
relational_operator: '=='
threshold: 0
- status: critical
trigger:
logical_operator: and
rules:
- function: count
arguments: [ okay ]
relational_operator: '=='
threshold: 1
- function: count
arguments: [ critical, down ]
relational_operator: '>'
threshold: 1
- status: warning
trigger:
logical_operator: or
rules:
- function: percent
arguments: [ okay ]
relational_operator: '<'
threshold: 100
- status: okay
trigger:
logical_operator: or
rules:
- function: percent
arguments: [ okay ]
relational_operator: '=='
threshold: 100
- status: unknown
# A policy that is used to derive a cluster status based
# on the health status of its members.
status_of_members:
- status: down
trigger:
logical_operator: or
rules:
- function: percent
arguments: [ down ]
relational_operator: '=='
threshold: 100
- status: critical
trigger:
logical_operator: and
rules:
- function: count
arguments: [ okay, warning ]
relational_operator: '<='
threshold: 1
- function: count
arguments: [ critical, down, unknown ]
relational_operator: '>'
threshold: 0
- status: warning
trigger:
logical_operator: or
rules:
- function: percent
arguments: [ okay ]
relational_operator: '!='
threshold: 100
- status: okay
trigger:
logical_operator: or
rules:
- function: percent
arguments: [ okay ]
relational_operator: '=='
threshold: 100
- status: unknown
# A policy that is typically used for storage or compute clusters
majority_of_node_members:
- status: down
trigger:
logical_operator: or
rules:
- function: percent
arguments: [ down ]
relational_operator: '=='
threshold: 100
- status: critical
trigger:
logical_operator: and
rules:
- function: percent
arguments: [ down, critical ]
relational_operator: '>='
threshold: 50
- status: warning
trigger:
logical_operator: or
rules:
- function: percent
arguments: [ down, critical, warning, unknown ]
relational_operator: '>'
threshold: 0
function: percent
- status: okay
gse_cluster_service:
input_message_types:
- afd_service_metric
aggregator_flag: true
# the field in the input messages to identify the cluster
cluster_field: service
# the field in the input messages to identify the cluster's member
member_field: source
output_message_type: gse_service_cluster_metric
output_metric_name: cluster_service_status
interval: 10
warm_up_period: 20
alerting: enabled
clusters:
nova-logs:
policy: highest_severity
group_by: hostname
members:
- error
nova-logs-compute:
policy: highest_severity
group_by: hostname
members:
- error
heat-logs:
policy: highest_severity
group_by: hostname
members:
- error
cinder-logs:
policy: highest_severity
group_by: hostname
members:
- error
glance-logs:
policy: highest_severity
group_by: hostname
members:
- error
neutron-logs:
policy: highest_severity
group_by: hostname
members:
- error
neutron-logs-compute:
policy: highest_severity
group_by: hostname
members:
- error
keystone-logs:
policy: highest_severity
group_by: hostname
members:
- error
mysqld-tcp:
policy: highest_severity
group_by: member
members:
- backends
mysql:
policy: majority_of_members
group_by: hostname
members:
- node-status
- check
haproxy-openstack:
policy: availability_of_members
group_by: hostname
members:
- check
apache:
policy: availability_of_members
group_by: hostname
members:
- worker
- check
memcached-service:
policy: availability_of_members
group_by: hostname
members:
- check
rabbitmq-cluster:
policy: highest_severity
group_by: member
members:
- memory
- disk
- queue
- pacemaker
rabbitmq-service:
# A check failure on a single node doesn't mean that the whole cluster
# is down, this is why a 'hostname' group_by and 'majority_of_members'
# policy are used here.
policy: majority_of_members
group_by: hostname
members:
- check
nova-api:
policy: highest_severity
group_by: member
members:
- backends
- http_errors
nova-api-check:
policy: highest_severity
group_by: member
members:
- vip
nova-api-endpoint:
policy: availability_of_members
group_by: hostname
members:
- endpoint
nova-novncproxy-websocket:
policy: highest_severity
group_by: member
members:
- backends
nova-metadata-api:
policy: highest_severity
group_by: member
members:
- backends
nova-scheduler:
policy: highest_severity
group_by: member
members:
- workers
nova-cert:
policy: highest_severity
group_by: member
members:
- workers
nova-consoleauth:
policy: highest_severity
group_by: member
members:
- workers
nova-compute:
policy: highest_severity
group_by: member
members:
- workers
nova-conductor:
policy: highest_severity
group_by: member
members:
- workers
# The AFDs are emitted by all the controller nodes because all of them
# collect the openstack_nova_instance_creation_time metric. Hence the
# service is considered not okay when a majority of controllers report
# not okay.
nova-instances:
policy: majority_of_members
group_by: hostname
members:
- creation-time
cinder-api:
policy: highest_severity
group_by: member
members:
- backends
- http_errors
cinder-api-check:
policy: highest_severity
group_by: member
members:
- vip
cinder-v2-api-check:
policy: highest_severity
group_by: member
members:
# Cinder V2 backends are in fact the same as the Cinder backends
- vip
cinder-api-endpoint:
policy: availability_of_members
group_by: hostname
members:
- endpoint
cinder-scheduler:
policy: highest_severity
group_by: member
members:
- workers
cinder-volume:
policy: highest_severity
group_by: member
members:
- workers
neutron-api:
policy: highest_severity
group_by: member
members:
- backends
- http_errors
neutron-api-check:
policy: highest_severity
group_by: member
members:
- vip
neutron-api-endpoint:
policy: availability_of_members
group_by: hostname
members:
- endpoint
neutron-l3:
policy: highest_severity
group_by: member
members:
- workers
neutron-dhcp:
policy: highest_severity
group_by: member
members:
- workers
neutron-metadata:
policy: highest_severity
group_by: member
members:
- workers
<% if not @contrail_plugin then -%>
neutron-openvswitch:
policy: highest_severity
group_by: member
members:
- workers
<% end -%>
keystone-response-time:
policy: highest_severity
group_by: member
members:
- duration
keystone-public-api:
policy: highest_severity
group_by: member
members:
- backends
- http_errors
keystone-public-api-check:
policy: highest_severity
group_by: member
members:
- vip
keystone-public-api-endpoint:
policy: availability_of_members
group_by: hostname
members:
- endpoint
keystone-admin-api:
policy: highest_severity
group_by: member
members:
# TODO(pasquier-s): add a metric reporting the status of the keystone-admin-api vip
- backends
- http_errors
glance-api:
policy: highest_severity
group_by: member
members:
- backends
- http_errors
glance-api-check:
policy: highest_severity
group_by: member
members:
- vip
glance-api-endpoint:
policy: availability_of_members
group_by: hostname
members:
- endpoint
glance-registry-api:
policy: highest_severity
group_by: member
members:
- backends
heat-api:
policy: highest_severity
group_by: member
members:
- backends
- http_errors
heat-api-check:
policy: highest_severity
group_by: member
members:
- vip
heat-api-endpoint:
policy: availability_of_members
group_by: hostname
members:
- endpoint
heat-cfn-api:
policy: highest_severity
group_by: member
members:
- backends
heat-cfn-api-check:
policy: highest_severity
group_by: member
members:
- vip
heat-cfn-api-endpoint:
policy: availability_of_members
group_by: hostname
members:
- endpoint
heat-cloudwatch-api:
policy: highest_severity
group_by: member
members:
- backends
<% if @tls_enabled then -%>
horizon-https:
<% else -%>
horizon-web:
<% end -%>
policy: highest_severity
group_by: member
members:
- backends
- http_errors
<% if not @storage_options["objects_ceph"] then -%>
swift-api:
policy: highest_severity
group_by: member
members:
- backends
- http_errors
swift-api-check:
policy: highest_severity
group_by: member
members:
- vip
swift-s3-api-check:
policy: highest_severity
group_by: member
members:
# Swift S3 backends are in fact the same as the Swift backends
- vip
swift-api-endpoint:
policy: availability_of_members
group_by: hostname
members:
- endpoint
<% end -%>
<% if @ceilometer_enabled -%>
ceilometer-api:
policy: highest_severity
group_by: member
members:
- backends
ceilometer-api-check:
policy: highest_severity
group_by: member
members:
- vip
<% end -%>
<% if @storage_options["volumes_ceph"] then -%>
ceph-mon-cluster:
policy: highest_severity
group_by: member
members:
- health
- capacity
ceph-mon-service:
policy: availability_of_members
group_by: hostname
members:
- check
ceph-osd-service:
policy: availability_of_members
group_by: hostname
members:
- check
<% end -%>
<% if @monitor_elasticsearch -%>
elasticsearch-cluster:
policy: highest_severity
group_by: member
members:
- health
elasticsearch-service:
policy: availability_of_members
group_by: hostname
members:
- check
<% end -%>
<% if @monitor_influxdb -%>
influxdb-api-check:
policy: highest_severity
group_by: member
members:
- vip
influxdb-service:
policy: availability_of_members
group_by: hostname
members:
- check
<% end -%>
pacemaker-service:
policy: availability_of_members
group_by: hostname
members:
- check
libvirt-service:
policy: majority_of_node_members
group_by: hostname
members:
- check
nova-free-vcpu:
policy: highest_severity
group_by: member
members:
- nova-free-vcpu
nova-free-memory:
policy: highest_severity
group_by: member
members:
- nova-free-memory
gse_cluster_node:
input_message_types:
- afd_node_metric
aggregator_flag: true
# the field in the input messages to identify the cluster
cluster_field: node_role
# the field in the input messages to identify the cluster's member
member_field: source
output_message_type: gse_node_cluster_metric
output_metric_name: cluster_node_status
interval: 10
warm_up_period: 80
alerting: enabled
clusters:
controller:
policy: status_of_members
group_by: hostname
members:
- cpu
- network-rx
- network-tx
- root-fs
- log-fs
- other-fs
- swap
- hdd-errors
<% if @detach_rabbitmq_enabled -%>
rabbitmq-nodes:
policy: status_of_members
group_by: hostname
members:
- cpu
- network-rx
- network-tx
- root-fs
- other-fs
- swap
- hdd-errors
<% end -%>
mysql-nodes:
policy: status_of_members
group_by: hostname
members:
<% if @detach_database_enabled -%>
- cpu
- network-rx
- network-tx
- root-fs
- other-fs
- swap
- hdd-errors
<% end -%>
- mysql-fs
compute:
policy: majority_of_node_members
group_by: hostname
members:
- cpu
- network-rx
- network-tx
- root-fs
- nova-fs
- other-fs
- swap
- hdd-errors
storage:
policy: majority_of_node_members
group_by: hostname
members:
- cpu
- network-rx
- network-tx
- root-fs
- other-fs
- swap
- hdd-errors
<% if @storage_options["volumes_ceph"] then -%>
- osd-disk
<% end -%>
<% if @monitor_elasticsearch -%>
elasticsearch-nodes:
policy: status_of_members
group_by: hostname
members:
- data-fs
- root-fs
- cpu
- network-rx
- network-tx
- swap
- hdd-errors
<% end -%>
<% if @monitor_influxdb -%>
influxdb-nodes:
policy: status_of_members
group_by: hostname
members:
- data-fs
- root-fs
- cpu
- network-rx
- network-tx
- swap
- hdd-errors
<% end -%>
gse_cluster_global:
input_message_types:
- gse_service_cluster_metric
- gse_node_cluster_metric
aggregator_flag: false
# the field in the input messages to identify the cluster's member
member_field: cluster_name
output_message_type: gse_cluster_metric
output_metric_name: cluster_status
interval: 10
warm_up_period: 30
alerting: enabled_with_notification
clusters:
mysql:
policy: highest_severity
group_by: member
members:
- mysqld-tcp
- mysql
haproxy-openstack:
policy: highest_severity
group_by: member
members:
- haproxy-openstack
apache:
policy: highest_severity
group_by: member
members:
- apache
memcached:
policy: highest_severity
group_by: member
members:
- memcached-service
rabbitmq:
policy: highest_severity
group_by: member
members:
- rabbitmq-cluster
- rabbitmq-service
nova-control-plane:
policy: highest_severity
group_by: member
members:
- nova-logs
- nova-api # backends and http errors
- nova-api-check # vip check
- nova-api-endpoint # local check
- nova-metadata-api # worker
- nova-scheduler # worker
- nova-conductor # worker
- nova-cert # worker
- nova-consoleauth # worker
- nova-novncproxy-websocket # worker
hints:
- cinder-control-plane
- glance
- keystone
- neutron-control-plane
- mysql
- rabbitmq
nova-data-plane:
policy: highest_severity
group_by: member
members:
- nova-logs-compute
- nova-compute
- nova-instances
- libvirt-service
- nova-free-vcpu
- nova-free-memory
hints:
- neutron-data-plane
cinder-control-plane:
policy: highest_severity
group_by: member
members:
- cinder-logs
- cinder-api
- cinder-api-check
- cinder-v2-api-check
- cinder-api-endpoint
- cinder-scheduler
hints:
- keystone
- mysql
- rabbitmq
<% if not @storage_options["volumes_ceph"] then -%>
cinder-data-plane:
policy: highest_severity
group_by: member
members:
- cinder-volume-logs
- cinder-volume
<% end -%>
neutron-control-plane:
policy: highest_severity
group_by: member
members:
- neutron-logs
- neutron-api
- neutron-api-check
- neutron-api-endpoint
<% if not @contrail_plugin then -%>
- neutron-l3
- neutron-dhcp
- neutron-metadata
- neutron-openvswitch
<% end -%>
hints:
- keystone
- mysql
- rabbitmq
neutron-data-plane:
policy: highest_severity
group_by: member
members:
- neutron-logs-compute
<% if not @contrail_plugin then -%>
- neutron-openvswitch
<% end -%>
keystone:
policy: highest_severity
group_by: member
members:
- keystone-logs
- keystone-response-time
- keystone-public-api
- keystone-public-api-check
- keystone-public-api-endpoint
- keystone-admin-api
hints:
- mysql
- apache
glance:
policy: highest_severity
group_by: member
members:
- glance-logs
- glance-api
- glance-api-check
- glance-api-endpoint
- glance-registry-api
hints:
- keystone
- mysql
heat:
policy: highest_severity
group_by: member
members:
- heat-logs
- heat-api
- heat-api-check
- heat-api-endpoint
- heat-cfn-api
- heat-cfn-api-check
- heat-cfn-api-endpoint
- heat-cloudwatch-api
hints:
- cinder-control-plane
- glance
- keystone
- neutron-control-plane
- nova-control-plane
- mysql
- rabbitmq
horizon:
policy: highest_severity
group_by: member
members:
<% if @tls_enabled then -%>
- horizon-https
<% else -%>
- horizon-web
<% end -%>
hints:
- keystone
- apache
<% if not @storage_options["objects_ceph"] then -%>
swift:
policy: highest_severity
group_by: member
members:
- swift-api
- swift-api-check
- swift-s3-api-check
- swift-api-endpoint
hints:
- keystone
<% end -%>
<% if @ceilometer_enabled -%>
ceilometer:
policy: highest_severity
group_by: member
members:
- ceilometer-api
- ceilometer-api-check
hints:
- keystone
- rabbitmq
<% end -%>
<% if @storage_options["volumes_ceph"] then -%>
ceph:
policy: highest_severity
group_by: member
members:
- ceph-mon-cluster
- ceph-mon-service
- ceph-osd-service
<% end -%>
<% if @monitor_elasticsearch -%>
elasticsearch:
policy: highest_severity
group_by: member
members:
- elasticsearch-cluster
- elasticsearch-service
<% end -%>
<% if @monitor_influxdb -%>
influxdb:
policy: highest_severity
group_by: member
members:
- influxdb-api-check
- influxdb-service
<% end -%>
pacemaker:
policy: highest_severity
group_by: member
members:
- pacemaker-service