vitrage/vitrage/datasources/prometheus/driver.py

427 lines
15 KiB
Python

# Copyright 2018 - Nokia
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import json
import socket
from collections import namedtuple
from ipaddress import ip_address
from oslo_config import cfg
from oslo_log import log
import requests
from urllib import parse as urlparse
from vitrage.common.constants import DatasourceAction
from vitrage.common.constants import DatasourceOpts as DSOpts
from vitrage.common.constants import DatasourceProperties as DSProps
from vitrage.common.constants import EventProperties as EProps
from vitrage.datasources.alarm_driver_base import AlarmDriverBase
from vitrage.datasources.prometheus import PROMETHEUS_DATASOURCE
from vitrage.datasources.prometheus.properties import get_alarm_update_time
from vitrage.datasources.prometheus.properties import get_label
from vitrage.datasources.prometheus.properties import PrometheusAlertLabels \
as PAlertLabels
from vitrage.datasources.prometheus.properties \
import PrometheusAlertProperties as PAlertProps
from vitrage.datasources.prometheus.properties import PrometheusAlertStatus \
as PAlertStatus
from vitrage.datasources.prometheus.properties \
import PrometheusConfigFileProperties as PCFProps
from vitrage.datasources.prometheus.properties \
import PrometheusDatasourceProperties as PDProps
from vitrage.datasources.prometheus.properties \
import PrometheusGetAllProperties as PGAProps
from vitrage.datasources.prometheus.properties \
import PrometheusProperties as PProps
from vitrage import os_clients
from vitrage.utils import file as file_utils
CONF = cfg.CONF
LOG = log.getLogger(__name__)
PROMETHEUS_EVENT_TYPE = 'prometheus.alarm'
class PrometheusDriver(AlarmDriverBase):
"""Handle Prometheus events.
Prometheus driver uses a configuration file that maps
the Prometheus alert labels to a corresponding Vitrage resource
with specific properties (id or other unique properties).
The mapping will most likely be defined by the alert name and other fields.
Prometheus configuration file structure:
The configuration file contains a list of alerts.
Each alert contains key and resource.
The key contains labels which uniquely identify each alert.
The resource specifies how to identify in Vitrage the resource that
the alert is on. It contains one or more Vitrage property names and
corresponding Prometheus alert labels.
Example:
^^^^^^^^
Prometheus event's details:
---------------------------
{
"status": "firing",
"version": "4",
"groupLabels": {
"alertname": "HighCpuOnVmAlert"
},
"commonAnnotations": {
"description": "Test alert to test libvirt exporter.\n",
"title": "High cpu usage on vm"
},
"groupKey": "{}:{alertname=\"HighCpuOnVmAlert\"}",
"receiver": "vitrage",
"externalURL": "http://vitrage.is.the.best:9093",
"alerts": [
{
"status": "firing",
"labels": {
"instance": "1.1.1.1:9999",
"domain": "instance-00000004",
"job": "libvirt",
"alertname": "HighCpuOnVmAlert",
"severity": "critical"
},
"endsAt": "2019-01-16T12:26:05.91446215Z",
"generatorURL": "http://seriously.vitrage.is.the.best",
"startsAt": "2019-01-16T12:11:50.91446215Z",
"annotations": {
"description": "Test alert to test libvirt exporter.\n",
"title": "High cpu usage on vm"
}
},
],
"commonLabels": {
"instance": "1.1.1.1:9999",
"job": "libvirt",
"severity": "critical",
"alertname": "HighCpuOnVmAlert"
}
}
prometheus_conf.yaml:
---------------------
alerts:
- key:
alertname: HighCpuOnVmAlert
job: libvirt
resource:
instance_name: domain
host_id: instance
`enrich_event` functions are explained based on the example above.
"""
AlarmKey = namedtuple('AlarmKey', [PAlertLabels.ALERT_NAME,
PCFProps.RESOURCE])
conf_map = {}
def __init__(self):
super(PrometheusDriver, self).__init__()
self._client = None
self._nova_client = None
self.conf_map = self._configuration_mapping()
@property
def nova_client(self):
if not self._nova_client:
self._nova_client = os_clients.nova_client()
return self._nova_client
def _vitrage_type(self):
return PROMETHEUS_DATASOURCE
def _alarm_key(self, alert):
return self.AlarmKey(
alertname=get_label(alert, PAlertLabels.ALERT_NAME),
resource=str(self._get_resource_alert_values(alert)))
def _is_erroneous(self, alert):
return alert and PAlertStatus.FIRING == alert.get(PAlertProps.STATUS)
def _is_valid(self, alert):
if not alert or PAlertProps.STATUS not in alert:
return False
return True
def _status_changed(self, new_alarm, old_alarm):
return \
new_alarm.get(PAlertProps.STATUS) != \
old_alarm.get(PAlertProps.STATUS)
def _get_all_alarms(self):
alertmanager_url = CONF.prometheus.alertmanager_url
receiver = CONF.prometheus.receiver
if not alertmanager_url:
LOG.warning('Alertmanager url is not defined')
return []
if not receiver:
LOG.warning('Receiver is not defined')
return []
payload = {PGAProps.ACTIVE: 'true',
PGAProps.RECEIVER: receiver}
session = requests.Session()
response = session.get(alertmanager_url,
params=payload)
if response.status_code == requests.codes.ok:
if 'v1' in alertmanager_url:
alerts = json.loads(response.text)[PGAProps.DATA]
else:
alerts = json.loads(response.text)
self._modify_alert_status(alerts)
alarms = self._enrich_alerts(alerts, PROMETHEUS_EVENT_TYPE)
return alarms
else:
LOG.error('Failed to get Alertmanager data. Response code: %s',
response.status_code)
return []
@staticmethod
def _modify_alert_status(alerts):
for alert in alerts:
if alert.get(PAlertProps.STATUS).get(PGAProps.STATE) == \
PGAProps.ACTIVE:
alert[PAlertProps.STATUS] = PAlertStatus.FIRING
def _get_changed_alarms(self):
return []
@staticmethod
def _configuration_mapping():
prometheus_config_file = CONF.prometheus[DSOpts.CONFIG_FILE]
try:
prometheus_config = \
file_utils.load_yaml_file(prometheus_config_file)
return prometheus_config[PCFProps.ALERTS]
except Exception:
LOG.exception('Failed in init the configuration file: %s',
prometheus_config_file)
return {}
def enrich_event(self, event, event_type):
"""Get an alert event from Prometheus and create a list of alert events
:param event: Prometheus event.
:param event_type: The type of the event. Always 'prometheus.alert'.
:return: a list of alarms, one per Prometheus alert
For the example above. The function returns:
{
"status": "firing",
"labels": {
"instance": "1.1.1.1:9999",
"domain": "instance-00000004",
"job": "libvirt",
"alertname": "HighCpuOnVmAlert",
"severity": "critical"
},
"vitrage_entity_type": "prometheus",
"endsAt": "2019-01-16T12:39:50.91446215Z",
"generatorURL": "http://seriously.vitrage.is.the.best",
"vitrage_datasource_name": "prometheus",
"startsAt": "2019-01-16T12:11:50.91446215Z",
"vitrage_datasource_action": "update",
"vitrage_entity_unique_props": {
"instance_name": "instance-00000004",
"host_id": "my-host-name"
},
"vitrage_sample_date": "2019-01-16T13:10:33Z",
"vitrage_event_type": "prometheus.alarm",
"annotations": {
"description": "Test alert to test libvirt exporter.\n",
"title": "High cpu usage on vm"
}
}
"""
LOG.debug('Going to enrich event: %s', event)
alarms = []
details = event.get(EProps.DETAILS)
if details:
alarms = self._enrich_alerts(details.get(PProps.ALERTS, []),
event_type)
LOG.debug('Enriched event. Created alert events: %s', alarms)
return self.make_pickleable(alarms, PROMETHEUS_DATASOURCE,
DatasourceAction.UPDATE)
def _enrich_alerts(self, alerts, event_type):
return [self._enrich_alert(alert, event_type) for alert in alerts]
def _enrich_alert(self, alert, event_type):
"""Enrich prometheus alert.
Adding fields to prometheus alert in order to map it to vitrage entity.
:param alert: Prometheus alert
:param event_type: The type of the event. Always 'prometheus.alert'.
:return: Enriched prometheus alert
"""
alert[DSProps.EVENT_TYPE] = event_type
vitrage_entity_unique_props = \
self._calculate_vitrage_entity_unique_props(alert)
alert[PDProps.ENTITY_UNIQUE_PROPS] = \
vitrage_entity_unique_props
old_alarm = self._old_alarm(alert)
alert = self._filter_and_cache_alarm(
alert, old_alarm,
self._filter_get_erroneous,
get_alarm_update_time(alert))
return alert
def _calculate_vitrage_entity_unique_props(self, alert):
"""Build a vitrage entity unique props.
The unique props are based on the alert and the conf file.
:param alert: Prometheus alert
:type alert: dict
:return: Unique properties of vitrage entity
":rtype: dict
For the example above. The function returns:
{'instance_name': 'instance-00000004',
'host_id': 'my-host-name'}
"""
resource_labels = self._get_conf_resource(alert)
vitrage_entity_unique_props = {}
for vitrage_label in resource_labels:
prometheus_label = resource_labels[vitrage_label]
label_value = str(get_label(alert, prometheus_label))
vitrage_entity_unique_props[vitrage_label] = \
self._adjust_label_value(label_value)
return vitrage_entity_unique_props
def _adjust_label_value(self, label_value):
"""Adjust the given value of the alert's label
First check if the value is ip.
Then, get its hostname if it has one.
If not, fetch the instance id from nova by its ip.
Otherwise, leave the label value as is.
:param label_value: Value of alert's label
:type label_value: str
:return: Adjusted label's value of the alert as described.
:rtype: str
For the example above. The function returns:
- label_value='instance-00000004' it returns:'instance-00000004'
- label_value='1.1.1.1:9999' it returns:'my-host-name'
"""
if label_value is not None:
try:
# Check if the value is ip
ip = str(self._validate_ip(label_value))
try:
# Get hostname of the ip
entity_hostname = socket.gethostbyaddr(ip)
label_value = entity_hostname[0]
except socket.error:
# If not ip of a host
nova_instance = self.nova_client.servers.list(
search_opts={'all_tenants': 1, 'ip': ip})
if nova_instance:
label_value = nova_instance[0].id
else:
label_value = ip
except ValueError:
# If not ip value, leave it as is
pass
return label_value
def _get_resource_alert_values(self, alert):
"""Get values of the alert labels from alert's resource in config file.
For the example above. The function returns:
{'instance': '1.1.1.1:9999', 'domain': 'instance-00000004'}
"""
resource_alert_labels = self._get_conf_resource(alert).values()
alert_values = {label: get_label(alert, label)
for label in resource_alert_labels}
return alert_values
def _get_conf_resource(self, alert):
"""Get resource from conf file that matches the alert.
Matching a resource from conf file to alert is done by
alert's key in the conf file.
The alert's key in conf file contains alert's labels and
their value as in Prometheus alert.
:param alert: Prometheus alert
:type alert: dict
:return: Resource that matches the alert
:rtype: dict
Resource is a dict, where the keys are vitrage entity fields
and its values are the corresponding alert labels.
For the example above. The function returns:
{'instance_name': 'domain', 'host_id': 'instance'}
"""
if self.conf_map:
for conf_alert in self.conf_map:
alert_key = conf_alert[PCFProps.KEY].items()
alert_labels = alert[PAlertProps.LABELS].items()
match = set(alert_key).issubset(set(alert_labels))
if match:
return conf_alert[PCFProps.RESOURCE]
return {}
@staticmethod
def _validate_ip(value):
"""Check if the value is ip address.
If the value is in ip:port form, separate it and validate just the ip.
:param value: String value
:return:An IPv4Address or IPv6Address object
:raises ValueError: if the *value* passed isn't either a v4 or a v6
address
"""
# check if the value is ip
try:
ip = ip_address(str(value))
except ValueError:
parsed = urlparse.urlparse('//{}'.format(value))
ip = ip_address(str(parsed.hostname))
return ip
@staticmethod
def get_event_types():
return [PROMETHEUS_EVENT_TYPE]
@staticmethod
def should_delete_outdated_entities():
return True