5b02232aaa
This is improvement of functional test of prometheus plugin. By this fix, the created prometheus rule files are checked by promtool. If the syntax of them are invalid, the test fails. This check is disabled by default and enabled when CONF.prometheus_plugin.test_rule_with_promtool is true. Implements: blueprint support-auto-lcm Change-Id: Ica331befe225156c607d5c8267462b7281669c91
918 lines
39 KiB
Python
918 lines
39 KiB
Python
# Copyright (C) 2022 Fujitsu
|
|
# All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import datetime
|
|
import itertools
|
|
import json
|
|
import os
|
|
import paramiko
|
|
import re
|
|
import tempfile
|
|
|
|
from keystoneauth1 import exceptions as ks_exc
|
|
from oslo_log import log as logging
|
|
from oslo_utils import uuidutils
|
|
from tacker.sol_refactored.api import prometheus_plugin_validator as validator
|
|
from tacker.sol_refactored.api.schemas import prometheus_plugin_schemas
|
|
from tacker.sol_refactored.common import config as cfg
|
|
from tacker.sol_refactored.common import exceptions as sol_ex
|
|
from tacker.sol_refactored.common import fm_alarm_utils
|
|
from tacker.sol_refactored.common import http_client
|
|
from tacker.sol_refactored.common import monitoring_plugin_base as mon_base
|
|
from tacker.sol_refactored.common import pm_job_utils
|
|
from tacker.sol_refactored.common import vnf_instance_utils as inst_utils
|
|
from tacker.sol_refactored.conductor import conductor_rpc_v2 as rpc
|
|
from tacker.sol_refactored import objects
|
|
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
logging.getLogger("paramiko").setLevel(logging.WARNING)
|
|
|
|
CONF = cfg.CONF
|
|
|
|
|
|
class PrometheusPlugin():
|
|
def __init__(self):
|
|
self.rpc = rpc.PrometheusPluginConductor()
|
|
|
|
def parse_datetime(self, isodate):
|
|
t = (isodate if isinstance(isodate, datetime.datetime)
|
|
else datetime.datetime.fromisoformat(
|
|
isodate.replace('Z', '+00:00')))
|
|
return t if t.tzinfo else t.astimezone()
|
|
|
|
|
|
class PrometheusPluginPm(PrometheusPlugin, mon_base.MonitoringPlugin):
|
|
_instance = None
|
|
|
|
@staticmethod
|
|
def instance():
|
|
if PrometheusPluginPm._instance is None:
|
|
if not CONF.prometheus_plugin.performance_management:
|
|
stub = mon_base.MonitoringPluginStub.instance()
|
|
PrometheusPluginPm._instance = stub
|
|
else:
|
|
PrometheusPluginPm()
|
|
return PrometheusPluginPm._instance
|
|
|
|
def __init__(self):
|
|
if PrometheusPluginPm._instance:
|
|
raise SystemError(
|
|
"Not constructor but instance() should be used.")
|
|
super(PrometheusPluginPm, self).__init__()
|
|
self.notification_callback = None
|
|
auth_handle = http_client.NoAuthHandle()
|
|
self.client = http_client.HttpClient(auth_handle)
|
|
self.reporting_period_margin = (
|
|
CONF.prometheus_plugin.reporting_period_margin)
|
|
self.notification_callback = self.default_callback
|
|
# Pod name can be specified but container name can not.
|
|
# And some prometheus metrics need container name. Therefore, max
|
|
# statement of PromQL is alternatively used in some measurements to
|
|
# measure without container name. That means it provids only most
|
|
# impacted value among the containers.
|
|
self.sol_exprs = {
|
|
'VCpuUsageMeanVnf':
|
|
'avg(sum(rate(pod_cpu_usage_seconds_total'
|
|
'{{pod=~"{pod}"}}[{reporting_period}s])))',
|
|
'VCpuUsagePeakVnf':
|
|
'max(sum(rate(pod_cpu_usage_seconds_total'
|
|
'{{pod=~"{pod}"}}[{reporting_period}s])))',
|
|
'VMemoryUsageMeanVnf':
|
|
'avg(pod_memory_working_set_bytes{{pod=~"{pod}"}} / '
|
|
'on(pod) (kube_node_status_capacity{{resource="memory"}} * '
|
|
'on(node) group_right kube_pod_info))',
|
|
'VMemoryUsagePeakVnf':
|
|
'max(pod_memory_working_set_bytes{{pod=~"{pod}"}} / '
|
|
'on(pod) (kube_node_status_capacity{{resource="memory"}} * '
|
|
'on(node) group_right kube_pod_info))',
|
|
'VDiskUsageMeanVnf':
|
|
'avg(max(container_fs_usage_bytes{{pod=~"{pod}"}}/'
|
|
'container_fs_limit_bytes{{pod=~"{pod}"}}))',
|
|
'VDiskUsagePeakVnf':
|
|
'max(max(container_fs_usage_bytes{{pod=~"{pod}"}}/'
|
|
'container_fs_limit_bytes{{pod=~"{pod}"}}))',
|
|
'ByteIncomingVnfIntCp':
|
|
'sum(container_network_receive_bytes_total'
|
|
'{{interface="{sub_object_instance_id}",pod=~"{pod}"}})',
|
|
'PacketIncomingVnfIntCp':
|
|
'sum(container_network_receive_packets_total'
|
|
'{{interface="{sub_object_instance_id}",pod=~"{pod}"}})',
|
|
'ByteOutgoingVnfIntCp':
|
|
'sum(container_network_transmit_bytes_total'
|
|
'{{interface="{sub_object_instance_id}",pod=~"{pod}"}})',
|
|
'PacketOutgoingVnfIntCp':
|
|
'sum(container_network_transmit_packets_total'
|
|
'{{interface="{sub_object_instance_id}",pod=~"{pod}"}})',
|
|
'ByteIncomingVnfExtCp':
|
|
'sum(container_network_receive_bytes_total'
|
|
'{{interface="{sub_object_instance_id}",pod=~"{pod}"}})',
|
|
'PacketIncomingVnfExtCp':
|
|
'sum(container_network_receive_packets_total'
|
|
'{{interface="{sub_object_instance_id}",pod=~"{pod}"}})',
|
|
'ByteOutgoingVnfExtCp':
|
|
'sum(container_network_transmit_bytes_total'
|
|
'{{interface="{sub_object_instance_id}",pod=~"{pod}"}})',
|
|
'PacketOutgoingVnfExtCp':
|
|
'sum(container_network_transmit_packets_total'
|
|
'{{interface="{sub_object_instance_id}",pod=~"{pod}"}})',
|
|
}
|
|
PrometheusPluginPm._instance = self
|
|
|
|
def set_callback(self, notification_callback):
|
|
self.notification_callback = notification_callback
|
|
|
|
def create_job(self, **kwargs):
|
|
return self.make_rules(kwargs['context'], kwargs['pm_job'])
|
|
|
|
def delete_job(self, **kwargs):
|
|
self.delete_rules(kwargs['context'], kwargs['pm_job'])
|
|
|
|
def alert(self, **kwargs):
|
|
try:
|
|
self._alert(kwargs['request'], body=kwargs['body'])
|
|
except Exception as e:
|
|
# All exceptions is ignored here and 204 response will always
|
|
# be returned. Because when tacker responds error to alertmanager,
|
|
# alertmanager may repeat the same reports.
|
|
LOG.error("%s: %s", e.__class__.__name__, e.args[0])
|
|
|
|
def default_callback(self, context, entries):
|
|
self.rpc.store_job_info(context, entries)
|
|
|
|
def convert_measurement_unit(self, metric, value):
|
|
if re.match(r'^V(Cpu|Memory|Disk)Usage(Mean|Peak)Vnf\..+', metric):
|
|
value = float(value)
|
|
elif re.match(r'^(Byte|Packet)(Incoming|Outgoing)Vnf(IntCp|ExtCp)',
|
|
metric):
|
|
value = int(value)
|
|
else:
|
|
raise sol_ex.PrometheusPluginError(
|
|
"Failed to convert annotations.value to measurement unit.")
|
|
return value
|
|
|
|
def get_datetime_of_latest_report(
|
|
self, context, pm_job, object_instance_id,
|
|
sub_object_instance_id, metric):
|
|
report = pm_job_utils.get_pm_report(context, pm_job.id)
|
|
if not report:
|
|
return None
|
|
|
|
report = list(map(lambda x: x.entries, report))
|
|
report = list(itertools.chain.from_iterable(report))
|
|
entries_of_same_object = list(
|
|
filter(
|
|
lambda x: (
|
|
x.objectInstanceId == object_instance_id and
|
|
(not x.obj_attr_is_set('subObjectInstanceId') or
|
|
x.subObjectInstanceId == sub_object_instance_id) and
|
|
x.performanceMetric == metric),
|
|
report))
|
|
if len(entries_of_same_object) == 0:
|
|
return None
|
|
values = sum(list(map(
|
|
lambda x: x.performanceValues, entries_of_same_object)), [])
|
|
return max(values, key=lambda value:
|
|
self.parse_datetime(value.timeStamp)).timeStamp
|
|
|
|
def filter_alert_by_time(
|
|
self, context, pm_job, datetime_now,
|
|
object_instance_id, sub_object_instance_id, metric):
|
|
# Ignore expired alert
|
|
reporting_boundary = pm_job.criteria.reportingBoundary\
|
|
if (pm_job.criteria.obj_attr_is_set('reportingBoundary') and
|
|
pm_job.criteria.reportingBoundary) else None
|
|
if (reporting_boundary and
|
|
datetime_now > self.parse_datetime(reporting_boundary)):
|
|
raise sol_ex.PrometheusPluginSkipped()
|
|
|
|
# Ignore short period alert
|
|
report_date = self.get_datetime_of_latest_report(
|
|
context, pm_job, object_instance_id, sub_object_instance_id,
|
|
metric)
|
|
|
|
# reporting_period_margin is some margin for timing inconsistency
|
|
# between prometheus and tacker.
|
|
if (report_date and report_date + datetime.timedelta(
|
|
seconds=(pm_job.criteria.reportingPeriod -
|
|
self.reporting_period_margin)) >= datetime_now):
|
|
raise sol_ex.PrometheusPluginSkipped()
|
|
|
|
def valid_alert(self, pm_job, object_instance_id, sub_object_instance_id):
|
|
object_instance_ids = (
|
|
pm_job.objectInstanceIds
|
|
if (pm_job.obj_attr_is_set('objectInstanceIds') and
|
|
pm_job.objectInstanceIds) else [])
|
|
if object_instance_id not in object_instance_ids:
|
|
LOG.error(
|
|
f"labels.object_instance_id {object_instance_id} "
|
|
f"doesn't match pmJob.")
|
|
raise sol_ex.PrometheusPluginSkipped()
|
|
sub_object_instance_ids = (
|
|
pm_job.subObjectInstanceIds
|
|
if (pm_job.obj_attr_is_set('subObjectInstanceIds') and
|
|
pm_job.subObjectInstanceIds) else [])
|
|
if (sub_object_instance_id and
|
|
(not sub_object_instance_ids or
|
|
sub_object_instance_id not in sub_object_instance_ids)):
|
|
LOG.error(
|
|
f"labels.sub_object_instance_id {sub_object_instance_id} "
|
|
f"doesn't match pmJob.")
|
|
raise sol_ex.PrometheusPluginSkipped()
|
|
|
|
@validator.schema(prometheus_plugin_schemas.AlertMessage)
|
|
def _alert(self, request, body):
|
|
result = []
|
|
context = request.context
|
|
datetime_now = datetime.datetime.now(datetime.timezone.utc)
|
|
for alt in body['alerts']:
|
|
if alt['labels']['function_type'] != 'vnfpm':
|
|
continue
|
|
try:
|
|
pm_job_id = alt['labels']['job_id']
|
|
object_instance_id = alt['labels']['object_instance_id']
|
|
metric = alt['labels']['metric']
|
|
sub_object_instance_id = alt['labels'].get(
|
|
'sub_object_instance_id')
|
|
value = alt['annotations']['value']
|
|
|
|
pm_job = pm_job_utils.get_pm_job(context, pm_job_id)
|
|
self.filter_alert_by_time(context, pm_job, datetime_now,
|
|
object_instance_id,
|
|
sub_object_instance_id, metric)
|
|
self.valid_alert(
|
|
pm_job, object_instance_id, sub_object_instance_id)
|
|
value = self.convert_measurement_unit(metric, value)
|
|
|
|
result.append({
|
|
'objectType': pm_job.objectType,
|
|
'objectInstanceId': object_instance_id,
|
|
'subObjectInstanceId': sub_object_instance_id,
|
|
'performanceMetric': metric,
|
|
'performanceValues': [{
|
|
'timeStamp': datetime_now,
|
|
'value': value
|
|
}]
|
|
})
|
|
except sol_ex.PrometheusPluginSkipped:
|
|
pass
|
|
if len(result) > 0:
|
|
if self.notification_callback:
|
|
# every job_id in body['alerts'] has same id
|
|
self.notification_callback(context, {
|
|
'id': uuidutils.generate_uuid(),
|
|
'jobId': pm_job_id,
|
|
'entries': result,
|
|
})
|
|
return result
|
|
|
|
def decompose_metrics_vnfc(self, pm_job):
|
|
metrics = pm_job.criteria.performanceMetric\
|
|
if pm_job.criteria.obj_attr_is_set('performanceMetric') else None
|
|
metrics = (list(filter(lambda x: (
|
|
re.match(r'^V(Cpu|Memory|Disk)Usage(Mean|Peak)Vnf\..+', x) and
|
|
re.sub(r'^V(Cpu|Memory|Disk)Usage(Mean|Peak)Vnf\.', '',
|
|
x) in pm_job.objectInstanceIds), metrics))
|
|
if metrics else [])
|
|
metric_grps = pm_job.criteria.performanceMetricGroup\
|
|
if (pm_job.criteria.obj_attr_is_set('performanceMetricGroup') and
|
|
pm_job.criteria.performanceMetricGroup) else []
|
|
for obj in pm_job.objectInstanceIds:
|
|
for grp in metric_grps:
|
|
if grp == 'VirtualisedComputeResource':
|
|
metrics.append(f'VCpuUsageMeanVnf.{obj}')
|
|
metrics.append(f'VCpuUsagePeakVnf.{obj}')
|
|
metrics.append(f'VMemoryUsageMeanVnf.{obj}')
|
|
metrics.append(f'VMemoryUsagePeakVnf.{obj}')
|
|
metrics.append(f'VDiskUsageMeanVnf.{obj}')
|
|
metrics.append(f'VDiskUsagePeakVnf.{obj}')
|
|
metrics = list(set(metrics))
|
|
if len(metrics) == 0:
|
|
raise sol_ex.PrometheusPluginError(
|
|
"Invalid performanceMetric or performanceMetricGroup."
|
|
)
|
|
return metrics
|
|
|
|
def make_prom_ql(self, target, pod, collection_period=30,
|
|
reporting_period=90, sub_object_instance_id='*'):
|
|
reporting_period = max(reporting_period, 30)
|
|
expr = self.sol_exprs[target].format(
|
|
pod=pod,
|
|
collection_period=collection_period,
|
|
reporting_period=reporting_period,
|
|
sub_object_instance_id=sub_object_instance_id
|
|
)
|
|
return expr
|
|
|
|
def make_rule(self, pm_job, object_instance_id, sub_object_instance_id,
|
|
metric, expression, collection_period):
|
|
labels = {
|
|
'alertname': '',
|
|
'receiver_type': 'tacker',
|
|
'function_type': 'vnfpm',
|
|
'job_id': pm_job.id,
|
|
'object_instance_id': object_instance_id,
|
|
'sub_object_instance_id': sub_object_instance_id,
|
|
'metric': metric
|
|
}
|
|
labels = {k: v for k, v in labels.items() if v is not None}
|
|
annotations = {
|
|
'value': r'{{$value}}'
|
|
}
|
|
rule = {
|
|
'alert': uuidutils.generate_uuid(),
|
|
'expr': expression,
|
|
'for': f'{collection_period}s',
|
|
'labels': labels,
|
|
'annotations': annotations
|
|
}
|
|
return rule
|
|
|
|
def get_vnfc_resource_info(self, _, vnf_instance_id, inst_map):
|
|
inst = inst_map[vnf_instance_id]
|
|
if not inst.obj_attr_is_set('instantiatedVnfInfo') or\
|
|
not inst.instantiatedVnfInfo.obj_attr_is_set(
|
|
'vnfcResourceInfo'):
|
|
return None
|
|
return inst.instantiatedVnfInfo.vnfcResourceInfo
|
|
|
|
def get_pod_regexp(self, resource_info):
|
|
# resource ids are like:
|
|
# ['test-test1-756757f8f-xcwmt',
|
|
# 'test-test2-756757f8f-kmghr', ...]
|
|
# convert them to a regex string such as:
|
|
# '(test-test1-[0-9a-f]{1,10}-[0-9a-z]{5}$|
|
|
# test-test2-[0-9a-f]{1,10}-[0-9a-z]{5}$|...)'
|
|
deployments = list(filter(
|
|
lambda r:
|
|
r.computeResource.obj_attr_is_set(
|
|
'vimLevelResourceType')
|
|
and r.computeResource.obj_attr_is_set(
|
|
'resourceId'
|
|
)
|
|
and r.computeResource.vimLevelResourceType ==
|
|
'Deployment', resource_info
|
|
))
|
|
deployments = list(set(list(map(
|
|
lambda d: re.sub(
|
|
r'\-[0-9a-f]{1,10}\-[0-9a-z]{5}$', '',
|
|
d.computeResource.resourceId) +
|
|
r'-[0-9a-f]{1,10}-[0-9a-z]{5}$',
|
|
deployments
|
|
))))
|
|
pods_regexp = '(' + '|'.join(deployments) + ')'
|
|
return deployments, pods_regexp
|
|
|
|
def _make_rules_for_each_obj(self, context, pm_job, inst_map, metric):
|
|
target = re.sub(r'\..+$', '', metric)
|
|
objs = pm_job.objectInstanceIds
|
|
collection_period = pm_job.criteria.collectionPeriod
|
|
reporting_period = pm_job.criteria.reportingPeriod
|
|
rules = []
|
|
for obj in objs:
|
|
# resource ids are like:
|
|
# ['test-test1-756757f8f-xcwmt',
|
|
# 'test-test2-756757f8f-kmghr', ...]
|
|
# convert them to a regex string such as:
|
|
# '(test-test1-[0-9a-f]{1,10}-[0-9a-z]{5}$|
|
|
# test-test2-[0-9a-f]{1,10}-[0-9a-z]{5}$|...)'
|
|
resource_info = self.get_vnfc_resource_info(context, obj, inst_map)
|
|
if not resource_info:
|
|
continue
|
|
deployments, pods_regexp = self.get_pod_regexp(resource_info)
|
|
if len(deployments) == 0:
|
|
continue
|
|
expr = self.make_prom_ql(
|
|
target, pods_regexp, collection_period=collection_period,
|
|
reporting_period=reporting_period)
|
|
rules.append(self.make_rule(
|
|
pm_job, obj, None, metric, expr,
|
|
collection_period))
|
|
return rules
|
|
|
|
def get_compute_resource_by_sub_obj(self, vnf_instance, sub_obj):
|
|
inst = vnf_instance
|
|
if (not inst.obj_attr_is_set('instantiatedVnfInfo') or
|
|
not inst.instantiatedVnfInfo.obj_attr_is_set(
|
|
'vnfcResourceInfo') or
|
|
not inst.instantiatedVnfInfo.obj_attr_is_set('vnfcInfo')):
|
|
return None
|
|
vnfc_info = list(filter(
|
|
lambda x: (x.obj_attr_is_set('vnfcResourceInfoId') and
|
|
x.id == sub_obj),
|
|
inst.instantiatedVnfInfo.vnfcInfo))
|
|
if len(vnfc_info) == 0:
|
|
return None
|
|
resources = list(filter(
|
|
lambda x: (vnfc_info[0].obj_attr_is_set('vnfcResourceInfoId') and
|
|
x.id == vnfc_info[0].vnfcResourceInfoId and
|
|
x.computeResource.obj_attr_is_set('vimLevelResourceType') and
|
|
x.computeResource.vimLevelResourceType == 'Deployment' and
|
|
x.computeResource.obj_attr_is_set('resourceId')),
|
|
inst.instantiatedVnfInfo.vnfcResourceInfo))
|
|
if len(resources) == 0:
|
|
return None
|
|
return resources[0].computeResource
|
|
|
|
def _make_rules_for_each_sub_obj(self, context, pm_job, inst_map, metric):
|
|
target = re.sub(r'\..+$', '', metric)
|
|
objs = pm_job.objectInstanceIds
|
|
sub_objs = pm_job.subObjectInstanceIds\
|
|
if (pm_job.obj_attr_is_set('subObjectInstanceIds') and
|
|
pm_job.subObjectInstanceIds) else []
|
|
collection_period = pm_job.criteria.collectionPeriod
|
|
reporting_period = pm_job.criteria.reportingPeriod
|
|
rules = []
|
|
resource_info = self.get_vnfc_resource_info(context, objs[0], inst_map)
|
|
if not resource_info:
|
|
return []
|
|
if pm_job.objectType in {'Vnf', 'Vnfc'}:
|
|
inst = inst_map[objs[0]]
|
|
for sub_obj in sub_objs:
|
|
compute_resource = self.get_compute_resource_by_sub_obj(
|
|
inst, sub_obj)
|
|
if not compute_resource:
|
|
continue
|
|
resource_id = compute_resource.resourceId
|
|
expr = self.make_prom_ql(
|
|
target, resource_id,
|
|
collection_period=collection_period,
|
|
reporting_period=reporting_period)
|
|
rules.append(self.make_rule(
|
|
pm_job, objs[0], sub_obj, metric, expr,
|
|
collection_period))
|
|
else:
|
|
deployments, pods_regexp = self.get_pod_regexp(resource_info)
|
|
if len(deployments) == 0:
|
|
return []
|
|
for sub_obj in sub_objs:
|
|
expr = self.make_prom_ql(
|
|
target, pods_regexp, collection_period=collection_period,
|
|
reporting_period=reporting_period,
|
|
sub_object_instance_id=sub_obj)
|
|
rules.append(self.make_rule(
|
|
pm_job, objs[0], sub_obj, metric, expr,
|
|
collection_period))
|
|
return rules
|
|
|
|
def _make_rules(self, context, pm_job, metric, inst_map):
|
|
sub_objs = pm_job.subObjectInstanceIds\
|
|
if (pm_job.obj_attr_is_set('subObjectInstanceIds') and
|
|
pm_job.subObjectInstanceIds) else []
|
|
# Cardinality of objectInstanceIds and subObjectInstanceIds
|
|
# is N:0 or 1:N.
|
|
if len(sub_objs) > 0:
|
|
return self._make_rules_for_each_sub_obj(
|
|
context, pm_job, inst_map, metric)
|
|
return self._make_rules_for_each_obj(
|
|
context, pm_job, inst_map, metric)
|
|
|
|
def decompose_metrics_vnfintextcp(self, pm_job):
|
|
group_name = 'VnfInternalCp'\
|
|
if pm_job.objectType == 'VnfIntCp' else 'VnfExternalCp'
|
|
metrics = pm_job.criteria.performanceMetric\
|
|
if pm_job.criteria.obj_attr_is_set('performanceMetric') else None
|
|
metrics = list(filter(lambda x: (
|
|
re.match(r'^(Byte|Packet)(Incoming|Outgoing)' + pm_job.objectType,
|
|
x)),
|
|
metrics)) if metrics else []
|
|
metric_grps = pm_job.criteria.performanceMetricGroup\
|
|
if (pm_job.criteria.obj_attr_is_set('performanceMetricGroup') and
|
|
pm_job.criteria.performanceMetricGroup) else []
|
|
for grp in metric_grps:
|
|
if grp == group_name:
|
|
metrics.append(f'ByteIncoming{pm_job.objectType}')
|
|
metrics.append(f'ByteOutgoing{pm_job.objectType}')
|
|
metrics.append(f'PacketIncoming{pm_job.objectType}')
|
|
metrics.append(f'PacketOutgoing{pm_job.objectType}')
|
|
metrics = list(set(metrics))
|
|
if len(metrics) == 0:
|
|
raise sol_ex.PrometheusPluginError(
|
|
"Invalid performanceMetric or performanceMetricGroup."
|
|
)
|
|
return metrics
|
|
|
|
def _delete_rule(self, host, port, user, password, path, pm_job_id):
|
|
with paramiko.Transport(sock=(host, port)) as client:
|
|
client.connect(username=user, password=password)
|
|
sftp = paramiko.SFTPClient.from_transport(client)
|
|
sftp.remove(f'{path}/{pm_job_id}.json')
|
|
|
|
def delete_rules(self, context, pm_job):
|
|
target_list, reload_list = self.get_access_info(pm_job)
|
|
for target in target_list:
|
|
try:
|
|
self._delete_rule(
|
|
target['host'], target['port'], target['user'],
|
|
target['password'], target['path'], pm_job.id)
|
|
except (sol_ex.PrometheusPluginError, ks_exc.ClientException,
|
|
paramiko.SSHException):
|
|
# This exception is ignored. DELETE /pm_jobs/{id}
|
|
# will be success even if _delete_rule() is failed.
|
|
# Because the rule file was already deleted.
|
|
pass
|
|
for uri in reload_list:
|
|
try:
|
|
self.reload_prom_server(context, uri)
|
|
except (sol_ex.PrometheusPluginError, ks_exc.ClientException,
|
|
paramiko.SSHException):
|
|
pass
|
|
|
|
def decompose_metrics(self, pm_job):
|
|
if pm_job.objectType in {'Vnf', 'Vnfc'}:
|
|
return self.decompose_metrics_vnfc(pm_job)
|
|
if pm_job.objectType in {'VnfIntCp', 'VnfExtCp'}:
|
|
return self.decompose_metrics_vnfintextcp(pm_job)
|
|
raise sol_ex.PrometheusPluginError(
|
|
f"Invalid objectType: {pm_job.objectType}.")
|
|
|
|
def reload_prom_server(self, context, reload_uri):
|
|
resp, _ = self.client.do_request(
|
|
reload_uri, "PUT", context=context)
|
|
if resp.status_code >= 400 and resp.status_code < 600:
|
|
raise sol_ex.PrometheusPluginError(
|
|
f"Reloading request to prometheus is failed: "
|
|
f"{resp.status_code}.")
|
|
|
|
def _upload_rule(self, rule_group, host, port, user, password, path,
|
|
pm_job_id):
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
with open(os.path.join(tmpdir, 'rule.json'),
|
|
'w+', encoding="utf-8") as fp:
|
|
json.dump(rule_group, fp, indent=4, ensure_ascii=False)
|
|
filename = fp.name
|
|
with paramiko.Transport(sock=(host, port)) as client:
|
|
LOG.info("Upload rule files to prometheus server: %s.", host)
|
|
client.connect(username=user, password=password)
|
|
sftp = paramiko.SFTPClient.from_transport(client)
|
|
sftp.put(filename, f'{path}/{pm_job_id}.json')
|
|
self.verify_rule(host, port, user, password, path, pm_job_id)
|
|
|
|
def verify_rule(self, host, port, user, password, path, pm_job_id):
|
|
if not CONF.prometheus_plugin.test_rule_with_promtool:
|
|
return
|
|
with paramiko.SSHClient() as client:
|
|
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
client.connect(host, port=port, username=user, password=password)
|
|
command = f"promtool check rules {path}/{pm_job_id}.json"
|
|
LOG.info("Rule file validation command: %s", command)
|
|
_, stdout, stderr = client.exec_command(command)
|
|
if stdout.channel.recv_exit_status() != 0:
|
|
error_byte = stderr.read()
|
|
error_str = error_byte.decode('utf-8')
|
|
LOG.error(
|
|
"Rule file validation with promtool failed: %s",
|
|
error_str)
|
|
raise sol_ex.PrometheusPluginError(
|
|
"Rule file validation with promtool failed.")
|
|
|
|
def get_access_info(self, pm_job):
|
|
target_list = []
|
|
reload_list = []
|
|
if (not pm_job.obj_attr_is_set('metadata')
|
|
or 'monitoring' not in pm_job.metadata):
|
|
raise sol_ex.PrometheusPluginError(
|
|
"monitoring info is missing at metadata field.")
|
|
access_info = pm_job.metadata['monitoring']
|
|
if (access_info.get('monitorName') != 'prometheus' or
|
|
access_info.get('driverType') != 'external'):
|
|
raise sol_ex.PrometheusPluginError(
|
|
"prometheus info is missing at metadata field.")
|
|
for info in access_info.get('targetsInfo', []):
|
|
host = info.get('prometheusHost', '')
|
|
port = info.get('prometheusHostPort', 22)
|
|
auth = info.get('authInfo', {})
|
|
user = auth.get('ssh_username', '')
|
|
password = auth.get('ssh_password', '')
|
|
path = info.get('alertRuleConfigPath', '')
|
|
uri = info.get('prometheusReloadApiEndpoint', '')
|
|
if not (host and user and path and uri):
|
|
continue
|
|
target_list.append({
|
|
'host': host,
|
|
'port': port,
|
|
'user': user,
|
|
'password': password,
|
|
'path': path
|
|
})
|
|
reload_list.append(uri)
|
|
return target_list, list(set(reload_list))
|
|
|
|
def upload_rules(
|
|
self, context, target_list, reload_list, rule_group, pm_job):
|
|
def _cleanup_error(target_list):
|
|
for target in target_list:
|
|
try:
|
|
self._delete_rule(target['host'], target['port'],
|
|
target['user'], target['password'], target['path'],
|
|
pm_job.id)
|
|
except (sol_ex.PrometheusPluginError, ks_exc.ClientException,
|
|
paramiko.SSHException):
|
|
pass
|
|
|
|
try:
|
|
for target in target_list:
|
|
self._upload_rule(
|
|
rule_group, target['host'], target['port'],
|
|
target['user'], target['password'], target['path'],
|
|
pm_job.id)
|
|
for uri in reload_list:
|
|
self.reload_prom_server(context, uri)
|
|
except (sol_ex.PrometheusPluginError, ks_exc.ClientException,
|
|
paramiko.SSHException) as e:
|
|
LOG.error("failed to upload rule files: %s", e.args[0])
|
|
_cleanup_error(target_list)
|
|
raise e
|
|
except Exception as e:
|
|
_cleanup_error(target_list)
|
|
raise e
|
|
|
|
def get_vnf_instances(self, context, pm_job):
|
|
object_instance_ids = list(set(pm_job.objectInstanceIds))
|
|
return dict(zip(
|
|
object_instance_ids,
|
|
list(map(
|
|
lambda inst: inst_utils.get_inst(context, inst),
|
|
object_instance_ids))))
|
|
|
|
def make_rules(self, context, pm_job):
|
|
target_list, reload_list = self.get_access_info(pm_job)
|
|
metrics = self.decompose_metrics(pm_job)
|
|
inst_map = self.get_vnf_instances(context, pm_job)
|
|
rules = sum([self._make_rules(context, pm_job, metric, inst_map)
|
|
for metric in metrics], [])
|
|
if len(rules) == 0:
|
|
raise sol_ex.PrometheusPluginError(
|
|
f"Converting from a PM job to alert rules is failed."
|
|
f" PM job id: {pm_job.id}")
|
|
rule_group = {
|
|
'groups': [
|
|
{
|
|
'name': f'tacker_{pm_job.id}',
|
|
'rules': rules
|
|
}
|
|
]
|
|
}
|
|
self.upload_rules(
|
|
context, target_list, reload_list, rule_group, pm_job)
|
|
return rule_group
|
|
|
|
|
|
class PrometheusPluginFm(PrometheusPlugin, mon_base.MonitoringPlugin):
|
|
_instance = None
|
|
|
|
@staticmethod
|
|
def instance():
|
|
if PrometheusPluginFm._instance is None:
|
|
if not CONF.prometheus_plugin.fault_management:
|
|
stub = mon_base.MonitoringPluginStub.instance()
|
|
PrometheusPluginFm._instance = stub
|
|
else:
|
|
PrometheusPluginFm()
|
|
return PrometheusPluginFm._instance
|
|
|
|
def __init__(self):
|
|
if PrometheusPluginFm._instance:
|
|
raise SystemError(
|
|
"Not constructor but instance() should be used.")
|
|
super(PrometheusPluginFm, self).__init__()
|
|
self.notification_callback = self.default_callback
|
|
self.endpoint = CONF.v2_vnfm.endpoint
|
|
PrometheusPluginFm._instance = self
|
|
|
|
def set_callback(self, notification_callback):
|
|
self.notification_callback = notification_callback
|
|
|
|
def alert(self, **kwargs):
|
|
try:
|
|
self._alert(kwargs['request'], body=kwargs['body'])
|
|
except Exception as e:
|
|
# All exceptions is ignored here and 204 response will always
|
|
# be returned. Because when tacker responds error to alertmanager,
|
|
# alertmanager may repeat the same reports.
|
|
LOG.error("%s: %s", e.__class__.__name__, e.args[0])
|
|
|
|
def default_callback(self, context, alarm):
|
|
self.rpc.store_alarm_info(context, alarm)
|
|
|
|
def vnfc_instance_ids(
|
|
self, context, vnf_instance_id, alert_entry):
|
|
inst = inst_utils.get_inst(context, vnf_instance_id)
|
|
resources = (inst.instantiatedVnfInfo.vnfcResourceInfo
|
|
if inst.obj_attr_is_set('instantiatedVnfInfo') and
|
|
inst.instantiatedVnfInfo.obj_attr_is_set(
|
|
'vnfcResourceInfo') else [])
|
|
vnfc_info = (inst.instantiatedVnfInfo.vnfcInfo
|
|
if inst.obj_attr_is_set('instantiatedVnfInfo') and
|
|
inst.instantiatedVnfInfo.obj_attr_is_set(
|
|
'vnfcInfo') else [])
|
|
if 'pod' not in alert_entry['labels']:
|
|
return []
|
|
pod = alert_entry['labels']['pod']
|
|
|
|
deployments = list(filter(
|
|
lambda r: (
|
|
r.computeResource.obj_attr_is_set('vimLevelResourceType') and
|
|
r.computeResource.obj_attr_is_set('resourceId') and
|
|
(r.computeResource.vimLevelResourceType in
|
|
{'Deployment', 'Pod'}) and
|
|
re.match(pod, r.computeResource.resourceId)),
|
|
resources
|
|
))
|
|
vnfc_res_info_ids = list(map(
|
|
lambda res: res.id, deployments
|
|
))
|
|
vnfc_info = list(filter(
|
|
lambda info: (
|
|
info.obj_attr_is_set('vnfcResourceInfoId') and
|
|
info.vnfcResourceInfoId in vnfc_res_info_ids),
|
|
vnfc_info
|
|
))
|
|
vnfc_info = list(map(lambda info: info.id, vnfc_info))
|
|
return vnfc_info
|
|
|
|
def update_alarm(self, context, not_cleared, ends_at, datetime_now):
|
|
for alm in not_cleared:
|
|
alm.alarmClearedTime = ends_at
|
|
alm.alarmChangedTime = datetime_now
|
|
if self.notification_callback:
|
|
self.notification_callback(context, alm)
|
|
|
|
def create_new_alarm(self, context, alert_entry, datetime_now):
|
|
vnf_instance_id = alert_entry['labels']['vnf_instance_id']
|
|
fingerprint = alert_entry['fingerprint']
|
|
perceived_severity = alert_entry['labels']['perceived_severity']
|
|
|
|
fault_details = [
|
|
f"fingerprint: {fingerprint}",
|
|
f"detail: {alert_entry['annotations'].get('fault_details')}"
|
|
]
|
|
|
|
vnfc_instance_ids = self.vnfc_instance_ids(
|
|
context, vnf_instance_id, alert_entry)
|
|
if len(vnfc_instance_ids) == 0:
|
|
LOG.error("failed to specify vnfc_instance for the alert.")
|
|
raise sol_ex.PrometheusPluginSkipped()
|
|
|
|
new_alarm = objects.AlarmV1.from_dict({
|
|
'id':
|
|
uuidutils.generate_uuid(),
|
|
'managedObjectId':
|
|
vnf_instance_id,
|
|
'vnfcInstanceIds':
|
|
vnfc_instance_ids,
|
|
'alarmRaisedTime':
|
|
datetime_now.isoformat(),
|
|
'ackState':
|
|
'UNACKNOWLEDGED',
|
|
'perceivedSeverity':
|
|
perceived_severity,
|
|
'eventTime':
|
|
alert_entry['startsAt'],
|
|
'eventType':
|
|
alert_entry['labels'].get('event_type', ''),
|
|
'faultType':
|
|
alert_entry['annotations'].get('fault_type', ''),
|
|
'probableCause':
|
|
alert_entry['annotations'].get('probable_cause', ''),
|
|
'isRootCause':
|
|
False,
|
|
'faultDetails':
|
|
fault_details,
|
|
'_links': {}
|
|
})
|
|
|
|
_links = fm_alarm_utils.make_alarm_links(new_alarm, self.endpoint)
|
|
new_alarm._links = _links
|
|
if self.notification_callback:
|
|
self.notification_callback(context, new_alarm)
|
|
return new_alarm
|
|
|
|
def get_not_cleared_alarms(self, context, vnf_instance_id, fingerprint):
|
|
alms = fm_alarm_utils.get_not_cleared_alarms(context, vnf_instance_id)
|
|
fpstr = f'fingerprint: {fingerprint}'
|
|
return list(filter(
|
|
lambda x: (not x.obj_attr_is_set('alarmClearedTime') and
|
|
x.obj_attr_is_set('faultDetails') and
|
|
fpstr in x.faultDetails), alms))
|
|
|
|
def create_or_update_alarm(
|
|
self, context, alert_entry, datetime_now):
|
|
status = alert_entry['status']
|
|
vnf_instance_id = alert_entry['labels']['vnf_instance_id']
|
|
fingerprint = alert_entry['fingerprint']
|
|
not_cleared = self.get_not_cleared_alarms(
|
|
context, vnf_instance_id, fingerprint)
|
|
|
|
if status == 'resolved' and len(not_cleared) > 0:
|
|
ends_at = alert_entry['endsAt']
|
|
self.update_alarm(
|
|
context, not_cleared, ends_at, datetime_now)
|
|
return not_cleared
|
|
if status == 'firing' and len(not_cleared) == 0:
|
|
new_alarm = self.create_new_alarm(
|
|
context, alert_entry, datetime_now)
|
|
return [new_alarm]
|
|
raise sol_ex.PrometheusPluginSkipped()
|
|
|
|
@validator.schema(prometheus_plugin_schemas.AlertMessage)
|
|
def _alert(self, request, body):
|
|
now = datetime.datetime.now(datetime.timezone.utc)
|
|
result = []
|
|
for alt in body['alerts']:
|
|
if alt['labels']['function_type'] != 'vnffm':
|
|
continue
|
|
try:
|
|
alarms = self.create_or_update_alarm(
|
|
request.context, alt, now)
|
|
result.extend(alarms)
|
|
except sol_ex.PrometheusPluginSkipped:
|
|
pass
|
|
return result
|
|
|
|
|
|
class PrometheusPluginAutoScaling(PrometheusPlugin, mon_base.MonitoringPlugin):
|
|
_instance = None
|
|
|
|
@staticmethod
|
|
def instance():
|
|
if PrometheusPluginAutoScaling._instance is None:
|
|
if not CONF.prometheus_plugin.auto_scaling:
|
|
stub = mon_base.MonitoringPluginStub.instance()
|
|
PrometheusPluginAutoScaling._instance = stub
|
|
else:
|
|
PrometheusPluginAutoScaling()
|
|
return PrometheusPluginAutoScaling._instance
|
|
|
|
def __init__(self):
|
|
if PrometheusPluginAutoScaling._instance:
|
|
raise SystemError(
|
|
"Not constructor but instance() should be used.")
|
|
super(PrometheusPluginAutoScaling, self).__init__()
|
|
self.notification_callback = self.default_callback
|
|
PrometheusPluginAutoScaling._instance = self
|
|
|
|
def set_callback(self, notification_callback):
|
|
self.notification_callback = notification_callback
|
|
|
|
def alert(self, **kwargs):
|
|
try:
|
|
self._alert(kwargs['request'], body=kwargs['body'])
|
|
except Exception as e:
|
|
# All exceptions is ignored here and 204 response will always
|
|
# be returned. Because when tacker responds error to alertmanager,
|
|
# alertmanager may repeat the same reports.
|
|
LOG.error("%s: %s", e.__class__.__name__, e.args[0])
|
|
|
|
def default_callback(self, context, vnf_instance_id, scaling_param):
|
|
self.rpc.request_scale(context, vnf_instance_id, scaling_param)
|
|
|
|
def skip_if_auto_scale_not_enabled(self, vnf_instance):
|
|
if (not vnf_instance.obj_attr_is_set('vnfConfigurableProperties') or
|
|
not vnf_instance.vnfConfigurableProperties.get(
|
|
'isAutoscaleEnabled')):
|
|
raise sol_ex.PrometheusPluginSkipped()
|
|
|
|
def process_auto_scale(self, request, vnf_instance_id, auto_scale_type,
|
|
aspect_id):
|
|
scaling_param = {
|
|
'type': auto_scale_type,
|
|
'aspectId': aspect_id,
|
|
}
|
|
context = request.context
|
|
if self.notification_callback:
|
|
self.notification_callback(context, vnf_instance_id, scaling_param)
|
|
|
|
@validator.schema(prometheus_plugin_schemas.AlertMessage)
|
|
def _alert(self, request, body):
|
|
result = []
|
|
for alt in body['alerts']:
|
|
if alt['labels']['function_type'] != 'auto_scale':
|
|
continue
|
|
try:
|
|
vnf_instance_id = alt['labels']['vnf_instance_id']
|
|
auto_scale_type = alt['labels']['auto_scale_type']
|
|
aspect_id = alt['labels']['aspect_id']
|
|
context = request.context
|
|
|
|
inst = inst_utils.get_inst(context, vnf_instance_id)
|
|
self.skip_if_auto_scale_not_enabled(inst)
|
|
self.process_auto_scale(
|
|
request, vnf_instance_id, auto_scale_type, aspect_id)
|
|
result.append((vnf_instance_id, auto_scale_type, aspect_id))
|
|
except sol_ex.PrometheusPluginSkipped:
|
|
pass
|
|
return result
|