tacker/tacker/sol_refactored/common/prometheus_plugin.py

# Copyright (C) 2022 Fujitsu
# All Rights Reserved.
#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.

import datetime
import itertools
import json
import os
import paramiko
import re
import tempfile

from keystoneauth1 import exceptions as ks_exc
from oslo_log import log as logging
from oslo_utils import uuidutils
from tacker.sol_refactored.api import prometheus_plugin_validator as validator
from tacker.sol_refactored.api.schemas import prometheus_plugin_schemas
from tacker.sol_refactored.common import config as cfg
from tacker.sol_refactored.common import exceptions as sol_ex
from tacker.sol_refactored.common import fm_alarm_utils
from tacker.sol_refactored.common import http_client
from tacker.sol_refactored.common import monitoring_plugin_base as mon_base
from tacker.sol_refactored.common import pm_job_utils
from tacker.sol_refactored.common import vnf_instance_utils as inst_utils
from tacker.sol_refactored.conductor import conductor_rpc_v2 as rpc
from tacker.sol_refactored import objects


LOG = logging.getLogger(__name__)
logging.getLogger("paramiko").setLevel(logging.WARNING)

CONF = cfg.CONF


class PrometheusPlugin():
    def __init__(self):
        self.rpc = rpc.PrometheusPluginConductor()

    def parse_datetime(self, isodate):
        t = (isodate if isinstance(isodate, datetime.datetime)
             else datetime.datetime.fromisoformat(
             isodate.replace('Z', '+00:00')))
        return t if t.tzinfo else t.astimezone()


class PrometheusPluginPm(PrometheusPlugin, mon_base.MonitoringPlugin):
    _instance = None

    @staticmethod
    def instance():
        if PrometheusPluginPm._instance is None:
            if not CONF.prometheus_plugin.performance_management:
                stub = mon_base.MonitoringPluginStub.instance()
                PrometheusPluginPm._instance = stub
            else:
                PrometheusPluginPm()
        return PrometheusPluginPm._instance

    def __init__(self):
        if PrometheusPluginPm._instance:
            raise SystemError(
                "Not constructor but instance() should be used.")
        super(PrometheusPluginPm, self).__init__()
        self.notification_callback = None
        auth_handle = http_client.NoAuthHandle()
        self.client = http_client.HttpClient(auth_handle)
        self.reporting_period_margin = (
            CONF.prometheus_plugin.reporting_period_margin)
        self.notification_callback = self.default_callback
        # Pod name can be specified but container name can not.
        # And some prometheus metrics need container name. Therefore, max
        # statement of PromQL is alternatively used in some measurements to
        # measure without container name. That means it provids only most
        # impacted value among the containers.
        self.sol_exprs = {
            'VCpuUsageMeanVnf':
                'avg(sum(rate(pod_cpu_usage_seconds_total'
                '{{pod=~"{pod}"}}[{reporting_period}s])))',
            'VCpuUsagePeakVnf':
                'max(sum(rate(pod_cpu_usage_seconds_total'
                '{{pod=~"{pod}"}}[{reporting_period}s])))',
            'VMemoryUsageMeanVnf':
                'avg(pod_memory_working_set_bytes{{pod=~"{pod}"}} / '
                'on(pod) (kube_node_status_capacity{{resource="memory"}} * '
                'on(node) group_right kube_pod_info))',
            'VMemoryUsagePeakVnf':
                'max(pod_memory_working_set_bytes{{pod=~"{pod}"}} / '
                'on(pod) (kube_node_status_capacity{{resource="memory"}} * '
                'on(node) group_right kube_pod_info))',
            'VDiskUsageMeanVnf':
                'avg(max(container_fs_usage_bytes{{pod=~"{pod}"}}/'
                'container_fs_limit_bytes{{pod=~"{pod}"}}))',
            'VDiskUsagePeakVnf':
                'max(max(container_fs_usage_bytes{{pod=~"{pod}"}}/'
                'container_fs_limit_bytes{{pod=~"{pod}"}}))',
            'ByteIncomingVnfIntCp':
                'sum(container_network_receive_bytes_total'
                '{{interface="{sub_object_instance_id}",pod=~"{pod}"}})',
            'PacketIncomingVnfIntCp':
                'sum(container_network_receive_packets_total'
                '{{interface="{sub_object_instance_id}",pod=~"{pod}"}})',
            'ByteOutgoingVnfIntCp':
                'sum(container_network_transmit_bytes_total'
                '{{interface="{sub_object_instance_id}",pod=~"{pod}"}})',
            'PacketOutgoingVnfIntCp':
                'sum(container_network_transmit_packets_total'
                '{{interface="{sub_object_instance_id}",pod=~"{pod}"}})',
            'ByteIncomingVnfExtCp':
                'sum(container_network_receive_bytes_total'
                '{{interface="{sub_object_instance_id}",pod=~"{pod}"}})',
            'PacketIncomingVnfExtCp':
                'sum(container_network_receive_packets_total'
                '{{interface="{sub_object_instance_id}",pod=~"{pod}"}})',
            'ByteOutgoingVnfExtCp':
                'sum(container_network_transmit_bytes_total'
                '{{interface="{sub_object_instance_id}",pod=~"{pod}"}})',
            'PacketOutgoingVnfExtCp':
                'sum(container_network_transmit_packets_total'
                '{{interface="{sub_object_instance_id}",pod=~"{pod}"}})',
        }
        PrometheusPluginPm._instance = self

    def set_callback(self, notification_callback):
        self.notification_callback = notification_callback

    def create_job(self, **kwargs):
        return self.make_rules(kwargs['context'], kwargs['pm_job'])

    def delete_job(self, **kwargs):
        self.delete_rules(kwargs['context'], kwargs['pm_job'])

    def alert(self, **kwargs):
        try:
            self._alert(kwargs['request'], body=kwargs['body'])
        except Exception as e:
            # All exceptions is ignored here and 204 response will always
            # be returned. Because when tacker responds error to alertmanager,
            # alertmanager may repeat the same reports.
            LOG.error("%s: %s", e.__class__.__name__, e.args[0])

    def default_callback(self, context, entries):
        self.rpc.store_job_info(context, entries)

    def convert_measurement_unit(self, metric, value):
        if re.match(r'^V(Cpu|Memory|Disk)Usage(Mean|Peak)Vnf\..+', metric):
            value = float(value)
        elif re.match(r'^(Byte|Packet)(Incoming|Outgoing)Vnf(IntCp|ExtCp)',
                      metric):
            value = int(value)
        else:
            raise sol_ex.PrometheusPluginError(
                  "Failed to convert annotations.value to measurement unit.")
        return value

    def get_datetime_of_latest_report(
            self, context, pm_job, object_instance_id,
            sub_object_instance_id, metric):
        report = pm_job_utils.get_pm_report(context, pm_job.id)
        if not report:
            return None

        report = list(map(lambda x: x.entries, report))
        report = list(itertools.chain.from_iterable(report))
        entries_of_same_object = list(
            filter(
                lambda x: (
                    x.objectInstanceId == object_instance_id and
                    (not x.obj_attr_is_set('subObjectInstanceId') or
                     x.subObjectInstanceId == sub_object_instance_id) and
                    x.performanceMetric == metric),
                report))
        if len(entries_of_same_object) == 0:
            return None
        values = sum(list(map(
            lambda x: x.performanceValues, entries_of_same_object)), [])
        return max(values, key=lambda value:
                   self.parse_datetime(value.timeStamp)).timeStamp

    def filter_alert_by_time(
            self, context, pm_job, datetime_now,
            object_instance_id, sub_object_instance_id, metric):
        # Ignore expired alert
        reporting_boundary = pm_job.criteria.reportingBoundary\
            if (pm_job.criteria.obj_attr_is_set('reportingBoundary') and
                pm_job.criteria.reportingBoundary) else None
        if (reporting_boundary and
                datetime_now > self.parse_datetime(reporting_boundary)):
            raise sol_ex.PrometheusPluginSkipped()

        # Ignore short period alert
        report_date = self.get_datetime_of_latest_report(
            context, pm_job, object_instance_id, sub_object_instance_id,
            metric)

        # reporting_period_margin is some margin for timing inconsistency
        # between prometheus and tacker.
        if (report_date and report_date + datetime.timedelta(
            seconds=(pm_job.criteria.reportingPeriod -
                     self.reporting_period_margin)) >= datetime_now):
            raise sol_ex.PrometheusPluginSkipped()

    def valid_alert(self, pm_job, object_instance_id, sub_object_instance_id):
        object_instance_ids = (
            pm_job.objectInstanceIds
            if (pm_job.obj_attr_is_set('objectInstanceIds') and
                pm_job.objectInstanceIds) else [])
        if object_instance_id not in object_instance_ids:
            LOG.error(
                f"labels.object_instance_id {object_instance_id} "
                f"doesn't match pmJob.")
            raise sol_ex.PrometheusPluginSkipped()
        sub_object_instance_ids = (
            pm_job.subObjectInstanceIds
            if (pm_job.obj_attr_is_set('subObjectInstanceIds') and
                pm_job.subObjectInstanceIds) else [])
        if (sub_object_instance_id and
                (not sub_object_instance_ids or
                 sub_object_instance_id not in sub_object_instance_ids)):
            LOG.error(
                f"labels.sub_object_instance_id {sub_object_instance_id} "
                f"doesn't match pmJob.")
            raise sol_ex.PrometheusPluginSkipped()

    @validator.schema(prometheus_plugin_schemas.AlertMessage)
    def _alert(self, request, body):
        result = []
        context = request.context
        datetime_now = datetime.datetime.now(datetime.timezone.utc)
        for alt in body['alerts']:
            if alt['labels']['function_type'] != 'vnfpm':
                continue
            try:
                pm_job_id = alt['labels']['job_id']
                object_instance_id = alt['labels']['object_instance_id']
                metric = alt['labels']['metric']
                sub_object_instance_id = alt['labels'].get(
                    'sub_object_instance_id')
                value = alt['annotations']['value']

                pm_job = pm_job_utils.get_pm_job(context, pm_job_id)
                self.filter_alert_by_time(context, pm_job, datetime_now,
                                          object_instance_id,
                                          sub_object_instance_id, metric)
                self.valid_alert(
                    pm_job, object_instance_id, sub_object_instance_id)
                value = self.convert_measurement_unit(metric, value)

                result.append({
                    'objectType': pm_job.objectType,
                    'objectInstanceId': object_instance_id,
                    'subObjectInstanceId': sub_object_instance_id,
                    'performanceMetric': metric,
                    'performanceValues': [{
                        'timeStamp': datetime_now,
                        'value': value
                    }]
                })
            except sol_ex.PrometheusPluginSkipped:
                pass
        if len(result) > 0:
            if self.notification_callback:
                # every job_id in body['alerts'] has same id
                self.notification_callback(context, {
                    'id': uuidutils.generate_uuid(),
                    'jobId': pm_job_id,
                    'entries': result,
                })
        return result

    def decompose_metrics_vnfc(self, pm_job):
        metrics = pm_job.criteria.performanceMetric\
            if pm_job.criteria.obj_attr_is_set('performanceMetric') else None
        metrics = (list(filter(lambda x: (
            re.match(r'^V(Cpu|Memory|Disk)Usage(Mean|Peak)Vnf\..+', x) and
            re.sub(r'^V(Cpu|Memory|Disk)Usage(Mean|Peak)Vnf\.', '',
                   x) in pm_job.objectInstanceIds), metrics))
            if metrics else [])
        metric_grps = pm_job.criteria.performanceMetricGroup\
            if (pm_job.criteria.obj_attr_is_set('performanceMetricGroup') and
                pm_job.criteria.performanceMetricGroup) else []
        for obj in pm_job.objectInstanceIds:
            for grp in metric_grps:
                if grp == 'VirtualisedComputeResource':
                    metrics.append(f'VCpuUsageMeanVnf.{obj}')
                    metrics.append(f'VCpuUsagePeakVnf.{obj}')
                    metrics.append(f'VMemoryUsageMeanVnf.{obj}')
                    metrics.append(f'VMemoryUsagePeakVnf.{obj}')
                    metrics.append(f'VDiskUsageMeanVnf.{obj}')
                    metrics.append(f'VDiskUsagePeakVnf.{obj}')
        metrics = list(set(metrics))
        if len(metrics) == 0:
            raise sol_ex.PrometheusPluginError(
                "Invalid performanceMetric or performanceMetricGroup."
            )
        return metrics

    def make_prom_ql(self, target, pod, collection_period=30,
                     reporting_period=90, sub_object_instance_id='*'):
        reporting_period = max(reporting_period, 30)
        expr = self.sol_exprs[target].format(
            pod=pod,
            collection_period=collection_period,
            reporting_period=reporting_period,
            sub_object_instance_id=sub_object_instance_id
        )
        return expr

    def make_rule(self, pm_job, object_instance_id, sub_object_instance_id,
                  metric, expression, collection_period):
        labels = {
            'alertname': '',
            'receiver_type': 'tacker',
            'function_type': 'vnfpm',
            'job_id': pm_job.id,
            'object_instance_id': object_instance_id,
            'sub_object_instance_id': sub_object_instance_id,
            'metric': metric
        }
        labels = {k: v for k, v in labels.items() if v is not None}
        annotations = {
            'value': r'{{$value}}'
        }
        rule = {
            'alert': uuidutils.generate_uuid(),
            'expr': expression,
            'for': f'{collection_period}s',
            'labels': labels,
            'annotations': annotations
        }
        return rule

    def get_vnfc_resource_info(self, _, vnf_instance_id, inst_map):
        inst = inst_map[vnf_instance_id]
        if not inst.obj_attr_is_set('instantiatedVnfInfo') or\
                not inst.instantiatedVnfInfo.obj_attr_is_set(
                    'vnfcResourceInfo'):
            return None
        return inst.instantiatedVnfInfo.vnfcResourceInfo

    def get_pod_regexp(self, resource_info):
        # resource ids are like:
        #   ['test-test1-756757f8f-xcwmt',
        #    'test-test2-756757f8f-kmghr', ...]
        # convert them to a regex string such as:
        #   '(test-test1-[0-9a-f]{1,10}-[0-9a-z]{5}$|
        #    test-test2-[0-9a-f]{1,10}-[0-9a-z]{5}$|...)'
        deployments = list(filter(
            lambda r:
                r.computeResource.obj_attr_is_set(
                    'vimLevelResourceType')
                and r.computeResource.obj_attr_is_set(
                    'resourceId'
                )
                and r.computeResource.vimLevelResourceType ==
                'Deployment', resource_info
        ))
        deployments = list(set(list(map(
            lambda d: re.sub(
                r'\-[0-9a-f]{1,10}\-[0-9a-z]{5}$', '',
                d.computeResource.resourceId) +
            r'-[0-9a-f]{1,10}-[0-9a-z]{5}$',
            deployments
        ))))
        pods_regexp = '(' + '|'.join(deployments) + ')'
        return deployments, pods_regexp

    def _make_rules_for_each_obj(self, context, pm_job, inst_map, metric):
        target = re.sub(r'\..+$', '', metric)
        objs = pm_job.objectInstanceIds
        collection_period = pm_job.criteria.collectionPeriod
        reporting_period = pm_job.criteria.reportingPeriod
        rules = []
        for obj in objs:
            # resource ids are like:
            #   ['test-test1-756757f8f-xcwmt',
            #    'test-test2-756757f8f-kmghr', ...]
            # convert them to a regex string such as:
            #   '(test-test1-[0-9a-f]{1,10}-[0-9a-z]{5}$|
            #    test-test2-[0-9a-f]{1,10}-[0-9a-z]{5}$|...)'
            resource_info = self.get_vnfc_resource_info(context, obj, inst_map)
            if not resource_info:
                continue
            deployments, pods_regexp = self.get_pod_regexp(resource_info)
            if len(deployments) == 0:
                continue
            expr = self.make_prom_ql(
                target, pods_regexp, collection_period=collection_period,
                reporting_period=reporting_period)
            rules.append(self.make_rule(
                pm_job, obj, None, metric, expr,
                collection_period))
        return rules

    def get_compute_resource_by_sub_obj(self, vnf_instance, sub_obj):
        inst = vnf_instance
        if (not inst.obj_attr_is_set('instantiatedVnfInfo') or
                not inst.instantiatedVnfInfo.obj_attr_is_set(
                    'vnfcResourceInfo') or
                not inst.instantiatedVnfInfo.obj_attr_is_set('vnfcInfo')):
            return None
        vnfc_info = list(filter(
            lambda x: (x.obj_attr_is_set('vnfcResourceInfoId') and
                x.id == sub_obj),
            inst.instantiatedVnfInfo.vnfcInfo))
        if len(vnfc_info) == 0:
            return None
        resources = list(filter(
            lambda x: (vnfc_info[0].obj_attr_is_set('vnfcResourceInfoId') and
                x.id == vnfc_info[0].vnfcResourceInfoId and
                x.computeResource.obj_attr_is_set('vimLevelResourceType') and
                x.computeResource.vimLevelResourceType == 'Deployment' and
                x.computeResource.obj_attr_is_set('resourceId')),
            inst.instantiatedVnfInfo.vnfcResourceInfo))
        if len(resources) == 0:
            return None
        return resources[0].computeResource

    def _make_rules_for_each_sub_obj(self, context, pm_job, inst_map, metric):
        target = re.sub(r'\..+$', '', metric)
        objs = pm_job.objectInstanceIds
        sub_objs = pm_job.subObjectInstanceIds\
            if (pm_job.obj_attr_is_set('subObjectInstanceIds') and
                pm_job.subObjectInstanceIds) else []
        collection_period = pm_job.criteria.collectionPeriod
        reporting_period = pm_job.criteria.reportingPeriod
        rules = []
        resource_info = self.get_vnfc_resource_info(context, objs[0], inst_map)
        if not resource_info:
            return []
        if pm_job.objectType in {'Vnf', 'Vnfc'}:
            inst = inst_map[objs[0]]
            for sub_obj in sub_objs:
                compute_resource = self.get_compute_resource_by_sub_obj(
                    inst, sub_obj)
                if not compute_resource:
                    continue
                resource_id = compute_resource.resourceId
                expr = self.make_prom_ql(
                    target, resource_id,
                    collection_period=collection_period,
                    reporting_period=reporting_period)
                rules.append(self.make_rule(
                    pm_job, objs[0], sub_obj, metric, expr,
                    collection_period))
        else:
            deployments, pods_regexp = self.get_pod_regexp(resource_info)
            if len(deployments) == 0:
                return []
            for sub_obj in sub_objs:
                expr = self.make_prom_ql(
                    target, pods_regexp, collection_period=collection_period,
                    reporting_period=reporting_period,
                    sub_object_instance_id=sub_obj)
                rules.append(self.make_rule(
                    pm_job, objs[0], sub_obj, metric, expr,
                    collection_period))
        return rules

    def _make_rules(self, context, pm_job, metric, inst_map):
        sub_objs = pm_job.subObjectInstanceIds\
            if (pm_job.obj_attr_is_set('subObjectInstanceIds') and
                pm_job.subObjectInstanceIds) else []
        # Cardinality of objectInstanceIds and subObjectInstanceIds
        # is N:0 or 1:N.
        if len(sub_objs) > 0:
            return self._make_rules_for_each_sub_obj(
                context, pm_job, inst_map, metric)
        return self._make_rules_for_each_obj(
            context, pm_job, inst_map, metric)

    def decompose_metrics_vnfintextcp(self, pm_job):
        group_name = 'VnfInternalCp'\
            if pm_job.objectType == 'VnfIntCp' else 'VnfExternalCp'
        metrics = pm_job.criteria.performanceMetric\
            if pm_job.criteria.obj_attr_is_set('performanceMetric') else None
        metrics = list(filter(lambda x: (
            re.match(r'^(Byte|Packet)(Incoming|Outgoing)' + pm_job.objectType,
                     x)),
            metrics)) if metrics else []
        metric_grps = pm_job.criteria.performanceMetricGroup\
            if (pm_job.criteria.obj_attr_is_set('performanceMetricGroup') and
                pm_job.criteria.performanceMetricGroup) else []
        for grp in metric_grps:
            if grp == group_name:
                metrics.append(f'ByteIncoming{pm_job.objectType}')
                metrics.append(f'ByteOutgoing{pm_job.objectType}')
                metrics.append(f'PacketIncoming{pm_job.objectType}')
                metrics.append(f'PacketOutgoing{pm_job.objectType}')
        metrics = list(set(metrics))
        if len(metrics) == 0:
            raise sol_ex.PrometheusPluginError(
                "Invalid performanceMetric or performanceMetricGroup."
            )
        return metrics

    def _delete_rule(self, host, port, user, password, path, pm_job_id):
        with paramiko.Transport(sock=(host, port)) as client:
            client.connect(username=user, password=password)
            sftp = paramiko.SFTPClient.from_transport(client)
            sftp.remove(f'{path}/{pm_job_id}.json')

    def delete_rules(self, context, pm_job):
        target_list, reload_list = self.get_access_info(pm_job)
        for target in target_list:
            try:
                self._delete_rule(
                    target['host'], target['port'], target['user'],
                    target['password'], target['path'], pm_job.id)
            except (sol_ex.PrometheusPluginError, ks_exc.ClientException,
                    paramiko.SSHException):
                # This exception is ignored. DELETE /pm_jobs/{id}
                # will be success even if _delete_rule() is failed.
                # Because the rule file was already deleted.
                pass
        for uri in reload_list:
            try:
                self.reload_prom_server(context, uri)
            except (sol_ex.PrometheusPluginError, ks_exc.ClientException,
                    paramiko.SSHException):
                pass

    def decompose_metrics(self, pm_job):
        if pm_job.objectType in {'Vnf', 'Vnfc'}:
            return self.decompose_metrics_vnfc(pm_job)
        if pm_job.objectType in {'VnfIntCp', 'VnfExtCp'}:
            return self.decompose_metrics_vnfintextcp(pm_job)
        raise sol_ex.PrometheusPluginError(
            f"Invalid objectType: {pm_job.objectType}.")

    def reload_prom_server(self, context, reload_uri):
        resp, _ = self.client.do_request(
            reload_uri, "PUT", context=context)
        if resp.status_code >= 400 and resp.status_code < 600:
            raise sol_ex.PrometheusPluginError(
                f"Reloading request to prometheus is failed: "
                f"{resp.status_code}.")

    def _upload_rule(self, rule_group, host, port, user, password, path,
                     pm_job_id):
        with tempfile.TemporaryDirectory() as tmpdir:
            with open(os.path.join(tmpdir, 'rule.json'),
                      'w+', encoding="utf-8") as fp:
                json.dump(rule_group, fp, indent=4, ensure_ascii=False)
                filename = fp.name
            with paramiko.Transport(sock=(host, port)) as client:
                LOG.info("Upload rule files to prometheus server: %s.", host)
                client.connect(username=user, password=password)
                sftp = paramiko.SFTPClient.from_transport(client)
                sftp.put(filename, f'{path}/{pm_job_id}.json')
        self.verify_rule(host, port, user, password, path, pm_job_id)

    def verify_rule(self, host, port, user, password, path, pm_job_id):
        if not CONF.prometheus_plugin.test_rule_with_promtool:
            return
        with paramiko.SSHClient() as client:
            client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
            client.connect(host, port=port, username=user, password=password)
            command = f"promtool check rules {path}/{pm_job_id}.json"
            LOG.info("Rule file validation command: %s", command)
            _, stdout, stderr = client.exec_command(command)
            if stdout.channel.recv_exit_status() != 0:
                error_byte = stderr.read()
                error_str = error_byte.decode('utf-8')
                LOG.error(
                    "Rule file validation with promtool failed: %s",
                    error_str)
                raise sol_ex.PrometheusPluginError(
                    "Rule file validation with promtool failed.")

    def get_access_info(self, pm_job):
        target_list = []
        reload_list = []
        if (not pm_job.obj_attr_is_set('metadata')
                or 'monitoring' not in pm_job.metadata):
            raise sol_ex.PrometheusPluginError(
                  "monitoring info is missing at metadata field.")
        access_info = pm_job.metadata['monitoring']
        if (access_info.get('monitorName') != 'prometheus' or
                access_info.get('driverType') != 'external'):
            raise sol_ex.PrometheusPluginError(
                  "prometheus info is missing at metadata field.")
        for info in access_info.get('targetsInfo', []):
            host = info.get('prometheusHost', '')
            port = info.get('prometheusHostPort', 22)
            auth = info.get('authInfo', {})
            user = auth.get('ssh_username', '')
            password = auth.get('ssh_password', '')
            path = info.get('alertRuleConfigPath', '')
            uri = info.get('prometheusReloadApiEndpoint', '')
            if not (host and user and path and uri):
                continue
            target_list.append({
                'host': host,
                'port': port,
                'user': user,
                'password': password,
                'path': path
            })
            reload_list.append(uri)
        return target_list, list(set(reload_list))

    def upload_rules(
            self, context, target_list, reload_list, rule_group, pm_job):
        def _cleanup_error(target_list):
            for target in target_list:
                try:
                    self._delete_rule(target['host'], target['port'],
                        target['user'], target['password'], target['path'],
                        pm_job.id)
                except (sol_ex.PrometheusPluginError, ks_exc.ClientException,
                        paramiko.SSHException):
                    pass

        try:
            for target in target_list:
                self._upload_rule(
                    rule_group, target['host'], target['port'],
                    target['user'], target['password'], target['path'],
                    pm_job.id)
            for uri in reload_list:
                self.reload_prom_server(context, uri)
        except (sol_ex.PrometheusPluginError, ks_exc.ClientException,
                paramiko.SSHException) as e:
            LOG.error("failed to upload rule files: %s", e.args[0])
            _cleanup_error(target_list)
            raise e
        except Exception as e:
            _cleanup_error(target_list)
            raise e

    def get_vnf_instances(self, context, pm_job):
        object_instance_ids = list(set(pm_job.objectInstanceIds))
        return dict(zip(
            object_instance_ids,
            list(map(
                lambda inst: inst_utils.get_inst(context, inst),
                object_instance_ids))))

    def make_rules(self, context, pm_job):
        target_list, reload_list = self.get_access_info(pm_job)
        metrics = self.decompose_metrics(pm_job)
        inst_map = self.get_vnf_instances(context, pm_job)
        rules = sum([self._make_rules(context, pm_job, metric, inst_map)
                     for metric in metrics], [])
        if len(rules) == 0:
            raise sol_ex.PrometheusPluginError(
                  f"Converting from a PM job to alert rules is failed."
                  f" PM job id: {pm_job.id}")
        rule_group = {
            'groups': [
                {
                    'name': f'tacker_{pm_job.id}',
                    'rules': rules
                }
            ]
        }
        self.upload_rules(
            context, target_list, reload_list, rule_group, pm_job)
        return rule_group


class PrometheusPluginFm(PrometheusPlugin, mon_base.MonitoringPlugin):
    _instance = None

    @staticmethod
    def instance():
        if PrometheusPluginFm._instance is None:
            if not CONF.prometheus_plugin.fault_management:
                stub = mon_base.MonitoringPluginStub.instance()
                PrometheusPluginFm._instance = stub
            else:
                PrometheusPluginFm()
        return PrometheusPluginFm._instance

    def __init__(self):
        if PrometheusPluginFm._instance:
            raise SystemError(
                "Not constructor but instance() should be used.")
        super(PrometheusPluginFm, self).__init__()
        self.notification_callback = self.default_callback
        self.endpoint = CONF.v2_vnfm.endpoint
        PrometheusPluginFm._instance = self

    def set_callback(self, notification_callback):
        self.notification_callback = notification_callback

    def alert(self, **kwargs):
        try:
            self._alert(kwargs['request'], body=kwargs['body'])
        except Exception as e:
            # All exceptions is ignored here and 204 response will always
            # be returned. Because when tacker responds error to alertmanager,
            # alertmanager may repeat the same reports.
            LOG.error("%s: %s", e.__class__.__name__, e.args[0])

    def default_callback(self, context, alarm):
        self.rpc.store_alarm_info(context, alarm)

    def vnfc_instance_ids(
            self, context, vnf_instance_id, alert_entry):
        inst = inst_utils.get_inst(context, vnf_instance_id)
        resources = (inst.instantiatedVnfInfo.vnfcResourceInfo
                if inst.obj_attr_is_set('instantiatedVnfInfo') and
                inst.instantiatedVnfInfo.obj_attr_is_set(
                    'vnfcResourceInfo') else [])
        vnfc_info = (inst.instantiatedVnfInfo.vnfcInfo
                if inst.obj_attr_is_set('instantiatedVnfInfo') and
                inst.instantiatedVnfInfo.obj_attr_is_set(
                    'vnfcInfo') else [])
        if 'pod' not in alert_entry['labels']:
            return []
        pod = alert_entry['labels']['pod']

        deployments = list(filter(
            lambda r: (
                r.computeResource.obj_attr_is_set('vimLevelResourceType') and
                r.computeResource.obj_attr_is_set('resourceId') and
                (r.computeResource.vimLevelResourceType in
                    {'Deployment', 'Pod'}) and
                re.match(pod, r.computeResource.resourceId)),
            resources
        ))
        vnfc_res_info_ids = list(map(
            lambda res: res.id, deployments
        ))
        vnfc_info = list(filter(
            lambda info: (
                info.obj_attr_is_set('vnfcResourceInfoId') and
                info.vnfcResourceInfoId in vnfc_res_info_ids),
            vnfc_info
        ))
        vnfc_info = list(map(lambda info: info.id, vnfc_info))
        return vnfc_info

    def update_alarm(self, context, not_cleared, ends_at, datetime_now):
        for alm in not_cleared:
            alm.alarmClearedTime = ends_at
            alm.alarmChangedTime = datetime_now
            if self.notification_callback:
                self.notification_callback(context, alm)

    def create_new_alarm(self, context, alert_entry, datetime_now):
        vnf_instance_id = alert_entry['labels']['vnf_instance_id']
        fingerprint = alert_entry['fingerprint']
        perceived_severity = alert_entry['labels']['perceived_severity']

        fault_details = [
            f"fingerprint: {fingerprint}",
            f"detail: {alert_entry['annotations'].get('fault_details')}"
        ]

        vnfc_instance_ids = self.vnfc_instance_ids(
            context, vnf_instance_id, alert_entry)
        if len(vnfc_instance_ids) == 0:
            LOG.error("failed to specify vnfc_instance for the alert.")
            raise sol_ex.PrometheusPluginSkipped()

        new_alarm = objects.AlarmV1.from_dict({
            'id':
                uuidutils.generate_uuid(),
            'managedObjectId':
                vnf_instance_id,
            'vnfcInstanceIds':
                vnfc_instance_ids,
            'alarmRaisedTime':
                datetime_now.isoformat(),
            'ackState':
                'UNACKNOWLEDGED',
            'perceivedSeverity':
                perceived_severity,
            'eventTime':
                alert_entry['startsAt'],
            'eventType':
                alert_entry['labels'].get('event_type', ''),
            'faultType':
                alert_entry['annotations'].get('fault_type', ''),
            'probableCause':
                alert_entry['annotations'].get('probable_cause', ''),
            'isRootCause':
                False,
            'faultDetails':
                fault_details,
            '_links': {}
        })

        _links = fm_alarm_utils.make_alarm_links(new_alarm, self.endpoint)
        new_alarm._links = _links
        if self.notification_callback:
            self.notification_callback(context, new_alarm)
        return new_alarm

    def get_not_cleared_alarms(self, context, vnf_instance_id, fingerprint):
        alms = fm_alarm_utils.get_not_cleared_alarms(context, vnf_instance_id)
        fpstr = f'fingerprint: {fingerprint}'
        return list(filter(
            lambda x: (not x.obj_attr_is_set('alarmClearedTime') and
                       x.obj_attr_is_set('faultDetails') and
                       fpstr in x.faultDetails), alms))

    def create_or_update_alarm(
            self, context, alert_entry, datetime_now):
        status = alert_entry['status']
        vnf_instance_id = alert_entry['labels']['vnf_instance_id']
        fingerprint = alert_entry['fingerprint']
        not_cleared = self.get_not_cleared_alarms(
            context, vnf_instance_id, fingerprint)

        if status == 'resolved' and len(not_cleared) > 0:
            ends_at = alert_entry['endsAt']
            self.update_alarm(
                context, not_cleared, ends_at, datetime_now)
            return not_cleared
        if status == 'firing' and len(not_cleared) == 0:
            new_alarm = self.create_new_alarm(
                context, alert_entry, datetime_now)
            return [new_alarm]
        raise sol_ex.PrometheusPluginSkipped()

    @validator.schema(prometheus_plugin_schemas.AlertMessage)
    def _alert(self, request, body):
        now = datetime.datetime.now(datetime.timezone.utc)
        result = []
        for alt in body['alerts']:
            if alt['labels']['function_type'] != 'vnffm':
                continue
            try:
                alarms = self.create_or_update_alarm(
                    request.context, alt, now)
                result.extend(alarms)
            except sol_ex.PrometheusPluginSkipped:
                pass
        return result


class PrometheusPluginAutoScaling(PrometheusPlugin, mon_base.MonitoringPlugin):
    _instance = None

    @staticmethod
    def instance():
        if PrometheusPluginAutoScaling._instance is None:
            if not CONF.prometheus_plugin.auto_scaling:
                stub = mon_base.MonitoringPluginStub.instance()
                PrometheusPluginAutoScaling._instance = stub
            else:
                PrometheusPluginAutoScaling()
        return PrometheusPluginAutoScaling._instance

    def __init__(self):
        if PrometheusPluginAutoScaling._instance:
            raise SystemError(
                "Not constructor but instance() should be used.")
        super(PrometheusPluginAutoScaling, self).__init__()
        self.notification_callback = self.default_callback
        PrometheusPluginAutoScaling._instance = self

    def set_callback(self, notification_callback):
        self.notification_callback = notification_callback

    def alert(self, **kwargs):
        try:
            self._alert(kwargs['request'], body=kwargs['body'])
        except Exception as e:
            # All exceptions is ignored here and 204 response will always
            # be returned. Because when tacker responds error to alertmanager,
            # alertmanager may repeat the same reports.
            LOG.error("%s: %s", e.__class__.__name__, e.args[0])

    def default_callback(self, context, vnf_instance_id, scaling_param):
        self.rpc.request_scale(context, vnf_instance_id, scaling_param)

    def skip_if_auto_scale_not_enabled(self, vnf_instance):
        if (not vnf_instance.obj_attr_is_set('vnfConfigurableProperties') or
                not vnf_instance.vnfConfigurableProperties.get(
                    'isAutoscaleEnabled')):
            raise sol_ex.PrometheusPluginSkipped()

    def process_auto_scale(self, request, vnf_instance_id, auto_scale_type,
                           aspect_id):
        scaling_param = {
            'type': auto_scale_type,
            'aspectId': aspect_id,
        }
        context = request.context
        if self.notification_callback:
            self.notification_callback(context, vnf_instance_id, scaling_param)

    @validator.schema(prometheus_plugin_schemas.AlertMessage)
    def _alert(self, request, body):
        result = []
        for alt in body['alerts']:
            if alt['labels']['function_type'] != 'auto_scale':
                continue
            try:
                vnf_instance_id = alt['labels']['vnf_instance_id']
                auto_scale_type = alt['labels']['auto_scale_type']
                aspect_id = alt['labels']['aspect_id']
                context = request.context

                inst = inst_utils.get_inst(context, vnf_instance_id)
                self.skip_if_auto_scale_not_enabled(inst)
                self.process_auto_scale(
                    request, vnf_instance_id, auto_scale_type, aspect_id)
                result.append((vnf_instance_id, auto_scale_type, aspect_id))
            except sol_ex.PrometheusPluginSkipped:
                pass
        return result