Merge "Prometheus rule file syntax test"

This commit is contained in:
Zuul
2023-02-06 02:05:03 +00:00
committed by Gerrit Code Review
7 changed files with 87 additions and 18 deletions

View File

@@ -586,9 +586,10 @@
v2_vnfm:
kubernetes_vim_rsc_wait_timeout: 800
prometheus_plugin:
fault_management: True
performance_management: True
auto_scaling: True
fault_management: true
performance_management: true
auto_scaling: true
test_rule_with_promtool: true
tox_envlist: dsvm-functional-sol-kubernetes-v2
vars:
prometheus_setup: true

View File

@@ -52,6 +52,9 @@ performance_management, fault_management or auto_scaling below.
* - ``CONF.prometheus_plugin.auto_scaling``
- false
- Enable prometheus plugin autoscaling.
* - ``CONF.prometheus_plugin.test_rule_with_promtool``
- false
- Enable rule file validation using promtool.
System
~~~~~~
@@ -241,6 +244,14 @@ needs to activate sshd.
- The directory indicated by "rule_files" setting of prometheus
server config should be accessible by SSH.
Supported versions
------------------
Tacker Zed release
- Prometheus: 2.37
- Alertmanager: 0.24
Alert rule registration
~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -161,6 +161,9 @@ PROMETHEUS_PLUGIN_OPTS = [
'This configuration is changed in case of replacing '
'the original function with a vendor specific '
'function.')),
cfg.BoolOpt('test_rule_with_promtool',
default=False,
help=_('Enable rule file validation using promtool.')),
]
CONF.register_opts(PROMETHEUS_PLUGIN_OPTS, 'prometheus_plugin')

View File

@@ -21,6 +21,7 @@ import paramiko
import re
import tempfile
from keystoneauth1 import exceptions as ks_exc
from oslo_log import log as logging
from oslo_utils import uuidutils
from tacker.sol_refactored.api import prometheus_plugin_validator as validator
@@ -37,6 +38,7 @@ from tacker.sol_refactored import objects
LOG = logging.getLogger(__name__)
logging.getLogger("paramiko").setLevel(logging.WARNING)
CONF = cfg.CONF
@@ -510,12 +512,23 @@ class PrometheusPluginPm(PrometheusPlugin, mon_base.MonitoringPlugin):
def delete_rules(self, context, pm_job):
target_list, reload_list = self.get_access_info(pm_job)
for info in target_list:
self._delete_rule(
info['host'], info['port'], info['user'],
info['password'], info['path'], pm_job.id)
for target in target_list:
try:
self._delete_rule(
target['host'], target['port'], target['user'],
target['password'], target['path'], pm_job.id)
except (sol_ex.PrometheusPluginError, ks_exc.ClientException,
paramiko.SSHException):
# This exception is ignored. DELETE /pm_jobs/{id}
# will be success even if _delete_rule() is failed.
# Because the rule file was already deleted.
pass
for uri in reload_list:
self.reload_prom_server(context, uri)
try:
self.reload_prom_server(context, uri)
except (sol_ex.PrometheusPluginError, ks_exc.ClientException,
paramiko.SSHException):
pass
def decompose_metrics(self, pm_job):
if pm_job.objectType in {'Vnf', 'Vnfc'}:
@@ -528,9 +541,10 @@ class PrometheusPluginPm(PrometheusPlugin, mon_base.MonitoringPlugin):
def reload_prom_server(self, context, reload_uri):
resp, _ = self.client.do_request(
reload_uri, "PUT", context=context)
if resp.status_code != 202:
LOG.error("reloading request to prometheus is failed: %d.",
resp.status_code)
if resp.status_code >= 400 and resp.status_code < 600:
raise sol_ex.PrometheusPluginError(
f"Reloading request to prometheus is failed: "
f"{resp.status_code}.")
def _upload_rule(self, rule_group, host, port, user, password, path,
pm_job_id):
@@ -544,6 +558,25 @@ class PrometheusPluginPm(PrometheusPlugin, mon_base.MonitoringPlugin):
client.connect(username=user, password=password)
sftp = paramiko.SFTPClient.from_transport(client)
sftp.put(filename, f'{path}/{pm_job_id}.json')
self.verify_rule(host, port, user, password, path, pm_job_id)
def verify_rule(self, host, port, user, password, path, pm_job_id):
if not CONF.prometheus_plugin.test_rule_with_promtool:
return
with paramiko.SSHClient() as client:
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(host, port=port, username=user, password=password)
command = f"promtool check rules {path}/{pm_job_id}.json"
LOG.info("Rule file validation command: %s", command)
_, stdout, stderr = client.exec_command(command)
if stdout.channel.recv_exit_status() != 0:
error_byte = stderr.read()
error_str = error_byte.decode('utf-8')
LOG.error(
"Rule file validation with promtool failed: %s",
error_str)
raise sol_ex.PrometheusPluginError(
"Rule file validation with promtool failed.")
def get_access_info(self, pm_job):
target_list = []
@@ -579,12 +612,32 @@ class PrometheusPluginPm(PrometheusPlugin, mon_base.MonitoringPlugin):
def upload_rules(
self, context, target_list, reload_list, rule_group, pm_job):
for info in target_list:
self._upload_rule(
rule_group, info['host'], info['port'], info['user'],
info['password'], info['path'], pm_job.id)
for uri in reload_list:
self.reload_prom_server(context, uri)
def _cleanup_error(target_list):
for target in target_list:
try:
self._delete_rule(target['host'], target['port'],
target['user'], target['password'], target['path'],
pm_job.id)
except (sol_ex.PrometheusPluginError, ks_exc.ClientException,
paramiko.SSHException):
pass
try:
for target in target_list:
self._upload_rule(
rule_group, target['host'], target['port'],
target['user'], target['password'], target['path'],
pm_job.id)
for uri in reload_list:
self.reload_prom_server(context, uri)
except (sol_ex.PrometheusPluginError, ks_exc.ClientException,
paramiko.SSHException) as e:
LOG.error("failed to upload rule files: %s", e.args[0])
_cleanup_error(target_list)
raise e
except Exception as e:
_cleanup_error(target_list)
raise e
def get_vnf_instances(self, context, pm_job):
object_instance_ids = list(set(pm_job.objectInstanceIds))

View File

@@ -60,4 +60,4 @@ class PrometheusPluginDriver():
url = f'{ep}/vnflcm/v2/vnf_instances/{vnf_instance_id}/scale'
resp, _ = self.client.do_request(
url, "POST", context=context, body=scale_req, version="2.0.0")
LOG.info("AutoHealing request is processed: %d.", resp.status_code)
LOG.info("AutoScaling request is processed: %d.", resp.status_code)

View File

@@ -199,6 +199,7 @@ class VnfPmControllerV2(sol_wsgi.SolAPIController):
try:
self.plugin.create_job(context=context, pm_job=pm_job)
except sol_ex.PrometheusPluginError as e:
LOG.error("Failed to create PM job: %s", e.args[0])
raise sol_ex.PrometheusSettingFailed from e
pm_job.create(context)