diff --git a/.zuul.yaml b/.zuul.yaml index 150c3eddd..3349d3d08 100644 --- a/.zuul.yaml +++ b/.zuul.yaml @@ -586,9 +586,10 @@ v2_vnfm: kubernetes_vim_rsc_wait_timeout: 800 prometheus_plugin: - fault_management: True - performance_management: True - auto_scaling: True + fault_management: true + performance_management: true + auto_scaling: true + test_rule_with_promtool: true tox_envlist: dsvm-functional-sol-kubernetes-v2 vars: prometheus_setup: true diff --git a/doc/source/user/prometheus_plugin_use_case_guide.rst b/doc/source/user/prometheus_plugin_use_case_guide.rst index 3aa4fb29d..2f6a52b28 100644 --- a/doc/source/user/prometheus_plugin_use_case_guide.rst +++ b/doc/source/user/prometheus_plugin_use_case_guide.rst @@ -52,6 +52,9 @@ performance_management, fault_management or auto_scaling below. * - ``CONF.prometheus_plugin.auto_scaling`` - false - Enable prometheus plugin autoscaling. + * - ``CONF.prometheus_plugin.test_rule_with_promtool`` + - false + - Enable rule file validation using promtool. System ~~~~~~ @@ -241,6 +244,14 @@ needs to activate sshd. - The directory indicated by "rule_files" setting of prometheus server config should be accessible by SSH. +Supported versions +------------------ + +Tacker Zed release + +- Prometheus: 2.37 +- Alertmanager: 0.24 + Alert rule registration ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/tacker/sol_refactored/common/config.py b/tacker/sol_refactored/common/config.py index cd2be997e..456092ec5 100644 --- a/tacker/sol_refactored/common/config.py +++ b/tacker/sol_refactored/common/config.py @@ -161,6 +161,9 @@ PROMETHEUS_PLUGIN_OPTS = [ 'This configuration is changed in case of replacing ' 'the original function with a vendor specific ' 'function.')), + cfg.BoolOpt('test_rule_with_promtool', + default=False, + help=_('Enable rule file validation using promtool.')), ] CONF.register_opts(PROMETHEUS_PLUGIN_OPTS, 'prometheus_plugin') diff --git a/tacker/sol_refactored/common/prometheus_plugin.py b/tacker/sol_refactored/common/prometheus_plugin.py index 68e6bdfdc..f33093ac9 100644 --- a/tacker/sol_refactored/common/prometheus_plugin.py +++ b/tacker/sol_refactored/common/prometheus_plugin.py @@ -21,6 +21,7 @@ import paramiko import re import tempfile +from keystoneauth1 import exceptions as ks_exc from oslo_log import log as logging from oslo_utils import uuidutils from tacker.sol_refactored.api import prometheus_plugin_validator as validator @@ -37,6 +38,7 @@ from tacker.sol_refactored import objects LOG = logging.getLogger(__name__) +logging.getLogger("paramiko").setLevel(logging.WARNING) CONF = cfg.CONF @@ -510,12 +512,23 @@ class PrometheusPluginPm(PrometheusPlugin, mon_base.MonitoringPlugin): def delete_rules(self, context, pm_job): target_list, reload_list = self.get_access_info(pm_job) - for info in target_list: - self._delete_rule( - info['host'], info['port'], info['user'], - info['password'], info['path'], pm_job.id) + for target in target_list: + try: + self._delete_rule( + target['host'], target['port'], target['user'], + target['password'], target['path'], pm_job.id) + except (sol_ex.PrometheusPluginError, ks_exc.ClientException, + paramiko.SSHException): + # This exception is ignored. DELETE /pm_jobs/{id} + # will be success even if _delete_rule() is failed. + # Because the rule file was already deleted. + pass for uri in reload_list: - self.reload_prom_server(context, uri) + try: + self.reload_prom_server(context, uri) + except (sol_ex.PrometheusPluginError, ks_exc.ClientException, + paramiko.SSHException): + pass def decompose_metrics(self, pm_job): if pm_job.objectType in {'Vnf', 'Vnfc'}: @@ -528,9 +541,10 @@ class PrometheusPluginPm(PrometheusPlugin, mon_base.MonitoringPlugin): def reload_prom_server(self, context, reload_uri): resp, _ = self.client.do_request( reload_uri, "PUT", context=context) - if resp.status_code != 202: - LOG.error("reloading request to prometheus is failed: %d.", - resp.status_code) + if resp.status_code >= 400 and resp.status_code < 600: + raise sol_ex.PrometheusPluginError( + f"Reloading request to prometheus is failed: " + f"{resp.status_code}.") def _upload_rule(self, rule_group, host, port, user, password, path, pm_job_id): @@ -544,6 +558,25 @@ class PrometheusPluginPm(PrometheusPlugin, mon_base.MonitoringPlugin): client.connect(username=user, password=password) sftp = paramiko.SFTPClient.from_transport(client) sftp.put(filename, f'{path}/{pm_job_id}.json') + self.verify_rule(host, port, user, password, path, pm_job_id) + + def verify_rule(self, host, port, user, password, path, pm_job_id): + if not CONF.prometheus_plugin.test_rule_with_promtool: + return + with paramiko.SSHClient() as client: + client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + client.connect(host, port=port, username=user, password=password) + command = f"promtool check rules {path}/{pm_job_id}.json" + LOG.info("Rule file validation command: %s", command) + _, stdout, stderr = client.exec_command(command) + if stdout.channel.recv_exit_status() != 0: + error_byte = stderr.read() + error_str = error_byte.decode('utf-8') + LOG.error( + "Rule file validation with promtool failed: %s", + error_str) + raise sol_ex.PrometheusPluginError( + "Rule file validation with promtool failed.") def get_access_info(self, pm_job): target_list = [] @@ -579,12 +612,32 @@ class PrometheusPluginPm(PrometheusPlugin, mon_base.MonitoringPlugin): def upload_rules( self, context, target_list, reload_list, rule_group, pm_job): - for info in target_list: - self._upload_rule( - rule_group, info['host'], info['port'], info['user'], - info['password'], info['path'], pm_job.id) - for uri in reload_list: - self.reload_prom_server(context, uri) + def _cleanup_error(target_list): + for target in target_list: + try: + self._delete_rule(target['host'], target['port'], + target['user'], target['password'], target['path'], + pm_job.id) + except (sol_ex.PrometheusPluginError, ks_exc.ClientException, + paramiko.SSHException): + pass + + try: + for target in target_list: + self._upload_rule( + rule_group, target['host'], target['port'], + target['user'], target['password'], target['path'], + pm_job.id) + for uri in reload_list: + self.reload_prom_server(context, uri) + except (sol_ex.PrometheusPluginError, ks_exc.ClientException, + paramiko.SSHException) as e: + LOG.error("failed to upload rule files: %s", e.args[0]) + _cleanup_error(target_list) + raise e + except Exception as e: + _cleanup_error(target_list) + raise e def get_vnf_instances(self, context, pm_job): object_instance_ids = list(set(pm_job.objectInstanceIds)) diff --git a/tacker/sol_refactored/conductor/prometheus_plugin_driver.py b/tacker/sol_refactored/conductor/prometheus_plugin_driver.py index 2688bf711..7a93480d1 100644 --- a/tacker/sol_refactored/conductor/prometheus_plugin_driver.py +++ b/tacker/sol_refactored/conductor/prometheus_plugin_driver.py @@ -60,4 +60,4 @@ class PrometheusPluginDriver(): url = f'{ep}/vnflcm/v2/vnf_instances/{vnf_instance_id}/scale' resp, _ = self.client.do_request( url, "POST", context=context, body=scale_req, version="2.0.0") - LOG.info("AutoHealing request is processed: %d.", resp.status_code) + LOG.info("AutoScaling request is processed: %d.", resp.status_code) diff --git a/tacker/sol_refactored/controller/vnfpm_v2.py b/tacker/sol_refactored/controller/vnfpm_v2.py index f40d2f796..a25ca38b2 100644 --- a/tacker/sol_refactored/controller/vnfpm_v2.py +++ b/tacker/sol_refactored/controller/vnfpm_v2.py @@ -199,6 +199,7 @@ class VnfPmControllerV2(sol_wsgi.SolAPIController): try: self.plugin.create_job(context=context, pm_job=pm_job) except sol_ex.PrometheusPluginError as e: + LOG.error("Failed to create PM job: %s", e.args[0]) raise sol_ex.PrometheusSettingFailed from e pm_job.create(context) diff --git a/tacker/tests/functional/sol_kubernetes_v2/samples/tacker-monitoring-test.zip b/tacker/tests/functional/sol_kubernetes_v2/samples/tacker-monitoring-test.zip index 94b4370c0..babbf5456 100644 Binary files a/tacker/tests/functional/sol_kubernetes_v2/samples/tacker-monitoring-test.zip and b/tacker/tests/functional/sol_kubernetes_v2/samples/tacker-monitoring-test.zip differ