Add retry around Nagios Request

Added a retry around the Nagios request commands. Updated the code based on comments and feedback. Change-Id: I24588c112e2b5ec954f857550bda7d78bdf6d03e
2021-07-27 13:44:11 -05:00 · 2021-07-27 13:44:11 -05:00 · 5522c1856e
parent 110fef8a2c
commit 5522c1856e
4 changed files with 218 additions and 137 deletions
--- a/nagios/plugins/check_exporter_health_metric.py
+++ b/nagios/plugins/check_exporter_health_metric.py
@ -74,13 +74,13 @@ def main():
        for key, value in metrics.items():
            if value == args.critical:
                criticalMessages.append("Critical: {metric_name} metric is a critical value of {metric_value}({detail})".format(
-                                         metric_name=args.health_metric, metric_value=value, detail=key))
+                    metric_name=args.health_metric, metric_value=value, detail=key))
            elif value == args.warning:
                warningMessages.append("Warning: {metric_name} metric is a warning value of {metric_value}({detail})".format(
-                                         metric_name=args.health_metric, metric_value=value, detail=key))
+                    metric_name=args.health_metric, metric_value=value, detail=key))
    else:
        print("Unknown: Query response for {metric_name} has Null value({detail})".format(
-                     metric_name=args.health_metric, detail=str(metrics)))
+            metric_name=args.health_metric, detail=str(metrics)))
        sys.exit(STATE_UNKNOWN)

    if criticalMessages:
@ -96,37 +96,50 @@ def main():


 def query_exporter_metric(exporter_namespace, label_selector, metric_name):
-    exporter_endpoint = find_active_endpoint(exporter_namespace, label_selector)
+    exporter_endpoint = find_active_endpoint(
+        exporter_namespace, label_selector)
    error_messages = []
    metrics = dict()
-    try:
-        response = requests.get(include_schema(exporter_endpoint), verify=False)  # nosec
-        line_item_metrics = re.findall(
-            "^{}.*".format(metric_name),
-            response.text,
-            re.MULTILINE)
-        for metric in line_item_metrics:
-            metric_with_labels, value = metric.split(" ")
-            metrics[metric_with_labels] = float(value)
-    except Exception as e:
-        error_messages.append(
-            "ERROR retrieving exporter endpoint {}".format(
-                str(e)))
+    max_retry = 5
+    retry = 1
+    while retry < max_retry:
+        try:
+            response = requests.get(include_schema(
+                exporter_endpoint), verify=False)  # nosec
+            line_item_metrics = re.findall(
+                "^{}.*".format(metric_name),
+                response.text,
+                re.MULTILINE)
+            for metric in line_item_metrics:
+                metric_with_labels, value = metric.split(" ")
+                metrics[metric_with_labels] = float(value)
+        except Exception as e:
+            if retry < max_retry:
+                print('Request timeout, Retrying - {}'.format(retry))
+                retry += 1
+                continue
+            error_messages.append(
+                "ERROR retrieving exporter endpoint {}".format(
+                    str(e)))
    return metrics, error_messages

+
 def get_kubernetes_api():
    kubernetes.config.load_incluster_config()
    api = kubernetes.client.CoreV1Api()
    return api

+
 def get_kubernetes_endpoints(namespace, label_selector):
    kube_api = get_kubernetes_api()
    try:
-        endpoint_list = kube_api.list_namespaced_endpoints(namespace=namespace, label_selector=label_selector)
+        endpoint_list = kube_api.list_namespaced_endpoints(
+            namespace=namespace, label_selector=label_selector)
    except ApiException as e:
        print("Exception when calling CoreV1Api->list_namespaced_endpoints: %s\n" % e)
    return endpoint_list.items

+
 def get_endpoint_metric_port(endpoint):
    ports = endpoint.ports
    for port in ports:
@ -135,6 +148,7 @@ def get_endpoint_metric_port(endpoint):
    print("No metrics ports exposed on {} endpoint".format(endpoint))
    sys.exit(STATE_CRITICAL)

+
 def get_kubernetes_endpoint_addresses(endpoints):
    addresses = []
    for endpoint in endpoints:
@ -144,6 +158,7 @@ def get_kubernetes_endpoint_addresses(endpoints):
                addresses.append("{}:{}/metrics".format(address.ip, port))
    return addresses

+
 def find_active_endpoint(namespace, label_selector):
    kube_api = get_kubernetes_api()
    exporter_endpoints = get_kubernetes_endpoints(namespace, label_selector)
@ -152,9 +167,11 @@ def find_active_endpoint(namespace, label_selector):
        response = requests.get(include_schema(address), verify=False)  # nosec
        if response.text:
            return address
-    print("No active exporters in {} namespace with selectors {} found!".format(namespace, label_selector))
+    print("No active exporters in {} namespace with selectors {} found!".format(
+        namespace, label_selector))
    sys.exit(STATE_CRITICAL)

+
 def include_schema(endpoint):
    if endpoint.startswith("http://") or endpoint.startswith("https://"):
        return endpoint
--- a/nagios/plugins/check_rest_get_api.py
+++ b/nagios/plugins/check_rest_get_api.py
@ -77,6 +77,7 @@ def main():
    timeout_seconds = 10
    warning_seconds = timeout_seconds
    critical_seconds = timeout_seconds
+    max_retry = 5

    if args.warning_response_seconds:
        warning_seconds = int(args.warning_response_seconds)
@ -108,50 +109,59 @@ def main():
        proxies["https"] = args.https_proxy

    parsed = urlparse(args.url)
-    replaced = parsed._replace(netloc="{}:{}@{}".format(parsed.username, "???", parsed.hostname))
+    replaced = parsed._replace(
+        netloc="{}:{}@{}".format(parsed.username, "???", parsed.hostname))
    screened_url = replaced.geturl()

-    try:
-        response = requests.get(
-            include_schema(
-                args.url),
-            proxies=proxies,
-            timeout=timeout_seconds,
-            verify=False)  # nosec
+    retry = 1

-        response_seconds = response.elapsed.total_seconds()
-        response_time = "[RT={:.4f}]".format(response_seconds)
+    while retry < max_retry:
+        try:
+            response = requests.get(
+                include_schema(
+                    args.url),
+                proxies=proxies,
+                timeout=timeout_seconds,
+                verify=False)  # nosec

-        if response.status_code not in expected_response_codes:
-            print("CRITICAL: using URL {} expected HTTP status codes {} but got {}. {}".format(
-                screened_url, expected_response_codes, response.status_code, response_time))
+            response_seconds = response.elapsed.total_seconds()
+            response_time = "[RT={:.4f}]".format(response_seconds)
+
+            if response_seconds >= warning_seconds and response_seconds < critical_seconds:
+                print("WARNING: using URL {} response seconds {} is more than warning threshold {} seconds. {}".format(
+                    screened_url, response_seconds, warning_seconds, response_time))
+                sys.exit(STATE_WARNING)
+
+            if response.status_code not in expected_response_codes:
+                print("CRITICAL: using URL {} expected HTTP status codes {} but got {}. {}".format(
+                    screened_url, expected_response_codes, response.status_code, response_time))
+                sys.exit(STATE_CRITICAL)
+
+            if response_seconds >= critical_seconds:
+                print("CRITICAL: using URL {} response seconds {} is more than critical threshold {} seconds. {}".format(
+                    screened_url, response_seconds, critical_seconds, response_time))
+                sys.exit(STATE_CRITICAL)
+
+            print("OK: URL {} returned response code {}. {}".format(
+                screened_url, response.status_code, response_time))
+            sys.exit(STATE_OK)
+
+        except requests.exceptions.Timeout:
+            if retry < max_retry:
+                print('Request timeout, Retrying - {}'.format(retry))
+                retry += 1
+                continue
+            else:
+                print("CRITICAL: Timeout in {} seconds to fetch from URL {}".format(
+                    timeout_seconds, screened_url))
+                sys.exit(STATE_CRITICAL)
+        except Exception as e:
+            print("CRITICAL: Failed to fetch from URL {} with reason {}".format(
+                screened_url, e))
            sys.exit(STATE_CRITICAL)

-        if response_seconds >= warning_seconds and response_seconds < critical_seconds:
-            print("WARNING: using URL {} response seconds {} is more than warning threshold {} seconds. {}".format(
-                screened_url, response_seconds, warning_seconds, response_time))
-            sys.exit(STATE_WARNING)
-
-        if response_seconds >= critical_seconds:
-            print("CRITICAL: using URL {} response seconds {} is more than critical threshold {} seconds. {}".format(
-                screened_url, response_seconds, critical_seconds, response_time))
-            sys.exit(STATE_CRITICAL)
-
-        print("OK: URL {} returned response code {}. {}".format(
-            screened_url, response.status_code, response_time))
        sys.exit(STATE_OK)

-    except requests.exceptions.Timeout:
-        print("CRITICAL: Timeout in {} seconds to fetch from URL {}".format(
-            timeout_seconds, screened_url))
-        sys.exit(STATE_CRITICAL)
-    except Exception as e:
-        print("CRITICAL: Failed to fetch from URL {} with reason {}".format(
-            screened_url, e))
-        sys.exit(STATE_CRITICAL)
-
-    sys.exit(STATE_OK)
-

 def include_schema(api):
    if api.startswith(
--- a/nagios/plugins/query_prometheus_alerts.py
+++ b/nagios/plugins/query_prometheus_alerts.py
@ -96,10 +96,10 @@ def main():
        severity = metric['metric']['severity']
        message = args.msg_format.format(**metric['metric'])
        if alertstate == 'firing':
-           if severity == 'page':
-              firingScalarMessages_critical.append(message)
-           if severity == 'warning':
-              firingScalarMessages_warning.append(message)
+            if severity == 'page':
+                firingScalarMessages_critical.append(message)
+            if severity == 'warning':
+                firingScalarMessages_warning.append(message)

    if firingScalarMessages_critical:
        print(",".join(firingScalarMessages_critical))
@ -134,34 +134,54 @@ def main():
 def query_prometheus(prometheus_api, alertname, labels_csv, timeout):
    error_messages = []
    response_json = dict()
-    try:
-        promql = 'ALERTS{alertname="' + alertname + '"'
-        if labels_csv:
-            promql = promql + "," + labels_csv
-        promql = promql + "}"
-        query = {'query': promql}
-        kwargs = {
-            'params': query,
-            'timeout': timeout
-        }
-        cacert = os.getenv('CA_CERT_PATH', "")
-        if cacert:
-            kwargs['verify'] = cacert
+    max_retry = 5
+    retry = 1
+    while retry < max_retry:
+        try:
+            promql = 'ALERTS{alertname="' + alertname + '"'
+            if labels_csv:
+                promql = promql + "," + labels_csv
+            promql = promql + "}"
+            query = {'query': promql}
+            kwargs = {
+                'params': query,
+                'timeout': timeout
+            }
+            cacert = os.getenv('CA_CERT_PATH', "")
+            if cacert:
+                kwargs['verify'] = cacert

-        response = requests.get(include_schema(prometheus_api) + "/api/v1/query", **kwargs)
-        response_json = response.json()
-    except requests.exceptions.Timeout:
-        error_messages.append(
-            "ERROR: Prometheus api connection timed out, using URL {}, the maximum timeout value is {} seconds".format(clean_api_address(prometheus_api) , timeout))
-    except requests.exceptions.ConnectionError:
-        error_messages.append(
-            "ERROR:  Prometheus api cannot be connected[connection refused], using URL {}".format(clean_api_address(prometheus_api)))
-    except requests.exceptions.RequestException:
-        error_messages.append(
-            "ERROR:  Prometheus api connection failed, using URL {}".format(clean_api_address(prometheus_api)))
-    except Exception as e:
-        error_messages.append(
-            "ERROR while invoking prometheus api using URL {}, got error: {}".format(clean_api_address(prometheus_api), e))
+            response = requests.get(include_schema(
+                prometheus_api) + "/api/v1/query", **kwargs)
+            response_json = response.json()
+        except requests.exceptions.Timeout:
+            if retry < max_retry:
+                print('Request timeout, Retrying - {}'.format(retry))
+                retry += 1
+                continue
+            error_messages.append(
+                "ERROR: Prometheus api connection timed out, using URL {}, the maximum timeout value is {} seconds".format(clean_api_address(prometheus_api), timeout))
+        except requests.exceptions.ConnectionError:
+            if retry < max_retry:
+                print('Request timeout, Retrying - {}'.format(retry))
+                retry += 1
+                continue
+            error_messages.append(
+                "ERROR:  Prometheus api cannot be connected[connection refused], using URL {}".format(clean_api_address(prometheus_api)))
+        except requests.exceptions.RequestException:
+            if retry < max_retry:
+                print('Request timeout, Retrying - {}'.format(retry))
+                retry += 1
+                continue
+            error_messages.append(
+                "ERROR:  Prometheus api connection failed, using URL {}".format(clean_api_address(prometheus_api)))
+        except Exception as e:
+            if retry < max_retry:
+                print('Request timeout, Retrying - {}'.format(retry))
+                retry += 1
+                continue
+            error_messages.append(
+                "ERROR while invoking prometheus api using URL {}, got error: {}".format(clean_api_address(prometheus_api), e))

    return response_json, error_messages

@ -169,40 +189,58 @@ def query_prometheus(prometheus_api, alertname, labels_csv, timeout):
 def check_prom_metrics_available(prometheus_api, metrics, labels_csv, timeout):
    error_messages = []
    metrics_available = False
-    try:
-        metrics_with_query = []
-        for metric in metrics:
-            if labels_csv:
-                metrics_with_query.append(
-                    "absent({metric}{{{labels}}})".format(
-                        metric=metric, labels=labels_csv))
-            else:
-                metrics_with_query.append(
-                    "absent({metric})".format(metric=metric))
-        promql = " OR ".join(metrics_with_query)
-        query = {'query': promql}
-        response = requests.get(
-            include_schema(prometheus_api) +
-            "/api/v1/query",
-            params=query, timeout=timeout)
-        response_json = response.json()
-        if response_json['data']['result']:
-            if response_json['data']['result'][0]['value'][1] == "1":
-                metrics_available = False
-            else:
-                metrics_available = True
-    except requests.exceptions.Timeout:
-        error_messages.append(
-            "ERROR: Prometheus api connection timed out, using URL {}, the maximum timeout value is {} seconds".format(clean_api_address(prometheus_api), timeout))
-    except requests.exceptions.ConnectionError:
-        error_messages.append(
-            "ERROR:  Prometheus api cannot be connected[connection refused], using URL {}".format(clean_api_address(prometheus_api)))
-    except requests.exceptions.RequestException:
-        error_messages.append(
-            "ERROR:  Prometheus api connection failed, using URL {}".format(clean_api_address(prometheus_api)))
-    except Exception as e:
-        error_messages.append(
-            "ERROR while invoking prometheus api using URL {}, got error: {}".format(clean_api_address(prometheus_api), e))
+    max_retry = 5
+    retry = 1
+    while retry < max_retry:
+        try:
+            metrics_with_query = []
+            for metric in metrics:
+                if labels_csv:
+                    metrics_with_query.append(
+                        "absent({metric}{{{labels}}})".format(
+                            metric=metric, labels=labels_csv))
+                else:
+                    metrics_with_query.append(
+                        "absent({metric})".format(metric=metric))
+            promql = " OR ".join(metrics_with_query)
+            query = {'query': promql}
+            response = requests.get(
+                include_schema(prometheus_api) +
+                "/api/v1/query",
+                params=query, timeout=timeout)
+            response_json = response.json()
+            if response_json['data']['result']:
+                if response_json['data']['result'][0]['value'][1] == "1":
+                    metrics_available = False
+                else:
+                    metrics_available = True
+        except requests.exceptions.Timeout:
+            if retry < max_retry:
+                retry += 1
+                continue
+            error_messages.append(
+                "ERROR: Prometheus api connection timed out, using URL {}, the maximum timeout value is {} seconds".format(clean_api_address(prometheus_api), timeout))
+        except requests.exceptions.ConnectionError:
+            if retry < max_retry:
+                print('Request timeout, Retrying - {}'.format(retry))
+                retry += 1
+                continue
+            error_messages.append(
+                "ERROR:  Prometheus api cannot be connected[connection refused], using URL {}".format(clean_api_address(prometheus_api)))
+        except requests.exceptions.RequestException:
+            if retry < max_retry:
+                print('Request timeout, Retrying - {}'.format(retry))
+                retry += 1
+                continue
+            error_messages.append(
+                "ERROR:  Prometheus api connection failed, using URL {}".format(clean_api_address(prometheus_api)))
+        except Exception as e:
+            if retry < max_retry:
+                print('Request timeout, Retrying - {}'.format(retry))
+                retry += 1
+                continue
+            error_messages.append(
+                "ERROR while invoking prometheus api using URL {}, got error: {}".format(clean_api_address(prometheus_api), e))

    return metrics_available, error_messages

@ -214,12 +252,14 @@ def include_schema(prometheus_api):
    else:
        return "http://{}".format(prometheus_api)

+
 def clean_api_address(prometheus_api):
-   try:
-     match = re.match(r'(http(s?):\/\/(.[^:@]*):)(.[^@]*)', prometheus_api)
-     return re.sub(match.group(4), 'REDACTED', prometheus_api)
-   except:
-     return prometheus_api
+    try:
+        match = re.match(r'(http(s?):\/\/(.[^:@]*):)(.[^@]*)', prometheus_api)
+        return re.sub(match.group(4), 'REDACTED', prometheus_api)
+    except:
+        return prometheus_api
+

 def get_label_names(s):
    d = {}
--- a/nagios/plugins/send_http_post_event.py
+++ b/nagios/plugins/send_http_post_event.py
@ -32,7 +32,7 @@
 #                      --state-id 2
 #                      --output 'nova-compute stop/waiting'
 #                      --monitoring-hostname 'nagioshost.x.y.com'
-# sends HTTP POST with following payload:
+#  sends HTTP POST with following payload:
 #    "SvcEvent":{
 #        "SvcHostname":"hostwithevent.y.x.com",
 #        "SvcDesc":"Service_nova-compute",
@ -103,6 +103,8 @@ parser.add_argument(
 args = parser.parse_args()

 payload = {}
+max_retry = 5
+retry = 1

 if args.type == 'host':
    payload['HostEvent'] = {
@ -123,23 +125,35 @@ elif args.type == 'service':
        'MonitoringHostName': args.monitoring_hostname
    }

-try:
-    requests.post(
-        args.primary_url,
-        data=json.dumps(payload),
-        timeout=args.timeout,
-        verify=False)
-except Exception as e:
-    pass
-
-if args.secondary_url:
+while retry < max_retry:
    try:
        requests.post(
-            args.secondary_url,
+            args.primary_url,
            data=json.dumps(payload),
            timeout=args.timeout,
            verify=False)
    except Exception as e:
+        if retry < max_retry:
+            print('Request timeout, Retrying - {}'.format(retry))
+            retry += 1
+            continue
        pass

+if args.secondary_url:
+    retry = 1
+    while retry < max_retry:
+        try:
+            requests.post(
+                args.secondary_url,
+                data=json.dumps(payload),
+                timeout=args.timeout,
+                verify=False)
+            break
+        except Exception as e:
+            if retry < max_retry:
+                print('Request timeout, Retrying - {}'.format(retry))
+                retry += 1
+                continue
+            pass
+
 sys.exit(0)