Add retry around Nagios Request

Added a retry around the Nagios request commands. Updated the code
based on comments and feedback.

Change-Id: I24588c112e2b5ec954f857550bda7d78bdf6d03e
This commit is contained in:
Chris Straut (cs4987) 2021-07-27 13:44:11 -05:00
parent 110fef8a2c
commit 5522c1856e
4 changed files with 218 additions and 137 deletions

View File

@ -74,13 +74,13 @@ def main():
for key, value in metrics.items():
if value == args.critical:
criticalMessages.append("Critical: {metric_name} metric is a critical value of {metric_value}({detail})".format(
metric_name=args.health_metric, metric_value=value, detail=key))
metric_name=args.health_metric, metric_value=value, detail=key))
elif value == args.warning:
warningMessages.append("Warning: {metric_name} metric is a warning value of {metric_value}({detail})".format(
metric_name=args.health_metric, metric_value=value, detail=key))
metric_name=args.health_metric, metric_value=value, detail=key))
else:
print("Unknown: Query response for {metric_name} has Null value({detail})".format(
metric_name=args.health_metric, detail=str(metrics)))
metric_name=args.health_metric, detail=str(metrics)))
sys.exit(STATE_UNKNOWN)
if criticalMessages:
@ -96,37 +96,50 @@ def main():
def query_exporter_metric(exporter_namespace, label_selector, metric_name):
exporter_endpoint = find_active_endpoint(exporter_namespace, label_selector)
exporter_endpoint = find_active_endpoint(
exporter_namespace, label_selector)
error_messages = []
metrics = dict()
try:
response = requests.get(include_schema(exporter_endpoint), verify=False) # nosec
line_item_metrics = re.findall(
"^{}.*".format(metric_name),
response.text,
re.MULTILINE)
for metric in line_item_metrics:
metric_with_labels, value = metric.split(" ")
metrics[metric_with_labels] = float(value)
except Exception as e:
error_messages.append(
"ERROR retrieving exporter endpoint {}".format(
str(e)))
max_retry = 5
retry = 1
while retry < max_retry:
try:
response = requests.get(include_schema(
exporter_endpoint), verify=False) # nosec
line_item_metrics = re.findall(
"^{}.*".format(metric_name),
response.text,
re.MULTILINE)
for metric in line_item_metrics:
metric_with_labels, value = metric.split(" ")
metrics[metric_with_labels] = float(value)
except Exception as e:
if retry < max_retry:
print('Request timeout, Retrying - {}'.format(retry))
retry += 1
continue
error_messages.append(
"ERROR retrieving exporter endpoint {}".format(
str(e)))
return metrics, error_messages
def get_kubernetes_api():
kubernetes.config.load_incluster_config()
api = kubernetes.client.CoreV1Api()
return api
def get_kubernetes_endpoints(namespace, label_selector):
kube_api = get_kubernetes_api()
try:
endpoint_list = kube_api.list_namespaced_endpoints(namespace=namespace, label_selector=label_selector)
endpoint_list = kube_api.list_namespaced_endpoints(
namespace=namespace, label_selector=label_selector)
except ApiException as e:
print("Exception when calling CoreV1Api->list_namespaced_endpoints: %s\n" % e)
return endpoint_list.items
def get_endpoint_metric_port(endpoint):
ports = endpoint.ports
for port in ports:
@ -135,6 +148,7 @@ def get_endpoint_metric_port(endpoint):
print("No metrics ports exposed on {} endpoint".format(endpoint))
sys.exit(STATE_CRITICAL)
def get_kubernetes_endpoint_addresses(endpoints):
addresses = []
for endpoint in endpoints:
@ -144,6 +158,7 @@ def get_kubernetes_endpoint_addresses(endpoints):
addresses.append("{}:{}/metrics".format(address.ip, port))
return addresses
def find_active_endpoint(namespace, label_selector):
kube_api = get_kubernetes_api()
exporter_endpoints = get_kubernetes_endpoints(namespace, label_selector)
@ -152,9 +167,11 @@ def find_active_endpoint(namespace, label_selector):
response = requests.get(include_schema(address), verify=False) # nosec
if response.text:
return address
print("No active exporters in {} namespace with selectors {} found!".format(namespace, label_selector))
print("No active exporters in {} namespace with selectors {} found!".format(
namespace, label_selector))
sys.exit(STATE_CRITICAL)
def include_schema(endpoint):
if endpoint.startswith("http://") or endpoint.startswith("https://"):
return endpoint

View File

@ -77,6 +77,7 @@ def main():
timeout_seconds = 10
warning_seconds = timeout_seconds
critical_seconds = timeout_seconds
max_retry = 5
if args.warning_response_seconds:
warning_seconds = int(args.warning_response_seconds)
@ -108,50 +109,59 @@ def main():
proxies["https"] = args.https_proxy
parsed = urlparse(args.url)
replaced = parsed._replace(netloc="{}:{}@{}".format(parsed.username, "???", parsed.hostname))
replaced = parsed._replace(
netloc="{}:{}@{}".format(parsed.username, "???", parsed.hostname))
screened_url = replaced.geturl()
try:
response = requests.get(
include_schema(
args.url),
proxies=proxies,
timeout=timeout_seconds,
verify=False) # nosec
retry = 1
response_seconds = response.elapsed.total_seconds()
response_time = "[RT={:.4f}]".format(response_seconds)
while retry < max_retry:
try:
response = requests.get(
include_schema(
args.url),
proxies=proxies,
timeout=timeout_seconds,
verify=False) # nosec
if response.status_code not in expected_response_codes:
print("CRITICAL: using URL {} expected HTTP status codes {} but got {}. {}".format(
screened_url, expected_response_codes, response.status_code, response_time))
response_seconds = response.elapsed.total_seconds()
response_time = "[RT={:.4f}]".format(response_seconds)
if response_seconds >= warning_seconds and response_seconds < critical_seconds:
print("WARNING: using URL {} response seconds {} is more than warning threshold {} seconds. {}".format(
screened_url, response_seconds, warning_seconds, response_time))
sys.exit(STATE_WARNING)
if response.status_code not in expected_response_codes:
print("CRITICAL: using URL {} expected HTTP status codes {} but got {}. {}".format(
screened_url, expected_response_codes, response.status_code, response_time))
sys.exit(STATE_CRITICAL)
if response_seconds >= critical_seconds:
print("CRITICAL: using URL {} response seconds {} is more than critical threshold {} seconds. {}".format(
screened_url, response_seconds, critical_seconds, response_time))
sys.exit(STATE_CRITICAL)
print("OK: URL {} returned response code {}. {}".format(
screened_url, response.status_code, response_time))
sys.exit(STATE_OK)
except requests.exceptions.Timeout:
if retry < max_retry:
print('Request timeout, Retrying - {}'.format(retry))
retry += 1
continue
else:
print("CRITICAL: Timeout in {} seconds to fetch from URL {}".format(
timeout_seconds, screened_url))
sys.exit(STATE_CRITICAL)
except Exception as e:
print("CRITICAL: Failed to fetch from URL {} with reason {}".format(
screened_url, e))
sys.exit(STATE_CRITICAL)
if response_seconds >= warning_seconds and response_seconds < critical_seconds:
print("WARNING: using URL {} response seconds {} is more than warning threshold {} seconds. {}".format(
screened_url, response_seconds, warning_seconds, response_time))
sys.exit(STATE_WARNING)
if response_seconds >= critical_seconds:
print("CRITICAL: using URL {} response seconds {} is more than critical threshold {} seconds. {}".format(
screened_url, response_seconds, critical_seconds, response_time))
sys.exit(STATE_CRITICAL)
print("OK: URL {} returned response code {}. {}".format(
screened_url, response.status_code, response_time))
sys.exit(STATE_OK)
except requests.exceptions.Timeout:
print("CRITICAL: Timeout in {} seconds to fetch from URL {}".format(
timeout_seconds, screened_url))
sys.exit(STATE_CRITICAL)
except Exception as e:
print("CRITICAL: Failed to fetch from URL {} with reason {}".format(
screened_url, e))
sys.exit(STATE_CRITICAL)
sys.exit(STATE_OK)
def include_schema(api):
if api.startswith(

View File

@ -96,10 +96,10 @@ def main():
severity = metric['metric']['severity']
message = args.msg_format.format(**metric['metric'])
if alertstate == 'firing':
if severity == 'page':
firingScalarMessages_critical.append(message)
if severity == 'warning':
firingScalarMessages_warning.append(message)
if severity == 'page':
firingScalarMessages_critical.append(message)
if severity == 'warning':
firingScalarMessages_warning.append(message)
if firingScalarMessages_critical:
print(",".join(firingScalarMessages_critical))
@ -134,34 +134,54 @@ def main():
def query_prometheus(prometheus_api, alertname, labels_csv, timeout):
error_messages = []
response_json = dict()
try:
promql = 'ALERTS{alertname="' + alertname + '"'
if labels_csv:
promql = promql + "," + labels_csv
promql = promql + "}"
query = {'query': promql}
kwargs = {
'params': query,
'timeout': timeout
}
cacert = os.getenv('CA_CERT_PATH', "")
if cacert:
kwargs['verify'] = cacert
max_retry = 5
retry = 1
while retry < max_retry:
try:
promql = 'ALERTS{alertname="' + alertname + '"'
if labels_csv:
promql = promql + "," + labels_csv
promql = promql + "}"
query = {'query': promql}
kwargs = {
'params': query,
'timeout': timeout
}
cacert = os.getenv('CA_CERT_PATH', "")
if cacert:
kwargs['verify'] = cacert
response = requests.get(include_schema(prometheus_api) + "/api/v1/query", **kwargs)
response_json = response.json()
except requests.exceptions.Timeout:
error_messages.append(
"ERROR: Prometheus api connection timed out, using URL {}, the maximum timeout value is {} seconds".format(clean_api_address(prometheus_api) , timeout))
except requests.exceptions.ConnectionError:
error_messages.append(
"ERROR: Prometheus api cannot be connected[connection refused], using URL {}".format(clean_api_address(prometheus_api)))
except requests.exceptions.RequestException:
error_messages.append(
"ERROR: Prometheus api connection failed, using URL {}".format(clean_api_address(prometheus_api)))
except Exception as e:
error_messages.append(
"ERROR while invoking prometheus api using URL {}, got error: {}".format(clean_api_address(prometheus_api), e))
response = requests.get(include_schema(
prometheus_api) + "/api/v1/query", **kwargs)
response_json = response.json()
except requests.exceptions.Timeout:
if retry < max_retry:
print('Request timeout, Retrying - {}'.format(retry))
retry += 1
continue
error_messages.append(
"ERROR: Prometheus api connection timed out, using URL {}, the maximum timeout value is {} seconds".format(clean_api_address(prometheus_api), timeout))
except requests.exceptions.ConnectionError:
if retry < max_retry:
print('Request timeout, Retrying - {}'.format(retry))
retry += 1
continue
error_messages.append(
"ERROR: Prometheus api cannot be connected[connection refused], using URL {}".format(clean_api_address(prometheus_api)))
except requests.exceptions.RequestException:
if retry < max_retry:
print('Request timeout, Retrying - {}'.format(retry))
retry += 1
continue
error_messages.append(
"ERROR: Prometheus api connection failed, using URL {}".format(clean_api_address(prometheus_api)))
except Exception as e:
if retry < max_retry:
print('Request timeout, Retrying - {}'.format(retry))
retry += 1
continue
error_messages.append(
"ERROR while invoking prometheus api using URL {}, got error: {}".format(clean_api_address(prometheus_api), e))
return response_json, error_messages
@ -169,40 +189,58 @@ def query_prometheus(prometheus_api, alertname, labels_csv, timeout):
def check_prom_metrics_available(prometheus_api, metrics, labels_csv, timeout):
error_messages = []
metrics_available = False
try:
metrics_with_query = []
for metric in metrics:
if labels_csv:
metrics_with_query.append(
"absent({metric}{{{labels}}})".format(
metric=metric, labels=labels_csv))
else:
metrics_with_query.append(
"absent({metric})".format(metric=metric))
promql = " OR ".join(metrics_with_query)
query = {'query': promql}
response = requests.get(
include_schema(prometheus_api) +
"/api/v1/query",
params=query, timeout=timeout)
response_json = response.json()
if response_json['data']['result']:
if response_json['data']['result'][0]['value'][1] == "1":
metrics_available = False
else:
metrics_available = True
except requests.exceptions.Timeout:
error_messages.append(
"ERROR: Prometheus api connection timed out, using URL {}, the maximum timeout value is {} seconds".format(clean_api_address(prometheus_api), timeout))
except requests.exceptions.ConnectionError:
error_messages.append(
"ERROR: Prometheus api cannot be connected[connection refused], using URL {}".format(clean_api_address(prometheus_api)))
except requests.exceptions.RequestException:
error_messages.append(
"ERROR: Prometheus api connection failed, using URL {}".format(clean_api_address(prometheus_api)))
except Exception as e:
error_messages.append(
"ERROR while invoking prometheus api using URL {}, got error: {}".format(clean_api_address(prometheus_api), e))
max_retry = 5
retry = 1
while retry < max_retry:
try:
metrics_with_query = []
for metric in metrics:
if labels_csv:
metrics_with_query.append(
"absent({metric}{{{labels}}})".format(
metric=metric, labels=labels_csv))
else:
metrics_with_query.append(
"absent({metric})".format(metric=metric))
promql = " OR ".join(metrics_with_query)
query = {'query': promql}
response = requests.get(
include_schema(prometheus_api) +
"/api/v1/query",
params=query, timeout=timeout)
response_json = response.json()
if response_json['data']['result']:
if response_json['data']['result'][0]['value'][1] == "1":
metrics_available = False
else:
metrics_available = True
except requests.exceptions.Timeout:
if retry < max_retry:
retry += 1
continue
error_messages.append(
"ERROR: Prometheus api connection timed out, using URL {}, the maximum timeout value is {} seconds".format(clean_api_address(prometheus_api), timeout))
except requests.exceptions.ConnectionError:
if retry < max_retry:
print('Request timeout, Retrying - {}'.format(retry))
retry += 1
continue
error_messages.append(
"ERROR: Prometheus api cannot be connected[connection refused], using URL {}".format(clean_api_address(prometheus_api)))
except requests.exceptions.RequestException:
if retry < max_retry:
print('Request timeout, Retrying - {}'.format(retry))
retry += 1
continue
error_messages.append(
"ERROR: Prometheus api connection failed, using URL {}".format(clean_api_address(prometheus_api)))
except Exception as e:
if retry < max_retry:
print('Request timeout, Retrying - {}'.format(retry))
retry += 1
continue
error_messages.append(
"ERROR while invoking prometheus api using URL {}, got error: {}".format(clean_api_address(prometheus_api), e))
return metrics_available, error_messages
@ -214,12 +252,14 @@ def include_schema(prometheus_api):
else:
return "http://{}".format(prometheus_api)
def clean_api_address(prometheus_api):
try:
match = re.match(r'(http(s?):\/\/(.[^:@]*):)(.[^@]*)', prometheus_api)
return re.sub(match.group(4), 'REDACTED', prometheus_api)
except:
return prometheus_api
try:
match = re.match(r'(http(s?):\/\/(.[^:@]*):)(.[^@]*)', prometheus_api)
return re.sub(match.group(4), 'REDACTED', prometheus_api)
except:
return prometheus_api
def get_label_names(s):
d = {}

View File

@ -32,7 +32,7 @@
# --state-id 2
# --output 'nova-compute stop/waiting'
# --monitoring-hostname 'nagioshost.x.y.com'
# sends HTTP POST with following payload:
# sends HTTP POST with following payload:
# "SvcEvent":{
# "SvcHostname":"hostwithevent.y.x.com",
# "SvcDesc":"Service_nova-compute",
@ -103,6 +103,8 @@ parser.add_argument(
args = parser.parse_args()
payload = {}
max_retry = 5
retry = 1
if args.type == 'host':
payload['HostEvent'] = {
@ -123,23 +125,35 @@ elif args.type == 'service':
'MonitoringHostName': args.monitoring_hostname
}
try:
requests.post(
args.primary_url,
data=json.dumps(payload),
timeout=args.timeout,
verify=False)
except Exception as e:
pass
if args.secondary_url:
while retry < max_retry:
try:
requests.post(
args.secondary_url,
args.primary_url,
data=json.dumps(payload),
timeout=args.timeout,
verify=False)
except Exception as e:
if retry < max_retry:
print('Request timeout, Retrying - {}'.format(retry))
retry += 1
continue
pass
if args.secondary_url:
retry = 1
while retry < max_retry:
try:
requests.post(
args.secondary_url,
data=json.dumps(payload),
timeout=args.timeout,
verify=False)
break
except Exception as e:
if retry < max_retry:
print('Request timeout, Retrying - {}'.format(retry))
retry += 1
continue
pass
sys.exit(0)