Add retry around Nagios Request
Added a retry around the Nagios request commands. Updated the code based on comments and feedback. Change-Id: I24588c112e2b5ec954f857550bda7d78bdf6d03e
This commit is contained in:
parent
110fef8a2c
commit
5522c1856e
|
@ -74,13 +74,13 @@ def main():
|
|||
for key, value in metrics.items():
|
||||
if value == args.critical:
|
||||
criticalMessages.append("Critical: {metric_name} metric is a critical value of {metric_value}({detail})".format(
|
||||
metric_name=args.health_metric, metric_value=value, detail=key))
|
||||
metric_name=args.health_metric, metric_value=value, detail=key))
|
||||
elif value == args.warning:
|
||||
warningMessages.append("Warning: {metric_name} metric is a warning value of {metric_value}({detail})".format(
|
||||
metric_name=args.health_metric, metric_value=value, detail=key))
|
||||
metric_name=args.health_metric, metric_value=value, detail=key))
|
||||
else:
|
||||
print("Unknown: Query response for {metric_name} has Null value({detail})".format(
|
||||
metric_name=args.health_metric, detail=str(metrics)))
|
||||
metric_name=args.health_metric, detail=str(metrics)))
|
||||
sys.exit(STATE_UNKNOWN)
|
||||
|
||||
if criticalMessages:
|
||||
|
@ -96,37 +96,50 @@ def main():
|
|||
|
||||
|
||||
def query_exporter_metric(exporter_namespace, label_selector, metric_name):
|
||||
exporter_endpoint = find_active_endpoint(exporter_namespace, label_selector)
|
||||
exporter_endpoint = find_active_endpoint(
|
||||
exporter_namespace, label_selector)
|
||||
error_messages = []
|
||||
metrics = dict()
|
||||
try:
|
||||
response = requests.get(include_schema(exporter_endpoint), verify=False) # nosec
|
||||
line_item_metrics = re.findall(
|
||||
"^{}.*".format(metric_name),
|
||||
response.text,
|
||||
re.MULTILINE)
|
||||
for metric in line_item_metrics:
|
||||
metric_with_labels, value = metric.split(" ")
|
||||
metrics[metric_with_labels] = float(value)
|
||||
except Exception as e:
|
||||
error_messages.append(
|
||||
"ERROR retrieving exporter endpoint {}".format(
|
||||
str(e)))
|
||||
max_retry = 5
|
||||
retry = 1
|
||||
while retry < max_retry:
|
||||
try:
|
||||
response = requests.get(include_schema(
|
||||
exporter_endpoint), verify=False) # nosec
|
||||
line_item_metrics = re.findall(
|
||||
"^{}.*".format(metric_name),
|
||||
response.text,
|
||||
re.MULTILINE)
|
||||
for metric in line_item_metrics:
|
||||
metric_with_labels, value = metric.split(" ")
|
||||
metrics[metric_with_labels] = float(value)
|
||||
except Exception as e:
|
||||
if retry < max_retry:
|
||||
print('Request timeout, Retrying - {}'.format(retry))
|
||||
retry += 1
|
||||
continue
|
||||
error_messages.append(
|
||||
"ERROR retrieving exporter endpoint {}".format(
|
||||
str(e)))
|
||||
return metrics, error_messages
|
||||
|
||||
|
||||
def get_kubernetes_api():
|
||||
kubernetes.config.load_incluster_config()
|
||||
api = kubernetes.client.CoreV1Api()
|
||||
return api
|
||||
|
||||
|
||||
def get_kubernetes_endpoints(namespace, label_selector):
|
||||
kube_api = get_kubernetes_api()
|
||||
try:
|
||||
endpoint_list = kube_api.list_namespaced_endpoints(namespace=namespace, label_selector=label_selector)
|
||||
endpoint_list = kube_api.list_namespaced_endpoints(
|
||||
namespace=namespace, label_selector=label_selector)
|
||||
except ApiException as e:
|
||||
print("Exception when calling CoreV1Api->list_namespaced_endpoints: %s\n" % e)
|
||||
return endpoint_list.items
|
||||
|
||||
|
||||
def get_endpoint_metric_port(endpoint):
|
||||
ports = endpoint.ports
|
||||
for port in ports:
|
||||
|
@ -135,6 +148,7 @@ def get_endpoint_metric_port(endpoint):
|
|||
print("No metrics ports exposed on {} endpoint".format(endpoint))
|
||||
sys.exit(STATE_CRITICAL)
|
||||
|
||||
|
||||
def get_kubernetes_endpoint_addresses(endpoints):
|
||||
addresses = []
|
||||
for endpoint in endpoints:
|
||||
|
@ -144,6 +158,7 @@ def get_kubernetes_endpoint_addresses(endpoints):
|
|||
addresses.append("{}:{}/metrics".format(address.ip, port))
|
||||
return addresses
|
||||
|
||||
|
||||
def find_active_endpoint(namespace, label_selector):
|
||||
kube_api = get_kubernetes_api()
|
||||
exporter_endpoints = get_kubernetes_endpoints(namespace, label_selector)
|
||||
|
@ -152,9 +167,11 @@ def find_active_endpoint(namespace, label_selector):
|
|||
response = requests.get(include_schema(address), verify=False) # nosec
|
||||
if response.text:
|
||||
return address
|
||||
print("No active exporters in {} namespace with selectors {} found!".format(namespace, label_selector))
|
||||
print("No active exporters in {} namespace with selectors {} found!".format(
|
||||
namespace, label_selector))
|
||||
sys.exit(STATE_CRITICAL)
|
||||
|
||||
|
||||
def include_schema(endpoint):
|
||||
if endpoint.startswith("http://") or endpoint.startswith("https://"):
|
||||
return endpoint
|
||||
|
|
|
@ -77,6 +77,7 @@ def main():
|
|||
timeout_seconds = 10
|
||||
warning_seconds = timeout_seconds
|
||||
critical_seconds = timeout_seconds
|
||||
max_retry = 5
|
||||
|
||||
if args.warning_response_seconds:
|
||||
warning_seconds = int(args.warning_response_seconds)
|
||||
|
@ -108,50 +109,59 @@ def main():
|
|||
proxies["https"] = args.https_proxy
|
||||
|
||||
parsed = urlparse(args.url)
|
||||
replaced = parsed._replace(netloc="{}:{}@{}".format(parsed.username, "???", parsed.hostname))
|
||||
replaced = parsed._replace(
|
||||
netloc="{}:{}@{}".format(parsed.username, "???", parsed.hostname))
|
||||
screened_url = replaced.geturl()
|
||||
|
||||
try:
|
||||
response = requests.get(
|
||||
include_schema(
|
||||
args.url),
|
||||
proxies=proxies,
|
||||
timeout=timeout_seconds,
|
||||
verify=False) # nosec
|
||||
retry = 1
|
||||
|
||||
response_seconds = response.elapsed.total_seconds()
|
||||
response_time = "[RT={:.4f}]".format(response_seconds)
|
||||
while retry < max_retry:
|
||||
try:
|
||||
response = requests.get(
|
||||
include_schema(
|
||||
args.url),
|
||||
proxies=proxies,
|
||||
timeout=timeout_seconds,
|
||||
verify=False) # nosec
|
||||
|
||||
if response.status_code not in expected_response_codes:
|
||||
print("CRITICAL: using URL {} expected HTTP status codes {} but got {}. {}".format(
|
||||
screened_url, expected_response_codes, response.status_code, response_time))
|
||||
response_seconds = response.elapsed.total_seconds()
|
||||
response_time = "[RT={:.4f}]".format(response_seconds)
|
||||
|
||||
if response_seconds >= warning_seconds and response_seconds < critical_seconds:
|
||||
print("WARNING: using URL {} response seconds {} is more than warning threshold {} seconds. {}".format(
|
||||
screened_url, response_seconds, warning_seconds, response_time))
|
||||
sys.exit(STATE_WARNING)
|
||||
|
||||
if response.status_code not in expected_response_codes:
|
||||
print("CRITICAL: using URL {} expected HTTP status codes {} but got {}. {}".format(
|
||||
screened_url, expected_response_codes, response.status_code, response_time))
|
||||
sys.exit(STATE_CRITICAL)
|
||||
|
||||
if response_seconds >= critical_seconds:
|
||||
print("CRITICAL: using URL {} response seconds {} is more than critical threshold {} seconds. {}".format(
|
||||
screened_url, response_seconds, critical_seconds, response_time))
|
||||
sys.exit(STATE_CRITICAL)
|
||||
|
||||
print("OK: URL {} returned response code {}. {}".format(
|
||||
screened_url, response.status_code, response_time))
|
||||
sys.exit(STATE_OK)
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
if retry < max_retry:
|
||||
print('Request timeout, Retrying - {}'.format(retry))
|
||||
retry += 1
|
||||
continue
|
||||
else:
|
||||
print("CRITICAL: Timeout in {} seconds to fetch from URL {}".format(
|
||||
timeout_seconds, screened_url))
|
||||
sys.exit(STATE_CRITICAL)
|
||||
except Exception as e:
|
||||
print("CRITICAL: Failed to fetch from URL {} with reason {}".format(
|
||||
screened_url, e))
|
||||
sys.exit(STATE_CRITICAL)
|
||||
|
||||
if response_seconds >= warning_seconds and response_seconds < critical_seconds:
|
||||
print("WARNING: using URL {} response seconds {} is more than warning threshold {} seconds. {}".format(
|
||||
screened_url, response_seconds, warning_seconds, response_time))
|
||||
sys.exit(STATE_WARNING)
|
||||
|
||||
if response_seconds >= critical_seconds:
|
||||
print("CRITICAL: using URL {} response seconds {} is more than critical threshold {} seconds. {}".format(
|
||||
screened_url, response_seconds, critical_seconds, response_time))
|
||||
sys.exit(STATE_CRITICAL)
|
||||
|
||||
print("OK: URL {} returned response code {}. {}".format(
|
||||
screened_url, response.status_code, response_time))
|
||||
sys.exit(STATE_OK)
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
print("CRITICAL: Timeout in {} seconds to fetch from URL {}".format(
|
||||
timeout_seconds, screened_url))
|
||||
sys.exit(STATE_CRITICAL)
|
||||
except Exception as e:
|
||||
print("CRITICAL: Failed to fetch from URL {} with reason {}".format(
|
||||
screened_url, e))
|
||||
sys.exit(STATE_CRITICAL)
|
||||
|
||||
sys.exit(STATE_OK)
|
||||
|
||||
|
||||
def include_schema(api):
|
||||
if api.startswith(
|
||||
|
|
|
@ -96,10 +96,10 @@ def main():
|
|||
severity = metric['metric']['severity']
|
||||
message = args.msg_format.format(**metric['metric'])
|
||||
if alertstate == 'firing':
|
||||
if severity == 'page':
|
||||
firingScalarMessages_critical.append(message)
|
||||
if severity == 'warning':
|
||||
firingScalarMessages_warning.append(message)
|
||||
if severity == 'page':
|
||||
firingScalarMessages_critical.append(message)
|
||||
if severity == 'warning':
|
||||
firingScalarMessages_warning.append(message)
|
||||
|
||||
if firingScalarMessages_critical:
|
||||
print(",".join(firingScalarMessages_critical))
|
||||
|
@ -134,34 +134,54 @@ def main():
|
|||
def query_prometheus(prometheus_api, alertname, labels_csv, timeout):
|
||||
error_messages = []
|
||||
response_json = dict()
|
||||
try:
|
||||
promql = 'ALERTS{alertname="' + alertname + '"'
|
||||
if labels_csv:
|
||||
promql = promql + "," + labels_csv
|
||||
promql = promql + "}"
|
||||
query = {'query': promql}
|
||||
kwargs = {
|
||||
'params': query,
|
||||
'timeout': timeout
|
||||
}
|
||||
cacert = os.getenv('CA_CERT_PATH', "")
|
||||
if cacert:
|
||||
kwargs['verify'] = cacert
|
||||
max_retry = 5
|
||||
retry = 1
|
||||
while retry < max_retry:
|
||||
try:
|
||||
promql = 'ALERTS{alertname="' + alertname + '"'
|
||||
if labels_csv:
|
||||
promql = promql + "," + labels_csv
|
||||
promql = promql + "}"
|
||||
query = {'query': promql}
|
||||
kwargs = {
|
||||
'params': query,
|
||||
'timeout': timeout
|
||||
}
|
||||
cacert = os.getenv('CA_CERT_PATH', "")
|
||||
if cacert:
|
||||
kwargs['verify'] = cacert
|
||||
|
||||
response = requests.get(include_schema(prometheus_api) + "/api/v1/query", **kwargs)
|
||||
response_json = response.json()
|
||||
except requests.exceptions.Timeout:
|
||||
error_messages.append(
|
||||
"ERROR: Prometheus api connection timed out, using URL {}, the maximum timeout value is {} seconds".format(clean_api_address(prometheus_api) , timeout))
|
||||
except requests.exceptions.ConnectionError:
|
||||
error_messages.append(
|
||||
"ERROR: Prometheus api cannot be connected[connection refused], using URL {}".format(clean_api_address(prometheus_api)))
|
||||
except requests.exceptions.RequestException:
|
||||
error_messages.append(
|
||||
"ERROR: Prometheus api connection failed, using URL {}".format(clean_api_address(prometheus_api)))
|
||||
except Exception as e:
|
||||
error_messages.append(
|
||||
"ERROR while invoking prometheus api using URL {}, got error: {}".format(clean_api_address(prometheus_api), e))
|
||||
response = requests.get(include_schema(
|
||||
prometheus_api) + "/api/v1/query", **kwargs)
|
||||
response_json = response.json()
|
||||
except requests.exceptions.Timeout:
|
||||
if retry < max_retry:
|
||||
print('Request timeout, Retrying - {}'.format(retry))
|
||||
retry += 1
|
||||
continue
|
||||
error_messages.append(
|
||||
"ERROR: Prometheus api connection timed out, using URL {}, the maximum timeout value is {} seconds".format(clean_api_address(prometheus_api), timeout))
|
||||
except requests.exceptions.ConnectionError:
|
||||
if retry < max_retry:
|
||||
print('Request timeout, Retrying - {}'.format(retry))
|
||||
retry += 1
|
||||
continue
|
||||
error_messages.append(
|
||||
"ERROR: Prometheus api cannot be connected[connection refused], using URL {}".format(clean_api_address(prometheus_api)))
|
||||
except requests.exceptions.RequestException:
|
||||
if retry < max_retry:
|
||||
print('Request timeout, Retrying - {}'.format(retry))
|
||||
retry += 1
|
||||
continue
|
||||
error_messages.append(
|
||||
"ERROR: Prometheus api connection failed, using URL {}".format(clean_api_address(prometheus_api)))
|
||||
except Exception as e:
|
||||
if retry < max_retry:
|
||||
print('Request timeout, Retrying - {}'.format(retry))
|
||||
retry += 1
|
||||
continue
|
||||
error_messages.append(
|
||||
"ERROR while invoking prometheus api using URL {}, got error: {}".format(clean_api_address(prometheus_api), e))
|
||||
|
||||
return response_json, error_messages
|
||||
|
||||
|
@ -169,40 +189,58 @@ def query_prometheus(prometheus_api, alertname, labels_csv, timeout):
|
|||
def check_prom_metrics_available(prometheus_api, metrics, labels_csv, timeout):
|
||||
error_messages = []
|
||||
metrics_available = False
|
||||
try:
|
||||
metrics_with_query = []
|
||||
for metric in metrics:
|
||||
if labels_csv:
|
||||
metrics_with_query.append(
|
||||
"absent({metric}{{{labels}}})".format(
|
||||
metric=metric, labels=labels_csv))
|
||||
else:
|
||||
metrics_with_query.append(
|
||||
"absent({metric})".format(metric=metric))
|
||||
promql = " OR ".join(metrics_with_query)
|
||||
query = {'query': promql}
|
||||
response = requests.get(
|
||||
include_schema(prometheus_api) +
|
||||
"/api/v1/query",
|
||||
params=query, timeout=timeout)
|
||||
response_json = response.json()
|
||||
if response_json['data']['result']:
|
||||
if response_json['data']['result'][0]['value'][1] == "1":
|
||||
metrics_available = False
|
||||
else:
|
||||
metrics_available = True
|
||||
except requests.exceptions.Timeout:
|
||||
error_messages.append(
|
||||
"ERROR: Prometheus api connection timed out, using URL {}, the maximum timeout value is {} seconds".format(clean_api_address(prometheus_api), timeout))
|
||||
except requests.exceptions.ConnectionError:
|
||||
error_messages.append(
|
||||
"ERROR: Prometheus api cannot be connected[connection refused], using URL {}".format(clean_api_address(prometheus_api)))
|
||||
except requests.exceptions.RequestException:
|
||||
error_messages.append(
|
||||
"ERROR: Prometheus api connection failed, using URL {}".format(clean_api_address(prometheus_api)))
|
||||
except Exception as e:
|
||||
error_messages.append(
|
||||
"ERROR while invoking prometheus api using URL {}, got error: {}".format(clean_api_address(prometheus_api), e))
|
||||
max_retry = 5
|
||||
retry = 1
|
||||
while retry < max_retry:
|
||||
try:
|
||||
metrics_with_query = []
|
||||
for metric in metrics:
|
||||
if labels_csv:
|
||||
metrics_with_query.append(
|
||||
"absent({metric}{{{labels}}})".format(
|
||||
metric=metric, labels=labels_csv))
|
||||
else:
|
||||
metrics_with_query.append(
|
||||
"absent({metric})".format(metric=metric))
|
||||
promql = " OR ".join(metrics_with_query)
|
||||
query = {'query': promql}
|
||||
response = requests.get(
|
||||
include_schema(prometheus_api) +
|
||||
"/api/v1/query",
|
||||
params=query, timeout=timeout)
|
||||
response_json = response.json()
|
||||
if response_json['data']['result']:
|
||||
if response_json['data']['result'][0]['value'][1] == "1":
|
||||
metrics_available = False
|
||||
else:
|
||||
metrics_available = True
|
||||
except requests.exceptions.Timeout:
|
||||
if retry < max_retry:
|
||||
retry += 1
|
||||
continue
|
||||
error_messages.append(
|
||||
"ERROR: Prometheus api connection timed out, using URL {}, the maximum timeout value is {} seconds".format(clean_api_address(prometheus_api), timeout))
|
||||
except requests.exceptions.ConnectionError:
|
||||
if retry < max_retry:
|
||||
print('Request timeout, Retrying - {}'.format(retry))
|
||||
retry += 1
|
||||
continue
|
||||
error_messages.append(
|
||||
"ERROR: Prometheus api cannot be connected[connection refused], using URL {}".format(clean_api_address(prometheus_api)))
|
||||
except requests.exceptions.RequestException:
|
||||
if retry < max_retry:
|
||||
print('Request timeout, Retrying - {}'.format(retry))
|
||||
retry += 1
|
||||
continue
|
||||
error_messages.append(
|
||||
"ERROR: Prometheus api connection failed, using URL {}".format(clean_api_address(prometheus_api)))
|
||||
except Exception as e:
|
||||
if retry < max_retry:
|
||||
print('Request timeout, Retrying - {}'.format(retry))
|
||||
retry += 1
|
||||
continue
|
||||
error_messages.append(
|
||||
"ERROR while invoking prometheus api using URL {}, got error: {}".format(clean_api_address(prometheus_api), e))
|
||||
|
||||
return metrics_available, error_messages
|
||||
|
||||
|
@ -214,12 +252,14 @@ def include_schema(prometheus_api):
|
|||
else:
|
||||
return "http://{}".format(prometheus_api)
|
||||
|
||||
|
||||
def clean_api_address(prometheus_api):
|
||||
try:
|
||||
match = re.match(r'(http(s?):\/\/(.[^:@]*):)(.[^@]*)', prometheus_api)
|
||||
return re.sub(match.group(4), 'REDACTED', prometheus_api)
|
||||
except:
|
||||
return prometheus_api
|
||||
try:
|
||||
match = re.match(r'(http(s?):\/\/(.[^:@]*):)(.[^@]*)', prometheus_api)
|
||||
return re.sub(match.group(4), 'REDACTED', prometheus_api)
|
||||
except:
|
||||
return prometheus_api
|
||||
|
||||
|
||||
def get_label_names(s):
|
||||
d = {}
|
||||
|
|
|
@ -32,7 +32,7 @@
|
|||
# --state-id 2
|
||||
# --output 'nova-compute stop/waiting'
|
||||
# --monitoring-hostname 'nagioshost.x.y.com'
|
||||
# sends HTTP POST with following payload:
|
||||
# sends HTTP POST with following payload:
|
||||
# "SvcEvent":{
|
||||
# "SvcHostname":"hostwithevent.y.x.com",
|
||||
# "SvcDesc":"Service_nova-compute",
|
||||
|
@ -103,6 +103,8 @@ parser.add_argument(
|
|||
args = parser.parse_args()
|
||||
|
||||
payload = {}
|
||||
max_retry = 5
|
||||
retry = 1
|
||||
|
||||
if args.type == 'host':
|
||||
payload['HostEvent'] = {
|
||||
|
@ -123,23 +125,35 @@ elif args.type == 'service':
|
|||
'MonitoringHostName': args.monitoring_hostname
|
||||
}
|
||||
|
||||
try:
|
||||
requests.post(
|
||||
args.primary_url,
|
||||
data=json.dumps(payload),
|
||||
timeout=args.timeout,
|
||||
verify=False)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
if args.secondary_url:
|
||||
while retry < max_retry:
|
||||
try:
|
||||
requests.post(
|
||||
args.secondary_url,
|
||||
args.primary_url,
|
||||
data=json.dumps(payload),
|
||||
timeout=args.timeout,
|
||||
verify=False)
|
||||
except Exception as e:
|
||||
if retry < max_retry:
|
||||
print('Request timeout, Retrying - {}'.format(retry))
|
||||
retry += 1
|
||||
continue
|
||||
pass
|
||||
|
||||
if args.secondary_url:
|
||||
retry = 1
|
||||
while retry < max_retry:
|
||||
try:
|
||||
requests.post(
|
||||
args.secondary_url,
|
||||
data=json.dumps(payload),
|
||||
timeout=args.timeout,
|
||||
verify=False)
|
||||
break
|
||||
except Exception as e:
|
||||
if retry < max_retry:
|
||||
print('Request timeout, Retrying - {}'.format(retry))
|
||||
retry += 1
|
||||
continue
|
||||
pass
|
||||
|
||||
sys.exit(0)
|
||||
|
|
Loading…
Reference in New Issue