Added metrics for failures caused by OpenStack services.

In Kuryr-kubernetes we mainly rely on two services: Neutron for ports creating and Octavia for load balancers. Sometimes we observe that either ports or load balancer hangs in some states indefinitely. In such case, after we timeout, kuryr controller will crash, leaving ambiguous log message. In this patch we introduce two new Prometheus metrics - one for Port and the other for load balancer, which will be updated during such situation. Change-Id: Ifa4792019be97c21d1d531d9409f95ffd6d54499
2021-06-28 14:37:11 +02:00 · 2021-06-28 14:37:11 +02:00 · 8336a8f6ba
parent c4278f9da2
commit 8336a8f6ba
1 changed files with 18 additions and 0 deletions
--- a/kuryr_kubernetes/controller/managers/prometheus_exporter.py
+++ b/kuryr_kubernetes/controller/managers/prometheus_exporter.py
@ -62,6 +62,14 @@ class ControllerPrometheusExporter(object):
        """Records pod creation duration to the registry"""
        self.pod_creation_latency.observe(duration)

+    def record_lb_failure(self):
+        """Increase failure count for Load Balancer readiness"""
+        self.load_balancer_readiness.inc()
+
+    def record_port_failure(self):
+        """Increase failure count to Port readiness"""
+        self.port_readiness.inc()
+
    @classmethod
    def get_instance(cls):
        if not ControllerPrometheusExporter.instance:
@ -172,3 +180,13 @@ class ControllerPrometheusExporter(object):
        self.pod_creation_latency = prometheus_client.Histogram(
            'kuryr_pod_creation_latency', 'Time taken for a pod to have'
            ' Kuryr annotations set', buckets=buckets, registry=self.registry)
+
+        self.load_balancer_readiness = prometheus_client.Counter(
+            'kuryr_load_balancer_readiness', 'This counter is increased when '
+            'Kuryr notices that an Octavia load balancer is stuck in an '
+            'unexpected state', registry=self.registry)
+
+        self.port_readiness = prometheus_client.Counter(
+            'kuryr_port_readiness', 'This counter is increased when Kuryr '
+            'times out waiting for Neutron to move port to ACTIVE',
+            registry=self.registry)