kuryr-kubernetes/kuryr_kubernetes/controller/managers/prometheus_exporter.py
gryf 8336a8f6ba Added metrics for failures caused by OpenStack services.
In Kuryr-kubernetes we mainly rely on two services: Neutron for ports
creating and Octavia for load balancers. Sometimes we observe that either
ports or load balancer hangs in some states indefinitely. In such case,
after we timeout, kuryr controller will crash, leaving ambiguous log
message.

In this patch we introduce two new Prometheus metrics - one for Port and
the other for load balancer, which will be updated during such
situation.

Change-Id: Ifa4792019be97c21d1d531d9409f95ffd6d54499
2021-10-05 22:33:46 +00:00

193 lines
7.9 KiB
Python

# Copyright 2020 Red Hat, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import flask
import netaddr
import prometheus_client
from prometheus_client.exposition import generate_latest
from oslo_config import cfg
from oslo_log import log as logging
from kuryr_kubernetes import clients
from kuryr_kubernetes import config
from kuryr_kubernetes import utils
LOG = logging.getLogger(__name__)
CONF = cfg.CONF
RESOURCES = ('ports', 'subnets', 'networks', 'security_groups',
'security_group_rules')
_NO_QUOTA = 0
_INF = float("inf")
_NO_LIMIT = -1
class ControllerPrometheusExporter(object):
"""Provides metrics to Prometheus"""
instance = None
def __init__(self):
self.application = flask.Flask('prometheus-exporter')
self.ctx = None
self.application.add_url_rule(
'/metrics', methods=['GET'], view_func=self.metrics)
self.headers = {'Connection': 'close'}
self._os_net = clients.get_network_client()
self._os_lb = clients.get_loadbalancer_client()
self._project_id = config.CONF.neutron_defaults.project
self._create_metrics()
def metrics(self):
"""Provides the registered metrics"""
self._record_quota_free_count_metric()
self._record_ports_quota_per_subnet_metric()
self._record_lbs_metrics()
collected_metric = generate_latest(self.registry)
return flask.Response(collected_metric, mimetype='text/plain')
def record_pod_creation_metric(self, duration):
"""Records pod creation duration to the registry"""
self.pod_creation_latency.observe(duration)
def record_lb_failure(self):
"""Increase failure count for Load Balancer readiness"""
self.load_balancer_readiness.inc()
def record_port_failure(self):
"""Increase failure count to Port readiness"""
self.port_readiness.inc()
@classmethod
def get_instance(cls):
if not ControllerPrometheusExporter.instance:
ControllerPrometheusExporter.instance = cls()
return ControllerPrometheusExporter.instance
def run(self):
# Disable obtrusive werkzeug logs.
logging.getLogger('werkzeug').setLevel(logging.WARNING)
address = '::'
try:
LOG.info('Starting Prometheus exporter')
self.application.run(
address, CONF.prometheus_exporter.controller_exporter_port)
except Exception:
LOG.exception('Failed to start Prometheus exporter')
raise
def _record_quota_free_count_metric(self):
"""Records Network resources availability to the registry"""
quota = self._os_net.get_quota(quota=self._project_id, details=True)
for resource in RESOURCES:
resource_quota = quota[resource]
labels = {'resource': resource}
quota_limit = resource_quota['limit']
if quota_limit == _NO_LIMIT:
self.quota_free_count.labels(**labels).set(quota_limit)
continue
quota_used = resource_quota['used']
availability = quota_limit - quota_used
if availability >= _NO_QUOTA:
self.quota_free_count.labels(**labels).set(availability)
def _record_ports_quota_per_subnet_metric(self):
"""Records the ports quota per subnet to the registry"""
subnets = self._os_net.subnets(project_id=self._project_id)
namespace_prefix = 'ns/'
for subnet in subnets:
if namespace_prefix not in subnet.name:
continue
total_num_addresses = 0
ports_availability = 0
for allocation in subnet.allocation_pools:
total_num_addresses += netaddr.IPRange(
netaddr.IPAddress(allocation['start']),
netaddr.IPAddress(allocation['end'])).size
ports_count = len(list(self._os_net.ports(
fixed_ips=[f'subnet_id={subnet.id}'],
project_id=self._project_id)))
# NOTE(maysams): As the allocation pools range does not take
# into account the Gateway IP, that port IP shouldn't
# be include when counting the used ports.
ports_count = ports_count - 1
labels = {'subnet_id': subnet.id, 'subnet_name': subnet.name}
ports_availability = total_num_addresses-ports_count
self.port_quota_per_subnet.labels(**labels).set(ports_availability)
def _record_lbs_metrics(self):
"""Records the number of members available per LB and the LB state"""
critical_lbs = [
('dns-default', 'openshift-dns'),
('kubernetes', 'default')]
for name, namespace in critical_lbs:
klb = utils.get_kuryrloadbalancer(name, namespace)
lb = klb.get('status', {}).get('loadbalancer', {})
lb_id = lb.get('id')
if not lb_id:
continue
lb = self._os_lb.find_load_balancer(lb_id)
labels = {'lb_name': namespace + '/' + name}
if not lb:
self.lbs_state.labels(**labels).state('DELETED')
continue
self.lbs_state.labels(**labels).state(lb.provisioning_status)
pools = self._os_lb.pools(loadbalancer_id=lb.id)
for pool in pools:
labels = {'lb_name': lb.name, 'lb_pool_name': pool.name}
self.lbs_members_count.labels(**labels).set(len(pool.members))
def _create_metrics(self):
"""Creates a registry and records metrics"""
self.registry = prometheus_client.CollectorRegistry()
self.quota_free_count = prometheus_client.Gauge(
'kuryr_quota_free_count', 'Amount of quota available'
' for the network resource', labelnames={'resource'},
registry=self.registry)
self.port_quota_per_subnet = prometheus_client.Gauge(
'kuryr_port_quota_per_subnet', 'Amount of ports available'
' on Subnet', labelnames={'subnet_id', 'subnet_name'},
registry=self.registry)
self.lbs_members_count = prometheus_client.Gauge(
'kuryr_critical_lb_members_count', 'Amount of members per '
'critical Load Balancer pool',
labelnames={'lb_name', 'lb_pool_name'},
registry=self.registry)
self.lbs_state = prometheus_client.Enum(
'kuryr_critical_lb_state', 'Critical Load Balancer State',
labelnames={'lb_name'},
states=['ERROR', 'ACTIVE', 'DELETED', 'PENDING_CREATE',
'PENDING_UPDATE', 'PENDING_DELETE'],
registry=self.registry)
buckets = (10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, _INF)
self.pod_creation_latency = prometheus_client.Histogram(
'kuryr_pod_creation_latency', 'Time taken for a pod to have'
' Kuryr annotations set', buckets=buckets, registry=self.registry)
self.load_balancer_readiness = prometheus_client.Counter(
'kuryr_load_balancer_readiness', 'This counter is increased when '
'Kuryr notices that an Octavia load balancer is stuck in an '
'unexpected state', registry=self.registry)
self.port_readiness = prometheus_client.Counter(
'kuryr_port_readiness', 'This counter is increased when Kuryr '
'times out waiting for Neutron to move port to ACTIVE',
registry=self.registry)