8336a8f6ba
In Kuryr-kubernetes we mainly rely on two services: Neutron for ports creating and Octavia for load balancers. Sometimes we observe that either ports or load balancer hangs in some states indefinitely. In such case, after we timeout, kuryr controller will crash, leaving ambiguous log message. In this patch we introduce two new Prometheus metrics - one for Port and the other for load balancer, which will be updated during such situation. Change-Id: Ifa4792019be97c21d1d531d9409f95ffd6d54499
193 lines
7.9 KiB
Python
193 lines
7.9 KiB
Python
# Copyright 2020 Red Hat, Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import flask
|
|
import netaddr
|
|
import prometheus_client
|
|
from prometheus_client.exposition import generate_latest
|
|
|
|
from oslo_config import cfg
|
|
from oslo_log import log as logging
|
|
|
|
from kuryr_kubernetes import clients
|
|
from kuryr_kubernetes import config
|
|
from kuryr_kubernetes import utils
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
CONF = cfg.CONF
|
|
RESOURCES = ('ports', 'subnets', 'networks', 'security_groups',
|
|
'security_group_rules')
|
|
_NO_QUOTA = 0
|
|
_INF = float("inf")
|
|
_NO_LIMIT = -1
|
|
|
|
|
|
class ControllerPrometheusExporter(object):
|
|
"""Provides metrics to Prometheus"""
|
|
|
|
instance = None
|
|
|
|
def __init__(self):
|
|
self.application = flask.Flask('prometheus-exporter')
|
|
self.ctx = None
|
|
self.application.add_url_rule(
|
|
'/metrics', methods=['GET'], view_func=self.metrics)
|
|
self.headers = {'Connection': 'close'}
|
|
self._os_net = clients.get_network_client()
|
|
self._os_lb = clients.get_loadbalancer_client()
|
|
self._project_id = config.CONF.neutron_defaults.project
|
|
self._create_metrics()
|
|
|
|
def metrics(self):
|
|
"""Provides the registered metrics"""
|
|
self._record_quota_free_count_metric()
|
|
self._record_ports_quota_per_subnet_metric()
|
|
self._record_lbs_metrics()
|
|
|
|
collected_metric = generate_latest(self.registry)
|
|
return flask.Response(collected_metric, mimetype='text/plain')
|
|
|
|
def record_pod_creation_metric(self, duration):
|
|
"""Records pod creation duration to the registry"""
|
|
self.pod_creation_latency.observe(duration)
|
|
|
|
def record_lb_failure(self):
|
|
"""Increase failure count for Load Balancer readiness"""
|
|
self.load_balancer_readiness.inc()
|
|
|
|
def record_port_failure(self):
|
|
"""Increase failure count to Port readiness"""
|
|
self.port_readiness.inc()
|
|
|
|
@classmethod
|
|
def get_instance(cls):
|
|
if not ControllerPrometheusExporter.instance:
|
|
ControllerPrometheusExporter.instance = cls()
|
|
return ControllerPrometheusExporter.instance
|
|
|
|
def run(self):
|
|
# Disable obtrusive werkzeug logs.
|
|
logging.getLogger('werkzeug').setLevel(logging.WARNING)
|
|
|
|
address = '::'
|
|
try:
|
|
LOG.info('Starting Prometheus exporter')
|
|
self.application.run(
|
|
address, CONF.prometheus_exporter.controller_exporter_port)
|
|
except Exception:
|
|
LOG.exception('Failed to start Prometheus exporter')
|
|
raise
|
|
|
|
def _record_quota_free_count_metric(self):
|
|
"""Records Network resources availability to the registry"""
|
|
quota = self._os_net.get_quota(quota=self._project_id, details=True)
|
|
for resource in RESOURCES:
|
|
resource_quota = quota[resource]
|
|
labels = {'resource': resource}
|
|
quota_limit = resource_quota['limit']
|
|
if quota_limit == _NO_LIMIT:
|
|
self.quota_free_count.labels(**labels).set(quota_limit)
|
|
continue
|
|
quota_used = resource_quota['used']
|
|
availability = quota_limit - quota_used
|
|
if availability >= _NO_QUOTA:
|
|
self.quota_free_count.labels(**labels).set(availability)
|
|
|
|
def _record_ports_quota_per_subnet_metric(self):
|
|
"""Records the ports quota per subnet to the registry"""
|
|
subnets = self._os_net.subnets(project_id=self._project_id)
|
|
namespace_prefix = 'ns/'
|
|
for subnet in subnets:
|
|
if namespace_prefix not in subnet.name:
|
|
continue
|
|
total_num_addresses = 0
|
|
ports_availability = 0
|
|
for allocation in subnet.allocation_pools:
|
|
total_num_addresses += netaddr.IPRange(
|
|
netaddr.IPAddress(allocation['start']),
|
|
netaddr.IPAddress(allocation['end'])).size
|
|
ports_count = len(list(self._os_net.ports(
|
|
fixed_ips=[f'subnet_id={subnet.id}'],
|
|
project_id=self._project_id)))
|
|
# NOTE(maysams): As the allocation pools range does not take
|
|
# into account the Gateway IP, that port IP shouldn't
|
|
# be include when counting the used ports.
|
|
ports_count = ports_count - 1
|
|
labels = {'subnet_id': subnet.id, 'subnet_name': subnet.name}
|
|
ports_availability = total_num_addresses-ports_count
|
|
self.port_quota_per_subnet.labels(**labels).set(ports_availability)
|
|
|
|
def _record_lbs_metrics(self):
|
|
"""Records the number of members available per LB and the LB state"""
|
|
critical_lbs = [
|
|
('dns-default', 'openshift-dns'),
|
|
('kubernetes', 'default')]
|
|
for name, namespace in critical_lbs:
|
|
klb = utils.get_kuryrloadbalancer(name, namespace)
|
|
lb = klb.get('status', {}).get('loadbalancer', {})
|
|
lb_id = lb.get('id')
|
|
if not lb_id:
|
|
continue
|
|
lb = self._os_lb.find_load_balancer(lb_id)
|
|
labels = {'lb_name': namespace + '/' + name}
|
|
if not lb:
|
|
self.lbs_state.labels(**labels).state('DELETED')
|
|
continue
|
|
self.lbs_state.labels(**labels).state(lb.provisioning_status)
|
|
pools = self._os_lb.pools(loadbalancer_id=lb.id)
|
|
for pool in pools:
|
|
labels = {'lb_name': lb.name, 'lb_pool_name': pool.name}
|
|
self.lbs_members_count.labels(**labels).set(len(pool.members))
|
|
|
|
def _create_metrics(self):
|
|
"""Creates a registry and records metrics"""
|
|
self.registry = prometheus_client.CollectorRegistry()
|
|
self.quota_free_count = prometheus_client.Gauge(
|
|
'kuryr_quota_free_count', 'Amount of quota available'
|
|
' for the network resource', labelnames={'resource'},
|
|
registry=self.registry)
|
|
|
|
self.port_quota_per_subnet = prometheus_client.Gauge(
|
|
'kuryr_port_quota_per_subnet', 'Amount of ports available'
|
|
' on Subnet', labelnames={'subnet_id', 'subnet_name'},
|
|
registry=self.registry)
|
|
|
|
self.lbs_members_count = prometheus_client.Gauge(
|
|
'kuryr_critical_lb_members_count', 'Amount of members per '
|
|
'critical Load Balancer pool',
|
|
labelnames={'lb_name', 'lb_pool_name'},
|
|
registry=self.registry)
|
|
|
|
self.lbs_state = prometheus_client.Enum(
|
|
'kuryr_critical_lb_state', 'Critical Load Balancer State',
|
|
labelnames={'lb_name'},
|
|
states=['ERROR', 'ACTIVE', 'DELETED', 'PENDING_CREATE',
|
|
'PENDING_UPDATE', 'PENDING_DELETE'],
|
|
registry=self.registry)
|
|
|
|
buckets = (10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, _INF)
|
|
self.pod_creation_latency = prometheus_client.Histogram(
|
|
'kuryr_pod_creation_latency', 'Time taken for a pod to have'
|
|
' Kuryr annotations set', buckets=buckets, registry=self.registry)
|
|
|
|
self.load_balancer_readiness = prometheus_client.Counter(
|
|
'kuryr_load_balancer_readiness', 'This counter is increased when '
|
|
'Kuryr notices that an Octavia load balancer is stuck in an '
|
|
'unexpected state', registry=self.registry)
|
|
|
|
self.port_readiness = prometheus_client.Counter(
|
|
'kuryr_port_readiness', 'This counter is increased when Kuryr '
|
|
'times out waiting for Neutron to move port to ACTIVE',
|
|
registry=self.registry)
|