Add readiness and liveness checks to CNI.
This patch adds readiness and liveness to CNI. It checks presence of NET_ADMIN capabilities, IPDB in working order, connection to Kubernetes API, quantity of CNI add failures, health of CNI components and existence of memory leaks. Implements: blueprint cni-daemon-readiness-liveness Change-Id: I9a4b871d196dbadfed687df93bb3cad97c957bfb
This commit is contained in:
parent
519391fac0
commit
1e4b7f1109
@ -461,8 +461,10 @@ EOF
|
||||
|
||||
function generate_cni_daemon_set() {
|
||||
output_dir=$1
|
||||
cni_bin_dir=${2:-/opt/cni/bin}
|
||||
cni_conf_dir=${3:-/etc/cni/net.d}
|
||||
cni_health_server_port=$2
|
||||
cni_daemon=${3:-False}
|
||||
cni_bin_dir=${4:-/opt/cni/bin}
|
||||
cni_conf_dir=${5:-/etc/cni/net.d}
|
||||
mkdir -p "$output_dir"
|
||||
rm -f ${output_dir}/cni_ds.yml
|
||||
cat >> "${output_dir}/cni_ds.yml" << EOF
|
||||
@ -513,6 +515,24 @@ spec:
|
||||
mountPath: /host_proc
|
||||
- name: openvswitch
|
||||
mountPath: /var/run/openvswitch
|
||||
EOF
|
||||
if [ "$cni_daemon" == "True" ]; then
|
||||
cat >> "${output_dir}/cni_ds.yml" << EOF
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: ${cni_health_server_port}
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 15
|
||||
timeoutSeconds: 5
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /alive
|
||||
port: ${cni_health_server_port}
|
||||
initialDelaySeconds: 15
|
||||
EOF
|
||||
fi
|
||||
cat >> "${output_dir}/cni_ds.yml" << EOF
|
||||
volumes:
|
||||
- name: bin
|
||||
hostPath:
|
||||
|
@ -121,6 +121,9 @@ function configure_kuryr {
|
||||
}
|
||||
|
||||
function generate_containerized_kuryr_resources {
|
||||
local cni_daemon
|
||||
cni_daemon=$1
|
||||
|
||||
# Containerized deployment will use tokens provided by k8s itself.
|
||||
inicomment "$KURYR_CONFIG" kubernetes ssl_client_crt_file
|
||||
inicomment "$KURYR_CONFIG" kubernetes ssl_client_key_file
|
||||
@ -138,7 +141,7 @@ function generate_containerized_kuryr_resources {
|
||||
generate_kuryr_configmap $output_dir $KURYR_CONFIG $KURYR_CNI_CONFIG
|
||||
generate_kuryr_service_account $output_dir
|
||||
generate_controller_deployment $output_dir $KURYR_HEALTH_SERVER_PORT
|
||||
generate_cni_daemon_set $output_dir $CNI_BIN_DIR $CNI_CONF_DIR
|
||||
generate_cni_daemon_set $output_dir $KURYR_CNI_HEALTH_SERVER_PORT $cni_daemon $CNI_BIN_DIR $CNI_CONF_DIR
|
||||
}
|
||||
|
||||
function run_containerized_kuryr_resources {
|
||||
@ -712,10 +715,11 @@ if [[ "$1" == "stack" && "$2" == "extra" ]]; then
|
||||
else
|
||||
if is_service_enabled kuryr-daemon; then
|
||||
build_kuryr_containers $CNI_BIN_DIR $CNI_CONF_DIR True
|
||||
generate_containerized_kuryr_resources True
|
||||
else
|
||||
build_kuryr_containers $CNI_BIN_DIR $CNI_CONF_DIR False
|
||||
generate_containerized_kuryr_resources False
|
||||
fi
|
||||
generate_containerized_kuryr_resources
|
||||
run_containerized_kuryr_resources
|
||||
fi
|
||||
fi
|
||||
|
@ -80,3 +80,6 @@ KURYR_HEALTH_SERVER_PORT=${KURYR_HEALTH_SERVER_PORT:-8082}
|
||||
|
||||
# OVS HOST PATH
|
||||
OVS_HOST_PATH=${OVS_HOST_PATH:-/var/run/openvswitch}
|
||||
|
||||
# Health Server
|
||||
KURYR_CNI_HEALTH_SERVER_PORT=${KURYR_CNI_HEALTH_SERVER_PORT:-8090}
|
||||
|
@ -20,11 +20,11 @@ Kuryr Kubernetes Health Manager Design
|
||||
Purpose
|
||||
-------
|
||||
The purpose of this document is to present the design decision behind
|
||||
Kuryr Kubernetes Health Manager.
|
||||
Kuryr Kubernetes Health Managers.
|
||||
|
||||
The main purpose of the Health Manager is to perform Health verifications
|
||||
that assures Kuryr Controller readiness and liveness, and so improve the
|
||||
management that Kubernetes does on Kuryr Controller pod.
|
||||
The main purpose of the Health Managers is to perform Health verifications that
|
||||
assures readiness and liveness to Kuryr Controller and CNI pod, and so improve
|
||||
the management that Kubernetes does on Kuryr-Kubernetes pods.
|
||||
|
||||
Overview
|
||||
--------
|
||||
@ -36,17 +36,43 @@ It is important to check health of these services so that Kubernetes and
|
||||
its users know when Kuryr Controller it is ready to perform its networking
|
||||
tasks. Also, it is necessary to check the health state of Kuryr components in
|
||||
order to assure Kuryr Controller service is alive. To provide these
|
||||
functionalities, Health Manager will verify and serve the health state of
|
||||
these services and components to the probe.
|
||||
functionalities, Controller's Health Manager will verify and serve the health
|
||||
state of these services and components to the probes.
|
||||
|
||||
Besides these problems on the Controller, Kuryr CNI daemon also might get to a
|
||||
broken state as a result of its components being not healthy and necessary
|
||||
configurations not present. It is essencial that CNI components health and
|
||||
configurations are properly verified to assure CNI daemon is in a good shape.
|
||||
On this way, the CNI Health Manager will check and serve the health state to
|
||||
Kubenetes readiness and liveness probes.
|
||||
|
||||
Proposed Solution
|
||||
-----------------
|
||||
One of the endpoints provided by The Health Manager will check whether it is
|
||||
able to watch the Kubernetes API, authenticate with Keystone and talk to
|
||||
Neutron, since these are services needed by Kuryr Controller. These checks
|
||||
will assure the Controller readiness. The other endpoint, will verify
|
||||
the health state of Kuryr components and guarantee Controller liveness.
|
||||
One of the endpoints provided by the Controller Health Manager will check
|
||||
whether it is able to watch the Kubernetes API, authenticate with Keystone
|
||||
and talk to Neutron, since these are services needed by Kuryr Controller.
|
||||
These checks will assure the Controller readiness. The other endpoint, will
|
||||
verify the health state of Kuryr components and guarantee Controller liveness.
|
||||
|
||||
The idea behind the Manager is to combine all the necessary checks in a
|
||||
server running inside Kuryr Controller pod and provide the checks result
|
||||
to the probe.
|
||||
The CNI Health Manager also provides two endpoins to Kubernetes probes.
|
||||
The endpoint that provides readiness state to the probe checks connection
|
||||
to Kubernetes API and presence of NET_ADMIN capabilities. The other endpoint,
|
||||
which provides liveness, validates whether IPDB is in working order, maximum
|
||||
CNI ADD failure is reached, health of CNI components and existence of memory
|
||||
leak.
|
||||
|
||||
.. note::
|
||||
The CNI Health Manager will be started with the check for memory leak disabled.
|
||||
In order to enable, set the following option in kuryr.conf to a limit value
|
||||
of memory in MiBs.
|
||||
|
||||
[cni_health_server]
|
||||
max_memory_usage = -1
|
||||
|
||||
The CNI Health Manager is added as a process to CNI daemon and communicates
|
||||
to the other two processes i.e. Watcher and Server with a shared boolean object,
|
||||
which indicates the current health state of each component.
|
||||
|
||||
The idea behind these two Managers is to combine all the necessary checks in
|
||||
servers running inside Kuryr Controller and CNI pods to provide the checks result
|
||||
to the probes.
|
||||
|
@ -75,14 +75,18 @@ def _configure_l3(vif, ifname, netns):
|
||||
dst='default').commit()
|
||||
|
||||
|
||||
def connect(vif, instance_info, ifname, netns=None):
|
||||
def connect(vif, instance_info, ifname, netns=None, report_health=None):
|
||||
driver = _get_binding_driver(vif)
|
||||
if report_health:
|
||||
report_health(driver.is_healthy())
|
||||
os_vif.plug(vif, instance_info)
|
||||
driver.connect(vif, ifname, netns)
|
||||
_configure_l3(vif, ifname, netns)
|
||||
|
||||
|
||||
def disconnect(vif, instance_info, ifname, netns=None):
|
||||
def disconnect(vif, instance_info, ifname, netns=None, report_health=None):
|
||||
driver = _get_binding_driver(vif)
|
||||
if report_health:
|
||||
report_health(driver.is_healthy())
|
||||
driver.disconnect(vif, ifname, netns)
|
||||
os_vif.unplug(vif, instance_info)
|
||||
|
@ -12,18 +12,23 @@
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import os
|
||||
|
||||
from oslo_config import cfg
|
||||
from oslo_log import log
|
||||
|
||||
from kuryr_kubernetes.cni.binding import base as b_base
|
||||
from kuryr_kubernetes.handlers import health
|
||||
from kuryr_kubernetes import linux_net_utils as net_utils
|
||||
|
||||
LOG = log.getLogger(__name__)
|
||||
CONF = cfg.CONF
|
||||
|
||||
|
||||
class BaseBridgeDriver(object):
|
||||
class BaseBridgeDriver(health.HealthHandler):
|
||||
|
||||
def __init__(self):
|
||||
super(BaseBridgeDriver, self).__init__()
|
||||
|
||||
def connect(self, vif, ifname, netns):
|
||||
host_ifname = vif.vif_name
|
||||
|
||||
@ -59,6 +64,9 @@ class BaseBridgeDriver(object):
|
||||
|
||||
|
||||
class BridgeDriver(BaseBridgeDriver):
|
||||
def __init__(self):
|
||||
super(BridgeDriver, self).__init__()
|
||||
|
||||
def connect(self, vif, ifname, netns):
|
||||
super(BridgeDriver, self).connect(vif, ifname, netns)
|
||||
host_ifname = vif.vif_name
|
||||
@ -75,6 +83,10 @@ class BridgeDriver(BaseBridgeDriver):
|
||||
|
||||
|
||||
class VIFOpenVSwitchDriver(BaseBridgeDriver):
|
||||
|
||||
def __init__(self):
|
||||
super(VIFOpenVSwitchDriver, self).__init__()
|
||||
|
||||
def connect(self, vif, ifname, netns):
|
||||
super(VIFOpenVSwitchDriver, self).connect(vif, ifname, netns)
|
||||
# FIXME(irenab) use pod_id (neutron port device_id)
|
||||
@ -86,3 +98,13 @@ class VIFOpenVSwitchDriver(BaseBridgeDriver):
|
||||
def disconnect(self, vif, ifname, netns):
|
||||
super(VIFOpenVSwitchDriver, self).disconnect(vif, ifname, netns)
|
||||
net_utils.delete_ovs_vif_port(vif.bridge_name, vif.vif_name)
|
||||
|
||||
def is_healthy(self):
|
||||
bridge_name = CONF.neutron_defaults.ovs_bridge
|
||||
try:
|
||||
with b_base.get_ipdb() as h_ipdb:
|
||||
h_ipdb.interfaces[bridge_name]
|
||||
return True
|
||||
except Exception:
|
||||
LOG.debug("Reporting Driver not healthy.")
|
||||
return False
|
||||
|
@ -17,6 +17,7 @@ import six
|
||||
|
||||
from kuryr_kubernetes.cni.binding import base as b_base
|
||||
from kuryr_kubernetes import config
|
||||
from kuryr_kubernetes.handlers import health
|
||||
from kuryr_kubernetes import utils
|
||||
|
||||
VLAN_KIND = 'vlan'
|
||||
@ -25,7 +26,10 @@ MACVLAN_MODE_BRIDGE = 'bridge'
|
||||
|
||||
|
||||
@six.add_metaclass(abc.ABCMeta)
|
||||
class NestedDriver(object):
|
||||
class NestedDriver(health.HealthHandler):
|
||||
|
||||
def __init__(self):
|
||||
super(NestedDriver, self).__init__()
|
||||
|
||||
@abc.abstractmethod
|
||||
def _get_iface_create_args(self, vif):
|
||||
@ -65,10 +69,18 @@ class NestedDriver(object):
|
||||
|
||||
|
||||
class VlanDriver(NestedDriver):
|
||||
|
||||
def __init__(self):
|
||||
super(VlanDriver, self).__init__()
|
||||
|
||||
def _get_iface_create_args(self, vif):
|
||||
return {'kind': VLAN_KIND, 'vlan_id': vif.vlan_id}
|
||||
|
||||
|
||||
class MacvlanDriver(NestedDriver):
|
||||
|
||||
def __init__(self):
|
||||
super(MacvlanDriver, self).__init__()
|
||||
|
||||
def _get_iface_create_args(self, vif):
|
||||
return {'kind': MACVLAN_KIND, 'macvlan_mode': MACVLAN_MODE_BRIDGE}
|
||||
|
@ -12,11 +12,14 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ctypes import c_bool
|
||||
import multiprocessing
|
||||
import os
|
||||
from six.moves import http_client as httplib
|
||||
import socket
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
|
||||
import cotyledon
|
||||
import flask
|
||||
@ -35,6 +38,7 @@ from kuryr_kubernetes import clients
|
||||
from kuryr_kubernetes.cni import api
|
||||
from kuryr_kubernetes.cni.binding import base as b_base
|
||||
from kuryr_kubernetes.cni import handlers as h_cni
|
||||
from kuryr_kubernetes.cni import health
|
||||
from kuryr_kubernetes.cni import utils
|
||||
from kuryr_kubernetes import config
|
||||
from kuryr_kubernetes import constants as k_const
|
||||
@ -45,6 +49,7 @@ from kuryr_kubernetes import watcher as k_watcher
|
||||
LOG = logging.getLogger(__name__)
|
||||
CONF = cfg.CONF
|
||||
RETRY_DELAY = 1000 # 1 second in milliseconds
|
||||
HEALTH_CHECKER_DELAY = 5
|
||||
|
||||
# TODO(dulek): Another corner case is (and was) when pod is deleted before it's
|
||||
# annotated by controller or even noticed by any watcher. Kubelet
|
||||
@ -55,7 +60,8 @@ RETRY_DELAY = 1000 # 1 second in milliseconds
|
||||
|
||||
|
||||
class K8sCNIRegistryPlugin(api.CNIPlugin):
|
||||
def __init__(self, registry):
|
||||
def __init__(self, registry, healthy):
|
||||
self.healthy = healthy
|
||||
self.registry = registry
|
||||
|
||||
def _get_name(self, pod):
|
||||
@ -108,6 +114,12 @@ class K8sCNIRegistryPlugin(api.CNIPlugin):
|
||||
pass
|
||||
self._do_work(params, b_base.disconnect)
|
||||
|
||||
def report_drivers_health(self, driver_healthy):
|
||||
if not driver_healthy:
|
||||
with self.healthy.get_lock():
|
||||
LOG.debug("Reporting CNI driver not healthy.")
|
||||
self.healthy.value = driver_healthy
|
||||
|
||||
def _do_work(self, params, fn):
|
||||
pod_name = params.args.K8S_POD_NAME
|
||||
|
||||
@ -126,7 +138,8 @@ class K8sCNIRegistryPlugin(api.CNIPlugin):
|
||||
except KeyError:
|
||||
raise exceptions.ResourceNotReady(pod_name)
|
||||
|
||||
fn(vif, self._get_inst(pod), params.CNI_IFNAME, params.CNI_NETNS)
|
||||
fn(vif, self._get_inst(pod), params.CNI_IFNAME, params.CNI_NETNS,
|
||||
self.report_drivers_health)
|
||||
return vif
|
||||
|
||||
def _get_inst(self, pod):
|
||||
@ -135,10 +148,11 @@ class K8sCNIRegistryPlugin(api.CNIPlugin):
|
||||
|
||||
|
||||
class DaemonServer(object):
|
||||
def __init__(self, plugin):
|
||||
def __init__(self, plugin, healthy):
|
||||
self.ctx = None
|
||||
self.plugin = plugin
|
||||
|
||||
self.healthy = healthy
|
||||
self.failure_count = multiprocessing.Value('i', 0)
|
||||
self.application = flask.Flask('kuryr-daemon')
|
||||
self.application.add_url_rule(
|
||||
'/addNetwork', methods=['POST'], view_func=self.add)
|
||||
@ -157,6 +171,7 @@ class DaemonServer(object):
|
||||
try:
|
||||
params = self._prepare_request()
|
||||
except Exception:
|
||||
self._check_failure()
|
||||
LOG.exception('Exception when reading CNI params.')
|
||||
return '', httplib.BAD_REQUEST, self.headers
|
||||
|
||||
@ -164,10 +179,12 @@ class DaemonServer(object):
|
||||
vif = self.plugin.add(params)
|
||||
data = jsonutils.dumps(vif.obj_to_primitive())
|
||||
except exceptions.ResourceNotReady as e:
|
||||
self._check_failure()
|
||||
LOG.error("Timed out waiting for requested pod to appear in "
|
||||
"registry: %s.", e)
|
||||
return '', httplib.GATEWAY_TIMEOUT, self.headers
|
||||
except Exception:
|
||||
self._check_failure()
|
||||
LOG.exception('Error when processing addNetwork request. CNI '
|
||||
'Params: %s', params)
|
||||
return '', httplib.INTERNAL_SERVER_ERROR, self.headers
|
||||
@ -215,16 +232,26 @@ class DaemonServer(object):
|
||||
LOG.exception('Failed to start kuryr-daemon.')
|
||||
raise
|
||||
|
||||
def _check_failure(self):
|
||||
with self.failure_count.get_lock():
|
||||
if self.failure_count.value < CONF.cni_daemon.cni_failures_count:
|
||||
self.failure_count.value += 1
|
||||
else:
|
||||
with self.healthy.get_lock():
|
||||
LOG.debug("Reporting maximun CNI ADD failures reached.")
|
||||
self.healthy.value = False
|
||||
|
||||
|
||||
class CNIDaemonServerService(cotyledon.Service):
|
||||
name = "server"
|
||||
|
||||
def __init__(self, worker_id, registry):
|
||||
def __init__(self, worker_id, registry, healthy):
|
||||
super(CNIDaemonServerService, self).__init__(worker_id)
|
||||
self.run_queue_reading = False
|
||||
self.registry = registry
|
||||
self.plugin = K8sCNIRegistryPlugin(registry)
|
||||
self.server = DaemonServer(self.plugin)
|
||||
self.healthy = healthy
|
||||
self.plugin = K8sCNIRegistryPlugin(registry, self.healthy)
|
||||
self.server = DaemonServer(self.plugin, self.healthy)
|
||||
|
||||
def run(self):
|
||||
# NOTE(dulek): We might do a *lot* of pyroute2 operations, let's
|
||||
@ -239,11 +266,13 @@ class CNIDaemonServerService(cotyledon.Service):
|
||||
class CNIDaemonWatcherService(cotyledon.Service):
|
||||
name = "watcher"
|
||||
|
||||
def __init__(self, worker_id, registry):
|
||||
def __init__(self, worker_id, registry, healthy):
|
||||
super(CNIDaemonWatcherService, self).__init__(worker_id)
|
||||
self.pipeline = None
|
||||
self.watcher = None
|
||||
self.health_thread = None
|
||||
self.registry = registry
|
||||
self.healthy = healthy
|
||||
|
||||
def _get_nodename(self):
|
||||
# NOTE(dulek): At first try to get it using environment variable,
|
||||
@ -262,8 +291,20 @@ class CNIDaemonWatcherService(cotyledon.Service):
|
||||
"%(base)s/pods?fieldSelector=spec.nodeName=%(node_name)s" % {
|
||||
'base': k_const.K8S_API_BASE,
|
||||
'node_name': self._get_nodename()})
|
||||
self.is_running = True
|
||||
self.health_thread = threading.Thread(
|
||||
target=self._start_watcher_health_checker)
|
||||
self.health_thread.start()
|
||||
self.watcher.start()
|
||||
|
||||
def _start_watcher_health_checker(self):
|
||||
while self.is_running:
|
||||
if not self.watcher.is_healthy():
|
||||
LOG.debug("Reporting watcher not healthy.")
|
||||
with self.healthy.get_lock():
|
||||
self.healthy.value = False
|
||||
time.sleep(HEALTH_CHECKER_DELAY)
|
||||
|
||||
def on_done(self, pod, vif):
|
||||
pod_name = pod['metadata']['name']
|
||||
vif_dict = vif.obj_to_primitive()
|
||||
@ -284,10 +325,24 @@ class CNIDaemonWatcherService(cotyledon.Service):
|
||||
self.registry[pod_name] = pod_dict
|
||||
|
||||
def terminate(self):
|
||||
self.is_running = False
|
||||
if self.health_thread:
|
||||
self.health_thread.join()
|
||||
if self.watcher:
|
||||
self.watcher.stop()
|
||||
|
||||
|
||||
class CNIDaemonHealthServerService(cotyledon.Service):
|
||||
name = "health"
|
||||
|
||||
def __init__(self, worker_id, healthy):
|
||||
super(CNIDaemonHealthServerService, self).__init__(worker_id)
|
||||
self.health_server = health.CNIHealthServer(healthy)
|
||||
|
||||
def run(self):
|
||||
self.health_server.run()
|
||||
|
||||
|
||||
class CNIDaemonServiceManager(cotyledon.ServiceManager):
|
||||
def __init__(self):
|
||||
super(CNIDaemonServiceManager, self).__init__()
|
||||
@ -302,8 +357,10 @@ class CNIDaemonServiceManager(cotyledon.ServiceManager):
|
||||
|
||||
self.manager = multiprocessing.Manager()
|
||||
registry = self.manager.dict() # For Watcher->Server communication.
|
||||
self.add(CNIDaemonWatcherService, workers=1, args=(registry,))
|
||||
self.add(CNIDaemonServerService, workers=1, args=(registry,))
|
||||
healthy = multiprocessing.Value(c_bool, True)
|
||||
self.add(CNIDaemonWatcherService, workers=1, args=(registry, healthy,))
|
||||
self.add(CNIDaemonServerService, workers=1, args=(registry, healthy,))
|
||||
self.add(CNIDaemonHealthServerService, workers=1, args=(healthy,))
|
||||
self.register_hooks(on_terminate=self.terminate)
|
||||
|
||||
def run(self):
|
||||
|
129
kuryr_kubernetes/cni/health.py
Normal file
129
kuryr_kubernetes/cni/health.py
Normal file
@ -0,0 +1,129 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import os
|
||||
import psutil
|
||||
import requests
|
||||
from six.moves import http_client as httplib
|
||||
import subprocess
|
||||
|
||||
from flask import Flask
|
||||
from pyroute2 import IPDB
|
||||
|
||||
from kuryr.lib._i18n import _
|
||||
from oslo_config import cfg
|
||||
from oslo_log import log as logging
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
CONF = cfg.CONF
|
||||
|
||||
cni_health_server_opts = [
|
||||
cfg.IntOpt('port',
|
||||
help=_('Port for CNI Health HTTP Server.'),
|
||||
default=8090),
|
||||
cfg.IntOpt('max_memory_usage',
|
||||
help=_('Maximum memory usage (MiB) for CNI Health Server '
|
||||
'process. If this value is exceeded kuryr-daemon '
|
||||
'will be marked as unhealthy.'),
|
||||
default=-1),
|
||||
]
|
||||
|
||||
CONF.register_opts(cni_health_server_opts, "cni_health_server")
|
||||
|
||||
BYTES_AMOUNT = 1048576
|
||||
|
||||
|
||||
class CNIHealthServer(object):
|
||||
"""Server used by readiness and liveness probe to manage CNI health checks.
|
||||
|
||||
Verifies presence of NET_ADMIN capabilities, IPDB in working order,
|
||||
connectivity to Kubernetes API, quantity of CNI add failure, health of
|
||||
CNI components and existence of memory leaks.
|
||||
"""
|
||||
|
||||
def __init__(self, components_healthy):
|
||||
|
||||
self.ctx = None
|
||||
self._components_healthy = components_healthy
|
||||
self.application = Flask('cni-health-daemon')
|
||||
self.application.add_url_rule(
|
||||
'/ready', methods=['GET'], view_func=self.readiness_status)
|
||||
self.application.add_url_rule(
|
||||
'/alive', methods=['GET'], view_func=self.liveness_status)
|
||||
self.headers = {'Connection': 'close'}
|
||||
|
||||
def readiness_status(self):
|
||||
net_admin_command = 'capsh --print | grep "Current:" | ' \
|
||||
'cut -d" " -f3 | grep -q cap_net_admin'
|
||||
return_code = subprocess.call(net_admin_command, shell=True)
|
||||
data = 'ok'
|
||||
k8s_conn, k8s_status = self.verify_k8s_connection()
|
||||
|
||||
if return_code != 0:
|
||||
error_message = 'NET_ADMIN capabilities not present.'
|
||||
LOG.error(error_message)
|
||||
return error_message, httplib.INTERNAL_SERVER_ERROR, self.headers
|
||||
if not k8s_conn:
|
||||
error_message = 'Error when processing k8s healthz request.'
|
||||
LOG.error(error_message)
|
||||
return error_message, k8s_status, self.headers
|
||||
|
||||
LOG.info('CNI driver readiness verified.')
|
||||
return data, httplib.OK, self.headers
|
||||
|
||||
def liveness_status(self):
|
||||
data = 'ok'
|
||||
no_limit = -1
|
||||
try:
|
||||
with IPDB() as a:
|
||||
a.release()
|
||||
except Exception:
|
||||
error_message = 'IPDB not in working order.'
|
||||
LOG.debug(error_message)
|
||||
return error_message, httplib.INTERNAL_SERVER_ERROR, self.headers
|
||||
|
||||
if CONF.cni_health_server.max_memory_usage != no_limit:
|
||||
# Force gc to release unreferenced memory before actually checking
|
||||
# the memory.
|
||||
gc.collect()
|
||||
process = psutil.Process(os.getpid())
|
||||
mem_usage = process.memory_info().rss / BYTES_AMOUNT
|
||||
if mem_usage > CONF.cni_health_server.max_memory_usage:
|
||||
err_message = 'CNI daemon exceeded maximum memory usage.'
|
||||
LOG.debug(err_message)
|
||||
return err_message, httplib.INTERNAL_SERVER_ERROR, self.headers
|
||||
|
||||
with self._components_healthy.get_lock():
|
||||
if not self._components_healthy.value:
|
||||
err_message = 'Kuryr CNI components not healthy.'
|
||||
LOG.debug(err_message)
|
||||
return err_message, httplib.INTERNAL_SERVER_ERROR, self.headers
|
||||
|
||||
LOG.debug('Kuryr CNI Liveness verified.')
|
||||
return data, httplib.OK, self.headers
|
||||
|
||||
def run(self):
|
||||
address = ''
|
||||
try:
|
||||
LOG.info('Starting CNI health check server.')
|
||||
self.application.run(address, CONF.cni_health_server.port)
|
||||
except Exception:
|
||||
LOG.exception('Failed to start CNI health check server.')
|
||||
raise
|
||||
|
||||
def verify_k8s_connection(self):
|
||||
path = '/healthz'
|
||||
address = CONF.kubernetes.api_root
|
||||
url = address + path
|
||||
resp = requests.get(url, headers={'Connection': 'close'})
|
||||
return resp.content == 'ok', resp.status_code
|
@ -70,6 +70,11 @@ daemon_opts = [
|
||||
"host network namespaces, which is essential for Kuryr "
|
||||
"to work."),
|
||||
default=None),
|
||||
cfg.IntOpt('cni_failures_count',
|
||||
help=_('Maximum number of consecutive failures of kuryr-daemon '
|
||||
'when processing requests. If this number is exceeded, '
|
||||
'kuryr-daemon will be marked as unhealthy.'),
|
||||
default=3),
|
||||
]
|
||||
|
||||
k8s_opts = [
|
||||
|
@ -14,6 +14,7 @@ import copy
|
||||
from oslo_log import _options
|
||||
|
||||
from kuryr.lib import opts as lib_opts
|
||||
from kuryr_kubernetes.cni import health as cni_health
|
||||
from kuryr_kubernetes import config
|
||||
from kuryr_kubernetes.controller.drivers import default_subnet
|
||||
from kuryr_kubernetes.controller.drivers import nested_vif
|
||||
@ -33,6 +34,7 @@ _kuryr_k8s_opts = [
|
||||
('pool_manager', pool.pool_manager_opts),
|
||||
('cni_daemon', config.daemon_opts),
|
||||
('health_server', health.health_server_opts),
|
||||
('cni_health_server', cni_health.cni_health_server_opts),
|
||||
]
|
||||
|
||||
|
||||
|
@ -77,20 +77,26 @@ class TestDriverMixin(test_base.TestCase):
|
||||
|
||||
@mock.patch('kuryr_kubernetes.cni.binding.base.get_ipdb')
|
||||
@mock.patch('os_vif.plug')
|
||||
def _test_connect(self, m_vif_plug, m_get_ipdb):
|
||||
def _test_connect(self, m_vif_plug, m_get_ipdb, report=None):
|
||||
def get_ipdb(netns=None):
|
||||
return self.ipdbs[netns]
|
||||
|
||||
m_get_ipdb.side_effect = get_ipdb
|
||||
|
||||
base.connect(self.vif, self.instance_info, self.ifname, self.netns)
|
||||
base.connect(self.vif, self.instance_info, self.ifname, self.netns,
|
||||
report)
|
||||
m_vif_plug.assert_called_once_with(self.vif, self.instance_info)
|
||||
self.m_c_iface.add_ip.assert_called_once_with('192.168.0.2/24')
|
||||
if report:
|
||||
report.assert_called_once()
|
||||
|
||||
@mock.patch('os_vif.unplug')
|
||||
def _test_disconnect(self, m_vif_unplug):
|
||||
base.disconnect(self.vif, self.instance_info, self.ifname, self.netns)
|
||||
def _test_disconnect(self, m_vif_unplug, report=None):
|
||||
base.disconnect(self.vif, self.instance_info, self.ifname, self.netns,
|
||||
report)
|
||||
m_vif_unplug.assert_called_once_with(self.vif, self.instance_info)
|
||||
if report:
|
||||
report.assert_called_once()
|
||||
|
||||
|
||||
class TestOpenVSwitchDriver(TestDriverMixin, test_base.TestCase):
|
||||
@ -98,11 +104,13 @@ class TestOpenVSwitchDriver(TestDriverMixin, test_base.TestCase):
|
||||
super(TestOpenVSwitchDriver, self).setUp()
|
||||
self.vif = fake._fake_vif(osv_objects.vif.VIFOpenVSwitch)
|
||||
|
||||
@mock.patch('kuryr_kubernetes.cni.daemon.service.K8sCNIRegistryPlugin.'
|
||||
'report_drivers_health')
|
||||
@mock.patch('os.getpid', mock.Mock(return_value=123))
|
||||
@mock.patch('kuryr_kubernetes.linux_net_utils.create_ovs_vif_port')
|
||||
def test_connect(self, mock_create_ovs):
|
||||
self._test_connect()
|
||||
self.assertEqual(2, self.h_ipdb_exit.call_count)
|
||||
def test_connect(self, mock_create_ovs, m_report):
|
||||
self._test_connect(report=m_report)
|
||||
self.assertEqual(3, self.h_ipdb_exit.call_count)
|
||||
self.assertEqual(2, self.c_ipdb_exit.call_count)
|
||||
self.c_ipdb.create.assert_called_once_with(
|
||||
ifname=self.ifname, peer='h_interface', kind='veth')
|
||||
@ -118,9 +126,11 @@ class TestOpenVSwitchDriver(TestDriverMixin, test_base.TestCase):
|
||||
'bridge', 'h_interface', '89eccd45-43e9-43d8-b4cc-4c13db13f782',
|
||||
'3e:94:b7:31:a0:83', 'kuryr')
|
||||
|
||||
@mock.patch('kuryr_kubernetes.cni.daemon.service.K8sCNIRegistryPlugin.'
|
||||
'report_drivers_health')
|
||||
@mock.patch('kuryr_kubernetes.linux_net_utils.delete_ovs_vif_port')
|
||||
def test_disconnect(self, mock_delete_ovs):
|
||||
self._test_disconnect()
|
||||
def test_disconnect(self, mock_delete_ovs, m_report):
|
||||
self._test_disconnect(report=m_report)
|
||||
mock_delete_ovs.assert_called_once_with('bridge', 'h_interface')
|
||||
|
||||
|
||||
|
89
kuryr_kubernetes/tests/unit/cni/test_health.py
Normal file
89
kuryr_kubernetes/tests/unit/cni/test_health.py
Normal file
@ -0,0 +1,89 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ctypes import c_bool
|
||||
from kuryr_kubernetes.cni import health
|
||||
from kuryr_kubernetes.tests import base
|
||||
import mock
|
||||
import multiprocessing
|
||||
|
||||
from oslo_config import cfg
|
||||
|
||||
|
||||
class TestResourceUsage(object):
|
||||
pass
|
||||
|
||||
|
||||
class TestCNIHealthServer(base.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
super(TestCNIHealthServer, self).setUp()
|
||||
healthy = multiprocessing.Value(c_bool, True)
|
||||
self.srv = health.CNIHealthServer(healthy)
|
||||
self.srv.application.testing = True
|
||||
self.test_client = self.srv.application.test_client()
|
||||
|
||||
@mock.patch('subprocess.call')
|
||||
@mock.patch('kuryr_kubernetes.cni.health.CNIHealthServer.'
|
||||
'verify_k8s_connection')
|
||||
def test_readiness_status(self, m_verify_k8s_conn, m_subprocess):
|
||||
m_subprocess.return_value = 0
|
||||
m_verify_k8s_conn.return_value = True, 200
|
||||
resp = self.test_client.get('/ready')
|
||||
self.assertEqual(200, resp.status_code)
|
||||
|
||||
@mock.patch('subprocess.call')
|
||||
@mock.patch('kuryr_kubernetes.cni.health.CNIHealthServer.'
|
||||
'verify_k8s_connection')
|
||||
def test_readiness_status_net_admin_error(self, m_verify_k8s_conn,
|
||||
m_subprocess):
|
||||
m_subprocess.return_value = 1
|
||||
m_verify_k8s_conn.return_value = True, 200
|
||||
resp = self.test_client.get('/ready')
|
||||
self.assertEqual(500, resp.status_code)
|
||||
|
||||
@mock.patch('subprocess.call')
|
||||
@mock.patch('kuryr_kubernetes.cni.health.CNIHealthServer.'
|
||||
'verify_k8s_connection')
|
||||
def test_readiness_status_k8s_error(self, m_verify_k8s_conn, m_subprocess):
|
||||
m_subprocess.return_value = 0
|
||||
m_verify_k8s_conn.return_value = False, 503
|
||||
resp = self.test_client.get('/ready')
|
||||
self.assertEqual(503, resp.status_code)
|
||||
|
||||
@mock.patch('pyroute2.IPDB.release')
|
||||
def test_liveness_status(self, m_ipdb):
|
||||
self.srv._components_healthy.value = True
|
||||
resp = self.test_client.get('/alive')
|
||||
m_ipdb.assert_called()
|
||||
self.assertEqual(200, resp.status_code)
|
||||
|
||||
def test_liveness_status_components_error(self):
|
||||
self.srv._components_healthy.value = False
|
||||
resp = self.test_client.get('/alive')
|
||||
self.assertEqual(500, resp.status_code)
|
||||
|
||||
@mock.patch('pyroute2.IPDB.release')
|
||||
def test_liveness_status_ipdb_error(self, m_ipdb):
|
||||
m_ipdb.side_effect = Exception
|
||||
resp = self.test_client.get('/alive')
|
||||
self.assertEqual(500, resp.status_code)
|
||||
|
||||
@mock.patch('psutil.Process.memory_info')
|
||||
def test_liveness_status_mem_usage_error(self, m_resource):
|
||||
cfg.CONF.set_override('max_memory_usage', 4096,
|
||||
group='cni_health_server')
|
||||
cls = TestResourceUsage()
|
||||
cls.rss = 5368709120
|
||||
m_resource.return_value = cls
|
||||
resp = self.test_client.get('/alive')
|
||||
self.assertEqual(500, resp.status_code)
|
@ -30,7 +30,8 @@ class TestK8sCNIRegistryPlugin(base.TestCase):
|
||||
self.vif = fake._fake_vif_dict()
|
||||
registry = {'foo': {'pod': self.pod, 'vif': self.vif,
|
||||
'containerid': None}}
|
||||
self.plugin = service.K8sCNIRegistryPlugin(registry)
|
||||
healthy = mock.Mock()
|
||||
self.plugin = service.K8sCNIRegistryPlugin(registry, healthy)
|
||||
self.params = mock.Mock(args=mock.Mock(K8S_POD_NAME='foo'),
|
||||
CNI_IFNAME='baz', CNI_NETNS=123,
|
||||
CNI_CONTAINERID='cont_id')
|
||||
@ -41,20 +42,22 @@ class TestK8sCNIRegistryPlugin(base.TestCase):
|
||||
self.plugin.add(self.params)
|
||||
|
||||
m_lock.assert_called_with('foo', external=True)
|
||||
m_connect.assert_called_with(mock.ANY, mock.ANY, 'baz', 123)
|
||||
m_connect.assert_called_with(mock.ANY, mock.ANY, 'baz', 123, mock.ANY)
|
||||
self.assertEqual('cont_id', self.plugin.registry['foo']['containerid'])
|
||||
|
||||
@mock.patch('kuryr_kubernetes.cni.binding.base.disconnect')
|
||||
def test_del_present(self, m_disconnect):
|
||||
self.plugin.delete(self.params)
|
||||
|
||||
m_disconnect.assert_called_with(mock.ANY, mock.ANY, 'baz', 123)
|
||||
m_disconnect.assert_called_with(mock.ANY, mock.ANY, 'baz', 123,
|
||||
mock.ANY)
|
||||
|
||||
@mock.patch('kuryr_kubernetes.cni.binding.base.disconnect')
|
||||
def test_del_wrong_container_id(self, m_disconnect):
|
||||
registry = {'foo': {'pod': self.pod, 'vif': self.vif,
|
||||
'containerid': 'different'}}
|
||||
self.plugin = service.K8sCNIRegistryPlugin(registry)
|
||||
healthy = mock.Mock()
|
||||
self.plugin = service.K8sCNIRegistryPlugin(registry, healthy)
|
||||
self.plugin.delete(self.params)
|
||||
|
||||
m_disconnect.assert_not_called()
|
||||
@ -77,7 +80,7 @@ class TestK8sCNIRegistryPlugin(base.TestCase):
|
||||
m_setitem.assert_called_once_with('foo', {'pod': self.pod,
|
||||
'vif': self.vif,
|
||||
'containerid': 'cont_id'})
|
||||
m_connect.assert_called_with(mock.ANY, mock.ANY, 'baz', 123)
|
||||
m_connect.assert_called_with(mock.ANY, mock.ANY, 'baz', 123, mock.ANY)
|
||||
|
||||
@mock.patch('time.sleep', mock.Mock())
|
||||
def test_add_not_present(self):
|
||||
@ -95,8 +98,10 @@ class TestK8sCNIRegistryPlugin(base.TestCase):
|
||||
class TestDaemonServer(base.TestCase):
|
||||
def setUp(self):
|
||||
super(TestDaemonServer, self).setUp()
|
||||
self.plugin = service.K8sCNIRegistryPlugin({})
|
||||
self.srv = service.DaemonServer(self.plugin)
|
||||
healthy = mock.Mock()
|
||||
self.plugin = service.K8sCNIRegistryPlugin({}, healthy)
|
||||
self.health_registry = mock.Mock()
|
||||
self.srv = service.DaemonServer(self.plugin, self.health_registry)
|
||||
|
||||
self.srv.application.testing = True
|
||||
self.test_client = self.srv.application.test_client()
|
||||
|
@ -16,6 +16,7 @@ oslo.serialization!=2.19.1,>=2.18.0 # Apache-2.0
|
||||
oslo.service!=1.28.1,>=1.24.0 # Apache-2.0
|
||||
oslo.utils>=3.33.0 # Apache-2.0
|
||||
os-vif!=1.8.0,>=1.7.0 # Apache-2.0
|
||||
psutil>=3.2.2 # BSD
|
||||
pyroute2>=0.4.21;sys_platform!='win32' # Apache-2.0 (+ dual licensed GPL2)
|
||||
retrying!=1.3.0,>=1.2.3 # Apache-2.0
|
||||
six>=1.10.0 # MIT
|
||||
|
@ -107,4 +107,5 @@ generate_kuryr_configmap $OUTPUT_DIR $CONTROLLER_CONF_PATH $CNI_CONF_PATH
|
||||
generate_kuryr_service_account $OUTPUT_DIR
|
||||
health_server_port=${KURYR_HEALTH_SERVER_PORT:-8082}
|
||||
generate_controller_deployment $OUTPUT_DIR $health_server_port
|
||||
generate_cni_daemon_set $OUTPUT_DIR
|
||||
cni_health_server_port=${KURYR_CNI_HEALTH_SERVER_PORT:-8090}
|
||||
generate_cni_daemon_set $OUTPUT_DIR $cni_health_server_port
|
||||
|
Loading…
Reference in New Issue
Block a user