From 1e4b7f1109e3307bc7c7574172ee707d67613c26 Mon Sep 17 00:00:00 2001 From: Maysa Macedo Date: Thu, 25 Jan 2018 02:05:44 +0000 Subject: [PATCH] Add readiness and liveness checks to CNI. This patch adds readiness and liveness to CNI. It checks presence of NET_ADMIN capabilities, IPDB in working order, connection to Kubernetes API, quantity of CNI add failures, health of CNI components and existence of memory leaks. Implements: blueprint cni-daemon-readiness-liveness Change-Id: I9a4b871d196dbadfed687df93bb3cad97c957bfb --- devstack/lib/kuryr_kubernetes | 24 +++- devstack/plugin.sh | 8 +- devstack/settings | 3 + doc/source/devref/health_manager.rst | 54 ++++++-- kuryr_kubernetes/cni/binding/base.py | 8 +- kuryr_kubernetes/cni/binding/bridge.py | 28 +++- kuryr_kubernetes/cni/binding/nested.py | 14 +- kuryr_kubernetes/cni/daemon/service.py | 77 +++++++++-- kuryr_kubernetes/cni/health.py | 129 ++++++++++++++++++ kuryr_kubernetes/config.py | 5 + kuryr_kubernetes/opts.py | 2 + .../tests/unit/cni/test_binding.py | 28 ++-- .../tests/unit/cni/test_health.py | 89 ++++++++++++ .../tests/unit/cni/test_service.py | 19 ++- requirements.txt | 1 + tools/generate_k8s_resource_definitions.sh | 3 +- 16 files changed, 441 insertions(+), 51 deletions(-) create mode 100644 kuryr_kubernetes/cni/health.py create mode 100644 kuryr_kubernetes/tests/unit/cni/test_health.py diff --git a/devstack/lib/kuryr_kubernetes b/devstack/lib/kuryr_kubernetes index 3b570a7b4..f4245d7a8 100644 --- a/devstack/lib/kuryr_kubernetes +++ b/devstack/lib/kuryr_kubernetes @@ -461,8 +461,10 @@ EOF function generate_cni_daemon_set() { output_dir=$1 - cni_bin_dir=${2:-/opt/cni/bin} - cni_conf_dir=${3:-/etc/cni/net.d} + cni_health_server_port=$2 + cni_daemon=${3:-False} + cni_bin_dir=${4:-/opt/cni/bin} + cni_conf_dir=${5:-/etc/cni/net.d} mkdir -p "$output_dir" rm -f ${output_dir}/cni_ds.yml cat >> "${output_dir}/cni_ds.yml" << EOF @@ -513,6 +515,24 @@ spec: mountPath: /host_proc - name: openvswitch mountPath: /var/run/openvswitch +EOF + if [ "$cni_daemon" == "True" ]; then + cat >> "${output_dir}/cni_ds.yml" << EOF + readinessProbe: + httpGet: + path: /ready + port: ${cni_health_server_port} + scheme: HTTP + initialDelaySeconds: 15 + timeoutSeconds: 5 + livenessProbe: + httpGet: + path: /alive + port: ${cni_health_server_port} + initialDelaySeconds: 15 +EOF + fi + cat >> "${output_dir}/cni_ds.yml" << EOF volumes: - name: bin hostPath: diff --git a/devstack/plugin.sh b/devstack/plugin.sh index e7d0c7059..0c41a7b15 100644 --- a/devstack/plugin.sh +++ b/devstack/plugin.sh @@ -121,6 +121,9 @@ function configure_kuryr { } function generate_containerized_kuryr_resources { + local cni_daemon + cni_daemon=$1 + # Containerized deployment will use tokens provided by k8s itself. inicomment "$KURYR_CONFIG" kubernetes ssl_client_crt_file inicomment "$KURYR_CONFIG" kubernetes ssl_client_key_file @@ -138,7 +141,7 @@ function generate_containerized_kuryr_resources { generate_kuryr_configmap $output_dir $KURYR_CONFIG $KURYR_CNI_CONFIG generate_kuryr_service_account $output_dir generate_controller_deployment $output_dir $KURYR_HEALTH_SERVER_PORT - generate_cni_daemon_set $output_dir $CNI_BIN_DIR $CNI_CONF_DIR + generate_cni_daemon_set $output_dir $KURYR_CNI_HEALTH_SERVER_PORT $cni_daemon $CNI_BIN_DIR $CNI_CONF_DIR } function run_containerized_kuryr_resources { @@ -712,10 +715,11 @@ if [[ "$1" == "stack" && "$2" == "extra" ]]; then else if is_service_enabled kuryr-daemon; then build_kuryr_containers $CNI_BIN_DIR $CNI_CONF_DIR True + generate_containerized_kuryr_resources True else build_kuryr_containers $CNI_BIN_DIR $CNI_CONF_DIR False + generate_containerized_kuryr_resources False fi - generate_containerized_kuryr_resources run_containerized_kuryr_resources fi fi diff --git a/devstack/settings b/devstack/settings index 6977116c0..b39d04522 100644 --- a/devstack/settings +++ b/devstack/settings @@ -80,3 +80,6 @@ KURYR_HEALTH_SERVER_PORT=${KURYR_HEALTH_SERVER_PORT:-8082} # OVS HOST PATH OVS_HOST_PATH=${OVS_HOST_PATH:-/var/run/openvswitch} + +# Health Server +KURYR_CNI_HEALTH_SERVER_PORT=${KURYR_CNI_HEALTH_SERVER_PORT:-8090} diff --git a/doc/source/devref/health_manager.rst b/doc/source/devref/health_manager.rst index 286d5733c..d2b94f481 100644 --- a/doc/source/devref/health_manager.rst +++ b/doc/source/devref/health_manager.rst @@ -20,11 +20,11 @@ Kuryr Kubernetes Health Manager Design Purpose ------- The purpose of this document is to present the design decision behind -Kuryr Kubernetes Health Manager. +Kuryr Kubernetes Health Managers. -The main purpose of the Health Manager is to perform Health verifications -that assures Kuryr Controller readiness and liveness, and so improve the -management that Kubernetes does on Kuryr Controller pod. +The main purpose of the Health Managers is to perform Health verifications that +assures readiness and liveness to Kuryr Controller and CNI pod, and so improve +the management that Kubernetes does on Kuryr-Kubernetes pods. Overview -------- @@ -36,17 +36,43 @@ It is important to check health of these services so that Kubernetes and its users know when Kuryr Controller it is ready to perform its networking tasks. Also, it is necessary to check the health state of Kuryr components in order to assure Kuryr Controller service is alive. To provide these -functionalities, Health Manager will verify and serve the health state of -these services and components to the probe. +functionalities, Controller's Health Manager will verify and serve the health +state of these services and components to the probes. + +Besides these problems on the Controller, Kuryr CNI daemon also might get to a +broken state as a result of its components being not healthy and necessary +configurations not present. It is essencial that CNI components health and +configurations are properly verified to assure CNI daemon is in a good shape. +On this way, the CNI Health Manager will check and serve the health state to +Kubenetes readiness and liveness probes. Proposed Solution ----------------- -One of the endpoints provided by The Health Manager will check whether it is -able to watch the Kubernetes API, authenticate with Keystone and talk to -Neutron, since these are services needed by Kuryr Controller. These checks -will assure the Controller readiness. The other endpoint, will verify -the health state of Kuryr components and guarantee Controller liveness. +One of the endpoints provided by the Controller Health Manager will check +whether it is able to watch the Kubernetes API, authenticate with Keystone +and talk to Neutron, since these are services needed by Kuryr Controller. +These checks will assure the Controller readiness. The other endpoint, will +verify the health state of Kuryr components and guarantee Controller liveness. -The idea behind the Manager is to combine all the necessary checks in a -server running inside Kuryr Controller pod and provide the checks result -to the probe. +The CNI Health Manager also provides two endpoins to Kubernetes probes. +The endpoint that provides readiness state to the probe checks connection +to Kubernetes API and presence of NET_ADMIN capabilities. The other endpoint, +which provides liveness, validates whether IPDB is in working order, maximum +CNI ADD failure is reached, health of CNI components and existence of memory +leak. + +.. note:: + The CNI Health Manager will be started with the check for memory leak disabled. + In order to enable, set the following option in kuryr.conf to a limit value + of memory in MiBs. + + [cni_health_server] + max_memory_usage = -1 + +The CNI Health Manager is added as a process to CNI daemon and communicates +to the other two processes i.e. Watcher and Server with a shared boolean object, +which indicates the current health state of each component. + +The idea behind these two Managers is to combine all the necessary checks in +servers running inside Kuryr Controller and CNI pods to provide the checks result +to the probes. diff --git a/kuryr_kubernetes/cni/binding/base.py b/kuryr_kubernetes/cni/binding/base.py index e4c468bac..af9b1db1e 100644 --- a/kuryr_kubernetes/cni/binding/base.py +++ b/kuryr_kubernetes/cni/binding/base.py @@ -75,14 +75,18 @@ def _configure_l3(vif, ifname, netns): dst='default').commit() -def connect(vif, instance_info, ifname, netns=None): +def connect(vif, instance_info, ifname, netns=None, report_health=None): driver = _get_binding_driver(vif) + if report_health: + report_health(driver.is_healthy()) os_vif.plug(vif, instance_info) driver.connect(vif, ifname, netns) _configure_l3(vif, ifname, netns) -def disconnect(vif, instance_info, ifname, netns=None): +def disconnect(vif, instance_info, ifname, netns=None, report_health=None): driver = _get_binding_driver(vif) + if report_health: + report_health(driver.is_healthy()) driver.disconnect(vif, ifname, netns) os_vif.unplug(vif, instance_info) diff --git a/kuryr_kubernetes/cni/binding/bridge.py b/kuryr_kubernetes/cni/binding/bridge.py index f2d33eb2a..e9d0c669f 100644 --- a/kuryr_kubernetes/cni/binding/bridge.py +++ b/kuryr_kubernetes/cni/binding/bridge.py @@ -12,18 +12,23 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. - import os - +from oslo_config import cfg from oslo_log import log from kuryr_kubernetes.cni.binding import base as b_base +from kuryr_kubernetes.handlers import health from kuryr_kubernetes import linux_net_utils as net_utils LOG = log.getLogger(__name__) +CONF = cfg.CONF -class BaseBridgeDriver(object): +class BaseBridgeDriver(health.HealthHandler): + + def __init__(self): + super(BaseBridgeDriver, self).__init__() + def connect(self, vif, ifname, netns): host_ifname = vif.vif_name @@ -59,6 +64,9 @@ class BaseBridgeDriver(object): class BridgeDriver(BaseBridgeDriver): + def __init__(self): + super(BridgeDriver, self).__init__() + def connect(self, vif, ifname, netns): super(BridgeDriver, self).connect(vif, ifname, netns) host_ifname = vif.vif_name @@ -75,6 +83,10 @@ class BridgeDriver(BaseBridgeDriver): class VIFOpenVSwitchDriver(BaseBridgeDriver): + + def __init__(self): + super(VIFOpenVSwitchDriver, self).__init__() + def connect(self, vif, ifname, netns): super(VIFOpenVSwitchDriver, self).connect(vif, ifname, netns) # FIXME(irenab) use pod_id (neutron port device_id) @@ -86,3 +98,13 @@ class VIFOpenVSwitchDriver(BaseBridgeDriver): def disconnect(self, vif, ifname, netns): super(VIFOpenVSwitchDriver, self).disconnect(vif, ifname, netns) net_utils.delete_ovs_vif_port(vif.bridge_name, vif.vif_name) + + def is_healthy(self): + bridge_name = CONF.neutron_defaults.ovs_bridge + try: + with b_base.get_ipdb() as h_ipdb: + h_ipdb.interfaces[bridge_name] + return True + except Exception: + LOG.debug("Reporting Driver not healthy.") + return False diff --git a/kuryr_kubernetes/cni/binding/nested.py b/kuryr_kubernetes/cni/binding/nested.py index 5670a9cfa..3eef252cd 100644 --- a/kuryr_kubernetes/cni/binding/nested.py +++ b/kuryr_kubernetes/cni/binding/nested.py @@ -17,6 +17,7 @@ import six from kuryr_kubernetes.cni.binding import base as b_base from kuryr_kubernetes import config +from kuryr_kubernetes.handlers import health from kuryr_kubernetes import utils VLAN_KIND = 'vlan' @@ -25,7 +26,10 @@ MACVLAN_MODE_BRIDGE = 'bridge' @six.add_metaclass(abc.ABCMeta) -class NestedDriver(object): +class NestedDriver(health.HealthHandler): + + def __init__(self): + super(NestedDriver, self).__init__() @abc.abstractmethod def _get_iface_create_args(self, vif): @@ -65,10 +69,18 @@ class NestedDriver(object): class VlanDriver(NestedDriver): + + def __init__(self): + super(VlanDriver, self).__init__() + def _get_iface_create_args(self, vif): return {'kind': VLAN_KIND, 'vlan_id': vif.vlan_id} class MacvlanDriver(NestedDriver): + + def __init__(self): + super(MacvlanDriver, self).__init__() + def _get_iface_create_args(self, vif): return {'kind': MACVLAN_KIND, 'macvlan_mode': MACVLAN_MODE_BRIDGE} diff --git a/kuryr_kubernetes/cni/daemon/service.py b/kuryr_kubernetes/cni/daemon/service.py index cdadcadc0..b17bc7f9c 100644 --- a/kuryr_kubernetes/cni/daemon/service.py +++ b/kuryr_kubernetes/cni/daemon/service.py @@ -12,11 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +from ctypes import c_bool import multiprocessing import os from six.moves import http_client as httplib import socket import sys +import threading +import time import cotyledon import flask @@ -35,6 +38,7 @@ from kuryr_kubernetes import clients from kuryr_kubernetes.cni import api from kuryr_kubernetes.cni.binding import base as b_base from kuryr_kubernetes.cni import handlers as h_cni +from kuryr_kubernetes.cni import health from kuryr_kubernetes.cni import utils from kuryr_kubernetes import config from kuryr_kubernetes import constants as k_const @@ -45,6 +49,7 @@ from kuryr_kubernetes import watcher as k_watcher LOG = logging.getLogger(__name__) CONF = cfg.CONF RETRY_DELAY = 1000 # 1 second in milliseconds +HEALTH_CHECKER_DELAY = 5 # TODO(dulek): Another corner case is (and was) when pod is deleted before it's # annotated by controller or even noticed by any watcher. Kubelet @@ -55,7 +60,8 @@ RETRY_DELAY = 1000 # 1 second in milliseconds class K8sCNIRegistryPlugin(api.CNIPlugin): - def __init__(self, registry): + def __init__(self, registry, healthy): + self.healthy = healthy self.registry = registry def _get_name(self, pod): @@ -108,6 +114,12 @@ class K8sCNIRegistryPlugin(api.CNIPlugin): pass self._do_work(params, b_base.disconnect) + def report_drivers_health(self, driver_healthy): + if not driver_healthy: + with self.healthy.get_lock(): + LOG.debug("Reporting CNI driver not healthy.") + self.healthy.value = driver_healthy + def _do_work(self, params, fn): pod_name = params.args.K8S_POD_NAME @@ -126,7 +138,8 @@ class K8sCNIRegistryPlugin(api.CNIPlugin): except KeyError: raise exceptions.ResourceNotReady(pod_name) - fn(vif, self._get_inst(pod), params.CNI_IFNAME, params.CNI_NETNS) + fn(vif, self._get_inst(pod), params.CNI_IFNAME, params.CNI_NETNS, + self.report_drivers_health) return vif def _get_inst(self, pod): @@ -135,10 +148,11 @@ class K8sCNIRegistryPlugin(api.CNIPlugin): class DaemonServer(object): - def __init__(self, plugin): + def __init__(self, plugin, healthy): self.ctx = None self.plugin = plugin - + self.healthy = healthy + self.failure_count = multiprocessing.Value('i', 0) self.application = flask.Flask('kuryr-daemon') self.application.add_url_rule( '/addNetwork', methods=['POST'], view_func=self.add) @@ -157,6 +171,7 @@ class DaemonServer(object): try: params = self._prepare_request() except Exception: + self._check_failure() LOG.exception('Exception when reading CNI params.') return '', httplib.BAD_REQUEST, self.headers @@ -164,10 +179,12 @@ class DaemonServer(object): vif = self.plugin.add(params) data = jsonutils.dumps(vif.obj_to_primitive()) except exceptions.ResourceNotReady as e: + self._check_failure() LOG.error("Timed out waiting for requested pod to appear in " "registry: %s.", e) return '', httplib.GATEWAY_TIMEOUT, self.headers except Exception: + self._check_failure() LOG.exception('Error when processing addNetwork request. CNI ' 'Params: %s', params) return '', httplib.INTERNAL_SERVER_ERROR, self.headers @@ -215,16 +232,26 @@ class DaemonServer(object): LOG.exception('Failed to start kuryr-daemon.') raise + def _check_failure(self): + with self.failure_count.get_lock(): + if self.failure_count.value < CONF.cni_daemon.cni_failures_count: + self.failure_count.value += 1 + else: + with self.healthy.get_lock(): + LOG.debug("Reporting maximun CNI ADD failures reached.") + self.healthy.value = False + class CNIDaemonServerService(cotyledon.Service): name = "server" - def __init__(self, worker_id, registry): + def __init__(self, worker_id, registry, healthy): super(CNIDaemonServerService, self).__init__(worker_id) self.run_queue_reading = False self.registry = registry - self.plugin = K8sCNIRegistryPlugin(registry) - self.server = DaemonServer(self.plugin) + self.healthy = healthy + self.plugin = K8sCNIRegistryPlugin(registry, self.healthy) + self.server = DaemonServer(self.plugin, self.healthy) def run(self): # NOTE(dulek): We might do a *lot* of pyroute2 operations, let's @@ -239,11 +266,13 @@ class CNIDaemonServerService(cotyledon.Service): class CNIDaemonWatcherService(cotyledon.Service): name = "watcher" - def __init__(self, worker_id, registry): + def __init__(self, worker_id, registry, healthy): super(CNIDaemonWatcherService, self).__init__(worker_id) self.pipeline = None self.watcher = None + self.health_thread = None self.registry = registry + self.healthy = healthy def _get_nodename(self): # NOTE(dulek): At first try to get it using environment variable, @@ -262,8 +291,20 @@ class CNIDaemonWatcherService(cotyledon.Service): "%(base)s/pods?fieldSelector=spec.nodeName=%(node_name)s" % { 'base': k_const.K8S_API_BASE, 'node_name': self._get_nodename()}) + self.is_running = True + self.health_thread = threading.Thread( + target=self._start_watcher_health_checker) + self.health_thread.start() self.watcher.start() + def _start_watcher_health_checker(self): + while self.is_running: + if not self.watcher.is_healthy(): + LOG.debug("Reporting watcher not healthy.") + with self.healthy.get_lock(): + self.healthy.value = False + time.sleep(HEALTH_CHECKER_DELAY) + def on_done(self, pod, vif): pod_name = pod['metadata']['name'] vif_dict = vif.obj_to_primitive() @@ -284,10 +325,24 @@ class CNIDaemonWatcherService(cotyledon.Service): self.registry[pod_name] = pod_dict def terminate(self): + self.is_running = False + if self.health_thread: + self.health_thread.join() if self.watcher: self.watcher.stop() +class CNIDaemonHealthServerService(cotyledon.Service): + name = "health" + + def __init__(self, worker_id, healthy): + super(CNIDaemonHealthServerService, self).__init__(worker_id) + self.health_server = health.CNIHealthServer(healthy) + + def run(self): + self.health_server.run() + + class CNIDaemonServiceManager(cotyledon.ServiceManager): def __init__(self): super(CNIDaemonServiceManager, self).__init__() @@ -302,8 +357,10 @@ class CNIDaemonServiceManager(cotyledon.ServiceManager): self.manager = multiprocessing.Manager() registry = self.manager.dict() # For Watcher->Server communication. - self.add(CNIDaemonWatcherService, workers=1, args=(registry,)) - self.add(CNIDaemonServerService, workers=1, args=(registry,)) + healthy = multiprocessing.Value(c_bool, True) + self.add(CNIDaemonWatcherService, workers=1, args=(registry, healthy,)) + self.add(CNIDaemonServerService, workers=1, args=(registry, healthy,)) + self.add(CNIDaemonHealthServerService, workers=1, args=(healthy,)) self.register_hooks(on_terminate=self.terminate) def run(self): diff --git a/kuryr_kubernetes/cni/health.py b/kuryr_kubernetes/cni/health.py new file mode 100644 index 000000000..f5799fbc5 --- /dev/null +++ b/kuryr_kubernetes/cni/health.py @@ -0,0 +1,129 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import os +import psutil +import requests +from six.moves import http_client as httplib +import subprocess + +from flask import Flask +from pyroute2 import IPDB + +from kuryr.lib._i18n import _ +from oslo_config import cfg +from oslo_log import log as logging + +LOG = logging.getLogger(__name__) +CONF = cfg.CONF + +cni_health_server_opts = [ + cfg.IntOpt('port', + help=_('Port for CNI Health HTTP Server.'), + default=8090), + cfg.IntOpt('max_memory_usage', + help=_('Maximum memory usage (MiB) for CNI Health Server ' + 'process. If this value is exceeded kuryr-daemon ' + 'will be marked as unhealthy.'), + default=-1), +] + +CONF.register_opts(cni_health_server_opts, "cni_health_server") + +BYTES_AMOUNT = 1048576 + + +class CNIHealthServer(object): + """Server used by readiness and liveness probe to manage CNI health checks. + + Verifies presence of NET_ADMIN capabilities, IPDB in working order, + connectivity to Kubernetes API, quantity of CNI add failure, health of + CNI components and existence of memory leaks. + """ + + def __init__(self, components_healthy): + + self.ctx = None + self._components_healthy = components_healthy + self.application = Flask('cni-health-daemon') + self.application.add_url_rule( + '/ready', methods=['GET'], view_func=self.readiness_status) + self.application.add_url_rule( + '/alive', methods=['GET'], view_func=self.liveness_status) + self.headers = {'Connection': 'close'} + + def readiness_status(self): + net_admin_command = 'capsh --print | grep "Current:" | ' \ + 'cut -d" " -f3 | grep -q cap_net_admin' + return_code = subprocess.call(net_admin_command, shell=True) + data = 'ok' + k8s_conn, k8s_status = self.verify_k8s_connection() + + if return_code != 0: + error_message = 'NET_ADMIN capabilities not present.' + LOG.error(error_message) + return error_message, httplib.INTERNAL_SERVER_ERROR, self.headers + if not k8s_conn: + error_message = 'Error when processing k8s healthz request.' + LOG.error(error_message) + return error_message, k8s_status, self.headers + + LOG.info('CNI driver readiness verified.') + return data, httplib.OK, self.headers + + def liveness_status(self): + data = 'ok' + no_limit = -1 + try: + with IPDB() as a: + a.release() + except Exception: + error_message = 'IPDB not in working order.' + LOG.debug(error_message) + return error_message, httplib.INTERNAL_SERVER_ERROR, self.headers + + if CONF.cni_health_server.max_memory_usage != no_limit: + # Force gc to release unreferenced memory before actually checking + # the memory. + gc.collect() + process = psutil.Process(os.getpid()) + mem_usage = process.memory_info().rss / BYTES_AMOUNT + if mem_usage > CONF.cni_health_server.max_memory_usage: + err_message = 'CNI daemon exceeded maximum memory usage.' + LOG.debug(err_message) + return err_message, httplib.INTERNAL_SERVER_ERROR, self.headers + + with self._components_healthy.get_lock(): + if not self._components_healthy.value: + err_message = 'Kuryr CNI components not healthy.' + LOG.debug(err_message) + return err_message, httplib.INTERNAL_SERVER_ERROR, self.headers + + LOG.debug('Kuryr CNI Liveness verified.') + return data, httplib.OK, self.headers + + def run(self): + address = '' + try: + LOG.info('Starting CNI health check server.') + self.application.run(address, CONF.cni_health_server.port) + except Exception: + LOG.exception('Failed to start CNI health check server.') + raise + + def verify_k8s_connection(self): + path = '/healthz' + address = CONF.kubernetes.api_root + url = address + path + resp = requests.get(url, headers={'Connection': 'close'}) + return resp.content == 'ok', resp.status_code diff --git a/kuryr_kubernetes/config.py b/kuryr_kubernetes/config.py index 463e03b0c..60df4456b 100644 --- a/kuryr_kubernetes/config.py +++ b/kuryr_kubernetes/config.py @@ -70,6 +70,11 @@ daemon_opts = [ "host network namespaces, which is essential for Kuryr " "to work."), default=None), + cfg.IntOpt('cni_failures_count', + help=_('Maximum number of consecutive failures of kuryr-daemon ' + 'when processing requests. If this number is exceeded, ' + 'kuryr-daemon will be marked as unhealthy.'), + default=3), ] k8s_opts = [ diff --git a/kuryr_kubernetes/opts.py b/kuryr_kubernetes/opts.py index c1e270386..7b49ab7e8 100644 --- a/kuryr_kubernetes/opts.py +++ b/kuryr_kubernetes/opts.py @@ -14,6 +14,7 @@ import copy from oslo_log import _options from kuryr.lib import opts as lib_opts +from kuryr_kubernetes.cni import health as cni_health from kuryr_kubernetes import config from kuryr_kubernetes.controller.drivers import default_subnet from kuryr_kubernetes.controller.drivers import nested_vif @@ -33,6 +34,7 @@ _kuryr_k8s_opts = [ ('pool_manager', pool.pool_manager_opts), ('cni_daemon', config.daemon_opts), ('health_server', health.health_server_opts), + ('cni_health_server', cni_health.cni_health_server_opts), ] diff --git a/kuryr_kubernetes/tests/unit/cni/test_binding.py b/kuryr_kubernetes/tests/unit/cni/test_binding.py index e58a3feae..597e89382 100644 --- a/kuryr_kubernetes/tests/unit/cni/test_binding.py +++ b/kuryr_kubernetes/tests/unit/cni/test_binding.py @@ -77,20 +77,26 @@ class TestDriverMixin(test_base.TestCase): @mock.patch('kuryr_kubernetes.cni.binding.base.get_ipdb') @mock.patch('os_vif.plug') - def _test_connect(self, m_vif_plug, m_get_ipdb): + def _test_connect(self, m_vif_plug, m_get_ipdb, report=None): def get_ipdb(netns=None): return self.ipdbs[netns] m_get_ipdb.side_effect = get_ipdb - base.connect(self.vif, self.instance_info, self.ifname, self.netns) + base.connect(self.vif, self.instance_info, self.ifname, self.netns, + report) m_vif_plug.assert_called_once_with(self.vif, self.instance_info) self.m_c_iface.add_ip.assert_called_once_with('192.168.0.2/24') + if report: + report.assert_called_once() @mock.patch('os_vif.unplug') - def _test_disconnect(self, m_vif_unplug): - base.disconnect(self.vif, self.instance_info, self.ifname, self.netns) + def _test_disconnect(self, m_vif_unplug, report=None): + base.disconnect(self.vif, self.instance_info, self.ifname, self.netns, + report) m_vif_unplug.assert_called_once_with(self.vif, self.instance_info) + if report: + report.assert_called_once() class TestOpenVSwitchDriver(TestDriverMixin, test_base.TestCase): @@ -98,11 +104,13 @@ class TestOpenVSwitchDriver(TestDriverMixin, test_base.TestCase): super(TestOpenVSwitchDriver, self).setUp() self.vif = fake._fake_vif(osv_objects.vif.VIFOpenVSwitch) + @mock.patch('kuryr_kubernetes.cni.daemon.service.K8sCNIRegistryPlugin.' + 'report_drivers_health') @mock.patch('os.getpid', mock.Mock(return_value=123)) @mock.patch('kuryr_kubernetes.linux_net_utils.create_ovs_vif_port') - def test_connect(self, mock_create_ovs): - self._test_connect() - self.assertEqual(2, self.h_ipdb_exit.call_count) + def test_connect(self, mock_create_ovs, m_report): + self._test_connect(report=m_report) + self.assertEqual(3, self.h_ipdb_exit.call_count) self.assertEqual(2, self.c_ipdb_exit.call_count) self.c_ipdb.create.assert_called_once_with( ifname=self.ifname, peer='h_interface', kind='veth') @@ -118,9 +126,11 @@ class TestOpenVSwitchDriver(TestDriverMixin, test_base.TestCase): 'bridge', 'h_interface', '89eccd45-43e9-43d8-b4cc-4c13db13f782', '3e:94:b7:31:a0:83', 'kuryr') + @mock.patch('kuryr_kubernetes.cni.daemon.service.K8sCNIRegistryPlugin.' + 'report_drivers_health') @mock.patch('kuryr_kubernetes.linux_net_utils.delete_ovs_vif_port') - def test_disconnect(self, mock_delete_ovs): - self._test_disconnect() + def test_disconnect(self, mock_delete_ovs, m_report): + self._test_disconnect(report=m_report) mock_delete_ovs.assert_called_once_with('bridge', 'h_interface') diff --git a/kuryr_kubernetes/tests/unit/cni/test_health.py b/kuryr_kubernetes/tests/unit/cni/test_health.py new file mode 100644 index 000000000..f8900ecf7 --- /dev/null +++ b/kuryr_kubernetes/tests/unit/cni/test_health.py @@ -0,0 +1,89 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ctypes import c_bool +from kuryr_kubernetes.cni import health +from kuryr_kubernetes.tests import base +import mock +import multiprocessing + +from oslo_config import cfg + + +class TestResourceUsage(object): + pass + + +class TestCNIHealthServer(base.TestCase): + + def setUp(self): + super(TestCNIHealthServer, self).setUp() + healthy = multiprocessing.Value(c_bool, True) + self.srv = health.CNIHealthServer(healthy) + self.srv.application.testing = True + self.test_client = self.srv.application.test_client() + + @mock.patch('subprocess.call') + @mock.patch('kuryr_kubernetes.cni.health.CNIHealthServer.' + 'verify_k8s_connection') + def test_readiness_status(self, m_verify_k8s_conn, m_subprocess): + m_subprocess.return_value = 0 + m_verify_k8s_conn.return_value = True, 200 + resp = self.test_client.get('/ready') + self.assertEqual(200, resp.status_code) + + @mock.patch('subprocess.call') + @mock.patch('kuryr_kubernetes.cni.health.CNIHealthServer.' + 'verify_k8s_connection') + def test_readiness_status_net_admin_error(self, m_verify_k8s_conn, + m_subprocess): + m_subprocess.return_value = 1 + m_verify_k8s_conn.return_value = True, 200 + resp = self.test_client.get('/ready') + self.assertEqual(500, resp.status_code) + + @mock.patch('subprocess.call') + @mock.patch('kuryr_kubernetes.cni.health.CNIHealthServer.' + 'verify_k8s_connection') + def test_readiness_status_k8s_error(self, m_verify_k8s_conn, m_subprocess): + m_subprocess.return_value = 0 + m_verify_k8s_conn.return_value = False, 503 + resp = self.test_client.get('/ready') + self.assertEqual(503, resp.status_code) + + @mock.patch('pyroute2.IPDB.release') + def test_liveness_status(self, m_ipdb): + self.srv._components_healthy.value = True + resp = self.test_client.get('/alive') + m_ipdb.assert_called() + self.assertEqual(200, resp.status_code) + + def test_liveness_status_components_error(self): + self.srv._components_healthy.value = False + resp = self.test_client.get('/alive') + self.assertEqual(500, resp.status_code) + + @mock.patch('pyroute2.IPDB.release') + def test_liveness_status_ipdb_error(self, m_ipdb): + m_ipdb.side_effect = Exception + resp = self.test_client.get('/alive') + self.assertEqual(500, resp.status_code) + + @mock.patch('psutil.Process.memory_info') + def test_liveness_status_mem_usage_error(self, m_resource): + cfg.CONF.set_override('max_memory_usage', 4096, + group='cni_health_server') + cls = TestResourceUsage() + cls.rss = 5368709120 + m_resource.return_value = cls + resp = self.test_client.get('/alive') + self.assertEqual(500, resp.status_code) diff --git a/kuryr_kubernetes/tests/unit/cni/test_service.py b/kuryr_kubernetes/tests/unit/cni/test_service.py index 3f601d1f1..15fece923 100644 --- a/kuryr_kubernetes/tests/unit/cni/test_service.py +++ b/kuryr_kubernetes/tests/unit/cni/test_service.py @@ -30,7 +30,8 @@ class TestK8sCNIRegistryPlugin(base.TestCase): self.vif = fake._fake_vif_dict() registry = {'foo': {'pod': self.pod, 'vif': self.vif, 'containerid': None}} - self.plugin = service.K8sCNIRegistryPlugin(registry) + healthy = mock.Mock() + self.plugin = service.K8sCNIRegistryPlugin(registry, healthy) self.params = mock.Mock(args=mock.Mock(K8S_POD_NAME='foo'), CNI_IFNAME='baz', CNI_NETNS=123, CNI_CONTAINERID='cont_id') @@ -41,20 +42,22 @@ class TestK8sCNIRegistryPlugin(base.TestCase): self.plugin.add(self.params) m_lock.assert_called_with('foo', external=True) - m_connect.assert_called_with(mock.ANY, mock.ANY, 'baz', 123) + m_connect.assert_called_with(mock.ANY, mock.ANY, 'baz', 123, mock.ANY) self.assertEqual('cont_id', self.plugin.registry['foo']['containerid']) @mock.patch('kuryr_kubernetes.cni.binding.base.disconnect') def test_del_present(self, m_disconnect): self.plugin.delete(self.params) - m_disconnect.assert_called_with(mock.ANY, mock.ANY, 'baz', 123) + m_disconnect.assert_called_with(mock.ANY, mock.ANY, 'baz', 123, + mock.ANY) @mock.patch('kuryr_kubernetes.cni.binding.base.disconnect') def test_del_wrong_container_id(self, m_disconnect): registry = {'foo': {'pod': self.pod, 'vif': self.vif, 'containerid': 'different'}} - self.plugin = service.K8sCNIRegistryPlugin(registry) + healthy = mock.Mock() + self.plugin = service.K8sCNIRegistryPlugin(registry, healthy) self.plugin.delete(self.params) m_disconnect.assert_not_called() @@ -77,7 +80,7 @@ class TestK8sCNIRegistryPlugin(base.TestCase): m_setitem.assert_called_once_with('foo', {'pod': self.pod, 'vif': self.vif, 'containerid': 'cont_id'}) - m_connect.assert_called_with(mock.ANY, mock.ANY, 'baz', 123) + m_connect.assert_called_with(mock.ANY, mock.ANY, 'baz', 123, mock.ANY) @mock.patch('time.sleep', mock.Mock()) def test_add_not_present(self): @@ -95,8 +98,10 @@ class TestK8sCNIRegistryPlugin(base.TestCase): class TestDaemonServer(base.TestCase): def setUp(self): super(TestDaemonServer, self).setUp() - self.plugin = service.K8sCNIRegistryPlugin({}) - self.srv = service.DaemonServer(self.plugin) + healthy = mock.Mock() + self.plugin = service.K8sCNIRegistryPlugin({}, healthy) + self.health_registry = mock.Mock() + self.srv = service.DaemonServer(self.plugin, self.health_registry) self.srv.application.testing = True self.test_client = self.srv.application.test_client() diff --git a/requirements.txt b/requirements.txt index 416c48ae7..45eccab51 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,7 @@ oslo.serialization!=2.19.1,>=2.18.0 # Apache-2.0 oslo.service!=1.28.1,>=1.24.0 # Apache-2.0 oslo.utils>=3.33.0 # Apache-2.0 os-vif!=1.8.0,>=1.7.0 # Apache-2.0 +psutil>=3.2.2 # BSD pyroute2>=0.4.21;sys_platform!='win32' # Apache-2.0 (+ dual licensed GPL2) retrying!=1.3.0,>=1.2.3 # Apache-2.0 six>=1.10.0 # MIT diff --git a/tools/generate_k8s_resource_definitions.sh b/tools/generate_k8s_resource_definitions.sh index d4cadb875..3cb77a882 100755 --- a/tools/generate_k8s_resource_definitions.sh +++ b/tools/generate_k8s_resource_definitions.sh @@ -107,4 +107,5 @@ generate_kuryr_configmap $OUTPUT_DIR $CONTROLLER_CONF_PATH $CNI_CONF_PATH generate_kuryr_service_account $OUTPUT_DIR health_server_port=${KURYR_HEALTH_SERVER_PORT:-8082} generate_controller_deployment $OUTPUT_DIR $health_server_port -generate_cni_daemon_set $OUTPUT_DIR +cni_health_server_port=${KURYR_CNI_HEALTH_SERVER_PORT:-8090} +generate_cni_daemon_set $OUTPUT_DIR $cni_health_server_port