kuryr-kubernetes/kuryr_kubernetes/cni/health.py

130 lines
4.7 KiB
Python

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import os
import psutil
import requests
from six.moves import http_client as httplib
import subprocess
from flask import Flask
from pyroute2 import IPDB
from kuryr.lib._i18n import _
from oslo_config import cfg
from oslo_log import log as logging
LOG = logging.getLogger(__name__)
CONF = cfg.CONF
cni_health_server_opts = [
cfg.IntOpt('port',
help=_('Port for CNI Health HTTP Server.'),
default=8090),
cfg.IntOpt('max_memory_usage',
help=_('Maximum memory usage (MiB) for CNI Health Server '
'process. If this value is exceeded kuryr-daemon '
'will be marked as unhealthy.'),
default=-1),
]
CONF.register_opts(cni_health_server_opts, "cni_health_server")
BYTES_AMOUNT = 1048576
class CNIHealthServer(object):
"""Server used by readiness and liveness probe to manage CNI health checks.
Verifies presence of NET_ADMIN capabilities, IPDB in working order,
connectivity to Kubernetes API, quantity of CNI add failure, health of
CNI components and existence of memory leaks.
"""
def __init__(self, components_healthy):
self.ctx = None
self._components_healthy = components_healthy
self.application = Flask('cni-health-daemon')
self.application.add_url_rule(
'/ready', methods=['GET'], view_func=self.readiness_status)
self.application.add_url_rule(
'/alive', methods=['GET'], view_func=self.liveness_status)
self.headers = {'Connection': 'close'}
def readiness_status(self):
net_admin_command = 'capsh --print | grep "Current:" | ' \
'cut -d" " -f3 | grep -q cap_net_admin'
return_code = subprocess.call(net_admin_command, shell=True)
data = 'ok'
k8s_conn, k8s_status = self.verify_k8s_connection()
if return_code != 0:
error_message = 'NET_ADMIN capabilities not present.'
LOG.error(error_message)
return error_message, httplib.INTERNAL_SERVER_ERROR, self.headers
if not k8s_conn:
error_message = 'Error when processing k8s healthz request.'
LOG.error(error_message)
return error_message, k8s_status, self.headers
LOG.info('CNI driver readiness verified.')
return data, httplib.OK, self.headers
def liveness_status(self):
data = 'ok'
no_limit = -1
try:
with IPDB() as a:
a.release()
except Exception:
error_message = 'IPDB not in working order.'
LOG.debug(error_message)
return error_message, httplib.INTERNAL_SERVER_ERROR, self.headers
if CONF.cni_health_server.max_memory_usage != no_limit:
# Force gc to release unreferenced memory before actually checking
# the memory.
gc.collect()
process = psutil.Process(os.getpid())
mem_usage = process.memory_info().rss / BYTES_AMOUNT
if mem_usage > CONF.cni_health_server.max_memory_usage:
err_message = 'CNI daemon exceeded maximum memory usage.'
LOG.debug(err_message)
return err_message, httplib.INTERNAL_SERVER_ERROR, self.headers
with self._components_healthy.get_lock():
if not self._components_healthy.value:
err_message = 'Kuryr CNI components not healthy.'
LOG.debug(err_message)
return err_message, httplib.INTERNAL_SERVER_ERROR, self.headers
LOG.debug('Kuryr CNI Liveness verified.')
return data, httplib.OK, self.headers
def run(self):
address = ''
try:
LOG.info('Starting CNI health check server.')
self.application.run(address, CONF.cni_health_server.port)
except Exception:
LOG.exception('Failed to start CNI health check server.')
raise
def verify_k8s_connection(self):
path = '/healthz'
address = CONF.kubernetes.api_root
url = address + path
resp = requests.get(url, headers={'Connection': 'close'})
return resp.content == 'ok', resp.status_code