kuryr-kubernetes/kuryr_kubernetes/cni/health.py
Michał Dulko d8892d2e72 Civilize logging
Our logs are awful and this commit attempts to fix some issues with
them:
* Make sure we always indicate why some readiness or liveness probe
  fail.
* Suppress INFO logs from werkzeug (so that we don't see every probe
  call on INFO level).
* Remove logging of successful probe checks.
* Make watcher restart logs less scary and include more cases.
* Add backoff to watcher restarts so that we don't spam logs when K8s
  API is briefly unavailable.
* Add warnings for low quotas.
* Suppress some long logs on K8s healthz failures - we don't need full
  message from K8s printed twice.

I also refactored CNI and controller health probes servers to make sure
they're not duplicating code.

Change-Id: Ia3db4863af8f28cfbaf2317042c8631cc63d9745
2020-07-03 15:09:52 +02:00

141 lines
5.2 KiB
Python

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from http import client as httplib
import os
from oslo_config import cfg
from oslo_log import log as logging
from pyroute2 import IPDB
from kuryr.lib._i18n import _
from kuryr_kubernetes.cni import utils
from kuryr_kubernetes import health as base_server
LOG = logging.getLogger(__name__)
CONF = cfg.CONF
cni_health_server_opts = [
cfg.IntOpt('port',
help=_('Port for CNI Health HTTP Server.'),
default=8090),
cfg.IntOpt('max_memory_usage',
help=_('Maximum memory usage (MiB) for CNI Health Server '
'process. If this value is exceeded kuryr-daemon '
'will be marked as unhealthy.'),
default=-1),
cfg.StrOpt(
'cg_path',
help=_('sysfs path to the CNI cgroup. This is used for resource '
'tracking and as such should point to the cgroup hierarchy '
'leaf. It only applies when non containerized'),
default='/sys/fs/cgroup/memory/system.slice/kuryr-cni.service')
]
CONF.register_opts(cni_health_server_opts, "cni_health_server")
TOP_CGROUP_MEMORY_PATH = '/sys/fs/cgroup/memory'
MEMSW_FILENAME = 'memory.memsw.usage_in_bytes'
BYTES_AMOUNT = 1048576
CAP_NET_ADMIN = 12 # Taken from linux/capabilities.h
EFFECTIVE_CAPS = 'CapEff:\t'
def _has_cap(capability, entry, proc_status_path='/proc/self/status'):
"""Returns true iff the process has the specified capability.
:param capability: the bit number for the capability to check as seen
in linux/capabilities.h.
:param entry: Whether to check CapInh, CapEff or CapBnd.
:param proc_status_path: Which process status should be checked. If none
is passed, it will check the current process.
:return: Whether the specified process has the capability bit set
"""
with open(proc_status_path, 'r') as pstat:
for line in pstat:
if line.startswith(entry):
caps = int(line[len(entry):], 16)
return (caps & (1 << capability)) != 0
def _get_cni_cgroup_path():
"""Returns the path to the CNI process cgroup memory directory."""
if utils.running_under_container_runtime():
# We are running inside a container. This means the root cgroup
# is the one we need to track as it will be the CNI parent proc
cg_memsw_path = TOP_CGROUP_MEMORY_PATH
else:
cg_memsw_path = CONF.cni_health_server.cg_path
return cg_memsw_path
def _get_memsw_usage(cgroup_mem_path):
"""Returns the group's resident memory plus swap usage."""
with open(os.path.join(cgroup_mem_path, MEMSW_FILENAME)) as memsw:
memsw_in_bytes = int(memsw.read())
return memsw_in_bytes / BYTES_AMOUNT
class CNIHealthServer(base_server.BaseHealthServer):
"""Server used by readiness and liveness probe to manage CNI health checks.
Verifies presence of NET_ADMIN capabilities, IPDB in working order,
connectivity to Kubernetes API, quantity of CNI add failure, health of
CNI components and existence of memory leaks.
"""
def __init__(self, components_healthy):
super().__init__('daemon-health', CONF.cni_health_server.port)
self._components_healthy = components_healthy
def readiness_status(self):
k8s_conn = self.verify_k8s_connection()
if not _has_cap(CAP_NET_ADMIN, EFFECTIVE_CAPS):
error_message = 'NET_ADMIN capabilities not present.'
LOG.error(error_message)
return error_message, httplib.INTERNAL_SERVER_ERROR, {}
if not k8s_conn:
error_message = 'K8s API healtz endpoint failed.'
LOG.error(error_message)
return error_message, httplib.INTERNAL_SERVER_ERROR, {}
return 'ok', httplib.OK, {}
def liveness_status(self):
no_limit = -1
try:
with IPDB():
pass
except Exception:
error_message = 'IPDB not in working order.'
LOG.error(error_message)
return error_message, httplib.INTERNAL_SERVER_ERROR, {}
if CONF.cni_health_server.max_memory_usage != no_limit:
mem_usage = _get_memsw_usage(_get_cni_cgroup_path())
if mem_usage > CONF.cni_health_server.max_memory_usage:
err_message = 'CNI daemon exceeded maximum memory usage.'
LOG.error(err_message)
return err_message, httplib.INTERNAL_SERVER_ERROR, {}
with self._components_healthy.get_lock():
if not self._components_healthy.value:
err_message = 'Kuryr CNI components not healthy.'
LOG.error(err_message)
return err_message, httplib.INTERNAL_SERVER_ERROR, {}
return 'ok', httplib.OK, {}