d8892d2e72
Our logs are awful and this commit attempts to fix some issues with them: * Make sure we always indicate why some readiness or liveness probe fail. * Suppress INFO logs from werkzeug (so that we don't see every probe call on INFO level). * Remove logging of successful probe checks. * Make watcher restart logs less scary and include more cases. * Add backoff to watcher restarts so that we don't spam logs when K8s API is briefly unavailable. * Add warnings for low quotas. * Suppress some long logs on K8s healthz failures - we don't need full message from K8s printed twice. I also refactored CNI and controller health probes servers to make sure they're not duplicating code. Change-Id: Ia3db4863af8f28cfbaf2317042c8631cc63d9745
141 lines
5.2 KiB
Python
141 lines
5.2 KiB
Python
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from http import client as httplib
|
|
import os
|
|
|
|
from oslo_config import cfg
|
|
from oslo_log import log as logging
|
|
from pyroute2 import IPDB
|
|
|
|
from kuryr.lib._i18n import _
|
|
from kuryr_kubernetes.cni import utils
|
|
from kuryr_kubernetes import health as base_server
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
CONF = cfg.CONF
|
|
|
|
cni_health_server_opts = [
|
|
cfg.IntOpt('port',
|
|
help=_('Port for CNI Health HTTP Server.'),
|
|
default=8090),
|
|
cfg.IntOpt('max_memory_usage',
|
|
help=_('Maximum memory usage (MiB) for CNI Health Server '
|
|
'process. If this value is exceeded kuryr-daemon '
|
|
'will be marked as unhealthy.'),
|
|
default=-1),
|
|
cfg.StrOpt(
|
|
'cg_path',
|
|
help=_('sysfs path to the CNI cgroup. This is used for resource '
|
|
'tracking and as such should point to the cgroup hierarchy '
|
|
'leaf. It only applies when non containerized'),
|
|
default='/sys/fs/cgroup/memory/system.slice/kuryr-cni.service')
|
|
]
|
|
|
|
CONF.register_opts(cni_health_server_opts, "cni_health_server")
|
|
|
|
TOP_CGROUP_MEMORY_PATH = '/sys/fs/cgroup/memory'
|
|
MEMSW_FILENAME = 'memory.memsw.usage_in_bytes'
|
|
BYTES_AMOUNT = 1048576
|
|
CAP_NET_ADMIN = 12 # Taken from linux/capabilities.h
|
|
EFFECTIVE_CAPS = 'CapEff:\t'
|
|
|
|
|
|
def _has_cap(capability, entry, proc_status_path='/proc/self/status'):
|
|
"""Returns true iff the process has the specified capability.
|
|
|
|
:param capability: the bit number for the capability to check as seen
|
|
in linux/capabilities.h.
|
|
:param entry: Whether to check CapInh, CapEff or CapBnd.
|
|
:param proc_status_path: Which process status should be checked. If none
|
|
is passed, it will check the current process.
|
|
:return: Whether the specified process has the capability bit set
|
|
"""
|
|
with open(proc_status_path, 'r') as pstat:
|
|
for line in pstat:
|
|
if line.startswith(entry):
|
|
caps = int(line[len(entry):], 16)
|
|
return (caps & (1 << capability)) != 0
|
|
|
|
|
|
def _get_cni_cgroup_path():
|
|
"""Returns the path to the CNI process cgroup memory directory."""
|
|
if utils.running_under_container_runtime():
|
|
# We are running inside a container. This means the root cgroup
|
|
# is the one we need to track as it will be the CNI parent proc
|
|
cg_memsw_path = TOP_CGROUP_MEMORY_PATH
|
|
else:
|
|
cg_memsw_path = CONF.cni_health_server.cg_path
|
|
|
|
return cg_memsw_path
|
|
|
|
|
|
def _get_memsw_usage(cgroup_mem_path):
|
|
"""Returns the group's resident memory plus swap usage."""
|
|
with open(os.path.join(cgroup_mem_path, MEMSW_FILENAME)) as memsw:
|
|
memsw_in_bytes = int(memsw.read())
|
|
return memsw_in_bytes / BYTES_AMOUNT
|
|
|
|
|
|
class CNIHealthServer(base_server.BaseHealthServer):
|
|
"""Server used by readiness and liveness probe to manage CNI health checks.
|
|
|
|
Verifies presence of NET_ADMIN capabilities, IPDB in working order,
|
|
connectivity to Kubernetes API, quantity of CNI add failure, health of
|
|
CNI components and existence of memory leaks.
|
|
"""
|
|
|
|
def __init__(self, components_healthy):
|
|
|
|
super().__init__('daemon-health', CONF.cni_health_server.port)
|
|
self._components_healthy = components_healthy
|
|
|
|
def readiness_status(self):
|
|
k8s_conn = self.verify_k8s_connection()
|
|
|
|
if not _has_cap(CAP_NET_ADMIN, EFFECTIVE_CAPS):
|
|
error_message = 'NET_ADMIN capabilities not present.'
|
|
LOG.error(error_message)
|
|
return error_message, httplib.INTERNAL_SERVER_ERROR, {}
|
|
if not k8s_conn:
|
|
error_message = 'K8s API healtz endpoint failed.'
|
|
LOG.error(error_message)
|
|
return error_message, httplib.INTERNAL_SERVER_ERROR, {}
|
|
|
|
return 'ok', httplib.OK, {}
|
|
|
|
def liveness_status(self):
|
|
no_limit = -1
|
|
try:
|
|
with IPDB():
|
|
pass
|
|
except Exception:
|
|
error_message = 'IPDB not in working order.'
|
|
LOG.error(error_message)
|
|
return error_message, httplib.INTERNAL_SERVER_ERROR, {}
|
|
|
|
if CONF.cni_health_server.max_memory_usage != no_limit:
|
|
mem_usage = _get_memsw_usage(_get_cni_cgroup_path())
|
|
|
|
if mem_usage > CONF.cni_health_server.max_memory_usage:
|
|
err_message = 'CNI daemon exceeded maximum memory usage.'
|
|
LOG.error(err_message)
|
|
return err_message, httplib.INTERNAL_SERVER_ERROR, {}
|
|
|
|
with self._components_healthy.get_lock():
|
|
if not self._components_healthy.value:
|
|
err_message = 'Kuryr CNI components not healthy.'
|
|
LOG.error(err_message)
|
|
return err_message, httplib.INTERNAL_SERVER_ERROR, {}
|
|
|
|
return 'ok', httplib.OK, {}
|