Handle kube ApiException during collectd platform monitoring
During stress test/high platform load it is possible that the kube-apiserver responds with an kube ApiException. As platform monitoring of cpu and memory should not be affected by unresponsive kube-api server, allow the kube ApiException to be handled and the remaining platform resource utilization monitoring to proceed. This could help identify the issue by allowing the raise of the platform alarm (e.g. 100.101 Platform CPU threshold exceeded, 100.103 Memory threshold exceeded). Verfied: o Platform CPU Alarm is raised with stress test o Platform CPU Alarm is raised with stress test and intermittent ApiException o Memory Alarm is raised with stress test o Memory Alarm is raised with stress test and intermittent ApiException o the above alarm conditions are cleared after debounce when stress condition is removed Closes-Bug: 1939172 Signed-off-by: John Kung <john.kung@windriver.com> Change-Id: I2c9c39a390af1d7ae752ad00db18384479cf6e99
This commit is contained in:
parent
aa8665cebf
commit
ecd744ba0a
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (c) 2018-2020 Wind River Systems, Inc.
|
||||
# Copyright (c) 2018-2021 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
@ -24,6 +24,8 @@ import socket
|
|||
import time
|
||||
import tsconfig.tsconfig as tsc
|
||||
|
||||
from kubernetes.client.rest import ApiException
|
||||
|
||||
PLUGIN = 'platform cpu usage plugin'
|
||||
PLUGIN_DEBUG = 'DEBUG platform cpu'
|
||||
|
||||
|
@ -257,38 +259,44 @@ def update_cpu_data(init=False):
|
|||
if obj.debug:
|
||||
collectd.info('%s Refresh k8s pod information.' % (PLUGIN_DEBUG))
|
||||
obj.k8s_pods = set()
|
||||
pods = obj._k8s_client.kube_get_local_pods()
|
||||
for i in pods:
|
||||
# NOTE: parent pod cgroup name contains annotation config.hash as
|
||||
# part of its name, otherwise it contains the pod uid.
|
||||
uid = i.metadata.uid
|
||||
if ((i.metadata.annotations) and
|
||||
(pc.POD_ANNOTATION_KEY in i.metadata.annotations)):
|
||||
hash_uid = i.metadata.annotations.get(pc.POD_ANNOTATION_KEY,
|
||||
None)
|
||||
if hash_uid:
|
||||
if obj.debug:
|
||||
collectd.info('%s POD_ANNOTATION_KEY: '
|
||||
'hash=%s, uid=%s, '
|
||||
'name=%s, namespace=%s, qos_class=%s'
|
||||
% (PLUGIN_DEBUG,
|
||||
hash_uid,
|
||||
i.metadata.uid,
|
||||
i.metadata.name,
|
||||
i.metadata.namespace,
|
||||
i.status.qos_class))
|
||||
uid = hash_uid
|
||||
try:
|
||||
pods = obj._k8s_client.kube_get_local_pods()
|
||||
for i in pods:
|
||||
# NOTE: parent pod cgroup name contains annotation config.hash as
|
||||
# part of its name, otherwise it contains the pod uid.
|
||||
uid = i.metadata.uid
|
||||
if ((i.metadata.annotations) and
|
||||
(pc.POD_ANNOTATION_KEY in i.metadata.annotations)):
|
||||
hash_uid = i.metadata.annotations.get(pc.POD_ANNOTATION_KEY,
|
||||
None)
|
||||
if hash_uid:
|
||||
if obj.debug:
|
||||
collectd.info('%s POD_ANNOTATION_KEY: '
|
||||
'hash=%s, uid=%s, '
|
||||
'name=%s, namespace=%s, qos_class=%s'
|
||||
% (PLUGIN_DEBUG,
|
||||
hash_uid,
|
||||
i.metadata.uid,
|
||||
i.metadata.name,
|
||||
i.metadata.namespace,
|
||||
i.status.qos_class))
|
||||
uid = hash_uid
|
||||
|
||||
obj.k8s_pods.add(uid)
|
||||
if uid not in obj._cache:
|
||||
obj._cache[uid] = pc.POD_object(i.metadata.uid,
|
||||
i.metadata.name,
|
||||
i.metadata.namespace,
|
||||
i.status.qos_class)
|
||||
# Remove stale _cache entries
|
||||
remove_uids = set(obj._cache.keys()) - obj.k8s_pods
|
||||
for uid in remove_uids:
|
||||
del obj._cache[uid]
|
||||
obj.k8s_pods.add(uid)
|
||||
if uid not in obj._cache:
|
||||
obj._cache[uid] = pc.POD_object(i.metadata.uid,
|
||||
i.metadata.name,
|
||||
i.metadata.namespace,
|
||||
i.status.qos_class)
|
||||
|
||||
# Remove stale _cache entries
|
||||
remove_uids = set(obj._cache.keys()) - obj.k8s_pods
|
||||
for uid in remove_uids:
|
||||
del obj._cache[uid]
|
||||
except ApiException:
|
||||
# continue with remainder of calculations, keeping cache
|
||||
collectd.warning("cpu plugin encountered kube ApiException")
|
||||
pass
|
||||
|
||||
# Save initial state information
|
||||
if init:
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (c) 2018-2020 Wind River Systems, Inc.
|
||||
# Copyright (c) 2018-2021 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
@ -25,6 +25,8 @@ import re
|
|||
import socket
|
||||
import time
|
||||
|
||||
from kubernetes.client.rest import ApiException
|
||||
|
||||
PLUGIN = 'platform memory usage'
|
||||
PLUGIN_NORM = '4K memory usage'
|
||||
PLUGIN_NUMA = '4K numa memory usage'
|
||||
|
@ -425,38 +427,44 @@ def read_func():
|
|||
if obj.debug:
|
||||
collectd.info('%s: Refresh k8s pod information.' % (PLUGIN_DEBUG))
|
||||
obj.k8s_pods = set()
|
||||
pods = obj._k8s_client.kube_get_local_pods()
|
||||
for i in pods:
|
||||
# NOTE: parent pod cgroup name contains annotation config.hash as
|
||||
# part of its name, otherwise it contains the pod uid.
|
||||
uid = i.metadata.uid
|
||||
if ((i.metadata.annotations) and
|
||||
(pc.POD_ANNOTATION_KEY in i.metadata.annotations)):
|
||||
hash_uid = i.metadata.annotations.get(pc.POD_ANNOTATION_KEY,
|
||||
None)
|
||||
if hash_uid:
|
||||
if obj.debug:
|
||||
collectd.info('%s: POD_ANNOTATION_KEY: '
|
||||
'hash=%s, uid=%s, '
|
||||
'name=%s, namespace=%s, qos_class=%s'
|
||||
% (PLUGIN_DEBUG,
|
||||
hash_uid,
|
||||
i.metadata.uid,
|
||||
i.metadata.name,
|
||||
i.metadata.namespace,
|
||||
i.status.qos_class))
|
||||
uid = hash_uid
|
||||
try:
|
||||
pods = obj._k8s_client.kube_get_local_pods()
|
||||
for i in pods:
|
||||
# NOTE: parent pod cgroup name contains annotation config.hash as
|
||||
# part of its name, otherwise it contains the pod uid.
|
||||
uid = i.metadata.uid
|
||||
if ((i.metadata.annotations) and
|
||||
(pc.POD_ANNOTATION_KEY in i.metadata.annotations)):
|
||||
hash_uid = i.metadata.annotations.get(pc.POD_ANNOTATION_KEY,
|
||||
None)
|
||||
if hash_uid:
|
||||
if obj.debug:
|
||||
collectd.info('%s: POD_ANNOTATION_KEY: '
|
||||
'hash=%s, uid=%s, '
|
||||
'name=%s, namespace=%s, qos_class=%s'
|
||||
% (PLUGIN_DEBUG,
|
||||
hash_uid,
|
||||
i.metadata.uid,
|
||||
i.metadata.name,
|
||||
i.metadata.namespace,
|
||||
i.status.qos_class))
|
||||
uid = hash_uid
|
||||
|
||||
obj.k8s_pods.add(uid)
|
||||
if uid not in obj._cache:
|
||||
obj._cache[uid] = pc.POD_object(i.metadata.uid,
|
||||
i.metadata.name,
|
||||
i.metadata.namespace,
|
||||
i.status.qos_class)
|
||||
# Remove stale _cache entries
|
||||
remove_uids = set(obj._cache.keys()) - obj.k8s_pods
|
||||
for uid in remove_uids:
|
||||
del obj._cache[uid]
|
||||
obj.k8s_pods.add(uid)
|
||||
if uid not in obj._cache:
|
||||
obj._cache[uid] = pc.POD_object(i.metadata.uid,
|
||||
i.metadata.name,
|
||||
i.metadata.namespace,
|
||||
i.status.qos_class)
|
||||
|
||||
# Remove stale _cache entries
|
||||
remove_uids = set(obj._cache.keys()) - obj.k8s_pods
|
||||
for uid in remove_uids:
|
||||
del obj._cache[uid]
|
||||
except ApiException:
|
||||
# continue with remainder of calculations, keeping cache
|
||||
collectd.warning("memory plugin encountered kube ApiException")
|
||||
pass
|
||||
|
||||
# Summarize memory usage for various groupings
|
||||
for g in pc.OVERALL_GROUPS:
|
||||
|
|
Loading…
Reference in New Issue