
The collectd cpu plugin and monitor-tools are updated to support diagnosing high cpu usage on shorter time scale. This includes tools that assist SystemEngineering determine the source where CPU time is coming from. This collectd cpu plugin is updated to support Kubernetes services under system.slice or k8splatform.slice. This changes the frequency of read function sampling to 1 second. We now see logs with instantaneous cpu spikes at the cgroup level. This dispatch of results still occurs at the original plugin interval of 30 seconds. The logging of the 1 second sampling is configurable via /etc/collectd.d/starlingx/python_plugins.conf field 'hires = <true|false>. The hiresolution samples are always collected and used for a histogram, but it is not always desired to log this due to the volume of output. This adds new logs for occupancy wait. This is similar to cpu occupancy, but instead of realtime used, it measures the aggregate percent of time a given cgroup is waiting to schedule. This is a measure of CPU contention. This adds new logs for occupancy histograms for all cgroups and aggregated groupings based on the 1 second occupancy samples. The histograms are displayed in hirunner order. This displays the histogram, the mean, 95th-percentile, and max value. The histograms are logged at 5 minute intervals. This reduces collectd cgroup to 256 CPUShare from (1024). This smoothes out behaviour of poorly behaved audits. The 'schedtop' tool is updated to display 'cgroup' field. This is the systemd cgroup name, or abbrieviated pod-name. This also handles Kernel sched output format changes for 6.6. New tool 'portscanner' is added to monitor-tools to diagnose local host processes that are using specific ports. This has been instrumental in discovering gunicorn/keystone API users. New tool 'k8smetrics' is added to monitor-tools to display the delay histogram and percentiles for kube-apiserver and etdcserver. This gives a way to quantify performance as a result of system load. Partial-Bug: 2084714 TEST PLAN: AIO-SX, AIO-DX, Standard, Storage, DC: PASS: Fresh install ISO PASS: Verify /var/log/collectd.logs for 1 second cpu/wait logs, and contains: etcd, kubelet, and containerd services. PASS: Verify we are dispatching at 30 second granularity. PASS: Verify we are displaying histograms every 5 minutes. PASS: Verify we can enable/disable the display of hiresolution logs with /etc/collectd.d/starlingx/python_plugins.conf field 'hires = <true|false>'. PASS: Verify schedtop contains 'cgroup' output. PASS: Verify output from 'k8smetrics'. Cross check against Prometheus GUI for apiserver percentile. PASS: Verify output from portscanner with port 5000. Verify 1-to-1 mapping against /var/log/keystone/keystone-all.log. Change-Id: I82d4f414afdf1cecbcc99680b360cbad702ba140 Signed-off-by: Jim Gauld <James.Gauld@windriver.com>
293 lines
9.4 KiB
Python
Executable File
293 lines
9.4 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
########################################################################
|
|
#
|
|
# Copyright (c) 2024 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
########################################################################
|
|
#
|
|
# Calculate Kubernetes latency percentile metrics (50%, 95, and 99%) for
|
|
# etcdserver and kube-apiserver. This is based on Prometheus format raw
|
|
# metrics histograms within kube-apiserver.
|
|
#
|
|
# This obtains current Kubernetes raw metrics cumulative counters,
|
|
# (e.g., kubectl get --raw /metrics). The counters represent cumulative
|
|
# frequency of delays <= value. This calculates the delta from previous,
|
|
# and does percentile calculation.
|
|
#
|
|
# Example:
|
|
# kubectl get --raw /metrics
|
|
#
|
|
# To see API calls:
|
|
# kubectl get --raw /metrics -v 6
|
|
#
|
|
# This does minimal parsing and aggregation to yield equivalent of the
|
|
# following Prometheus PromQL queries using data over a time-window:
|
|
# histogram_quantile(0.95, sum(rate(etcd_request_duration_seconds_bucket[5m])) by (le))
|
|
# histogram_quantile(0.95, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"CONNECT|WATCH|WATCH|PROXY"}[5m])) by (le))
|
|
# histogram_quantile(0.95, sum(rate(workqueue_queue_duration_seconds_bucket[5m])) by (le))
|
|
# histogram_quantile(0.95, sum(rate(rest_client_request_duration_seconds[5m])) by (le))
|
|
#
|
|
# Specific verbs are excluded to eliminate tooling anomolies, otherwise
|
|
# histogram are polluted with >>40second delays.
|
|
#
|
|
# TODO(jgauld): Migrate code to use prometheus_client API;
|
|
# prometheus_clientthat is not currently installed.
|
|
#
|
|
|
|
import argparse
|
|
from copy import deepcopy
|
|
from datetime import datetime, timedelta
|
|
import logging
|
|
import logging.handlers
|
|
import os
|
|
import pprint
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
import time
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
KUBECONFIG = '/etc/kubernetes/admin.conf'
|
|
re_bucket = re.compile(r'^([a-zA-Z0-9:_]+)_bucket{(.*)}\s+(\d+)')
|
|
|
|
def get_raw_metrics(rawfile=None):
|
|
if rawfile is None:
|
|
fd, rawfile = tempfile.mkstemp(dir='/tmp', prefix='k8s-prom-raw-', suffix='.log')
|
|
with os.fdopen(fd, 'w') as f:
|
|
cmd = ['kubectl', '--kubeconfig={}'.format(KUBECONFIG),
|
|
'get', '--raw', '/metrics']
|
|
try:
|
|
subprocess.check_call(cmd, stdout=f, timeout=5)
|
|
except subprocess.TimeoutExpired as e:
|
|
LOG.error('get_raw_metrics: error=%s' % (str(e)))
|
|
except subprocess.CalledProcessError as e:
|
|
LOG.error('get_raw_metrics: error=%s' % (str(e)))
|
|
except Exception as e:
|
|
LOG.error('get_raw_metrics: error=%s' % (str(e)))
|
|
return rawfile
|
|
|
|
def read_raw_metrics(rawfile=None):
|
|
patterns = {
|
|
'apiserver_request_duration_seconds': {'exclude_verbs': ['CONNECT', 'WATCH', 'WATCHLIST', 'PROXY']},
|
|
'etcd_request_duration_seconds': {},
|
|
'workqueue_queue_duration_seconds': {},
|
|
'rest_client_request_duration_seconds': {},
|
|
}
|
|
names = patterns.keys()
|
|
|
|
# Store aggregate bucket values metric[name][le]
|
|
metrics = {}
|
|
for name in names:
|
|
metrics[name] = {}
|
|
|
|
cleanup = False
|
|
if rawfile is None:
|
|
cleanup = True
|
|
rawfile = get_raw_metrics()
|
|
|
|
with open(rawfile) as f:
|
|
for l in f:
|
|
if l.startswith(tuple(names)):
|
|
# THIS IS TOO VERBOSE FOR TYPICAL DEBUG
|
|
#LOG.debug(l.rstrip())
|
|
|
|
match = re_bucket.search(l)
|
|
if match:
|
|
name = match.group(1)
|
|
tags = match.group(2)
|
|
count = int(match.group(3))
|
|
|
|
D = {}
|
|
for key_value in tags.split(','):
|
|
key, value = key_value.split('=')
|
|
value = value.replace('"', '')
|
|
D.update({key: value})
|
|
|
|
# make sure we have a valid "le" bucket
|
|
bucket = D.get('le')
|
|
if bucket is None:
|
|
continue
|
|
|
|
# filter out specific verbs
|
|
exclude_verbs = patterns[name].get('exclude_verbs', {})
|
|
if 'verb' in D and D['verb'] in exclude_verbs:
|
|
continue
|
|
|
|
# Aggregate metric for matching name and "le" bucket
|
|
if bucket not in metrics[name]:
|
|
metrics[name][bucket] = 0
|
|
metrics[name][bucket] += count
|
|
|
|
if cleanup:
|
|
os.unlink(rawfile)
|
|
|
|
return metrics
|
|
|
|
|
|
def percentile(hist, q=0.95):
|
|
# Input: dictionary hist[le_bin] = freq
|
|
|
|
# these are sorted
|
|
le_bins = sorted(list(hist.keys()), key=float)
|
|
|
|
# Calculate number of binned samples
|
|
count = 0
|
|
for x in le_bins:
|
|
count += hist[x]
|
|
|
|
p0 = 0.0
|
|
x0 = 0.0
|
|
|
|
for x in le_bins:
|
|
x1 = float(x)
|
|
p = float(hist[x]) / float(count)
|
|
p1 = p0 + p
|
|
if p1 >= q:
|
|
percentile = x0 + (x1 - x0) * (q - p0) / (p1 - p0)
|
|
break
|
|
p0 = p1
|
|
percentile = x1
|
|
|
|
return percentile
|
|
|
|
def k8smetrics(args=None):
|
|
# Read prometheus raw metrics snapshot at time t1
|
|
now = datetime.now()
|
|
tstamp1 = now
|
|
t1 = read_raw_metrics()
|
|
if args.debug:
|
|
LOG.debug("t1:")
|
|
pprint.pprint(t1, indent=1)
|
|
|
|
start_time = now
|
|
while now - start_time < timedelta(minutes=args.period_min):
|
|
# Copy all state information for time t0
|
|
t0 = deepcopy(t1)
|
|
tstamp0 = tstamp1
|
|
|
|
time.sleep(args.interval_min*60)
|
|
|
|
# Read prometheus raw metrics snapshot at time t1
|
|
now = datetime.now()
|
|
tstamp1 = now
|
|
t1 = read_raw_metrics()
|
|
if args.debug:
|
|
LOG.debug("t1:")
|
|
pprint.pprint(t1, indent=1)
|
|
|
|
# Print tool header for this interval
|
|
duration = tstamp1 - tstamp0
|
|
LOG.info('Samples from: %s - %s, duration: %s'
|
|
% (tstamp0, tstamp1, duration))
|
|
|
|
# Calculate delta between cumulative snapshots
|
|
delta = {}
|
|
for name in t1.keys():
|
|
delta[name] = {}
|
|
for bucket in t1[name]:
|
|
v0 = t0[name].get(bucket, 0)
|
|
delta[name][bucket] = t1[name][bucket] - v0
|
|
|
|
# NOTE: le="+Inf" is identical to value of x_count
|
|
# le="y" is upper-bound of the bucket
|
|
hist = {}
|
|
for name in delta.keys():
|
|
hist[name] = {}
|
|
inf = delta[name].pop('+Inf', None)
|
|
if inf is None:
|
|
continue
|
|
buckets = sorted(list(delta[name].keys()), key=float)
|
|
|
|
# Calculate frequency distribution from cumulative frequency
|
|
maxbin = 0.0
|
|
v0 = 0
|
|
for x in buckets:
|
|
v = delta[name][x]
|
|
d = v - v0
|
|
# in the case of anomolous value (yeah, we going crazy)
|
|
if d < 0:
|
|
if args.debug:
|
|
LOG.debug('d<0: x=%s, v0=%s, v=%s, d=%s, inf=%s' % (x, v0, v, d, inf))
|
|
d = 0
|
|
if d > 0:
|
|
maxbin = float(x)
|
|
v0 = v
|
|
hist[name][x] = d
|
|
|
|
index = name.rfind('_seconds')
|
|
text = name[:index]
|
|
percentile_50 = 1000.0*percentile(hist[name], q=0.50)
|
|
percentile_95 = 1000.0*percentile(hist[name], q=0.95)
|
|
percentile_99 = 1000.0*percentile(hist[name], q=0.99)
|
|
|
|
# Print histogram summary and percentiles for each metric
|
|
print("{} : count: {}, p50: {:.0f} ms, p95: {:.0f} ms, p99: {:.0f} ms, maxbin: {:.0f} ms".format(
|
|
text, inf, percentile_50, percentile_95, percentile_99, 1000.0*maxbin))
|
|
print('bins:', end=' ')
|
|
[print('{0:5g}'.format(1000.0*float(x)), end=' ') for x in buckets]
|
|
print()
|
|
print(' <=:', end=' ')
|
|
[print('{0:5.0f}'.format(delta[name][x]), end=' ') for x in buckets]
|
|
print()
|
|
print('hist:', end=' ')
|
|
[print('{0:5.0f}'.format(hist[name][x]), end=' ') for x in buckets]
|
|
print()
|
|
|
|
# blank line between metrics
|
|
print()
|
|
|
|
return 0
|
|
|
|
def main():
|
|
# Instantiate the parser
|
|
parser = argparse.ArgumentParser(description='Kubernetes latency percentile metrics')
|
|
|
|
# Optional argument
|
|
parser.add_argument('--period_min', type=int, default=1,
|
|
help='sampling period in minutes')
|
|
parser.add_argument('--interval_min', type=int, default=1,
|
|
help='sampling interval in minutes')
|
|
parser.add_argument('--debug', action='store_true',
|
|
help='enable tool debug')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Configure logging
|
|
if args.debug:
|
|
level = logging.DEBUG
|
|
else:
|
|
level = logging.INFO
|
|
out_hdlr = logging.StreamHandler(sys.stdout)
|
|
formatter = logging.Formatter(
|
|
'%(asctime)s %(process)s %(levelname)s %(module)s: %(message)s')
|
|
out_hdlr.setFormatter(formatter)
|
|
out_hdlr.setLevel(level)
|
|
LOG.addHandler(out_hdlr)
|
|
LOG.setLevel(level)
|
|
|
|
LOG.info("Kubernetes latency percentiles: period:%s mins, interval=%s mins",
|
|
args.period_min, args.interval_min)
|
|
|
|
try:
|
|
ret = k8smetrics(args=args)
|
|
sys.exit(ret)
|
|
|
|
except KeyboardInterrupt as e:
|
|
LOG.info('caught: %r, shutting down', e)
|
|
sys.exit(0)
|
|
|
|
except IOError:
|
|
sys.exit(0)
|
|
|
|
except Exception as e:
|
|
LOG.error('exception: %r', e, exc_info=1)
|
|
sys.exit(-4)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|