openstack-helm-infra/kubernetes-node-problem-detector/values.yaml
Steve Wilkerson a31bb2b049 Add node-problem-detector chart
This adds a chart for the node problem detector. This chart
will help provide additional insight into the status of the
underlying infrastructure of a deployment.

Updated the chart with new yamllint checks.

Change-Id: I21a24b67b121388107b20ab38ac7703c7a33f1c1
Signed-off-by: Steve Wilkerson <sw5822@att.com>
2020-06-22 13:00:55 -05:00

466 lines
14 KiB
YAML

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Default values for node-exporter.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
---
images:
tags:
node_problem_detector: k8s.gcr.io/node-problem-detector:v0.7.0
dep_check: quay.io/airshipit/kubernetes-entrypoint:v1.0.0
image_repo_sync: docker.io/docker:17.07.0
pull_policy: IfNotPresent
local_registry:
active: false
exclude:
- dep_check
- image_repo_sync
labels:
node_problem_detector:
node_selector_key: openstack-control-plane
node_selector_value: enabled
job:
node_selector_key: openstack-control-plane
node_selector_value: enabled
pod:
security_context:
node_problem_detector:
container:
node_problem_detector:
privileged: true
affinity:
anti:
type:
default: preferredDuringSchedulingIgnoredDuringExecution
topologyKey:
default: kubernetes.io/hostname
mounts:
node_problem_detector:
node_problem_detector:
init_container: null
lifecycle:
upgrades:
daemonsets:
pod_replacement_strategy: RollingUpdate
node_problem_detector:
enabled: true
min_ready_seconds: 0
revision_history: 3
pod_replacement_strategy: RollingUpdate
rolling_update:
max_unavailable: 1
max_surge: 3
termination_grace_period:
node_problem_detector:
timeout: 30
resources:
enabled: false
node_problem_detector:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "1024Mi"
cpu: "2000m"
jobs:
image_repo_sync:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "1024Mi"
cpu: "2000m"
tolerations:
node_problem_detector:
enabled: false
tolerations:
- key: node-role.kubernetes.io/master
operator: Exists
- key: node-role.kubernetes.io/node
operator: Exists
dependencies:
dynamic:
common:
local_image_registry:
jobs:
- node-exporter-image-repo-sync
services:
- endpoint: node
service: local_image_registry
static:
image_repo_sync:
services:
- endpoint: internal
service: local_image_registry
node_problem_detector:
services: null
monitoring:
prometheus:
pod:
enabled: true
service:
enabled: false
node_problem_detector:
scrape: true
port: 20257
endpoints:
cluster_domain_suffix: cluster.local
local_image_registry:
name: docker-registry
namespace: docker-registry
hosts:
default: localhost
internal: docker-registry
node: localhost
host_fqdn_override:
default: null
port:
registry:
node: 5000
node_problem_detector:
name: node-problem-detector
namespace: null
hosts:
default: node-problem-detector
host_fqdn_override:
default: null
path:
default: null
port:
metrics:
default: 20257
manifests:
configmap_bin: true
configmap_etc: true
daemonset: true
job_image_repo_sync: true
service: false
conf:
monitors:
system-log-monitor:
enabled:
- /config/kernel-monitor.json
- /config/docker-monitor.json
- /config/systemd-monitor.json
scripts:
enabled: null
source: null
config:
kernel-monitor:
plugin: kmsg
logPath: "/dev/kmsg"
lookback: 5m
bufferSize: 10
source: kernel-monitor
conditions:
- type: KernelDeadlock
reason: KernelHasNoDeadlock
message: kernel has no deadlock
- type: ReadonlyFilesystem
reason: FilesystemIsNotReadOnly
message: Filesystem is not read-only
rules:
- type: temporary
reason: OOMKilling
pattern: Kill process \d+ (.+) score \d+ or sacrifice child\nKilled process \d+
(.+) total-vm:\d+kB, anon-rss:\d+kB, file-rss:\d+kB.*
- type: temporary
reason: TaskHung
pattern: task \S+:\w+ blocked for more than \w+ seconds\.
- type: temporary
reason: UnregisterNetDevice
pattern: 'unregister_netdevice: waiting for \w+ to become free. Usage count = \d+'
- type: temporary
reason: KernelOops
pattern: 'BUG: unable to handle kernel NULL pointer dereference at .*'
- type: temporary
reason: KernelOops
pattern: 'divide error: 0000 \[#\d+\] SMP'
- type: permanent
condition: KernelDeadlock
reason: AUFSUmountHung
pattern: task umount\.aufs:\w+ blocked for more than \w+ seconds\.
- type: permanent
condition: KernelDeadlock
reason: DockerHung
pattern: task docker:\w+ blocked for more than \w+ seconds\.
- type: permanent
condition: ReadonlyFilesystem
reason: FilesystemIsReadOnly
pattern: Remounting filesystem read-only
kernel-monitor-filelog:
plugin: filelog
pluginConfig:
timestamp: "^.{15}"
message: 'kernel: \[.*\] (.*)'
timestampFormat: Jan _2 15:04:05
logPath: "/var/log/kern.log"
lookback: 5m
bufferSize: 10
source: kernel-monitor
conditions:
- type: KernelDeadlock
reason: KernelHasNoDeadlock
message: kernel has no deadlock
rules:
- type: temporary
reason: OOMKilling
pattern: Kill process \d+ (.+) score \d+ or sacrifice child\nKilled process \d+
(.+) total-vm:\d+kB, anon-rss:\d+kB, file-rss:\d+kB.*
- type: temporary
reason: TaskHung
pattern: task \S+:\w+ blocked for more than \w+ seconds\.
- type: temporary
reason: UnregisterNetDevice
pattern: 'unregister_netdevice: waiting for \w+ to become free. Usage count = \d+'
- type: temporary
reason: KernelOops
pattern: 'BUG: unable to handle kernel NULL pointer dereference at .*'
- type: temporary
reason: KernelOops
pattern: 'divide error: 0000 \[#\d+\] SMP'
- type: permanent
condition: KernelDeadlock
reason: AUFSUmountHung
pattern: task umount\.aufs:\w+ blocked for more than \w+ seconds\.
- type: permanent
condition: KernelDeadlock
reason: DockerHung
pattern: task docker:\w+ blocked for more than \w+ seconds\.
kernel-monitor-counter:
plugin: custom
pluginConfig:
invoke_interval: 5m
timeout: 1m
max_output_length: 80
concurrency: 1
source: kernel-monitor
conditions:
- type: FrequentUnregisterNetDevice
reason: NoFrequentUnregisterNetDevice
message: node is functioning properly
rules:
- type: permanent
condition: FrequentUnregisterNetDevice
reason: UnregisterNetDevice
path: "/home/kubernetes/bin/log-counter"
args:
- "--journald-source=kernel"
- "--log-path=/var/log/journal"
- "--lookback=20m"
- "--count=3"
- "--pattern=unregister_netdevice: waiting for \\w+ to become free. Usage count
= \\d+"
timeout: 1m
docker-monitor:
plugin: journald
pluginConfig:
source: dockerd
logPath: "/var/log/journal"
lookback: 5m
bufferSize: 10
source: docker-monitor
conditions: []
rules:
- type: temporary
reason: CorruptDockerImage
pattern: 'Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+)
/var/lib/docker/image/(.+): directory not empty.*'
docker-monitor-filelog:
plugin: filelog
pluginConfig:
timestamp: ^time="(\S*)"
message: |-
msg="([^
]*)"
timestampFormat: '2006-01-02T15:04:05.999999999-07:00'
logPath: "/var/log/docker.log"
lookback: 5m
bufferSize: 10
source: docker-monitor
conditions: []
rules:
- type: temporary
reason: CorruptDockerImage
pattern: 'Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+)
/var/lib/docker/image/(.+): directory not empty.*'
docker-monitor-counter:
plugin: custom
pluginConfig:
invoke_interval: 5m
timeout: 1m
max_output_length: 80
concurrency: 1
source: docker-monitor
conditions:
- type: CorruptDockerOverlay2
reason: NoCorruptDockerOverlay2
message: docker overlay2 is functioning properly
rules:
- type: permanent
condition: CorruptDockerOverlay2
reason: CorruptDockerOverlay2
path: "/home/kubernetes/bin/log-counter"
args:
- "--journald-source=dockerd"
- "--log-path=/var/log/journal"
- "--lookback=5m"
- "--count=10"
- "--pattern=returned error: readlink /var/lib/docker/overlay2.*: invalid argument.*"
timeout: 1m
systemd-monitor:
plugin: journald
pluginConfig:
source: systemd
logPath: "/var/log/journal"
lookback: ''
bufferSize: 10
source: systemd-monitor
conditions: []
rules:
- type: temporary
reason: KubeletStart
pattern: Started Kubernetes kubelet.
- type: temporary
reason: DockerStart
pattern: Starting Docker Application Container Engine...
- type: temporary
reason: ContainerdStart
pattern: Starting containerd container runtime...
systemd-monitor-counter:
plugin: custom
pluginConfig:
invoke_interval: 5m
timeout: 1m
max_output_length: 80
concurrency: 1
source: systemd-monitor
conditions:
- type: FrequentKubeletRestart
reason: NoFrequentKubeletRestart
message: kubelet is functioning properly
- type: FrequentDockerRestart
reason: NoFrequentDockerRestart
message: docker is functioning properly
- type: FrequentContainerdRestart
reason: NoFrequentContainerdRestart
message: containerd is functioning properly
rules:
- type: permanent
condition: FrequentKubeletRestart
reason: FrequentKubeletRestart
path: "/home/kubernetes/bin/log-counter"
args:
- "--journald-source=systemd"
- "--log-path=/var/log/journal"
- "--lookback=20m"
- "--delay=5m"
- "--count=5"
- "--pattern=Started Kubernetes kubelet."
timeout: 1m
- type: permanent
condition: FrequentDockerRestart
reason: FrequentDockerRestart
path: "/home/kubernetes/bin/log-counter"
args:
- "--journald-source=systemd"
- "--log-path=/var/log/journal"
- "--lookback=20m"
- "--count=5"
- "--pattern=Starting Docker Application Container Engine..."
timeout: 1m
- type: permanent
condition: FrequentContainerdRestart
reason: FrequentContainerdRestart
path: "/home/kubernetes/bin/log-counter"
args:
- "--journald-source=systemd"
- "--log-path=/var/log/journal"
- "--lookback=20m"
- "--count=5"
- "--pattern=Starting containerd container runtime..."
timeout: 1m
custom-plugin-monitor:
enabled:
- /config/network-problem-monitor.json
scripts:
enabled:
- network_problem.sh
source:
network_problem.sh: |
#!/bin/bash
# This plugin checks for common network issues. Currently, it only checks
# if the conntrack table is full.
OK=0
NONOK=1
UNKNOWN=2
[ -f /proc/sys/net/ipv4/netfilter/ip_conntrack_max ] || exit $UNKNOWN
[ -f /proc/sys/net/ipv4/netfilter/ip_conntrack_count ] || exit $UNKNOWN
conntrack_max=$(cat /proc/sys/net/ipv4/netfilter/ip_conntrack_max)
conntrack_count=$(cat /proc/sys/net/ipv4/netfilter/ip_conntrack_count)
if (( conntrack_count >= conntrack_max )); then
echo "Conntrack table full"
exit $NONOK
fi
echo "Conntrack table available"
exit $OK
config:
network-problem-monitor:
plugin: custom
pluginConfig:
invoke_interval: 30s
timeout: 5s
max_output_length: 80
concurrency: 3
source: network-custom-plugin-monitor
conditions: []
rules:
- type: temporary
reason: ConntrackFull
path: "./config/plugin/network_problem.sh"
timeout: 3s
system-stats-monitor:
enabled:
- /config/system-stats-monitor.json
scripts:
enabled: null
source: null
config:
system-stats-monitor:
disk:
metricsConfigs:
disk/io_time:
displayName: disk/io_time
disk/weighted_io:
displayName: disk/weighted_io
disk/avg_queue_len:
displayName: disk/avg_queue_len
includeRootBlk: true
includeAllAttachedBlk: true
lsblkTimeout: 5s
invokeInterval: 60s
...