[fedora-atomic-k8s] Adding Node Problem Detector

Deploying Node Problem Detector to all nodes to detect problems which
can be leverage by auto healing. This is the first step of enabling
the auto healing feature.

Task: 29886
Story: 2004782

Change-Id: I1b6075025c5f369821b4136783e68b16535dc6ef
This commit is contained in:
Feilong Wang 2019-03-08 14:52:15 +13:00
parent 722fc56eb3
commit c39f1150e5
6 changed files with 136 additions and 1 deletions

View File

@ -0,0 +1,117 @@
#!/bin/sh
step="enable-auto-healing"
printf "Starting to run ${step}\n"
. /etc/sysconfig/heat-params
_gcr_prefix=${CONTAINER_INFRA_PREFIX:-k8s.gcr.io/}
# Generate Node Problem Detector manifest file
NPD_DEPLOY=/srv/magnum/kubernetes/manifests/npd.yaml
[ -f ${NPD_DEPLOY} ] || {
echo "Writing File: $NPD_DEPLOY"
mkdir -p $(dirname ${NPD_DEPLOY})
cat << EOF > ${NPD_DEPLOY}
apiVersion: v1
kind: ServiceAccount
metadata:
name: node-problem-detector
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: npd-binding
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:node-problem-detector
subjects:
- kind: ServiceAccount
name: node-problem-detector
namespace: kube-system
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: npd
namespace: kube-system
labels:
k8s-app: node-problem-detector
version: ${NODE_PROBLEM_DETECTOR_TAG}
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
spec:
selector:
matchLabels:
k8s-app: node-problem-detector
version: ${NODE_PROBLEM_DETECTOR_TAG}
template:
metadata:
labels:
k8s-app: node-problem-detector
version: ${NODE_PROBLEM_DETECTOR_TAG}
kubernetes.io/cluster-service: "true"
spec:
containers:
- name: node-problem-detector
image: ${_gcr_prefix}node-problem-detector:${NODE_PROBLEM_DETECTOR_TAG}
command:
- "/bin/sh"
- "-c"
# Pass both config to support both journald and syslog.
- "exec /node-problem-detector --logtostderr --system-log-monitors=/config/kernel-monitor.json,/config/kernel-monitor-filelog.json,/config/docker-monitor.json,/config/docker-monitor-filelog.json >>/var/log/node-problem-detector.log 2>&1"
securityContext:
privileged: true
resources:
limits:
cpu: "200m"
memory: "100Mi"
requests:
cpu: "20m"
memory: "20Mi"
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- name: log
mountPath: /var/log
- name: localtime
mountPath: /etc/localtime
readOnly: true
volumes:
- name: log
hostPath:
path: /var/log/
- name: localtime
hostPath:
path: /etc/localtime
type: "FileOrCreate"
serviceAccountName: node-problem-detector
tolerations:
- operator: "Exists"
effect: "NoExecute"
- key: "CriticalAddonsOnly"
operator: "Exists"
EOF
}
echo "Waiting for Kubernetes API..."
until [ "ok" = "$(curl --silent http://127.0.0.1:8080/healthz)" ]
do
sleep 5
done
kubectl apply -f ${NPD_DEPLOY}
printf "Finished running ${step}\n"

View File

@ -86,3 +86,4 @@ write_files:
TILLER_ENABLED="$TILLER_ENABLED"
TILLER_TAG="$TILLER_TAG"
TILLER_NAMESPACE="$TILLER_NAMESPACE"
NODE_PROBLEM_DETECTOR_TAG="$NODE_PROBLEM_DETECTOR_TAG"

View File

@ -118,7 +118,8 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition):
'keystone_auth_enabled', 'k8s_keystone_auth_tag',
'tiller_enabled',
'tiller_tag',
'tiller_namespace']
'tiller_namespace',
'node_problem_detector_tag']
for label in label_list:
label_value = cluster.labels.get(label)

View File

@ -555,6 +555,11 @@ parameters:
description: namespace where tiller will be installed.
default: "magnum-tiller"
node_problem_detector_tag:
type: string
description: tag of the node problem detector container
default: v0.6.2
resources:
######################################################################
@ -818,6 +823,7 @@ resources:
tiller_enabled: {get_param: tiller_enabled}
tiller_tag: {get_param: tiller_tag}
tiller_namespace: {get_param: tiller_namespace}
node_problem_detector_tag: {get_param: node_problem_detector_tag}
kube_cluster_config:
type: OS::Heat::SoftwareConfig
@ -848,6 +854,7 @@ resources:
template: {get_file: ../../common/templates/kubernetes/fragments/enable-ingress-controller.sh}
- get_file: ../../common/templates/kubernetes/fragments/kube-dashboard-service.sh
- get_file: ../../common/templates/kubernetes/fragments/enable-keystone-auth.sh
- get_file: ../../common/templates/kubernetes/fragments/enable-auto-healing.sh
kube_cluster_deploy:
type: OS::Heat::SoftwareDeployment

View File

@ -430,6 +430,10 @@ parameters:
type: string
description: namespace where tiller will be installed
node_problem_detector_tag:
type: string
description: tag of the node problem detector container
resources:
######################################################################
#
@ -539,6 +543,7 @@ resources:
"$TILLER_ENABLED": {get_param: tiller_enabled}
"$TILLER_TAG": {get_param: tiller_tag}
"$TILLER_NAMESPACE": {get_param: tiller_namespace}
"$NODE_PROBLEM_DETECTOR_TAG": {get_param: node_problem_detector_tag}
install_openstack_ca:
type: OS::Heat::SoftwareConfig

View File

@ -424,6 +424,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
'tiller_tag')
tiller_namespace = mock_cluster.labels.get(
'tiller_namespace')
npd_tag = mock_cluster.labels.get('node_problem_detector_tag')
k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition()
@ -484,6 +485,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
'tiller_enabled': tiller_enabled,
'tiller_tag': tiller_tag,
'tiller_namespace': tiller_namespace,
'node_problem_detector_tag': npd_tag,
}}
mock_get_params.assert_called_once_with(mock_context,
mock_cluster_template,
@ -790,6 +792,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
'tiller_tag')
tiller_namespace = mock_cluster.labels.get(
'tiller_namespace')
npd_tag = mock_cluster.labels.get('node_problem_detector_tag')
k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition()
@ -852,6 +855,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
'tiller_enabled': tiller_enabled,
'tiller_tag': tiller_tag,
'tiller_namespace': tiller_namespace,
'node_problem_detector_tag': npd_tag,
}}
mock_get_params.assert_called_once_with(mock_context,
mock_cluster_template,