[fedora-atomic-k8s] Adding Node Problem Detector
Deploying Node Problem Detector to all nodes to detect problems which can be leverage by auto healing. This is the first step of enabling the auto healing feature. Task: 29886 Story: 2004782 Change-Id: I1b6075025c5f369821b4136783e68b16535dc6ef
This commit is contained in:
parent
722fc56eb3
commit
c39f1150e5
|
@ -0,0 +1,117 @@
|
||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
step="enable-auto-healing"
|
||||||
|
printf "Starting to run ${step}\n"
|
||||||
|
|
||||||
|
. /etc/sysconfig/heat-params
|
||||||
|
|
||||||
|
_gcr_prefix=${CONTAINER_INFRA_PREFIX:-k8s.gcr.io/}
|
||||||
|
|
||||||
|
# Generate Node Problem Detector manifest file
|
||||||
|
NPD_DEPLOY=/srv/magnum/kubernetes/manifests/npd.yaml
|
||||||
|
|
||||||
|
[ -f ${NPD_DEPLOY} ] || {
|
||||||
|
echo "Writing File: $NPD_DEPLOY"
|
||||||
|
mkdir -p $(dirname ${NPD_DEPLOY})
|
||||||
|
cat << EOF > ${NPD_DEPLOY}
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
name: node-problem-detector
|
||||||
|
namespace: kube-system
|
||||||
|
labels:
|
||||||
|
kubernetes.io/cluster-service: "true"
|
||||||
|
addonmanager.kubernetes.io/mode: Reconcile
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
metadata:
|
||||||
|
name: npd-binding
|
||||||
|
labels:
|
||||||
|
kubernetes.io/cluster-service: "true"
|
||||||
|
addonmanager.kubernetes.io/mode: Reconcile
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: ClusterRole
|
||||||
|
name: system:node-problem-detector
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: node-problem-detector
|
||||||
|
namespace: kube-system
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: npd
|
||||||
|
namespace: kube-system
|
||||||
|
labels:
|
||||||
|
k8s-app: node-problem-detector
|
||||||
|
version: ${NODE_PROBLEM_DETECTOR_TAG}
|
||||||
|
kubernetes.io/cluster-service: "true"
|
||||||
|
addonmanager.kubernetes.io/mode: Reconcile
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
k8s-app: node-problem-detector
|
||||||
|
version: ${NODE_PROBLEM_DETECTOR_TAG}
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
k8s-app: node-problem-detector
|
||||||
|
version: ${NODE_PROBLEM_DETECTOR_TAG}
|
||||||
|
kubernetes.io/cluster-service: "true"
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: node-problem-detector
|
||||||
|
image: ${_gcr_prefix}node-problem-detector:${NODE_PROBLEM_DETECTOR_TAG}
|
||||||
|
command:
|
||||||
|
- "/bin/sh"
|
||||||
|
- "-c"
|
||||||
|
# Pass both config to support both journald and syslog.
|
||||||
|
- "exec /node-problem-detector --logtostderr --system-log-monitors=/config/kernel-monitor.json,/config/kernel-monitor-filelog.json,/config/docker-monitor.json,/config/docker-monitor-filelog.json >>/var/log/node-problem-detector.log 2>&1"
|
||||||
|
securityContext:
|
||||||
|
privileged: true
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: "200m"
|
||||||
|
memory: "100Mi"
|
||||||
|
requests:
|
||||||
|
cpu: "20m"
|
||||||
|
memory: "20Mi"
|
||||||
|
env:
|
||||||
|
- name: NODE_NAME
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: spec.nodeName
|
||||||
|
volumeMounts:
|
||||||
|
- name: log
|
||||||
|
mountPath: /var/log
|
||||||
|
- name: localtime
|
||||||
|
mountPath: /etc/localtime
|
||||||
|
readOnly: true
|
||||||
|
volumes:
|
||||||
|
- name: log
|
||||||
|
hostPath:
|
||||||
|
path: /var/log/
|
||||||
|
- name: localtime
|
||||||
|
hostPath:
|
||||||
|
path: /etc/localtime
|
||||||
|
type: "FileOrCreate"
|
||||||
|
serviceAccountName: node-problem-detector
|
||||||
|
tolerations:
|
||||||
|
- operator: "Exists"
|
||||||
|
effect: "NoExecute"
|
||||||
|
- key: "CriticalAddonsOnly"
|
||||||
|
operator: "Exists"
|
||||||
|
EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "Waiting for Kubernetes API..."
|
||||||
|
until [ "ok" = "$(curl --silent http://127.0.0.1:8080/healthz)" ]
|
||||||
|
do
|
||||||
|
sleep 5
|
||||||
|
done
|
||||||
|
|
||||||
|
kubectl apply -f ${NPD_DEPLOY}
|
||||||
|
|
||||||
|
printf "Finished running ${step}\n"
|
|
@ -86,3 +86,4 @@ write_files:
|
||||||
TILLER_ENABLED="$TILLER_ENABLED"
|
TILLER_ENABLED="$TILLER_ENABLED"
|
||||||
TILLER_TAG="$TILLER_TAG"
|
TILLER_TAG="$TILLER_TAG"
|
||||||
TILLER_NAMESPACE="$TILLER_NAMESPACE"
|
TILLER_NAMESPACE="$TILLER_NAMESPACE"
|
||||||
|
NODE_PROBLEM_DETECTOR_TAG="$NODE_PROBLEM_DETECTOR_TAG"
|
||||||
|
|
|
@ -118,7 +118,8 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition):
|
||||||
'keystone_auth_enabled', 'k8s_keystone_auth_tag',
|
'keystone_auth_enabled', 'k8s_keystone_auth_tag',
|
||||||
'tiller_enabled',
|
'tiller_enabled',
|
||||||
'tiller_tag',
|
'tiller_tag',
|
||||||
'tiller_namespace']
|
'tiller_namespace',
|
||||||
|
'node_problem_detector_tag']
|
||||||
|
|
||||||
for label in label_list:
|
for label in label_list:
|
||||||
label_value = cluster.labels.get(label)
|
label_value = cluster.labels.get(label)
|
||||||
|
|
|
@ -555,6 +555,11 @@ parameters:
|
||||||
description: namespace where tiller will be installed.
|
description: namespace where tiller will be installed.
|
||||||
default: "magnum-tiller"
|
default: "magnum-tiller"
|
||||||
|
|
||||||
|
node_problem_detector_tag:
|
||||||
|
type: string
|
||||||
|
description: tag of the node problem detector container
|
||||||
|
default: v0.6.2
|
||||||
|
|
||||||
resources:
|
resources:
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
|
@ -818,6 +823,7 @@ resources:
|
||||||
tiller_enabled: {get_param: tiller_enabled}
|
tiller_enabled: {get_param: tiller_enabled}
|
||||||
tiller_tag: {get_param: tiller_tag}
|
tiller_tag: {get_param: tiller_tag}
|
||||||
tiller_namespace: {get_param: tiller_namespace}
|
tiller_namespace: {get_param: tiller_namespace}
|
||||||
|
node_problem_detector_tag: {get_param: node_problem_detector_tag}
|
||||||
|
|
||||||
kube_cluster_config:
|
kube_cluster_config:
|
||||||
type: OS::Heat::SoftwareConfig
|
type: OS::Heat::SoftwareConfig
|
||||||
|
@ -848,6 +854,7 @@ resources:
|
||||||
template: {get_file: ../../common/templates/kubernetes/fragments/enable-ingress-controller.sh}
|
template: {get_file: ../../common/templates/kubernetes/fragments/enable-ingress-controller.sh}
|
||||||
- get_file: ../../common/templates/kubernetes/fragments/kube-dashboard-service.sh
|
- get_file: ../../common/templates/kubernetes/fragments/kube-dashboard-service.sh
|
||||||
- get_file: ../../common/templates/kubernetes/fragments/enable-keystone-auth.sh
|
- get_file: ../../common/templates/kubernetes/fragments/enable-keystone-auth.sh
|
||||||
|
- get_file: ../../common/templates/kubernetes/fragments/enable-auto-healing.sh
|
||||||
|
|
||||||
kube_cluster_deploy:
|
kube_cluster_deploy:
|
||||||
type: OS::Heat::SoftwareDeployment
|
type: OS::Heat::SoftwareDeployment
|
||||||
|
|
|
@ -430,6 +430,10 @@ parameters:
|
||||||
type: string
|
type: string
|
||||||
description: namespace where tiller will be installed
|
description: namespace where tiller will be installed
|
||||||
|
|
||||||
|
node_problem_detector_tag:
|
||||||
|
type: string
|
||||||
|
description: tag of the node problem detector container
|
||||||
|
|
||||||
resources:
|
resources:
|
||||||
######################################################################
|
######################################################################
|
||||||
#
|
#
|
||||||
|
@ -539,6 +543,7 @@ resources:
|
||||||
"$TILLER_ENABLED": {get_param: tiller_enabled}
|
"$TILLER_ENABLED": {get_param: tiller_enabled}
|
||||||
"$TILLER_TAG": {get_param: tiller_tag}
|
"$TILLER_TAG": {get_param: tiller_tag}
|
||||||
"$TILLER_NAMESPACE": {get_param: tiller_namespace}
|
"$TILLER_NAMESPACE": {get_param: tiller_namespace}
|
||||||
|
"$NODE_PROBLEM_DETECTOR_TAG": {get_param: node_problem_detector_tag}
|
||||||
|
|
||||||
install_openstack_ca:
|
install_openstack_ca:
|
||||||
type: OS::Heat::SoftwareConfig
|
type: OS::Heat::SoftwareConfig
|
||||||
|
|
|
@ -424,6 +424,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
|
||||||
'tiller_tag')
|
'tiller_tag')
|
||||||
tiller_namespace = mock_cluster.labels.get(
|
tiller_namespace = mock_cluster.labels.get(
|
||||||
'tiller_namespace')
|
'tiller_namespace')
|
||||||
|
npd_tag = mock_cluster.labels.get('node_problem_detector_tag')
|
||||||
|
|
||||||
k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition()
|
k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition()
|
||||||
|
|
||||||
|
@ -484,6 +485,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
|
||||||
'tiller_enabled': tiller_enabled,
|
'tiller_enabled': tiller_enabled,
|
||||||
'tiller_tag': tiller_tag,
|
'tiller_tag': tiller_tag,
|
||||||
'tiller_namespace': tiller_namespace,
|
'tiller_namespace': tiller_namespace,
|
||||||
|
'node_problem_detector_tag': npd_tag,
|
||||||
}}
|
}}
|
||||||
mock_get_params.assert_called_once_with(mock_context,
|
mock_get_params.assert_called_once_with(mock_context,
|
||||||
mock_cluster_template,
|
mock_cluster_template,
|
||||||
|
@ -790,6 +792,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
|
||||||
'tiller_tag')
|
'tiller_tag')
|
||||||
tiller_namespace = mock_cluster.labels.get(
|
tiller_namespace = mock_cluster.labels.get(
|
||||||
'tiller_namespace')
|
'tiller_namespace')
|
||||||
|
npd_tag = mock_cluster.labels.get('node_problem_detector_tag')
|
||||||
|
|
||||||
k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition()
|
k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition()
|
||||||
|
|
||||||
|
@ -852,6 +855,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
|
||||||
'tiller_enabled': tiller_enabled,
|
'tiller_enabled': tiller_enabled,
|
||||||
'tiller_tag': tiller_tag,
|
'tiller_tag': tiller_tag,
|
||||||
'tiller_namespace': tiller_namespace,
|
'tiller_namespace': tiller_namespace,
|
||||||
|
'node_problem_detector_tag': npd_tag,
|
||||||
}}
|
}}
|
||||||
mock_get_params.assert_called_once_with(mock_context,
|
mock_get_params.assert_called_once_with(mock_context,
|
||||||
mock_cluster_template,
|
mock_cluster_template,
|
||||||
|
|
Loading…
Reference in New Issue