[fedora-atomic-k8s] Adding Node Problem Detector
Deploying Node Problem Detector to all nodes to detect problems which can be leverage by auto healing. This is the first step of enabling the auto healing feature. Task: 29886 Story: 2004782 Change-Id: I1b6075025c5f369821b4136783e68b16535dc6ef
This commit is contained in:
parent
722fc56eb3
commit
c39f1150e5
|
@ -0,0 +1,117 @@
|
|||
#!/bin/sh
|
||||
|
||||
step="enable-auto-healing"
|
||||
printf "Starting to run ${step}\n"
|
||||
|
||||
. /etc/sysconfig/heat-params
|
||||
|
||||
_gcr_prefix=${CONTAINER_INFRA_PREFIX:-k8s.gcr.io/}
|
||||
|
||||
# Generate Node Problem Detector manifest file
|
||||
NPD_DEPLOY=/srv/magnum/kubernetes/manifests/npd.yaml
|
||||
|
||||
[ -f ${NPD_DEPLOY} ] || {
|
||||
echo "Writing File: $NPD_DEPLOY"
|
||||
mkdir -p $(dirname ${NPD_DEPLOY})
|
||||
cat << EOF > ${NPD_DEPLOY}
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: node-problem-detector
|
||||
namespace: kube-system
|
||||
labels:
|
||||
kubernetes.io/cluster-service: "true"
|
||||
addonmanager.kubernetes.io/mode: Reconcile
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: npd-binding
|
||||
labels:
|
||||
kubernetes.io/cluster-service: "true"
|
||||
addonmanager.kubernetes.io/mode: Reconcile
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: system:node-problem-detector
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: node-problem-detector
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: npd
|
||||
namespace: kube-system
|
||||
labels:
|
||||
k8s-app: node-problem-detector
|
||||
version: ${NODE_PROBLEM_DETECTOR_TAG}
|
||||
kubernetes.io/cluster-service: "true"
|
||||
addonmanager.kubernetes.io/mode: Reconcile
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: node-problem-detector
|
||||
version: ${NODE_PROBLEM_DETECTOR_TAG}
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: node-problem-detector
|
||||
version: ${NODE_PROBLEM_DETECTOR_TAG}
|
||||
kubernetes.io/cluster-service: "true"
|
||||
spec:
|
||||
containers:
|
||||
- name: node-problem-detector
|
||||
image: ${_gcr_prefix}node-problem-detector:${NODE_PROBLEM_DETECTOR_TAG}
|
||||
command:
|
||||
- "/bin/sh"
|
||||
- "-c"
|
||||
# Pass both config to support both journald and syslog.
|
||||
- "exec /node-problem-detector --logtostderr --system-log-monitors=/config/kernel-monitor.json,/config/kernel-monitor-filelog.json,/config/docker-monitor.json,/config/docker-monitor-filelog.json >>/var/log/node-problem-detector.log 2>&1"
|
||||
securityContext:
|
||||
privileged: true
|
||||
resources:
|
||||
limits:
|
||||
cpu: "200m"
|
||||
memory: "100Mi"
|
||||
requests:
|
||||
cpu: "20m"
|
||||
memory: "20Mi"
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
volumeMounts:
|
||||
- name: log
|
||||
mountPath: /var/log
|
||||
- name: localtime
|
||||
mountPath: /etc/localtime
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: log
|
||||
hostPath:
|
||||
path: /var/log/
|
||||
- name: localtime
|
||||
hostPath:
|
||||
path: /etc/localtime
|
||||
type: "FileOrCreate"
|
||||
serviceAccountName: node-problem-detector
|
||||
tolerations:
|
||||
- operator: "Exists"
|
||||
effect: "NoExecute"
|
||||
- key: "CriticalAddonsOnly"
|
||||
operator: "Exists"
|
||||
EOF
|
||||
}
|
||||
|
||||
echo "Waiting for Kubernetes API..."
|
||||
until [ "ok" = "$(curl --silent http://127.0.0.1:8080/healthz)" ]
|
||||
do
|
||||
sleep 5
|
||||
done
|
||||
|
||||
kubectl apply -f ${NPD_DEPLOY}
|
||||
|
||||
printf "Finished running ${step}\n"
|
|
@ -86,3 +86,4 @@ write_files:
|
|||
TILLER_ENABLED="$TILLER_ENABLED"
|
||||
TILLER_TAG="$TILLER_TAG"
|
||||
TILLER_NAMESPACE="$TILLER_NAMESPACE"
|
||||
NODE_PROBLEM_DETECTOR_TAG="$NODE_PROBLEM_DETECTOR_TAG"
|
||||
|
|
|
@ -118,7 +118,8 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition):
|
|||
'keystone_auth_enabled', 'k8s_keystone_auth_tag',
|
||||
'tiller_enabled',
|
||||
'tiller_tag',
|
||||
'tiller_namespace']
|
||||
'tiller_namespace',
|
||||
'node_problem_detector_tag']
|
||||
|
||||
for label in label_list:
|
||||
label_value = cluster.labels.get(label)
|
||||
|
|
|
@ -555,6 +555,11 @@ parameters:
|
|||
description: namespace where tiller will be installed.
|
||||
default: "magnum-tiller"
|
||||
|
||||
node_problem_detector_tag:
|
||||
type: string
|
||||
description: tag of the node problem detector container
|
||||
default: v0.6.2
|
||||
|
||||
resources:
|
||||
|
||||
######################################################################
|
||||
|
@ -818,6 +823,7 @@ resources:
|
|||
tiller_enabled: {get_param: tiller_enabled}
|
||||
tiller_tag: {get_param: tiller_tag}
|
||||
tiller_namespace: {get_param: tiller_namespace}
|
||||
node_problem_detector_tag: {get_param: node_problem_detector_tag}
|
||||
|
||||
kube_cluster_config:
|
||||
type: OS::Heat::SoftwareConfig
|
||||
|
@ -848,6 +854,7 @@ resources:
|
|||
template: {get_file: ../../common/templates/kubernetes/fragments/enable-ingress-controller.sh}
|
||||
- get_file: ../../common/templates/kubernetes/fragments/kube-dashboard-service.sh
|
||||
- get_file: ../../common/templates/kubernetes/fragments/enable-keystone-auth.sh
|
||||
- get_file: ../../common/templates/kubernetes/fragments/enable-auto-healing.sh
|
||||
|
||||
kube_cluster_deploy:
|
||||
type: OS::Heat::SoftwareDeployment
|
||||
|
|
|
@ -430,6 +430,10 @@ parameters:
|
|||
type: string
|
||||
description: namespace where tiller will be installed
|
||||
|
||||
node_problem_detector_tag:
|
||||
type: string
|
||||
description: tag of the node problem detector container
|
||||
|
||||
resources:
|
||||
######################################################################
|
||||
#
|
||||
|
@ -539,6 +543,7 @@ resources:
|
|||
"$TILLER_ENABLED": {get_param: tiller_enabled}
|
||||
"$TILLER_TAG": {get_param: tiller_tag}
|
||||
"$TILLER_NAMESPACE": {get_param: tiller_namespace}
|
||||
"$NODE_PROBLEM_DETECTOR_TAG": {get_param: node_problem_detector_tag}
|
||||
|
||||
install_openstack_ca:
|
||||
type: OS::Heat::SoftwareConfig
|
||||
|
|
|
@ -424,6 +424,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
|
|||
'tiller_tag')
|
||||
tiller_namespace = mock_cluster.labels.get(
|
||||
'tiller_namespace')
|
||||
npd_tag = mock_cluster.labels.get('node_problem_detector_tag')
|
||||
|
||||
k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition()
|
||||
|
||||
|
@ -484,6 +485,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
|
|||
'tiller_enabled': tiller_enabled,
|
||||
'tiller_tag': tiller_tag,
|
||||
'tiller_namespace': tiller_namespace,
|
||||
'node_problem_detector_tag': npd_tag,
|
||||
}}
|
||||
mock_get_params.assert_called_once_with(mock_context,
|
||||
mock_cluster_template,
|
||||
|
@ -790,6 +792,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
|
|||
'tiller_tag')
|
||||
tiller_namespace = mock_cluster.labels.get(
|
||||
'tiller_namespace')
|
||||
npd_tag = mock_cluster.labels.get('node_problem_detector_tag')
|
||||
|
||||
k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition()
|
||||
|
||||
|
@ -852,6 +855,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
|
|||
'tiller_enabled': tiller_enabled,
|
||||
'tiller_tag': tiller_tag,
|
||||
'tiller_namespace': tiller_namespace,
|
||||
'node_problem_detector_tag': npd_tag,
|
||||
}}
|
||||
mock_get_params.assert_called_once_with(mock_context,
|
||||
mock_cluster_template,
|
||||
|
|
Loading…
Reference in New Issue