From c39f1150e5a005abb34b35f8eaef348bcd39843d Mon Sep 17 00:00:00 2001 From: Feilong Wang Date: Fri, 8 Mar 2019 14:52:15 +1300 Subject: [PATCH] [fedora-atomic-k8s] Adding Node Problem Detector Deploying Node Problem Detector to all nodes to detect problems which can be leverage by auto healing. This is the first step of enabling the auto healing feature. Task: 29886 Story: 2004782 Change-Id: I1b6075025c5f369821b4136783e68b16535dc6ef --- .../fragments/enable-auto-healing.sh | 117 ++++++++++++++++++ .../fragments/write-heat-params-master.yaml | 1 + .../drivers/heat/k8s_fedora_template_def.py | 3 +- .../templates/kubecluster.yaml | 7 ++ .../templates/kubemaster.yaml | 5 + .../unit/drivers/test_template_definition.py | 4 + 6 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 magnum/drivers/common/templates/kubernetes/fragments/enable-auto-healing.sh diff --git a/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-healing.sh b/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-healing.sh new file mode 100644 index 0000000000..abb70867a9 --- /dev/null +++ b/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-healing.sh @@ -0,0 +1,117 @@ +#!/bin/sh + +step="enable-auto-healing" +printf "Starting to run ${step}\n" + +. /etc/sysconfig/heat-params + +_gcr_prefix=${CONTAINER_INFRA_PREFIX:-k8s.gcr.io/} + +# Generate Node Problem Detector manifest file +NPD_DEPLOY=/srv/magnum/kubernetes/manifests/npd.yaml + +[ -f ${NPD_DEPLOY} ] || { + echo "Writing File: $NPD_DEPLOY" + mkdir -p $(dirname ${NPD_DEPLOY}) + cat << EOF > ${NPD_DEPLOY} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-problem-detector + namespace: kube-system + labels: + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: npd-binding + labels: + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:node-problem-detector +subjects: +- kind: ServiceAccount + name: node-problem-detector + namespace: kube-system +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: npd + namespace: kube-system + labels: + k8s-app: node-problem-detector + version: ${NODE_PROBLEM_DETECTOR_TAG} + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile +spec: + selector: + matchLabels: + k8s-app: node-problem-detector + version: ${NODE_PROBLEM_DETECTOR_TAG} + template: + metadata: + labels: + k8s-app: node-problem-detector + version: ${NODE_PROBLEM_DETECTOR_TAG} + kubernetes.io/cluster-service: "true" + spec: + containers: + - name: node-problem-detector + image: ${_gcr_prefix}node-problem-detector:${NODE_PROBLEM_DETECTOR_TAG} + command: + - "/bin/sh" + - "-c" + # Pass both config to support both journald and syslog. + - "exec /node-problem-detector --logtostderr --system-log-monitors=/config/kernel-monitor.json,/config/kernel-monitor-filelog.json,/config/docker-monitor.json,/config/docker-monitor-filelog.json >>/var/log/node-problem-detector.log 2>&1" + securityContext: + privileged: true + resources: + limits: + cpu: "200m" + memory: "100Mi" + requests: + cpu: "20m" + memory: "20Mi" + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: log + mountPath: /var/log + - name: localtime + mountPath: /etc/localtime + readOnly: true + volumes: + - name: log + hostPath: + path: /var/log/ + - name: localtime + hostPath: + path: /etc/localtime + type: "FileOrCreate" + serviceAccountName: node-problem-detector + tolerations: + - operator: "Exists" + effect: "NoExecute" + - key: "CriticalAddonsOnly" + operator: "Exists" +EOF +} + +echo "Waiting for Kubernetes API..." +until [ "ok" = "$(curl --silent http://127.0.0.1:8080/healthz)" ] +do + sleep 5 +done + +kubectl apply -f ${NPD_DEPLOY} + +printf "Finished running ${step}\n" diff --git a/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.yaml b/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.yaml index 6cd20bf2a9..d7a22fd73e 100644 --- a/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.yaml +++ b/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.yaml @@ -86,3 +86,4 @@ write_files: TILLER_ENABLED="$TILLER_ENABLED" TILLER_TAG="$TILLER_TAG" TILLER_NAMESPACE="$TILLER_NAMESPACE" + NODE_PROBLEM_DETECTOR_TAG="$NODE_PROBLEM_DETECTOR_TAG" diff --git a/magnum/drivers/heat/k8s_fedora_template_def.py b/magnum/drivers/heat/k8s_fedora_template_def.py index 76713943bd..272dd19a3d 100644 --- a/magnum/drivers/heat/k8s_fedora_template_def.py +++ b/magnum/drivers/heat/k8s_fedora_template_def.py @@ -118,7 +118,8 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition): 'keystone_auth_enabled', 'k8s_keystone_auth_tag', 'tiller_enabled', 'tiller_tag', - 'tiller_namespace'] + 'tiller_namespace', + 'node_problem_detector_tag'] for label in label_list: label_value = cluster.labels.get(label) diff --git a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml index ba23f3edc2..a711c7e394 100644 --- a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml +++ b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml @@ -555,6 +555,11 @@ parameters: description: namespace where tiller will be installed. default: "magnum-tiller" + node_problem_detector_tag: + type: string + description: tag of the node problem detector container + default: v0.6.2 + resources: ###################################################################### @@ -818,6 +823,7 @@ resources: tiller_enabled: {get_param: tiller_enabled} tiller_tag: {get_param: tiller_tag} tiller_namespace: {get_param: tiller_namespace} + node_problem_detector_tag: {get_param: node_problem_detector_tag} kube_cluster_config: type: OS::Heat::SoftwareConfig @@ -848,6 +854,7 @@ resources: template: {get_file: ../../common/templates/kubernetes/fragments/enable-ingress-controller.sh} - get_file: ../../common/templates/kubernetes/fragments/kube-dashboard-service.sh - get_file: ../../common/templates/kubernetes/fragments/enable-keystone-auth.sh + - get_file: ../../common/templates/kubernetes/fragments/enable-auto-healing.sh kube_cluster_deploy: type: OS::Heat::SoftwareDeployment diff --git a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml index 356a3126fe..3c9ac66154 100644 --- a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml +++ b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml @@ -430,6 +430,10 @@ parameters: type: string description: namespace where tiller will be installed + node_problem_detector_tag: + type: string + description: tag of the node problem detector container + resources: ###################################################################### # @@ -539,6 +543,7 @@ resources: "$TILLER_ENABLED": {get_param: tiller_enabled} "$TILLER_TAG": {get_param: tiller_tag} "$TILLER_NAMESPACE": {get_param: tiller_namespace} + "$NODE_PROBLEM_DETECTOR_TAG": {get_param: node_problem_detector_tag} install_openstack_ca: type: OS::Heat::SoftwareConfig diff --git a/magnum/tests/unit/drivers/test_template_definition.py b/magnum/tests/unit/drivers/test_template_definition.py index b06649175f..85ad654e42 100644 --- a/magnum/tests/unit/drivers/test_template_definition.py +++ b/magnum/tests/unit/drivers/test_template_definition.py @@ -424,6 +424,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase): 'tiller_tag') tiller_namespace = mock_cluster.labels.get( 'tiller_namespace') + npd_tag = mock_cluster.labels.get('node_problem_detector_tag') k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition() @@ -484,6 +485,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase): 'tiller_enabled': tiller_enabled, 'tiller_tag': tiller_tag, 'tiller_namespace': tiller_namespace, + 'node_problem_detector_tag': npd_tag, }} mock_get_params.assert_called_once_with(mock_context, mock_cluster_template, @@ -790,6 +792,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase): 'tiller_tag') tiller_namespace = mock_cluster.labels.get( 'tiller_namespace') + npd_tag = mock_cluster.labels.get('node_problem_detector_tag') k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition() @@ -852,6 +855,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase): 'tiller_enabled': tiller_enabled, 'tiller_tag': tiller_tag, 'tiller_namespace': tiller_namespace, + 'node_problem_detector_tag': npd_tag, }} mock_get_params.assert_called_once_with(mock_context, mock_cluster_template,