diff --git a/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-healing.sh b/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-healing.sh new file mode 100644 index 0000000000..abb70867a9 --- /dev/null +++ b/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-healing.sh @@ -0,0 +1,117 @@ +#!/bin/sh + +step="enable-auto-healing" +printf "Starting to run ${step}\n" + +. /etc/sysconfig/heat-params + +_gcr_prefix=${CONTAINER_INFRA_PREFIX:-k8s.gcr.io/} + +# Generate Node Problem Detector manifest file +NPD_DEPLOY=/srv/magnum/kubernetes/manifests/npd.yaml + +[ -f ${NPD_DEPLOY} ] || { + echo "Writing File: $NPD_DEPLOY" + mkdir -p $(dirname ${NPD_DEPLOY}) + cat << EOF > ${NPD_DEPLOY} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-problem-detector + namespace: kube-system + labels: + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: npd-binding + labels: + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:node-problem-detector +subjects: +- kind: ServiceAccount + name: node-problem-detector + namespace: kube-system +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: npd + namespace: kube-system + labels: + k8s-app: node-problem-detector + version: ${NODE_PROBLEM_DETECTOR_TAG} + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile +spec: + selector: + matchLabels: + k8s-app: node-problem-detector + version: ${NODE_PROBLEM_DETECTOR_TAG} + template: + metadata: + labels: + k8s-app: node-problem-detector + version: ${NODE_PROBLEM_DETECTOR_TAG} + kubernetes.io/cluster-service: "true" + spec: + containers: + - name: node-problem-detector + image: ${_gcr_prefix}node-problem-detector:${NODE_PROBLEM_DETECTOR_TAG} + command: + - "/bin/sh" + - "-c" + # Pass both config to support both journald and syslog. + - "exec /node-problem-detector --logtostderr --system-log-monitors=/config/kernel-monitor.json,/config/kernel-monitor-filelog.json,/config/docker-monitor.json,/config/docker-monitor-filelog.json >>/var/log/node-problem-detector.log 2>&1" + securityContext: + privileged: true + resources: + limits: + cpu: "200m" + memory: "100Mi" + requests: + cpu: "20m" + memory: "20Mi" + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: log + mountPath: /var/log + - name: localtime + mountPath: /etc/localtime + readOnly: true + volumes: + - name: log + hostPath: + path: /var/log/ + - name: localtime + hostPath: + path: /etc/localtime + type: "FileOrCreate" + serviceAccountName: node-problem-detector + tolerations: + - operator: "Exists" + effect: "NoExecute" + - key: "CriticalAddonsOnly" + operator: "Exists" +EOF +} + +echo "Waiting for Kubernetes API..." +until [ "ok" = "$(curl --silent http://127.0.0.1:8080/healthz)" ] +do + sleep 5 +done + +kubectl apply -f ${NPD_DEPLOY} + +printf "Finished running ${step}\n" diff --git a/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.yaml b/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.yaml index 6cd20bf2a9..d7a22fd73e 100644 --- a/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.yaml +++ b/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.yaml @@ -86,3 +86,4 @@ write_files: TILLER_ENABLED="$TILLER_ENABLED" TILLER_TAG="$TILLER_TAG" TILLER_NAMESPACE="$TILLER_NAMESPACE" + NODE_PROBLEM_DETECTOR_TAG="$NODE_PROBLEM_DETECTOR_TAG" diff --git a/magnum/drivers/heat/k8s_fedora_template_def.py b/magnum/drivers/heat/k8s_fedora_template_def.py index 76713943bd..272dd19a3d 100644 --- a/magnum/drivers/heat/k8s_fedora_template_def.py +++ b/magnum/drivers/heat/k8s_fedora_template_def.py @@ -118,7 +118,8 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition): 'keystone_auth_enabled', 'k8s_keystone_auth_tag', 'tiller_enabled', 'tiller_tag', - 'tiller_namespace'] + 'tiller_namespace', + 'node_problem_detector_tag'] for label in label_list: label_value = cluster.labels.get(label) diff --git a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml index ba23f3edc2..a711c7e394 100644 --- a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml +++ b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml @@ -555,6 +555,11 @@ parameters: description: namespace where tiller will be installed. default: "magnum-tiller" + node_problem_detector_tag: + type: string + description: tag of the node problem detector container + default: v0.6.2 + resources: ###################################################################### @@ -818,6 +823,7 @@ resources: tiller_enabled: {get_param: tiller_enabled} tiller_tag: {get_param: tiller_tag} tiller_namespace: {get_param: tiller_namespace} + node_problem_detector_tag: {get_param: node_problem_detector_tag} kube_cluster_config: type: OS::Heat::SoftwareConfig @@ -848,6 +854,7 @@ resources: template: {get_file: ../../common/templates/kubernetes/fragments/enable-ingress-controller.sh} - get_file: ../../common/templates/kubernetes/fragments/kube-dashboard-service.sh - get_file: ../../common/templates/kubernetes/fragments/enable-keystone-auth.sh + - get_file: ../../common/templates/kubernetes/fragments/enable-auto-healing.sh kube_cluster_deploy: type: OS::Heat::SoftwareDeployment diff --git a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml index 356a3126fe..3c9ac66154 100644 --- a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml +++ b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml @@ -430,6 +430,10 @@ parameters: type: string description: namespace where tiller will be installed + node_problem_detector_tag: + type: string + description: tag of the node problem detector container + resources: ###################################################################### # @@ -539,6 +543,7 @@ resources: "$TILLER_ENABLED": {get_param: tiller_enabled} "$TILLER_TAG": {get_param: tiller_tag} "$TILLER_NAMESPACE": {get_param: tiller_namespace} + "$NODE_PROBLEM_DETECTOR_TAG": {get_param: node_problem_detector_tag} install_openstack_ca: type: OS::Heat::SoftwareConfig diff --git a/magnum/tests/unit/drivers/test_template_definition.py b/magnum/tests/unit/drivers/test_template_definition.py index b06649175f..85ad654e42 100644 --- a/magnum/tests/unit/drivers/test_template_definition.py +++ b/magnum/tests/unit/drivers/test_template_definition.py @@ -424,6 +424,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase): 'tiller_tag') tiller_namespace = mock_cluster.labels.get( 'tiller_namespace') + npd_tag = mock_cluster.labels.get('node_problem_detector_tag') k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition() @@ -484,6 +485,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase): 'tiller_enabled': tiller_enabled, 'tiller_tag': tiller_tag, 'tiller_namespace': tiller_namespace, + 'node_problem_detector_tag': npd_tag, }} mock_get_params.assert_called_once_with(mock_context, mock_cluster_template, @@ -790,6 +792,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase): 'tiller_tag') tiller_namespace = mock_cluster.labels.get( 'tiller_namespace') + npd_tag = mock_cluster.labels.get('node_problem_detector_tag') k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition() @@ -852,6 +855,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase): 'tiller_enabled': tiller_enabled, 'tiller_tag': tiller_tag, 'tiller_namespace': tiller_namespace, + 'node_problem_detector_tag': npd_tag, }} mock_get_params.assert_called_once_with(mock_context, mock_cluster_template,