diff --git a/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-healing.sh b/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-healing.sh new file mode 100644 index 0000000000..abb70867a9 --- /dev/null +++ b/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-healing.sh @@ -0,0 +1,117 @@ +#!/bin/sh + +step="enable-auto-healing" +printf "Starting to run ${step}\n" + +. /etc/sysconfig/heat-params + +_gcr_prefix=${CONTAINER_INFRA_PREFIX:-k8s.gcr.io/} + +# Generate Node Problem Detector manifest file +NPD_DEPLOY=/srv/magnum/kubernetes/manifests/npd.yaml + +[ -f ${NPD_DEPLOY} ] || { + echo "Writing File: $NPD_DEPLOY" + mkdir -p $(dirname ${NPD_DEPLOY}) + cat << EOF > ${NPD_DEPLOY} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-problem-detector + namespace: kube-system + labels: + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: npd-binding + labels: + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:node-problem-detector +subjects: +- kind: ServiceAccount + name: node-problem-detector + namespace: kube-system +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: npd + namespace: kube-system + labels: + k8s-app: node-problem-detector + version: ${NODE_PROBLEM_DETECTOR_TAG} + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile +spec: + selector: + matchLabels: + k8s-app: node-problem-detector + version: ${NODE_PROBLEM_DETECTOR_TAG} + template: + metadata: + labels: + k8s-app: node-problem-detector + version: ${NODE_PROBLEM_DETECTOR_TAG} + kubernetes.io/cluster-service: "true" + spec: + containers: + - name: node-problem-detector + image: ${_gcr_prefix}node-problem-detector:${NODE_PROBLEM_DETECTOR_TAG} + command: + - "/bin/sh" + - "-c" + # Pass both config to support both journald and syslog. + - "exec /node-problem-detector --logtostderr --system-log-monitors=/config/kernel-monitor.json,/config/kernel-monitor-filelog.json,/config/docker-monitor.json,/config/docker-monitor-filelog.json >>/var/log/node-problem-detector.log 2>&1" + securityContext: + privileged: true + resources: + limits: + cpu: "200m" + memory: "100Mi" + requests: + cpu: "20m" + memory: "20Mi" + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: log + mountPath: /var/log + - name: localtime + mountPath: /etc/localtime + readOnly: true + volumes: + - name: log + hostPath: + path: /var/log/ + - name: localtime + hostPath: + path: /etc/localtime + type: "FileOrCreate" + serviceAccountName: node-problem-detector + tolerations: + - operator: "Exists" + effect: "NoExecute" + - key: "CriticalAddonsOnly" + operator: "Exists" +EOF +} + +echo "Waiting for Kubernetes API..." +until [ "ok" = "$(curl --silent http://127.0.0.1:8080/healthz)" ] +do + sleep 5 +done + +kubectl apply -f ${NPD_DEPLOY} + +printf "Finished running ${step}\n" diff --git a/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.yaml b/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.yaml index ba741d8b36..7ce9f22cb4 100644 --- a/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.yaml +++ b/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.yaml @@ -87,3 +87,4 @@ write_files: TILLER_ENABLED="$TILLER_ENABLED" TILLER_TAG="$TILLER_TAG" TILLER_NAMESPACE="$TILLER_NAMESPACE" + NODE_PROBLEM_DETECTOR_TAG="$NODE_PROBLEM_DETECTOR_TAG" diff --git a/magnum/drivers/heat/k8s_fedora_template_def.py b/magnum/drivers/heat/k8s_fedora_template_def.py index 6e8f2c8e8f..674c7b8866 100644 --- a/magnum/drivers/heat/k8s_fedora_template_def.py +++ b/magnum/drivers/heat/k8s_fedora_template_def.py @@ -118,7 +118,8 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition): 'keystone_auth_enabled', 'k8s_keystone_auth_tag', 'tiller_enabled', 'tiller_tag', - 'tiller_namespace'] + 'tiller_namespace', + 'node_problem_detector_tag'] for label in label_list: label_value = cluster.labels.get(label) diff --git a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml index f927bfacda..81c003086b 100644 --- a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml +++ b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml @@ -560,6 +560,11 @@ parameters: description: namespace where tiller will be installed. default: "magnum-tiller" + node_problem_detector_tag: + type: string + description: tag of the node problem detector container + default: v0.6.2 + resources: ###################################################################### @@ -824,6 +829,7 @@ resources: tiller_enabled: {get_param: tiller_enabled} tiller_tag: {get_param: tiller_tag} tiller_namespace: {get_param: tiller_namespace} + node_problem_detector_tag: {get_param: node_problem_detector_tag} kube_cluster_config: type: OS::Heat::SoftwareConfig @@ -855,6 +861,7 @@ resources: template: {get_file: ../../common/templates/kubernetes/fragments/enable-ingress-controller.sh} - get_file: ../../common/templates/kubernetes/fragments/kube-dashboard-service.sh - get_file: ../../common/templates/kubernetes/fragments/enable-keystone-auth.sh + - get_file: ../../common/templates/kubernetes/fragments/enable-auto-healing.sh kube_cluster_deploy: type: OS::Heat::SoftwareDeployment diff --git a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml index 09253e9bf5..09bcd4656b 100644 --- a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml +++ b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml @@ -434,6 +434,10 @@ parameters: type: string description: namespace where tiller will be installed + node_problem_detector_tag: + type: string + description: tag of the node problem detector container + resources: ###################################################################### # @@ -544,6 +548,7 @@ resources: "$TILLER_ENABLED": {get_param: tiller_enabled} "$TILLER_TAG": {get_param: tiller_tag} "$TILLER_NAMESPACE": {get_param: tiller_namespace} + "$NODE_PROBLEM_DETECTOR_TAG": {get_param: node_problem_detector_tag} install_openstack_ca: type: OS::Heat::SoftwareConfig diff --git a/magnum/tests/unit/drivers/test_template_definition.py b/magnum/tests/unit/drivers/test_template_definition.py index 9064c970c6..4f99833e9e 100644 --- a/magnum/tests/unit/drivers/test_template_definition.py +++ b/magnum/tests/unit/drivers/test_template_definition.py @@ -425,6 +425,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase): 'tiller_tag') tiller_namespace = mock_cluster.labels.get( 'tiller_namespace') + npd_tag = mock_cluster.labels.get('node_problem_detector_tag') k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition() @@ -486,6 +487,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase): 'tiller_enabled': tiller_enabled, 'tiller_tag': tiller_tag, 'tiller_namespace': tiller_namespace, + 'node_problem_detector_tag': npd_tag, }} mock_get_params.assert_called_once_with(mock_context, mock_cluster_template, @@ -793,6 +795,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase): 'tiller_tag') tiller_namespace = mock_cluster.labels.get( 'tiller_namespace') + npd_tag = mock_cluster.labels.get('node_problem_detector_tag') k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition() @@ -856,6 +859,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase): 'tiller_enabled': tiller_enabled, 'tiller_tag': tiller_tag, 'tiller_namespace': tiller_namespace, + 'node_problem_detector_tag': npd_tag, }} mock_get_params.assert_called_once_with(mock_context, mock_cluster_template,