Merge "[fedora_atomic] Support auto healing for k8s"

2019-04-17 08:36:41 +00:00 · 2019-04-17 08:36:41 +00:00 · 29f6eab346
parent 4722a64f5e 75fab6ff37
commit 29f6eab346
13 changed files with 502 additions and 20 deletions
--- a/doc/source/user/index.rst
+++ b/doc/source/user/index.rst
@ -392,6 +392,22 @@ the table are linked to more details elsewhere in the user guide.
 | `master_lb_floating_ip_enabled`_      | - true             | see below     |
 |                                       | - false            |               |
 +---------------------------------------+--------------------+---------------+
+| `auto_healing_enabled`_               | - true             | false         |
+|                                       | - false            |               |
+---------------------------------------+--------------------+---------------+
+| `auto_scaling_enabled`_               | - true             | true          |
+|                                       | - false            |               |
+---------------------------------------+--------------------+---------------+
+| `node_problem_detector_tag`_          | see below          | see below     |
+---------------------------------------+--------------------+---------------+
+| `draino_tag`_                         | see below          | see below     |
+---------------------------------------+--------------------+---------------+
+| `autoscaler_tag`_                     | see below          | see below     |
+---------------------------------------+--------------------+---------------+
+| `min_node_count`_                     | see below          | see below     |
+---------------------------------------+--------------------+---------------+
+| `max_node_count`_                     | see below          | see below     |
+---------------------------------------+--------------------+---------------+

 Cluster
 -------
@ -1119,6 +1135,9 @@ _`container_infra_prefix`
  * quay.io/coreos/configmap-reload:v0.0.1
  * quay.io/coreos/prometheus-config-reloader:v0.26.0
  * quay.io/prometheus/prometheus:v2.5.0
+  * k8s.gcr.io/node-problem-detector:v0.6.2
+  * docker.io/planetlabs/draino:abf028a
+  * docker.io/openstackmagnum/cluster-autoscaler:v1.0

 _`kube_tag`
  This label allows users to select `a specific Kubernetes release,
@ -1257,6 +1276,31 @@ _`master_lb_floating_ip_enabled`
  ``master_lb_enabled`` is set. If not specified, the default value is the same
  as template property ``floating_ip_enabled``.

+_`auto_healing_enabled`
+  If set to true, auto healing feature will be enabled. Defaults to false.
+
+_`auto_scaling_enabled`
+  If set to true, auto scaling feature will be enabled. Defaults to true.
+
+_`node_problem_detector_tag`
+  This label allows users to select a specific Node Problem Detector
+  version.
+
+_`draino_tag`
+  This label allows users to select a specific Draino version.
+
+_`autoscaler_tag`
+  This label allows users to select a specific Cluster Autoscaler version.
+
+_`min_node_count`
+  The minmium node count of the cluster when doing auto scaling or auto
+  healing. Defaults to 1.
+
+_`max_node_count`
+  The maxmium node count of the cluster when doing auto scaling or auto
+  healing.
+
+
 External load balancer for services
 -----------------------------------

--- a/magnum/drivers/common/templates/kubernetes/fragments/configure-kubernetes-minion.sh
+++ b/magnum/drivers/common/templates/kubernetes/fragments/configure-kubernetes-minion.sh
@ -154,6 +154,10 @@ KUBELET_ARGS="${KUBELET_ARGS} --client-ca-file=${CERT_DIR}/ca.crt --tls-cert-fil
 # specified cgroup driver
 KUBELET_ARGS="${KUBELET_ARGS} --cgroup-driver=${CGROUP_DRIVER}"

+if [ "$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" ]; then
+    KUBELET_ARGS="${KUBELET_ARGS} --node-labels=draino-enabled=true"
+fi
+
 systemctl disable docker
 if cat /usr/lib/systemd/system/docker.service | grep 'native.cgroupdriver'; then
        cp /usr/lib/systemd/system/docker.service /etc/systemd/system/
--- a/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-healing.sh
+++ b/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-healing.sh
@ -1,6 +1,6 @@
 #!/bin/sh

-step="enable-auto-healing"
+step="enable-node-problem-detector"
 printf "Starting to run ${step}\n"

 . /etc/sysconfig/heat-params
@ -68,7 +68,7 @@ spec:
        - "/bin/sh"
        - "-c"
        # Pass both config to support both journald and syslog.
-        - "exec /node-problem-detector --logtostderr --system-log-monitors=/config/kernel-monitor.json,/config/kernel-monitor-filelog.json,/config/docker-monitor.json,/config/docker-monitor-filelog.json >>/var/log/node-problem-detector.log 2>&1"
+        - "exec /node-problem-detector --logtostderr --system-log-monitors=/config/kernel-monitor.json,/config/kernel-monitor-filelog.json,/config/docker-monitor.json,/config/docker-monitor-filelog.json 2>&1 | tee /var/log/node-problem-detector.log"
        securityContext:
          privileged: true
        resources:
@ -115,3 +115,109 @@ done
 kubectl apply -f ${NPD_DEPLOY}

 printf "Finished running ${step}\n"
+
+_docker_draino_prefix=${CONTAINER_INFRA_PREFIX:-docker.io/planetlabs/}
+step="enable-auto-healing"
+printf "Starting to run ${step}\n"
+
+if [ "$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" ]; then
+    # Generate Draino manifest file
+    DRAINO_DEPLOY=/srv/magnum/kubernetes/manifests/draino.yaml
+
+    [ -f ${DRAINO_DEPLOY} ] || {
+        echo "Writing File: $DRAINO_DEPLOY"
+        mkdir -p $(dirname ${DRAINO_DEPLOY})
+        cat << EOF > ${DRAINO_DEPLOY}
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  labels: {component: draino}
+  name: draino
+  namespace: kube-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels: {component: draino}
+  name: draino
+rules:
+- apiGroups: ['']
+  resources: [events]
+  verbs: [create, patch, update]
+- apiGroups: ['']
+  resources: [nodes]
+  verbs: [get, watch, list, update]
+- apiGroups: ['']
+  resources: [nodes/status]
+  verbs: [patch]
+- apiGroups: ['']
+  resources: [pods]
+  verbs: [get, watch, list]
+- apiGroups: ['']
+  resources: [pods/eviction]
+  verbs: [create]
+- apiGroups: [extensions]
+  resources: [daemonsets]
+  verbs: [get, watch, list]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  labels: {component: draino}
+  name: draino
+roleRef: {apiGroup: rbac.authorization.k8s.io, kind: ClusterRole, name: draino}
+subjects:
+- {kind: ServiceAccount, name: draino, namespace: kube-system}
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels: {component: draino}
+  name: draino
+  namespace: kube-system
+spec:
+  # Draino does not currently support locking/master election, so you should
+  # only run one draino at a time. Draino won't start draining nodes immediately
+  # so it's usually safe for multiple drainos to exist for a brief period of
+  # time.
+  replicas: 1
+  selector:
+    matchLabels: {component: draino}
+  template:
+    metadata:
+      labels: {component: draino}
+      name: draino
+      namespace: kube-system
+    spec:
+      nodeSelector:
+        node-role.kubernetes.io/master: ""
+      hostNetwork: true
+      tolerations:
+        - effect: NoSchedule
+          operator: Exists
+        - key: CriticalAddonsOnly
+          operator: Exists
+        - effect: NoExecute
+          operator: Exists
+        - key: node.cloudprovider.kubernetes.io/uninitialized
+          value: "true"
+          effect: NoSchedule
+        - key: node-role.kubernetes.io/master
+          effect: NoSchedule
+      containers:
+      # You'll want to change these labels and conditions to suit your deployment.
+      - command: [/draino, --node-label=draino-enabled=true, --evict-daemonset-pods, --evict-emptydir-pods, NotReady]
+        image: ${_docker_draino_prefix}draino:${DRAINO_TAG}
+        livenessProbe:
+          httpGet: {path: /healthz, port: 10002}
+          initialDelaySeconds: 30
+        name: draino
+      serviceAccountName: draino
+EOF
+    }
+
+    kubectl apply -f ${DRAINO_DEPLOY}
+
+fi
+printf "Finished running ${step}\n"
--- a/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-scaling.sh
+++ b/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-scaling.sh
@ -0,0 +1,185 @@
+#!/bin/sh
+
+step="enable-auto-scaling"
+printf "Starting to run ${step}\n"
+
+. /etc/sysconfig/heat-params
+
+_docker_ca_prefix=${CONTAINER_INFRA_PREFIX:-docker.io/openstackmagnum/}
+
+# Either auto scaling or auto healing we need CA to be deployed
+if [ "$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" || "$(echo $AUTO_SCALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true"]; then
+    # Generate Autoscaler manifest file
+    AUTOSCALER_DEPLOY=/srv/magnum/kubernetes/manifests/autoscaler.yaml
+
+    [ -f ${AUTOSCALER_DEPLOY} ] || {
+        echo "Writing File: $AUTOSCALER_DEPLOY"
+        mkdir -p $(dirname ${AUTOSCALER_DEPLOY})
+        cat << EOF > ${AUTOSCALER_DEPLOY}
+---
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: ClusterRole
+metadata:
+  name: cluster-autoscaler-role
+rules:
+  - apiGroups: [""]
+    resources: ["events", "endpoints"]
+    verbs: ["create", "patch"]
+  - apiGroups: [""]
+    resources: ["pods/eviction"]
+    verbs: ["create"]
+  - apiGroups: [""]
+    resources: ["pods/status"]
+    verbs: ["update"]
+  - apiGroups: [""]
+    resources: ["endpoints"]
+    resourceNames: ["cluster-autoscaler"]
+    verbs: ["get", "update"]
+  - apiGroups: [""]
+    resources: ["nodes"]
+    verbs: ["watch", "list", "get", "update"]
+  - apiGroups: [""]
+    resources:
+      - "pods"
+      - "services"
+      - "replicationcontrollers"
+      - "persistentvolumeclaims"
+      - "persistentvolumes"
+    verbs: ["watch", "list", "get"]
+  - apiGroups: ["batch"]
+    resources: ["jobs"]
+    verbs: ["watch", "list", "get"]
+  - apiGroups: ["policy"]
+    resources: ["poddisruptionbudgets"]
+    verbs: ["watch", "list"]
+  - apiGroups: ["apps"]
+    resources: ["daemonsets", "replicasets", "statefulsets"]
+    verbs: ["watch", "list", "get"]
+  - apiGroups: ["storage.k8s.io"]
+    resources: ["storageclasses"]
+    verbs: ["watch", "list", "get"]
+  - apiGroups: [""]
+    resources: ["configmaps"]
+    verbs: ["create"]
+  - apiGroups: [""]
+    resources: ["configmaps"]
+    resourceNames: ["cluster-autoscaler-status"]
+    verbs: ["delete", "get", "update"]
+---
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: ClusterRoleBinding
+metadata:
+  name: cluster-autoscaler-rolebinding
+  namespace: kube-system
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: cluster-autoscaler-role
+subjects:
+  - kind: ServiceAccount
+    name: cluster-autoscaler-account
+    namespace: kube-system
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: cluster-autoscaler-account
+  namespace: kube-system
+---
+kind: Deployment
+apiVersion: apps/v1
+metadata:
+  name: cluster-autoscaler
+  namespace: kube-system
+  labels:
+    app: cluster-autoscaler
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: cluster-autoscaler
+  template:
+    metadata:
+      namespace: kube-system
+      labels:
+        app: cluster-autoscaler
+    spec:
+      nodeSelector:
+        node-role.kubernetes.io/master: ""
+      securityContext:
+        runAsUser: 1001
+      hostNetwork: True
+      tolerations:
+        - effect: NoSchedule
+          operator: Exists
+        - key: CriticalAddonsOnly
+          operator: Exists
+        - effect: NoExecute
+          operator: Exists
+        - key: node.cloudprovider.kubernetes.io/uninitialized
+          value: "true"
+          effect: NoSchedule
+        - key: node-role.kubernetes.io/master
+          effect: NoSchedule
+      serviceAccountName: cluster-autoscaler-account
+      containers:
+        - name: cluster-autoscaler
+          image: ${_docker_ca_prefix}cluster-autoscaler:${AUTOSCALER_TAG}
+          imagePullPolicy: Always
+          command:
+            - ./cluster-autoscaler
+            - --alsologtostderr
+            - --cloud-provider=magnum
+            - --cluster-name=${CLUSTER_UUID}
+            - --cloud-config=/config/cloud-config
+            - --nodes=${MIN_NODE_COUNT}:${MAX_NODE_COUNT}:default-worker
+            - --scale-down-unneeded-time=10m
+            - --scale-down-delay-after-failure=3m
+            - --scale-down-delay-after-add=10m
+          volumeMounts:
+            - name: ca-bundle
+              mountPath: /etc/kubernetes
+              readOnly: true
+            - name: cloud-config
+              mountPath: /config
+              readOnly: true
+      volumes:
+        - name: ca-bundle
+          secret:
+            secretName: ca-bundle
+        - name: cloud-config
+          secret:
+            secretName: cluster-autoscaler-cloud-config
+EOF
+    }
+
+    echo "Waiting for Kubernetes API..."
+    until  [ "ok" = "$(curl --silent http://127.0.0.1:8080/healthz)" ]
+    do
+        sleep 5
+    done
+
+    kubectl create secret generic ca-bundle --from-file=/etc/kubernetes/ca-bundle.crt -n kube-system
+
+    cat <<EOF | kubectl apply -f -
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: cluster-autoscaler-cloud-config
+  namespace: kube-system
+type: Opaque
+stringData:
+  cloud-config: |-
+    [Global]
+    auth-url=$AUTH_URL
+    user-id=$TRUSTEE_USER_ID
+    password=$TRUSTEE_PASSWORD
+    trust-id=$TRUST_ID
+    region=$REGION_NAME
+    ca-file=/etc/kubernetes/ca-bundle.crt
+EOF
+
+    kubectl apply -f ${AUTOSCALER_DEPLOY}
+fi
+printf "Finished running ${step}\n"
--- a/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.yaml
+++ b/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.yaml
@ -90,3 +90,9 @@ write_files:
      TILLER_NAMESPACE="$TILLER_NAMESPACE"
      NODE_PROBLEM_DETECTOR_TAG="$NODE_PROBLEM_DETECTOR_TAG"
      NGINX_INGRESS_CONTROLLER_TAG="$NGINX_INGRESS_CONTROLLER_TAG"
+      AUTO_HEALING_ENABLED="$AUTO_HEALING_ENABLED"
+      AUTO_SCALING_ENABLED="$AUTO_SCALING_ENABLED"
+      DRAINO_TAG="$DRAINO_TAG"
+      AUTOSCALER_TAG="$AUTOSCALER_TAG"
+      MIN_NODE_COUNT="$MIN_NODE_COUNT"
+      MAX_NODE_COUNT="$MAX_NODE_COUNT"
--- a/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params.yaml
+++ b/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params.yaml
@ -53,3 +53,4 @@ write_files:
      KUBEPROXY_OPTIONS="$KUBEPROXY_OPTIONS"
      OCTAVIA_ENABLED="$OCTAVIA_ENABLED"
      HEAT_CONTAINER_AGENT_TAG="$HEAT_CONTAINER_AGENT_TAG"
+      AUTO_HEALING_ENABLED="$AUTO_HEALING_ENABLED"
--- a/magnum/drivers/heat/k8s_fedora_template_def.py
+++ b/magnum/drivers/heat/k8s_fedora_template_def.py
@ -116,15 +116,12 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition):
                'to be true or unset.'))

        label_list = ['kube_tag', 'container_infra_prefix',
-                      'availability_zone',
-                      'cgroup_driver',
+                      'availability_zone', 'cgroup_driver',
                      'calico_tag', 'calico_cni_tag',
                      'calico_kube_controllers_tag', 'calico_ipv4pool',
                      'etcd_tag', 'flannel_tag', 'flannel_cni_tag',
-                      'cloud_provider_enabled',
-                      'cloud_provider_tag',
-                      'prometheus_tag',
-                      'grafana_tag',
+                      'cloud_provider_enabled', 'cloud_provider_tag',
+                      'prometheus_tag', 'grafana_tag',
                      'heat_container_agent_tag',
                      'keystone_auth_enabled', 'k8s_keystone_auth_tag',
                      'monitoring_enabled',
@ -132,7 +129,10 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition):
                      'tiller_tag',
                      'tiller_namespace',
                      'node_problem_detector_tag',
-                      'nginx_ingress_controller_tag']
+                      'nginx_ingress_controller_tag',
+                      'auto_healing_enabled', 'auto_scaling_enabled',
+                      'draino_tag', 'autoscaler_tag',
+                      'min_node_count', 'max_node_count']

        for label in label_list:
            label_value = cluster.labels.get(label)
@ -146,6 +146,19 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition):
        extra_params['kube_service_account_private_key'] = \
            csr_keys["private_key"].replace("\n", "\\n")

+        extra_params['project_id'] = cluster.project_id
+
+        if not extra_params.get('max_node_count'):
+            extra_params['max_node_count'] = cluster.node_count + 1
+
+        self._set_cert_manager_params(cluster, extra_params)
+
+        return super(K8sFedoraTemplateDefinition,
+                     self).get_params(context, cluster_template, cluster,
+                                      extra_params=extra_params,
+                                      **kwargs)
+
+    def _set_cert_manager_params(self, cluster, extra_params):
        cert_manager_api = cluster.labels.get('cert_manager_api')
        if strutils.bool_from_string(cert_manager_api):
            extra_params['cert_manager_api'] = cert_manager_api
@ -161,13 +174,6 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition):
                    ca_cert.get_private_key(),
                    ca_cert.get_private_key_passphrase()).replace("\n", "\\n")

-        extra_params['project_id'] = cluster.project_id
-
-        return super(K8sFedoraTemplateDefinition,
-                     self).get_params(context, cluster_template, cluster,
-                                      extra_params=extra_params,
-                                      **kwargs)
-
    def get_env_files(self, cluster_template, cluster):
        env_files = []

--- a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml
+++ b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml
@ -565,6 +565,20 @@ parameters:
    description: namespace where tiller will be installed.
    default: "magnum-tiller"

+  auto_healing_enabled:
+    type: boolean
+    description: >
+      true if the auto healing feature should be enabled
+    default:
+      false
+
+  auto_scaling_enabled:
+    type: boolean
+    description: >
+      true if the auto scaling feature should be enabled
+    default:
+      false
+
  node_problem_detector_tag:
    type: string
    description: tag of the node problem detector container
@ -575,6 +589,27 @@ parameters:
    description: nginx ingress controller docker image tag
    default: 0.23.0

+  draino_tag:
+    type: string
+    description: tag of the draino container
+    default: abf028a
+
+  autoscaler_tag:
+    type: string
+    description: tag of the autoscaler container
+    default: v1.0
+
+  min_node_count:
+    type: number
+    description: >
+      minimum node count of cluster workers when doing scale down
+    default: 1
+
+  max_node_count:
+    type: number
+    description: >
+      maximum node count of cluster workers when doing scale up
+
 resources:

  ######################################################################
@ -852,6 +887,12 @@ resources:
          tiller_namespace: {get_param: tiller_namespace}
          node_problem_detector_tag: {get_param: node_problem_detector_tag}
          nginx_ingress_controller_tag: {get_param: nginx_ingress_controller_tag}
+          auto_healing_enabled: {get_param: auto_healing_enabled}
+          auto_scaling_enabled: {get_param: auto_scaling_enabled}
+          draino_tag: {get_param: draino_tag}
+          autoscaler_tag: {get_param: autoscaler_tag}
+          min_node_count: {get_param: min_node_count}
+          max_node_count: {get_param: max_node_count}

  kube_cluster_config:
    type: OS::Heat::SoftwareConfig
@ -882,6 +923,7 @@ resources:
            - get_file: ../../common/templates/kubernetes/fragments/kube-dashboard-service.sh
            - get_file: ../../common/templates/kubernetes/fragments/enable-keystone-auth.sh
            - get_file: ../../common/templates/kubernetes/fragments/enable-auto-healing.sh
+            - get_file: ../../common/templates/kubernetes/fragments/enable-auto-scaling.sh
            # Helm Based Installation Configuration Scripts
            - get_file: ../../common/templates/kubernetes/helm/metrics-server.sh
            - str_replace:
@ -979,6 +1021,7 @@ resources:
          kubeproxy_options: {get_param: kubeproxy_options}
          octavia_enabled: {get_param: octavia_enabled}
          heat_container_agent_tag: {get_param: heat_container_agent_tag}
+          auto_healing_enabled: {get_param: auto_healing_enabled}

 outputs:

--- a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml
+++ b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml
@ -439,6 +439,16 @@ parameters:
    type: string
    description: namespace where tiller will be installed

+  auto_healing_enabled:
+    type: boolean
+    description: >
+      true if the auto healing feature should be enabled
+
+  auto_scaling_enabled:
+    type: boolean
+    description: >
+      true if the auto scaling feature should be enabled
+
  node_problem_detector_tag:
    type: string
    description: tag of the node problem detector container
@ -447,6 +457,24 @@ parameters:
    type: string
    description: nginx ingress controller docker image tag

+  draino_tag:
+    type: string
+    description: tag of the draino container
+
+  autoscaler_tag:
+    type: string
+    description: tag of the autoscaler container
+
+  min_node_count:
+    type: number
+    description: >
+      minimum node count of cluster workers when doing scale down
+
+  max_node_count:
+    type: number
+    description: >
+      maximum node count of cluster workers when doing scale up
+
 resources:
  ######################################################################
  #
@ -560,6 +588,12 @@ resources:
            "$TILLER_NAMESPACE": {get_param: tiller_namespace}
            "$NODE_PROBLEM_DETECTOR_TAG": {get_param: node_problem_detector_tag}
            "$NGINX_INGRESS_CONTROLLER_TAG": {get_param: nginx_ingress_controller_tag}
+            "$AUTO_HEALING_ENABLED": {get_param: auto_healing_enabled}
+            "$AUTO_SCALING_ENABLED": {get_param: auto_scaling_enabled}
+            "$DRAINO_TAG": {get_param: draino_tag}
+            "$AUTOSCALER_TAG": {get_param: autoscaler_tag}
+            "$MIN_NODE_COUNT": {get_param: min_node_count}
+            "$MAX_NODE_COUNT": {get_param: max_node_count}

  install_openstack_ca:
    type: OS::Heat::SoftwareConfig
--- a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubeminion.yaml
+++ b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubeminion.yaml
@ -276,6 +276,11 @@ parameters:
    type: string
    description: tag of the heat_container_agent system container

+  auto_healing_enabled:
+    type: boolean
+    description: >
+      true if the auto healing feature should be enabled
+
 resources:

  start_container_agent:
@ -355,6 +360,8 @@ resources:
            $KUBEPROXY_OPTIONS: {get_param: kubeproxy_options}
            $OCTAVIA_ENABLED: {get_param: octavia_enabled}
            $HEAT_CONTAINER_AGENT_TAG: {get_param: heat_container_agent_tag}
+            $AUTO_HEALING_ENABLED: {get_param: auto_healing_enabled}
+

  install_openstack_ca:
    type: OS::Heat::SoftwareConfig
--- a/magnum/tests/unit/conductor/handlers/test_k8s_cluster_conductor.py
+++ b/magnum/tests/unit/conductor/handlers/test_k8s_cluster_conductor.py
@ -331,11 +331,15 @@ class TestClusterConductorWithK8s(base.TestCase):
            'kube_service_account_key': 'public_key',
            'kube_service_account_private_key': 'private_key',
            'portal_network_cidr': '10.254.0.0/16',
-            'project_id': 'project_id'
+            'project_id': 'project_id',
+            'max_node_count': 2,
        }
        if missing_attr is not None:
            expected.pop(mapping[missing_attr], None)

+        if missing_attr == 'node_count':
+            expected['max_node_count'] = None
+
        self.assertEqual(expected, definition)
        self.assertEqual(
            ['../../common/templates/environments/no_private_network.yaml',
@ -459,7 +463,8 @@ class TestClusterConductorWithK8s(base.TestCase):
            'kube_service_account_key': 'public_key',
            'kube_service_account_private_key': 'private_key',
            'portal_network_cidr': '10.254.0.0/16',
-            'project_id': 'project_id'
+            'project_id': 'project_id',
+            'max_node_count': 2,
        }

        self.assertEqual(expected, definition)
@ -574,7 +579,8 @@ class TestClusterConductorWithK8s(base.TestCase):
            'kube_service_account_key': 'public_key',
            'kube_service_account_private_key': 'private_key',
            'portal_network_cidr': '10.254.0.0/16',
-            'project_id': 'project_id'
+            'project_id': 'project_id',
+            'max_node_count': 2,
        }
        self.assertEqual(expected, definition)
        self.assertEqual(
@ -1000,7 +1006,8 @@ class TestClusterConductorWithK8s(base.TestCase):
            'kube_service_account_key': 'public_key',
            'kube_service_account_private_key': 'private_key',
            'portal_network_cidr': '10.254.0.0/16',
-            'project_id': 'project_id'
+            'project_id': 'project_id',
+            'max_node_count': 2,
        }
        self.assertEqual(expected, definition)
        self.assertEqual(
--- a/magnum/tests/unit/drivers/test_template_definition.py
+++ b/magnum/tests/unit/drivers/test_template_definition.py
@ -517,6 +517,14 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
        tiller_namespace = mock_cluster.labels.get(
            'tiller_namespace')
        npd_tag = mock_cluster.labels.get('node_problem_detector_tag')
+        auto_healing_enabled = mock_cluster.labels.get(
+            'auto_healing_enabled')
+        auto_scaling_enabled = mock_cluster.labels.get(
+            'auto_scaling_enabled')
+        draino_tag = mock_cluster.labels.get('draino_tag')
+        autoscaler_tag = mock_cluster.labels.get('autoscaler_tag')
+        min_node_count = mock_cluster.labels.get('min_node_count')
+        max_node_count = mock_cluster.labels.get('max_node_count')

        k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition()

@ -581,6 +589,12 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
            'tiller_tag': tiller_tag,
            'tiller_namespace': tiller_namespace,
            'node_problem_detector_tag': npd_tag,
+            'auto_healing_enabled': auto_healing_enabled,
+            'auto_scaling_enabled': auto_scaling_enabled,
+            'draino_tag': draino_tag,
+            'autoscaler_tag': autoscaler_tag,
+            'min_node_count': min_node_count,
+            'max_node_count': max_node_count,
        }}
        mock_get_params.assert_called_once_with(mock_context,
                                                mock_cluster_template,
@ -893,6 +907,14 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
        tiller_namespace = mock_cluster.labels.get(
            'tiller_namespace')
        npd_tag = mock_cluster.labels.get('node_problem_detector_tag')
+        auto_healing_enabled = mock_cluster.labels.get(
+            'auto_healing_enabled')
+        auto_scaling_enabled = mock_cluster.labels.get(
+            'auto_scaling_enabled')
+        draino_tag = mock_cluster.labels.get('draino_tag')
+        autoscaler_tag = mock_cluster.labels.get('autoscaler_tag')
+        min_node_count = mock_cluster.labels.get('min_node_count')
+        max_node_count = mock_cluster.labels.get('max_node_count')

        k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition()

@ -959,6 +981,12 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
            'tiller_tag': tiller_tag,
            'tiller_namespace': tiller_namespace,
            'node_problem_detector_tag': npd_tag,
+            'auto_healing_enabled': auto_healing_enabled,
+            'auto_scaling_enabled': auto_scaling_enabled,
+            'draino_tag': draino_tag,
+            'autoscaler_tag': autoscaler_tag,
+            'min_node_count': min_node_count,
+            'max_node_count': max_node_count,
        }}
        mock_get_params.assert_called_once_with(mock_context,
                                                mock_cluster_template,
--- a/releasenotes/notes/support-auto-healing-3e07c16c55209b0a.yaml
+++ b/releasenotes/notes/support-auto-healing-3e07c16c55209b0a.yaml
@ -0,0 +1,11 @@
+---
+features:
+  - |
+    Using Node Problem Detector, Draino and AutoScaler to support
+    auto healing for K8s cluster, user can use a new label
+    "auto_healing_enabled' to turn on/off it.
+
+    Meanwhile, a new label "auto_scaling_enabled" is also introduced
+    to enable the capability to let the k8s cluster auto scale based
+    its workload.
+