diff --git a/doc/source/user/index.rst b/doc/source/user/index.rst index ad40060fc8..2c4fdb4074 100644 --- a/doc/source/user/index.rst +++ b/doc/source/user/index.rst @@ -403,6 +403,10 @@ the table are linked to more details elsewhere in the user guide. | `auto_healing_enabled`_ | - true | false | | | - false | | +---------------------------------------+--------------------+---------------+ +| `auto_healing_controller`_ | see below | "draino" | ++---------------------------------------+--------------------+---------------+ +| `magnum_auto_healer_tag`_ | see below | see below | ++---------------------------------------+--------------------+---------------+ | `auto_scaling_enabled`_ | - true | false | | | - false | | +---------------------------------------+--------------------+---------------+ @@ -1309,6 +1313,14 @@ _`master_lb_floating_ip_enabled` _`auto_healing_enabled` If set to true, auto healing feature will be enabled. Defaults to false. +_`auto_healing_controller` + This label sets the auto-healing service to be used. Currently ``draino`` and + ``magnum-auto-healer`` are supported. The default is ``draino``. For more + details, see + `draino doc `_ and + `magnum-auto-healer doc + `_. + _`auto_scaling_enabled` If set to true, auto scaling feature will be enabled. Defaults to true. @@ -1319,6 +1331,10 @@ _`node_problem_detector_tag` _`draino_tag` This label allows users to select a specific Draino version. +_`magnum_auto_healer_tag` + This label allows users to select a specific magnum-auto-healer version. + The default value for Train: v1.15.0 + _`autoscaler_tag` This label allows users to select a specific Cluster Autoscaler version. diff --git a/magnum/drivers/common/templates/kubernetes/fragments/configure-kubernetes-minion.sh b/magnum/drivers/common/templates/kubernetes/fragments/configure-kubernetes-minion.sh index 0f4a21e98c..f2b3a4bd85 100644 --- a/magnum/drivers/common/templates/kubernetes/fragments/configure-kubernetes-minion.sh +++ b/magnum/drivers/common/templates/kubernetes/fragments/configure-kubernetes-minion.sh @@ -160,7 +160,9 @@ KUBELET_ARGS="${KUBELET_ARGS} --client-ca-file=${CERT_DIR}/ca.crt --tls-cert-fil # specified cgroup driver KUBELET_ARGS="${KUBELET_ARGS} --cgroup-driver=${CGROUP_DRIVER}" -if [ "$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" ]; then +auto_healing_enabled=$(echo ${AUTO_HEALING_ENABLED} | tr '[:upper:]' '[:lower:]') +autohealing_controller=$(echo ${AUTO_HEALING_CONTROLLER} | tr '[:upper:]' '[:lower:]') +if [[ "${auto_healing_enabled}" = "true" && "${autohealing_controller}" = "draino" ]]; then KUBELET_ARGS="${KUBELET_ARGS} --node-labels=draino-enabled=true" fi diff --git a/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-healing.sh b/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-healing.sh index 1e95c995a7..04a46bbb25 100644 --- a/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-healing.sh +++ b/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-healing.sh @@ -8,7 +8,7 @@ printf "Starting to run ${step}\n" _gcr_prefix=${CONTAINER_INFRA_PREFIX:-k8s.gcr.io/} # Either auto scaling or auto healing we need CA to be deployed -if [ "$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" || "$(echo $NPD_ENABLED | tr '[:upper:]' '[:lower:]')" = "true"]; then +if [[ "$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" || "$(echo $NPD_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" ]]; then # Generate Node Problem Detector manifest file NPD_DEPLOY=/srv/magnum/kubernetes/manifests/npd.yaml @@ -121,18 +121,15 @@ EOF fi -_docker_draino_prefix=${CONTAINER_INFRA_PREFIX:-docker.io/planetlabs/} -step="enable-auto-healing" -printf "Starting to run ${step}\n" +function enable_draino { + echo "Installing draino" + _docker_draino_prefix=${CONTAINER_INFRA_PREFIX:-docker.io/planetlabs/} + draino_manifest=/srv/magnum/kubernetes/manifests/draino.yaml -if [ "$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" ]; then - # Generate Draino manifest file - DRAINO_DEPLOY=/srv/magnum/kubernetes/manifests/draino.yaml - - [ -f ${DRAINO_DEPLOY} ] || { - echo "Writing File: $DRAINO_DEPLOY" - mkdir -p $(dirname ${DRAINO_DEPLOY}) - cat << EOF > ${DRAINO_DEPLOY} + [ -f ${draino_manifest} ] || { + echo "Writing File: $draino_manifest" + mkdir -p $(dirname ${draino_manifest}) + cat << EOF > ${draino_manifest} --- apiVersion: v1 kind: ServiceAccount @@ -222,7 +219,156 @@ spec: EOF } - kubectl apply -f ${DRAINO_DEPLOY} + kubectl apply -f ${draino_manifest} +} +function enable_magnum_auto_healer { + echo "Installing magnum_auto_healer" + image_prefix=${CONTAINER_INFRA_PREFIX:-docker.io/k8scloudprovider/} + image_prefix=${image_prefix%/} + magnum_auto_healer_manifest=/srv/magnum/kubernetes/manifests/magnum_auto_healer.yaml + + [ -f ${magnum_auto_healer_manifest} ] || { + echo "Writing File: ${magnum_auto_healer_manifest}" + mkdir -p $(dirname ${magnum_auto_healer_manifest}) + cat << EOF > ${magnum_auto_healer_manifest} +--- +kind: ServiceAccount +apiVersion: v1 +metadata: + name: magnum-auto-healer + namespace: kube-system + +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: magnum-auto-healer +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cluster-admin +subjects: + - kind: ServiceAccount + name: magnum-auto-healer + namespace: kube-system + +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: magnum-auto-healer-config + namespace: kube-system +data: + config.yaml: | + cluster-name: ${CLUSTER_UUID} + dry-run: false + monitor-interval: 30s + check-delay-after-add: 20m + leader-elect: true + healthcheck: + master: + - type: Endpoint + params: + unhealthy-duration: 3m + protocol: HTTPS + port: 6443 + endpoints: ["/healthz"] + ok-codes: [200] + - type: NodeCondition + params: + unhealthy-duration: 3m + types: ["Ready"] + ok-values: ["True"] + worker: + - type: NodeCondition + params: + unhealthy-duration: 3m + types: ["Ready"] + ok-values: ["True"] + openstack: + auth-url: ${AUTH_URL} + user-id: ${TRUSTEE_USER_ID} + password: ${TRUSTEE_PASSWORD} + trust-id: ${TRUST_ID} + region: ${REGION_NAME} + ca-file: /etc/kubernetes/ca-bundle.crt + +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: magnum-auto-healer + namespace: kube-system + labels: + k8s-app: magnum-auto-healer +spec: + selector: + matchLabels: + k8s-app: magnum-auto-healer + template: + metadata: + labels: + k8s-app: magnum-auto-healer + spec: + hostNetwork: true + serviceAccountName: magnum-auto-healer + tolerations: + - effect: NoSchedule + operator: Exists + - key: CriticalAddonsOnly + operator: Exists + - effect: NoExecute + operator: Exists + nodeSelector: + node-role.kubernetes.io/master: "" + containers: + - name: magnum-auto-healer + image: ${image_prefix}/magnum-auto-healer:${MAGNUM_AUTO_HEALER_TAG} + imagePullPolicy: Always + args: + - /bin/magnum-auto-healer + - --config=/etc/magnum-auto-healer/config.yaml + - --v + - "2" + volumeMounts: + - name: config + mountPath: /etc/magnum-auto-healer + - name: kubernetes-config + mountPath: /etc/kubernetes + readOnly: true + volumes: + - name: config + configMap: + name: magnum-auto-healer-config + - name: kubernetes-config + hostPath: + path: /etc/kubernetes +EOF + } + + kubectl apply -f ${magnum_auto_healer_manifest} +} + +step="enable-auto-healing" +printf "Starting to run ${step}\n" + +if [ "$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" ]; then + autohealing_controller=$(echo ${AUTO_HEALING_CONTROLLER} | tr '[:upper:]' '[:lower:]') + case "${autohealing_controller}" in + "") + echo "No autohealing controller configured." + ;; + "draino") + enable_draino + ;; + "magnum-auto-healer") + enable_magnum_auto_healer + ;; + *) + echo "Autohealing controller ${autohealing_controller} not supported." + ;; + esac fi + printf "Finished running ${step}\n" diff --git a/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-scaling.sh b/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-scaling.sh index 132a5b0f28..b32cd4a048 100644 --- a/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-scaling.sh +++ b/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-scaling.sh @@ -7,8 +7,11 @@ printf "Starting to run ${step}\n" _docker_ca_prefix=${CONTAINER_INFRA_PREFIX:-docker.io/openstackmagnum/} -# Either auto scaling or auto healing we need CA to be deployed -if [ "$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" || "$(echo $AUTO_SCALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true"]; then +auto_scaling_enabled=$(echo $AUTO_SCALING_ENABLED | tr '[:upper:]' '[:lower:]') +auto_healing_enabled=$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]') +autohealing_controller=$(echo ${AUTO_HEALING_CONTROLLER} | tr '[:upper:]' '[:lower:]') + +if [[ "${auto_scaling_enabled}" = "true" || ("${auto_healing_enabled}" = "true" && "${autohealing_controller}" = "draino") ]]; then # Generate Autoscaler manifest file AUTOSCALER_DEPLOY=/srv/magnum/kubernetes/manifests/autoscaler.yaml diff --git a/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.sh b/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.sh index 2aa6008498..0fa28514ee 100644 --- a/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.sh +++ b/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.sh @@ -96,8 +96,10 @@ HEAT_PARAMS=/etc/sysconfig/heat-params NODE_PROBLEM_DETECTOR_TAG="$NODE_PROBLEM_DETECTOR_TAG" NGINX_INGRESS_CONTROLLER_TAG="$NGINX_INGRESS_CONTROLLER_TAG" AUTO_HEALING_ENABLED="$AUTO_HEALING_ENABLED" + AUTO_HEALING_CONTROLLER="$AUTO_HEALING_CONTROLLER" AUTO_SCALING_ENABLED="$AUTO_SCALING_ENABLED" DRAINO_TAG="$DRAINO_TAG" + MAGNUM_AUTO_HEALER_TAG="$MAGNUM_AUTO_HEALER_TAG" AUTOSCALER_TAG="$AUTOSCALER_TAG" MIN_NODE_COUNT="$MIN_NODE_COUNT" MAX_NODE_COUNT="$MAX_NODE_COUNT" diff --git a/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params.sh b/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params.sh index 6d2fc93c72..5a4f50eefa 100644 --- a/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params.sh +++ b/magnum/drivers/common/templates/kubernetes/fragments/write-heat-params.sh @@ -55,6 +55,8 @@ KUBELET_OPTIONS="$KUBELET_OPTIONS" KUBEPROXY_OPTIONS="$KUBEPROXY_OPTIONS" OCTAVIA_ENABLED="$OCTAVIA_ENABLED" HEAT_CONTAINER_AGENT_TAG="$HEAT_CONTAINER_AGENT_TAG" +AUTO_HEALING_ENABLED="$AUTO_HEALING_ENABLED" +AUTO_HEALING_CONTROLLER="$AUTO_HEALING_CONTROLLER" EOF } diff --git a/magnum/drivers/heat/k8s_fedora_template_def.py b/magnum/drivers/heat/k8s_fedora_template_def.py index 4bd399a61d..62f05a1a64 100644 --- a/magnum/drivers/heat/k8s_fedora_template_def.py +++ b/magnum/drivers/heat/k8s_fedora_template_def.py @@ -140,6 +140,7 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition): 'node_problem_detector_tag', 'nginx_ingress_controller_tag', 'auto_healing_enabled', 'auto_scaling_enabled', + 'auto_healing_controller', 'magnum_auto_healer_tag', 'draino_tag', 'autoscaler_tag', 'min_node_count', 'max_node_count', 'npd_enabled'] diff --git a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml index 4082c1a305..ea0745edee 100644 --- a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml +++ b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml @@ -610,6 +610,17 @@ parameters: default: false + auto_healing_controller: + type: string + description: > + The service to be deployed for auto-healing. + default: "draino" + + magnum_auto_healer_tag: + type: string + description: tag of the magnum-auto-healer service. + default: "v1.15.0" + auto_scaling_enabled: type: boolean description: > @@ -949,6 +960,8 @@ resources: node_problem_detector_tag: {get_param: node_problem_detector_tag} nginx_ingress_controller_tag: {get_param: nginx_ingress_controller_tag} auto_healing_enabled: {get_param: auto_healing_enabled} + auto_healing_controller: {get_param: auto_healing_controller} + magnum_auto_healer_tag: {get_param: magnum_auto_healer_tag} auto_scaling_enabled: {get_param: auto_scaling_enabled} draino_tag: {get_param: draino_tag} autoscaler_tag: {get_param: autoscaler_tag} @@ -1090,6 +1103,7 @@ resources: heat_container_agent_tag: {get_param: heat_container_agent_tag} auto_healing_enabled: {get_param: auto_healing_enabled} npd_enabled: {get_param: npd_enabled} + auto_healing_controller: {get_param: auto_healing_controller} outputs: diff --git a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml index e4d612e4ea..437ac229df 100644 --- a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml +++ b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml @@ -457,6 +457,17 @@ parameters: description: > true if the auto healing feature should be enabled + auto_healing_controller: + type: string + description: > + The service to be deployed for auto-healing. + default: "draino" + + magnum_auto_healer_tag: + type: string + description: tag of the magnum-auto-healer service. + default: "v1.15.0" + auto_scaling_enabled: type: boolean description: > @@ -634,6 +645,8 @@ resources: "$NODE_PROBLEM_DETECTOR_TAG": {get_param: node_problem_detector_tag} "$NGINX_INGRESS_CONTROLLER_TAG": {get_param: nginx_ingress_controller_tag} "$AUTO_HEALING_ENABLED": {get_param: auto_healing_enabled} + "$AUTO_HEALING_CONTROLLER": {get_param: auto_healing_controller} + "$MAGNUM_AUTO_HEALER_TAG": {get_param: magnum_auto_healer_tag} "$AUTO_SCALING_ENABLED": {get_param: auto_scaling_enabled} "$DRAINO_TAG": {get_param: draino_tag} "$AUTOSCALER_TAG": {get_param: autoscaler_tag} diff --git a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubeminion.yaml b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubeminion.yaml index 2dd02e0d27..0f1ea8eb77 100644 --- a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubeminion.yaml +++ b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubeminion.yaml @@ -281,6 +281,12 @@ parameters: description: > true if the auto healing feature should be enabled + auto_healing_controller: + type: string + description: > + The service to be deployed for auto-healing. + default: "draino" + npd_enabled: type: boolean description: > @@ -373,6 +379,7 @@ resources: $OCTAVIA_ENABLED: {get_param: octavia_enabled} $HEAT_CONTAINER_AGENT_TAG: {get_param: heat_container_agent_tag} $AUTO_HEALING_ENABLED: {get_param: auto_healing_enabled} + $AUTO_HEALING_CONTROLLER: {get_param: auto_healing_controller} $NPD_ENABLED: {get_param: npd_enabled} - get_file: ../../common/templates/kubernetes/fragments/write-kube-os-config.sh - get_file: ../../common/templates/kubernetes/fragments/make-cert-client.sh diff --git a/magnum/tests/unit/drivers/test_template_definition.py b/magnum/tests/unit/drivers/test_template_definition.py index 7a9dcdd9f6..1cb87d3ca3 100644 --- a/magnum/tests/unit/drivers/test_template_definition.py +++ b/magnum/tests/unit/drivers/test_template_definition.py @@ -524,6 +524,10 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase): 'traefik_ingress_controller_tag') auto_healing_enabled = mock_cluster.labels.get( 'auto_healing_enabled') + auto_healing_controller = mock_cluster.labels.get( + 'auto_healing_controller') + magnum_auto_healer_tag = mock_cluster.labels.get( + 'magnum_auto_healer_tag') auto_scaling_enabled = mock_cluster.labels.get( 'auto_scaling_enabled') draino_tag = mock_cluster.labels.get('draino_tag') @@ -600,6 +604,8 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase): 'tiller_namespace': tiller_namespace, 'node_problem_detector_tag': npd_tag, 'auto_healing_enabled': auto_healing_enabled, + 'auto_healing_controller': auto_healing_controller, + 'magnum_auto_healer_tag': magnum_auto_healer_tag, 'auto_scaling_enabled': auto_scaling_enabled, 'draino_tag': draino_tag, 'autoscaler_tag': autoscaler_tag, @@ -934,6 +940,10 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase): 'traefik_ingress_controller_tag') auto_healing_enabled = mock_cluster.labels.get( 'auto_healing_enabled') + auto_healing_controller = mock_cluster.labels.get( + 'auto_healing_controller') + magnum_auto_healer_tag = mock_cluster.labels.get( + 'magnum_auto_healer_tag') auto_scaling_enabled = mock_cluster.labels.get( 'auto_scaling_enabled') draino_tag = mock_cluster.labels.get('draino_tag') @@ -1012,6 +1022,8 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase): 'tiller_namespace': tiller_namespace, 'node_problem_detector_tag': npd_tag, 'auto_healing_enabled': auto_healing_enabled, + 'auto_healing_controller': auto_healing_controller, + 'magnum_auto_healer_tag': magnum_auto_healer_tag, 'auto_scaling_enabled': auto_scaling_enabled, 'draino_tag': draino_tag, 'autoscaler_tag': autoscaler_tag, diff --git a/releasenotes/notes/support-auto-healing-controller-333d1266918111e9.yaml b/releasenotes/notes/support-auto-healing-controller-333d1266918111e9.yaml new file mode 100644 index 0000000000..7cead46417 --- /dev/null +++ b/releasenotes/notes/support-auto-healing-controller-333d1266918111e9.yaml @@ -0,0 +1,7 @@ +--- +features: + - A new tag ``auto_healing_controller`` is introduced to allow the user to + choose the auto-healing service when ``auto_healing_enabled`` is specified + in the labels, ``draino`` and ``magnum-auto-healer`` are supported for now. + Another label ``magnum_auto_healer_tag`` is also added to specify the + ``magnum-auto-healer`` image tag.