[fedora_atomic] Support auto healing for k8s
Using Node Problem Detector, Draino and AutoScaler to support auto healing for K8s cluster, user can use a new label "auto_healing_enabled' to turn on/off it. Meanwhile, a new label "auto_scaling_enabled" is also introduced to enable the capability to let the k8s cluster auto scale based its workload. Task: 28923 Story: 2004782 Change-Id: I25af2a72a7a960205929374d2300bd83d4d20960
This commit is contained in:
parent
f194b5b7fa
commit
75fab6ff37
@ -392,6 +392,22 @@ the table are linked to more details elsewhere in the user guide.
|
||||
| `master_lb_floating_ip_enabled`_ | - true | see below |
|
||||
| | - false | |
|
||||
+---------------------------------------+--------------------+---------------+
|
||||
| `auto_healing_enabled`_ | - true | false |
|
||||
| | - false | |
|
||||
+---------------------------------------+--------------------+---------------+
|
||||
| `auto_scaling_enabled`_ | - true | true |
|
||||
| | - false | |
|
||||
+---------------------------------------+--------------------+---------------+
|
||||
| `node_problem_detector_tag`_ | see below | see below |
|
||||
+---------------------------------------+--------------------+---------------+
|
||||
| `draino_tag`_ | see below | see below |
|
||||
+---------------------------------------+--------------------+---------------+
|
||||
| `autoscaler_tag`_ | see below | see below |
|
||||
+---------------------------------------+--------------------+---------------+
|
||||
| `min_node_count`_ | see below | see below |
|
||||
+---------------------------------------+--------------------+---------------+
|
||||
| `max_node_count`_ | see below | see below |
|
||||
+---------------------------------------+--------------------+---------------+
|
||||
|
||||
Cluster
|
||||
-------
|
||||
@ -1119,6 +1135,9 @@ _`container_infra_prefix`
|
||||
* quay.io/coreos/configmap-reload:v0.0.1
|
||||
* quay.io/coreos/prometheus-config-reloader:v0.26.0
|
||||
* quay.io/prometheus/prometheus:v2.5.0
|
||||
* k8s.gcr.io/node-problem-detector:v0.6.2
|
||||
* docker.io/planetlabs/draino:abf028a
|
||||
* docker.io/openstackmagnum/cluster-autoscaler:v1.0
|
||||
|
||||
_`kube_tag`
|
||||
This label allows users to select `a specific Kubernetes release,
|
||||
@ -1257,6 +1276,31 @@ _`master_lb_floating_ip_enabled`
|
||||
``master_lb_enabled`` is set. If not specified, the default value is the same
|
||||
as template property ``floating_ip_enabled``.
|
||||
|
||||
_`auto_healing_enabled`
|
||||
If set to true, auto healing feature will be enabled. Defaults to false.
|
||||
|
||||
_`auto_scaling_enabled`
|
||||
If set to true, auto scaling feature will be enabled. Defaults to true.
|
||||
|
||||
_`node_problem_detector_tag`
|
||||
This label allows users to select a specific Node Problem Detector
|
||||
version.
|
||||
|
||||
_`draino_tag`
|
||||
This label allows users to select a specific Draino version.
|
||||
|
||||
_`autoscaler_tag`
|
||||
This label allows users to select a specific Cluster Autoscaler version.
|
||||
|
||||
_`min_node_count`
|
||||
The minmium node count of the cluster when doing auto scaling or auto
|
||||
healing. Defaults to 1.
|
||||
|
||||
_`max_node_count`
|
||||
The maxmium node count of the cluster when doing auto scaling or auto
|
||||
healing.
|
||||
|
||||
|
||||
External load balancer for services
|
||||
-----------------------------------
|
||||
|
||||
|
@ -154,6 +154,10 @@ KUBELET_ARGS="${KUBELET_ARGS} --client-ca-file=${CERT_DIR}/ca.crt --tls-cert-fil
|
||||
# specified cgroup driver
|
||||
KUBELET_ARGS="${KUBELET_ARGS} --cgroup-driver=${CGROUP_DRIVER}"
|
||||
|
||||
if [ "$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" ]; then
|
||||
KUBELET_ARGS="${KUBELET_ARGS} --node-labels=draino-enabled=true"
|
||||
fi
|
||||
|
||||
systemctl disable docker
|
||||
if cat /usr/lib/systemd/system/docker.service | grep 'native.cgroupdriver'; then
|
||||
cp /usr/lib/systemd/system/docker.service /etc/systemd/system/
|
||||
|
@ -1,6 +1,6 @@
|
||||
#!/bin/sh
|
||||
|
||||
step="enable-auto-healing"
|
||||
step="enable-node-problem-detector"
|
||||
printf "Starting to run ${step}\n"
|
||||
|
||||
. /etc/sysconfig/heat-params
|
||||
@ -68,7 +68,7 @@ spec:
|
||||
- "/bin/sh"
|
||||
- "-c"
|
||||
# Pass both config to support both journald and syslog.
|
||||
- "exec /node-problem-detector --logtostderr --system-log-monitors=/config/kernel-monitor.json,/config/kernel-monitor-filelog.json,/config/docker-monitor.json,/config/docker-monitor-filelog.json >>/var/log/node-problem-detector.log 2>&1"
|
||||
- "exec /node-problem-detector --logtostderr --system-log-monitors=/config/kernel-monitor.json,/config/kernel-monitor-filelog.json,/config/docker-monitor.json,/config/docker-monitor-filelog.json 2>&1 | tee /var/log/node-problem-detector.log"
|
||||
securityContext:
|
||||
privileged: true
|
||||
resources:
|
||||
@ -115,3 +115,109 @@ done
|
||||
kubectl apply -f ${NPD_DEPLOY}
|
||||
|
||||
printf "Finished running ${step}\n"
|
||||
|
||||
_docker_draino_prefix=${CONTAINER_INFRA_PREFIX:-docker.io/planetlabs/}
|
||||
step="enable-auto-healing"
|
||||
printf "Starting to run ${step}\n"
|
||||
|
||||
if [ "$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" ]; then
|
||||
# Generate Draino manifest file
|
||||
DRAINO_DEPLOY=/srv/magnum/kubernetes/manifests/draino.yaml
|
||||
|
||||
[ -f ${DRAINO_DEPLOY} ] || {
|
||||
echo "Writing File: $DRAINO_DEPLOY"
|
||||
mkdir -p $(dirname ${DRAINO_DEPLOY})
|
||||
cat << EOF > ${DRAINO_DEPLOY}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
labels: {component: draino}
|
||||
name: draino
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
labels: {component: draino}
|
||||
name: draino
|
||||
rules:
|
||||
- apiGroups: ['']
|
||||
resources: [events]
|
||||
verbs: [create, patch, update]
|
||||
- apiGroups: ['']
|
||||
resources: [nodes]
|
||||
verbs: [get, watch, list, update]
|
||||
- apiGroups: ['']
|
||||
resources: [nodes/status]
|
||||
verbs: [patch]
|
||||
- apiGroups: ['']
|
||||
resources: [pods]
|
||||
verbs: [get, watch, list]
|
||||
- apiGroups: ['']
|
||||
resources: [pods/eviction]
|
||||
verbs: [create]
|
||||
- apiGroups: [extensions]
|
||||
resources: [daemonsets]
|
||||
verbs: [get, watch, list]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
labels: {component: draino}
|
||||
name: draino
|
||||
roleRef: {apiGroup: rbac.authorization.k8s.io, kind: ClusterRole, name: draino}
|
||||
subjects:
|
||||
- {kind: ServiceAccount, name: draino, namespace: kube-system}
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
labels: {component: draino}
|
||||
name: draino
|
||||
namespace: kube-system
|
||||
spec:
|
||||
# Draino does not currently support locking/master election, so you should
|
||||
# only run one draino at a time. Draino won't start draining nodes immediately
|
||||
# so it's usually safe for multiple drainos to exist for a brief period of
|
||||
# time.
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels: {component: draino}
|
||||
template:
|
||||
metadata:
|
||||
labels: {component: draino}
|
||||
name: draino
|
||||
namespace: kube-system
|
||||
spec:
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/master: ""
|
||||
hostNetwork: true
|
||||
tolerations:
|
||||
- effect: NoSchedule
|
||||
operator: Exists
|
||||
- key: CriticalAddonsOnly
|
||||
operator: Exists
|
||||
- effect: NoExecute
|
||||
operator: Exists
|
||||
- key: node.cloudprovider.kubernetes.io/uninitialized
|
||||
value: "true"
|
||||
effect: NoSchedule
|
||||
- key: node-role.kubernetes.io/master
|
||||
effect: NoSchedule
|
||||
containers:
|
||||
# You'll want to change these labels and conditions to suit your deployment.
|
||||
- command: [/draino, --node-label=draino-enabled=true, --evict-daemonset-pods, --evict-emptydir-pods, NotReady]
|
||||
image: ${_docker_draino_prefix}draino:${DRAINO_TAG}
|
||||
livenessProbe:
|
||||
httpGet: {path: /healthz, port: 10002}
|
||||
initialDelaySeconds: 30
|
||||
name: draino
|
||||
serviceAccountName: draino
|
||||
EOF
|
||||
}
|
||||
|
||||
kubectl apply -f ${DRAINO_DEPLOY}
|
||||
|
||||
fi
|
||||
printf "Finished running ${step}\n"
|
||||
|
@ -0,0 +1,185 @@
|
||||
#!/bin/sh
|
||||
|
||||
step="enable-auto-scaling"
|
||||
printf "Starting to run ${step}\n"
|
||||
|
||||
. /etc/sysconfig/heat-params
|
||||
|
||||
_docker_ca_prefix=${CONTAINER_INFRA_PREFIX:-docker.io/openstackmagnum/}
|
||||
|
||||
# Either auto scaling or auto healing we need CA to be deployed
|
||||
if [ "$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" || "$(echo $AUTO_SCALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true"]; then
|
||||
# Generate Autoscaler manifest file
|
||||
AUTOSCALER_DEPLOY=/srv/magnum/kubernetes/manifests/autoscaler.yaml
|
||||
|
||||
[ -f ${AUTOSCALER_DEPLOY} ] || {
|
||||
echo "Writing File: $AUTOSCALER_DEPLOY"
|
||||
mkdir -p $(dirname ${AUTOSCALER_DEPLOY})
|
||||
cat << EOF > ${AUTOSCALER_DEPLOY}
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: cluster-autoscaler-role
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["events", "endpoints"]
|
||||
verbs: ["create", "patch"]
|
||||
- apiGroups: [""]
|
||||
resources: ["pods/eviction"]
|
||||
verbs: ["create"]
|
||||
- apiGroups: [""]
|
||||
resources: ["pods/status"]
|
||||
verbs: ["update"]
|
||||
- apiGroups: [""]
|
||||
resources: ["endpoints"]
|
||||
resourceNames: ["cluster-autoscaler"]
|
||||
verbs: ["get", "update"]
|
||||
- apiGroups: [""]
|
||||
resources: ["nodes"]
|
||||
verbs: ["watch", "list", "get", "update"]
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- "pods"
|
||||
- "services"
|
||||
- "replicationcontrollers"
|
||||
- "persistentvolumeclaims"
|
||||
- "persistentvolumes"
|
||||
verbs: ["watch", "list", "get"]
|
||||
- apiGroups: ["batch"]
|
||||
resources: ["jobs"]
|
||||
verbs: ["watch", "list", "get"]
|
||||
- apiGroups: ["policy"]
|
||||
resources: ["poddisruptionbudgets"]
|
||||
verbs: ["watch", "list"]
|
||||
- apiGroups: ["apps"]
|
||||
resources: ["daemonsets", "replicasets", "statefulsets"]
|
||||
verbs: ["watch", "list", "get"]
|
||||
- apiGroups: ["storage.k8s.io"]
|
||||
resources: ["storageclasses"]
|
||||
verbs: ["watch", "list", "get"]
|
||||
- apiGroups: [""]
|
||||
resources: ["configmaps"]
|
||||
verbs: ["create"]
|
||||
- apiGroups: [""]
|
||||
resources: ["configmaps"]
|
||||
resourceNames: ["cluster-autoscaler-status"]
|
||||
verbs: ["delete", "get", "update"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: cluster-autoscaler-rolebinding
|
||||
namespace: kube-system
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: cluster-autoscaler-role
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: cluster-autoscaler-account
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: cluster-autoscaler-account
|
||||
namespace: kube-system
|
||||
---
|
||||
kind: Deployment
|
||||
apiVersion: apps/v1
|
||||
metadata:
|
||||
name: cluster-autoscaler
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app: cluster-autoscaler
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: cluster-autoscaler
|
||||
template:
|
||||
metadata:
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app: cluster-autoscaler
|
||||
spec:
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/master: ""
|
||||
securityContext:
|
||||
runAsUser: 1001
|
||||
hostNetwork: True
|
||||
tolerations:
|
||||
- effect: NoSchedule
|
||||
operator: Exists
|
||||
- key: CriticalAddonsOnly
|
||||
operator: Exists
|
||||
- effect: NoExecute
|
||||
operator: Exists
|
||||
- key: node.cloudprovider.kubernetes.io/uninitialized
|
||||
value: "true"
|
||||
effect: NoSchedule
|
||||
- key: node-role.kubernetes.io/master
|
||||
effect: NoSchedule
|
||||
serviceAccountName: cluster-autoscaler-account
|
||||
containers:
|
||||
- name: cluster-autoscaler
|
||||
image: ${_docker_ca_prefix}cluster-autoscaler:${AUTOSCALER_TAG}
|
||||
imagePullPolicy: Always
|
||||
command:
|
||||
- ./cluster-autoscaler
|
||||
- --alsologtostderr
|
||||
- --cloud-provider=magnum
|
||||
- --cluster-name=${CLUSTER_UUID}
|
||||
- --cloud-config=/config/cloud-config
|
||||
- --nodes=${MIN_NODE_COUNT}:${MAX_NODE_COUNT}:default-worker
|
||||
- --scale-down-unneeded-time=10m
|
||||
- --scale-down-delay-after-failure=3m
|
||||
- --scale-down-delay-after-add=10m
|
||||
volumeMounts:
|
||||
- name: ca-bundle
|
||||
mountPath: /etc/kubernetes
|
||||
readOnly: true
|
||||
- name: cloud-config
|
||||
mountPath: /config
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: ca-bundle
|
||||
secret:
|
||||
secretName: ca-bundle
|
||||
- name: cloud-config
|
||||
secret:
|
||||
secretName: cluster-autoscaler-cloud-config
|
||||
EOF
|
||||
}
|
||||
|
||||
echo "Waiting for Kubernetes API..."
|
||||
until [ "ok" = "$(curl --silent http://127.0.0.1:8080/healthz)" ]
|
||||
do
|
||||
sleep 5
|
||||
done
|
||||
|
||||
kubectl create secret generic ca-bundle --from-file=/etc/kubernetes/ca-bundle.crt -n kube-system
|
||||
|
||||
cat <<EOF | kubectl apply -f -
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: cluster-autoscaler-cloud-config
|
||||
namespace: kube-system
|
||||
type: Opaque
|
||||
stringData:
|
||||
cloud-config: |-
|
||||
[Global]
|
||||
auth-url=$AUTH_URL
|
||||
user-id=$TRUSTEE_USER_ID
|
||||
password=$TRUSTEE_PASSWORD
|
||||
trust-id=$TRUST_ID
|
||||
region=$REGION_NAME
|
||||
ca-file=/etc/kubernetes/ca-bundle.crt
|
||||
EOF
|
||||
|
||||
kubectl apply -f ${AUTOSCALER_DEPLOY}
|
||||
fi
|
||||
printf "Finished running ${step}\n"
|
@ -90,3 +90,9 @@ write_files:
|
||||
TILLER_NAMESPACE="$TILLER_NAMESPACE"
|
||||
NODE_PROBLEM_DETECTOR_TAG="$NODE_PROBLEM_DETECTOR_TAG"
|
||||
NGINX_INGRESS_CONTROLLER_TAG="$NGINX_INGRESS_CONTROLLER_TAG"
|
||||
AUTO_HEALING_ENABLED="$AUTO_HEALING_ENABLED"
|
||||
AUTO_SCALING_ENABLED="$AUTO_SCALING_ENABLED"
|
||||
DRAINO_TAG="$DRAINO_TAG"
|
||||
AUTOSCALER_TAG="$AUTOSCALER_TAG"
|
||||
MIN_NODE_COUNT="$MIN_NODE_COUNT"
|
||||
MAX_NODE_COUNT="$MAX_NODE_COUNT"
|
||||
|
@ -53,3 +53,4 @@ write_files:
|
||||
KUBEPROXY_OPTIONS="$KUBEPROXY_OPTIONS"
|
||||
OCTAVIA_ENABLED="$OCTAVIA_ENABLED"
|
||||
HEAT_CONTAINER_AGENT_TAG="$HEAT_CONTAINER_AGENT_TAG"
|
||||
AUTO_HEALING_ENABLED="$AUTO_HEALING_ENABLED"
|
||||
|
@ -116,15 +116,12 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition):
|
||||
'to be true or unset.'))
|
||||
|
||||
label_list = ['kube_tag', 'container_infra_prefix',
|
||||
'availability_zone',
|
||||
'cgroup_driver',
|
||||
'availability_zone', 'cgroup_driver',
|
||||
'calico_tag', 'calico_cni_tag',
|
||||
'calico_kube_controllers_tag', 'calico_ipv4pool',
|
||||
'etcd_tag', 'flannel_tag', 'flannel_cni_tag',
|
||||
'cloud_provider_enabled',
|
||||
'cloud_provider_tag',
|
||||
'prometheus_tag',
|
||||
'grafana_tag',
|
||||
'cloud_provider_enabled', 'cloud_provider_tag',
|
||||
'prometheus_tag', 'grafana_tag',
|
||||
'heat_container_agent_tag',
|
||||
'keystone_auth_enabled', 'k8s_keystone_auth_tag',
|
||||
'monitoring_enabled',
|
||||
@ -132,7 +129,10 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition):
|
||||
'tiller_tag',
|
||||
'tiller_namespace',
|
||||
'node_problem_detector_tag',
|
||||
'nginx_ingress_controller_tag']
|
||||
'nginx_ingress_controller_tag',
|
||||
'auto_healing_enabled', 'auto_scaling_enabled',
|
||||
'draino_tag', 'autoscaler_tag',
|
||||
'min_node_count', 'max_node_count']
|
||||
|
||||
for label in label_list:
|
||||
label_value = cluster.labels.get(label)
|
||||
@ -146,6 +146,19 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition):
|
||||
extra_params['kube_service_account_private_key'] = \
|
||||
csr_keys["private_key"].replace("\n", "\\n")
|
||||
|
||||
extra_params['project_id'] = cluster.project_id
|
||||
|
||||
if not extra_params.get('max_node_count'):
|
||||
extra_params['max_node_count'] = cluster.node_count + 1
|
||||
|
||||
self._set_cert_manager_params(cluster, extra_params)
|
||||
|
||||
return super(K8sFedoraTemplateDefinition,
|
||||
self).get_params(context, cluster_template, cluster,
|
||||
extra_params=extra_params,
|
||||
**kwargs)
|
||||
|
||||
def _set_cert_manager_params(self, cluster, extra_params):
|
||||
cert_manager_api = cluster.labels.get('cert_manager_api')
|
||||
if strutils.bool_from_string(cert_manager_api):
|
||||
extra_params['cert_manager_api'] = cert_manager_api
|
||||
@ -161,13 +174,6 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition):
|
||||
ca_cert.get_private_key(),
|
||||
ca_cert.get_private_key_passphrase()).replace("\n", "\\n")
|
||||
|
||||
extra_params['project_id'] = cluster.project_id
|
||||
|
||||
return super(K8sFedoraTemplateDefinition,
|
||||
self).get_params(context, cluster_template, cluster,
|
||||
extra_params=extra_params,
|
||||
**kwargs)
|
||||
|
||||
def get_env_files(self, cluster_template, cluster):
|
||||
env_files = []
|
||||
|
||||
|
@ -565,6 +565,20 @@ parameters:
|
||||
description: namespace where tiller will be installed.
|
||||
default: "magnum-tiller"
|
||||
|
||||
auto_healing_enabled:
|
||||
type: boolean
|
||||
description: >
|
||||
true if the auto healing feature should be enabled
|
||||
default:
|
||||
false
|
||||
|
||||
auto_scaling_enabled:
|
||||
type: boolean
|
||||
description: >
|
||||
true if the auto scaling feature should be enabled
|
||||
default:
|
||||
false
|
||||
|
||||
node_problem_detector_tag:
|
||||
type: string
|
||||
description: tag of the node problem detector container
|
||||
@ -575,6 +589,27 @@ parameters:
|
||||
description: nginx ingress controller docker image tag
|
||||
default: 0.23.0
|
||||
|
||||
draino_tag:
|
||||
type: string
|
||||
description: tag of the draino container
|
||||
default: abf028a
|
||||
|
||||
autoscaler_tag:
|
||||
type: string
|
||||
description: tag of the autoscaler container
|
||||
default: v1.0
|
||||
|
||||
min_node_count:
|
||||
type: number
|
||||
description: >
|
||||
minimum node count of cluster workers when doing scale down
|
||||
default: 1
|
||||
|
||||
max_node_count:
|
||||
type: number
|
||||
description: >
|
||||
maximum node count of cluster workers when doing scale up
|
||||
|
||||
resources:
|
||||
|
||||
######################################################################
|
||||
@ -852,6 +887,12 @@ resources:
|
||||
tiller_namespace: {get_param: tiller_namespace}
|
||||
node_problem_detector_tag: {get_param: node_problem_detector_tag}
|
||||
nginx_ingress_controller_tag: {get_param: nginx_ingress_controller_tag}
|
||||
auto_healing_enabled: {get_param: auto_healing_enabled}
|
||||
auto_scaling_enabled: {get_param: auto_scaling_enabled}
|
||||
draino_tag: {get_param: draino_tag}
|
||||
autoscaler_tag: {get_param: autoscaler_tag}
|
||||
min_node_count: {get_param: min_node_count}
|
||||
max_node_count: {get_param: max_node_count}
|
||||
|
||||
kube_cluster_config:
|
||||
type: OS::Heat::SoftwareConfig
|
||||
@ -882,6 +923,7 @@ resources:
|
||||
- get_file: ../../common/templates/kubernetes/fragments/kube-dashboard-service.sh
|
||||
- get_file: ../../common/templates/kubernetes/fragments/enable-keystone-auth.sh
|
||||
- get_file: ../../common/templates/kubernetes/fragments/enable-auto-healing.sh
|
||||
- get_file: ../../common/templates/kubernetes/fragments/enable-auto-scaling.sh
|
||||
# Helm Based Installation Configuration Scripts
|
||||
- get_file: ../../common/templates/kubernetes/helm/metrics-server.sh
|
||||
- str_replace:
|
||||
@ -979,6 +1021,7 @@ resources:
|
||||
kubeproxy_options: {get_param: kubeproxy_options}
|
||||
octavia_enabled: {get_param: octavia_enabled}
|
||||
heat_container_agent_tag: {get_param: heat_container_agent_tag}
|
||||
auto_healing_enabled: {get_param: auto_healing_enabled}
|
||||
|
||||
outputs:
|
||||
|
||||
|
@ -439,6 +439,16 @@ parameters:
|
||||
type: string
|
||||
description: namespace where tiller will be installed
|
||||
|
||||
auto_healing_enabled:
|
||||
type: boolean
|
||||
description: >
|
||||
true if the auto healing feature should be enabled
|
||||
|
||||
auto_scaling_enabled:
|
||||
type: boolean
|
||||
description: >
|
||||
true if the auto scaling feature should be enabled
|
||||
|
||||
node_problem_detector_tag:
|
||||
type: string
|
||||
description: tag of the node problem detector container
|
||||
@ -447,6 +457,24 @@ parameters:
|
||||
type: string
|
||||
description: nginx ingress controller docker image tag
|
||||
|
||||
draino_tag:
|
||||
type: string
|
||||
description: tag of the draino container
|
||||
|
||||
autoscaler_tag:
|
||||
type: string
|
||||
description: tag of the autoscaler container
|
||||
|
||||
min_node_count:
|
||||
type: number
|
||||
description: >
|
||||
minimum node count of cluster workers when doing scale down
|
||||
|
||||
max_node_count:
|
||||
type: number
|
||||
description: >
|
||||
maximum node count of cluster workers when doing scale up
|
||||
|
||||
resources:
|
||||
######################################################################
|
||||
#
|
||||
@ -560,6 +588,12 @@ resources:
|
||||
"$TILLER_NAMESPACE": {get_param: tiller_namespace}
|
||||
"$NODE_PROBLEM_DETECTOR_TAG": {get_param: node_problem_detector_tag}
|
||||
"$NGINX_INGRESS_CONTROLLER_TAG": {get_param: nginx_ingress_controller_tag}
|
||||
"$AUTO_HEALING_ENABLED": {get_param: auto_healing_enabled}
|
||||
"$AUTO_SCALING_ENABLED": {get_param: auto_scaling_enabled}
|
||||
"$DRAINO_TAG": {get_param: draino_tag}
|
||||
"$AUTOSCALER_TAG": {get_param: autoscaler_tag}
|
||||
"$MIN_NODE_COUNT": {get_param: min_node_count}
|
||||
"$MAX_NODE_COUNT": {get_param: max_node_count}
|
||||
|
||||
install_openstack_ca:
|
||||
type: OS::Heat::SoftwareConfig
|
||||
|
@ -276,6 +276,11 @@ parameters:
|
||||
type: string
|
||||
description: tag of the heat_container_agent system container
|
||||
|
||||
auto_healing_enabled:
|
||||
type: boolean
|
||||
description: >
|
||||
true if the auto healing feature should be enabled
|
||||
|
||||
resources:
|
||||
|
||||
start_container_agent:
|
||||
@ -355,6 +360,8 @@ resources:
|
||||
$KUBEPROXY_OPTIONS: {get_param: kubeproxy_options}
|
||||
$OCTAVIA_ENABLED: {get_param: octavia_enabled}
|
||||
$HEAT_CONTAINER_AGENT_TAG: {get_param: heat_container_agent_tag}
|
||||
$AUTO_HEALING_ENABLED: {get_param: auto_healing_enabled}
|
||||
|
||||
|
||||
install_openstack_ca:
|
||||
type: OS::Heat::SoftwareConfig
|
||||
|
@ -331,11 +331,15 @@ class TestClusterConductorWithK8s(base.TestCase):
|
||||
'kube_service_account_key': 'public_key',
|
||||
'kube_service_account_private_key': 'private_key',
|
||||
'portal_network_cidr': '10.254.0.0/16',
|
||||
'project_id': 'project_id'
|
||||
'project_id': 'project_id',
|
||||
'max_node_count': 2,
|
||||
}
|
||||
if missing_attr is not None:
|
||||
expected.pop(mapping[missing_attr], None)
|
||||
|
||||
if missing_attr == 'node_count':
|
||||
expected['max_node_count'] = None
|
||||
|
||||
self.assertEqual(expected, definition)
|
||||
self.assertEqual(
|
||||
['../../common/templates/environments/no_private_network.yaml',
|
||||
@ -459,7 +463,8 @@ class TestClusterConductorWithK8s(base.TestCase):
|
||||
'kube_service_account_key': 'public_key',
|
||||
'kube_service_account_private_key': 'private_key',
|
||||
'portal_network_cidr': '10.254.0.0/16',
|
||||
'project_id': 'project_id'
|
||||
'project_id': 'project_id',
|
||||
'max_node_count': 2,
|
||||
}
|
||||
|
||||
self.assertEqual(expected, definition)
|
||||
@ -574,7 +579,8 @@ class TestClusterConductorWithK8s(base.TestCase):
|
||||
'kube_service_account_key': 'public_key',
|
||||
'kube_service_account_private_key': 'private_key',
|
||||
'portal_network_cidr': '10.254.0.0/16',
|
||||
'project_id': 'project_id'
|
||||
'project_id': 'project_id',
|
||||
'max_node_count': 2,
|
||||
}
|
||||
self.assertEqual(expected, definition)
|
||||
self.assertEqual(
|
||||
@ -1000,7 +1006,8 @@ class TestClusterConductorWithK8s(base.TestCase):
|
||||
'kube_service_account_key': 'public_key',
|
||||
'kube_service_account_private_key': 'private_key',
|
||||
'portal_network_cidr': '10.254.0.0/16',
|
||||
'project_id': 'project_id'
|
||||
'project_id': 'project_id',
|
||||
'max_node_count': 2,
|
||||
}
|
||||
self.assertEqual(expected, definition)
|
||||
self.assertEqual(
|
||||
|
@ -517,6 +517,14 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
|
||||
tiller_namespace = mock_cluster.labels.get(
|
||||
'tiller_namespace')
|
||||
npd_tag = mock_cluster.labels.get('node_problem_detector_tag')
|
||||
auto_healing_enabled = mock_cluster.labels.get(
|
||||
'auto_healing_enabled')
|
||||
auto_scaling_enabled = mock_cluster.labels.get(
|
||||
'auto_scaling_enabled')
|
||||
draino_tag = mock_cluster.labels.get('draino_tag')
|
||||
autoscaler_tag = mock_cluster.labels.get('autoscaler_tag')
|
||||
min_node_count = mock_cluster.labels.get('min_node_count')
|
||||
max_node_count = mock_cluster.labels.get('max_node_count')
|
||||
|
||||
k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition()
|
||||
|
||||
@ -581,6 +589,12 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
|
||||
'tiller_tag': tiller_tag,
|
||||
'tiller_namespace': tiller_namespace,
|
||||
'node_problem_detector_tag': npd_tag,
|
||||
'auto_healing_enabled': auto_healing_enabled,
|
||||
'auto_scaling_enabled': auto_scaling_enabled,
|
||||
'draino_tag': draino_tag,
|
||||
'autoscaler_tag': autoscaler_tag,
|
||||
'min_node_count': min_node_count,
|
||||
'max_node_count': max_node_count,
|
||||
}}
|
||||
mock_get_params.assert_called_once_with(mock_context,
|
||||
mock_cluster_template,
|
||||
@ -893,6 +907,14 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
|
||||
tiller_namespace = mock_cluster.labels.get(
|
||||
'tiller_namespace')
|
||||
npd_tag = mock_cluster.labels.get('node_problem_detector_tag')
|
||||
auto_healing_enabled = mock_cluster.labels.get(
|
||||
'auto_healing_enabled')
|
||||
auto_scaling_enabled = mock_cluster.labels.get(
|
||||
'auto_scaling_enabled')
|
||||
draino_tag = mock_cluster.labels.get('draino_tag')
|
||||
autoscaler_tag = mock_cluster.labels.get('autoscaler_tag')
|
||||
min_node_count = mock_cluster.labels.get('min_node_count')
|
||||
max_node_count = mock_cluster.labels.get('max_node_count')
|
||||
|
||||
k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition()
|
||||
|
||||
@ -959,6 +981,12 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
|
||||
'tiller_tag': tiller_tag,
|
||||
'tiller_namespace': tiller_namespace,
|
||||
'node_problem_detector_tag': npd_tag,
|
||||
'auto_healing_enabled': auto_healing_enabled,
|
||||
'auto_scaling_enabled': auto_scaling_enabled,
|
||||
'draino_tag': draino_tag,
|
||||
'autoscaler_tag': autoscaler_tag,
|
||||
'min_node_count': min_node_count,
|
||||
'max_node_count': max_node_count,
|
||||
}}
|
||||
mock_get_params.assert_called_once_with(mock_context,
|
||||
mock_cluster_template,
|
||||
|
@ -0,0 +1,11 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Using Node Problem Detector, Draino and AutoScaler to support
|
||||
auto healing for K8s cluster, user can use a new label
|
||||
"auto_healing_enabled' to turn on/off it.
|
||||
|
||||
Meanwhile, a new label "auto_scaling_enabled" is also introduced
|
||||
to enable the capability to let the k8s cluster auto scale based
|
||||
its workload.
|
||||
|
Loading…
Reference in New Issue
Block a user