Support auto_healing_controller
The current autohealing solution(NPD + Draino + cluster-autoscaler) in Magnum has some problems, e.g. it doesn't take master nodes into consideration, it cannot guarantee the original node count, etc. What's more, the whole solution involves too many components which brings complexity and maintenance overhead to the operation team, and cluster-autoscaler is not designed to deal with autohealing scenario. This patch allows the user to choose the auto-healing service by introducing a new label 'auto_healing_controller', currently, 'draino' and 'magnum-auto-healer'[1] are supported. 'draino' is the default value for backward compatibility. [1]: https://github.com/kubernetes/cloud-provider-openstack/blob/master/docs/using-magnum-auto-healer.md Change-Id: I7ff14837a8d7d360b72c8f40733e84c88c4269d4
This commit is contained in:
parent
77d4408fc4
commit
7ac783c264
|
@ -401,6 +401,10 @@ the table are linked to more details elsewhere in the user guide.
|
|||
| `auto_healing_enabled`_ | - true | false |
|
||||
| | - false | |
|
||||
+---------------------------------------+--------------------+---------------+
|
||||
| `auto_healing_controller`_ | see below | "draino" |
|
||||
+---------------------------------------+--------------------+---------------+
|
||||
| `magnum_auto_healer_tag`_ | see below | see below |
|
||||
+---------------------------------------+--------------------+---------------+
|
||||
| `auto_scaling_enabled`_ | - true | true |
|
||||
| | - false | |
|
||||
+---------------------------------------+--------------------+---------------+
|
||||
|
@ -1297,6 +1301,14 @@ _`master_lb_floating_ip_enabled`
|
|||
_`auto_healing_enabled`
|
||||
If set to true, auto healing feature will be enabled. Defaults to false.
|
||||
|
||||
_`auto_healing_controller`
|
||||
This label sets the auto-healing service to be used. Currently ``draino`` and
|
||||
``magnum-auto-healer`` are supported. The default is ``draino``. For more
|
||||
details, see
|
||||
`draino doc <https://github.com/planetlabs/draino>`_ and
|
||||
`magnum-auto-healer doc
|
||||
<https://github.com/kubernetes/cloud-provider-openstack/blob/master/docs/using-magnum-auto-healer.md>`_.
|
||||
|
||||
_`auto_scaling_enabled`
|
||||
If set to true, auto scaling feature will be enabled. Defaults to true.
|
||||
|
||||
|
@ -1307,6 +1319,9 @@ _`node_problem_detector_tag`
|
|||
_`draino_tag`
|
||||
This label allows users to select a specific Draino version.
|
||||
|
||||
_`magnum_auto_healer_tag`
|
||||
This label allows users to select a specific magnum-auto-healer version.
|
||||
|
||||
_`autoscaler_tag`
|
||||
This label allows users to select a specific Cluster Autoscaler version.
|
||||
|
||||
|
|
|
@ -156,7 +156,9 @@ KUBELET_ARGS="${KUBELET_ARGS} --client-ca-file=${CERT_DIR}/ca.crt --tls-cert-fil
|
|||
# specified cgroup driver
|
||||
KUBELET_ARGS="${KUBELET_ARGS} --cgroup-driver=${CGROUP_DRIVER}"
|
||||
|
||||
if [ "$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" ]; then
|
||||
auto_healing_enabled=$(echo ${AUTO_HEALING_ENABLED} | tr '[:upper:]' '[:lower:]')
|
||||
autohealing_controller=$(echo ${AUTO_HEALING_CONTROLLER} | tr '[:upper:]' '[:lower:]')
|
||||
if [[ "${auto_healing_enabled}" = "true" && "${autohealing_controller}" = "draino" ]]; then
|
||||
KUBELET_ARGS="${KUBELET_ARGS} --node-labels=draino-enabled=true"
|
||||
fi
|
||||
|
||||
|
|
|
@ -116,18 +116,15 @@ kubectl apply -f ${NPD_DEPLOY}
|
|||
|
||||
printf "Finished running ${step}\n"
|
||||
|
||||
_docker_draino_prefix=${CONTAINER_INFRA_PREFIX:-docker.io/planetlabs/}
|
||||
step="enable-auto-healing"
|
||||
printf "Starting to run ${step}\n"
|
||||
function enable_draino() {
|
||||
echo "Installing draino"
|
||||
_docker_draino_prefix=${CONTAINER_INFRA_PREFIX:-docker.io/planetlabs/}
|
||||
draino_manifest=/srv/magnum/kubernetes/manifests/draino.yaml
|
||||
|
||||
if [ "$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" ]; then
|
||||
# Generate Draino manifest file
|
||||
DRAINO_DEPLOY=/srv/magnum/kubernetes/manifests/draino.yaml
|
||||
|
||||
[ -f ${DRAINO_DEPLOY} ] || {
|
||||
echo "Writing File: $DRAINO_DEPLOY"
|
||||
mkdir -p $(dirname ${DRAINO_DEPLOY})
|
||||
cat << EOF > ${DRAINO_DEPLOY}
|
||||
[ -f ${draino_manifest} ] || {
|
||||
echo "Writing File: $draino_manifest"
|
||||
mkdir -p $(dirname ${draino_manifest})
|
||||
cat << EOF > ${draino_manifest}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
|
@ -217,7 +214,148 @@ spec:
|
|||
EOF
|
||||
}
|
||||
|
||||
kubectl apply -f ${DRAINO_DEPLOY}
|
||||
kubectl apply -f ${draino_manifest}
|
||||
}
|
||||
|
||||
function enable_magnum_auto_healer() {
|
||||
echo "Installing magnum_auto_healer"
|
||||
image_prefix=${CONTAINER_INFRA_PREFIX:-docker.io/k8scloudprovider/}
|
||||
image_prefix=${image_prefix%/}
|
||||
magnum_auto_healer_manifest=/srv/magnum/kubernetes/manifests/magnum_auto_healer.yaml
|
||||
|
||||
[ -f ${magnum_auto_healer_manifest} ] || {
|
||||
echo "Writing File: ${magnum_auto_healer_manifest}"
|
||||
mkdir -p $(dirname ${magnum_auto_healer_manifest})
|
||||
cat << EOF > ${magnum_auto_healer_manifest}
|
||||
---
|
||||
kind: ServiceAccount
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: magnum-auto-healer
|
||||
namespace: kube-system
|
||||
|
||||
---
|
||||
kind: ClusterRoleBinding
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: magnum-auto-healer
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: cluster-admin
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: magnum-auto-healer
|
||||
namespace: kube-system
|
||||
|
||||
---
|
||||
kind: ConfigMap
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: magnum-auto-healer-config
|
||||
namespace: kube-system
|
||||
data:
|
||||
config.yaml: |
|
||||
cluster-name: ${CLUSTER_UUID}
|
||||
dry-run: false
|
||||
monitor-interval: 30s
|
||||
check-delay-after-add: 20m
|
||||
leader-elect: true
|
||||
healthcheck:
|
||||
master:
|
||||
- type: Endpoint
|
||||
params:
|
||||
unhealthyDuration: 3m
|
||||
protocol: HTTPS
|
||||
port: 6443
|
||||
endpoints: ["/healthz"]
|
||||
okCodes: [200]
|
||||
- type: NodeCondition
|
||||
params:
|
||||
unhealthyDuration: 3m
|
||||
types: ["Ready"]
|
||||
okValues: ["True"]
|
||||
worker:
|
||||
- type: NodeCondition
|
||||
params:
|
||||
unhealthyDuration: 3m
|
||||
types: ["Ready"]
|
||||
okValues: ["True"]
|
||||
openstack:
|
||||
auth-url: ${AUTH_URL}
|
||||
user-id: ${TRUSTEE_USER_ID}
|
||||
password: ${TRUSTEE_PASSWORD}
|
||||
trust-id: ${TRUST_ID}
|
||||
region: ${REGION_NAME}
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: magnum-auto-healer
|
||||
namespace: kube-system
|
||||
labels:
|
||||
k8s-app: magnum-auto-healer
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: magnum-auto-healer
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: magnum-auto-healer
|
||||
spec:
|
||||
serviceAccountName: magnum-auto-healer
|
||||
tolerations:
|
||||
- effect: NoSchedule
|
||||
operator: Exists
|
||||
- key: CriticalAddonsOnly
|
||||
operator: Exists
|
||||
- effect: NoExecute
|
||||
operator: Exists
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/master: ""
|
||||
containers:
|
||||
- name: magnum-auto-healer
|
||||
image: ${image_prefix}/magnum-auto-healer:${MAGNUM_AUTO_HEALER_TAG}
|
||||
imagePullPolicy: Always
|
||||
args:
|
||||
- /bin/magnum-auto-healer
|
||||
- --config=/etc/magnum-auto-healer/config.yaml
|
||||
- --v
|
||||
- "2"
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/magnum-auto-healer
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: magnum-auto-healer-config
|
||||
EOF
|
||||
}
|
||||
|
||||
kubectl apply -f ${magnum_auto_healer_manifest}
|
||||
}
|
||||
|
||||
step="enable-auto-healing"
|
||||
printf "Starting to run ${step}\n"
|
||||
|
||||
if [ "$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" ]; then
|
||||
autohealing_controller=$(echo ${AUTO_HEALING_CONTROLLER} | tr '[:upper:]' '[:lower:]')
|
||||
case "${autohealing_controller}" in
|
||||
"")
|
||||
echo "No autohealing controller configured."
|
||||
;;
|
||||
"draino")
|
||||
enable_draino
|
||||
;;
|
||||
"magnum-auto-healer")
|
||||
enable_magnum_auto_healer
|
||||
;;
|
||||
*)
|
||||
echo "Autohealing controller ${autohealing_controller} not supported."
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
printf "Finished running ${step}\n"
|
||||
|
|
|
@ -7,8 +7,11 @@ printf "Starting to run ${step}\n"
|
|||
|
||||
_docker_ca_prefix=${CONTAINER_INFRA_PREFIX:-docker.io/openstackmagnum/}
|
||||
|
||||
# Either auto scaling or auto healing we need CA to be deployed
|
||||
if [ "$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" || "$(echo $AUTO_SCALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true"]; then
|
||||
auto_scaling_enabled=$(echo $AUTO_SCALING_ENABLED | tr '[:upper:]' '[:lower:]')
|
||||
auto_healing_enabled=$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')
|
||||
autohealing_controller=$(echo ${AUTO_HEALING_CONTROLLER} | tr '[:upper:]' '[:lower:]')
|
||||
|
||||
if [[ "${auto_scaling_enabled}" = "true" || ("${auto_healing_enabled}" = "true" && "${autohealing_controller}" = "draino") ]]; then
|
||||
# Generate Autoscaler manifest file
|
||||
AUTOSCALER_DEPLOY=/srv/magnum/kubernetes/manifests/autoscaler.yaml
|
||||
|
||||
|
|
|
@ -95,8 +95,10 @@ HEAT_PARAMS=/etc/sysconfig/heat-params
|
|||
NODE_PROBLEM_DETECTOR_TAG="$NODE_PROBLEM_DETECTOR_TAG"
|
||||
NGINX_INGRESS_CONTROLLER_TAG="$NGINX_INGRESS_CONTROLLER_TAG"
|
||||
AUTO_HEALING_ENABLED="$AUTO_HEALING_ENABLED"
|
||||
AUTO_HEALING_CONTROLLER="$AUTO_HEALING_CONTROLLER"
|
||||
AUTO_SCALING_ENABLED="$AUTO_SCALING_ENABLED"
|
||||
DRAINO_TAG="$DRAINO_TAG"
|
||||
MAGNUM_AUTO_HEALER_TAG="$MAGNUM_AUTO_HEALER_TAG"
|
||||
AUTOSCALER_TAG="$AUTOSCALER_TAG"
|
||||
MIN_NODE_COUNT="$MIN_NODE_COUNT"
|
||||
MAX_NODE_COUNT="$MAX_NODE_COUNT"
|
||||
|
|
|
@ -55,6 +55,8 @@ KUBELET_OPTIONS="$KUBELET_OPTIONS"
|
|||
KUBEPROXY_OPTIONS="$KUBEPROXY_OPTIONS"
|
||||
OCTAVIA_ENABLED="$OCTAVIA_ENABLED"
|
||||
HEAT_CONTAINER_AGENT_TAG="$HEAT_CONTAINER_AGENT_TAG"
|
||||
AUTO_HEALING_ENABLED="$AUTO_HEALING_ENABLED"
|
||||
AUTO_HEALING_CONTROLLER="$AUTO_HEALING_CONTROLLER"
|
||||
EOF
|
||||
}
|
||||
|
||||
|
|
|
@ -139,6 +139,7 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition):
|
|||
'node_problem_detector_tag',
|
||||
'nginx_ingress_controller_tag',
|
||||
'auto_healing_enabled', 'auto_scaling_enabled',
|
||||
'auto_healing_controller', 'magnum_auto_healer_tag',
|
||||
'draino_tag', 'autoscaler_tag',
|
||||
'min_node_count', 'max_node_count']
|
||||
|
||||
|
|
|
@ -604,6 +604,17 @@ parameters:
|
|||
default:
|
||||
false
|
||||
|
||||
auto_healing_controller:
|
||||
type: string
|
||||
description: >
|
||||
The service to be deployed for auto-healing.
|
||||
default: "draino"
|
||||
|
||||
magnum_auto_healer_tag:
|
||||
type: string
|
||||
description: tag of the magnum-auto-healer service.
|
||||
default: latest
|
||||
|
||||
auto_scaling_enabled:
|
||||
type: boolean
|
||||
description: >
|
||||
|
@ -936,6 +947,8 @@ resources:
|
|||
node_problem_detector_tag: {get_param: node_problem_detector_tag}
|
||||
nginx_ingress_controller_tag: {get_param: nginx_ingress_controller_tag}
|
||||
auto_healing_enabled: {get_param: auto_healing_enabled}
|
||||
auto_healing_controller: {get_param: auto_healing_controller}
|
||||
magnum_auto_healer_tag: {get_param: magnum_auto_healer_tag}
|
||||
auto_scaling_enabled: {get_param: auto_scaling_enabled}
|
||||
draino_tag: {get_param: draino_tag}
|
||||
autoscaler_tag: {get_param: autoscaler_tag}
|
||||
|
@ -1075,6 +1088,7 @@ resources:
|
|||
octavia_enabled: {get_param: octavia_enabled}
|
||||
heat_container_agent_tag: {get_param: heat_container_agent_tag}
|
||||
auto_healing_enabled: {get_param: auto_healing_enabled}
|
||||
auto_healing_controller: {get_param: auto_healing_controller}
|
||||
|
||||
outputs:
|
||||
|
||||
|
|
|
@ -452,6 +452,17 @@ parameters:
|
|||
description: >
|
||||
true if the auto healing feature should be enabled
|
||||
|
||||
auto_healing_controller:
|
||||
type: string
|
||||
description: >
|
||||
The service to be deployed for auto-healing.
|
||||
default: "draino"
|
||||
|
||||
magnum_auto_healer_tag:
|
||||
type: string
|
||||
description: tag of the magnum-auto-healer service.
|
||||
default: latest
|
||||
|
||||
auto_scaling_enabled:
|
||||
type: boolean
|
||||
description: >
|
||||
|
@ -621,6 +632,8 @@ resources:
|
|||
"$NODE_PROBLEM_DETECTOR_TAG": {get_param: node_problem_detector_tag}
|
||||
"$NGINX_INGRESS_CONTROLLER_TAG": {get_param: nginx_ingress_controller_tag}
|
||||
"$AUTO_HEALING_ENABLED": {get_param: auto_healing_enabled}
|
||||
"$AUTO_HEALING_CONTROLLER": {get_param: auto_healing_controller}
|
||||
"$MAGNUM_AUTO_HEALER_TAG": {get_param: magnum_auto_healer_tag}
|
||||
"$AUTO_SCALING_ENABLED": {get_param: auto_scaling_enabled}
|
||||
"$DRAINO_TAG": {get_param: draino_tag}
|
||||
"$AUTOSCALER_TAG": {get_param: autoscaler_tag}
|
||||
|
|
|
@ -281,6 +281,12 @@ parameters:
|
|||
description: >
|
||||
true if the auto healing feature should be enabled
|
||||
|
||||
auto_healing_controller:
|
||||
type: string
|
||||
description: >
|
||||
The service to be deployed for auto-healing.
|
||||
default: "draino"
|
||||
|
||||
resources:
|
||||
|
||||
agent_config:
|
||||
|
@ -366,6 +372,7 @@ resources:
|
|||
$OCTAVIA_ENABLED: {get_param: octavia_enabled}
|
||||
$HEAT_CONTAINER_AGENT_TAG: {get_param: heat_container_agent_tag}
|
||||
$AUTO_HEALING_ENABLED: {get_param: auto_healing_enabled}
|
||||
$AUTO_HEALING_CONTROLLER: {get_param: auto_healing_controller}
|
||||
- get_file: ../../common/templates/kubernetes/fragments/write-kube-os-config.sh
|
||||
- get_file: ../../common/templates/kubernetes/fragments/make-cert-client.sh
|
||||
- get_file: ../../common/templates/fragments/configure-docker-registry.sh
|
||||
|
|
|
@ -522,6 +522,10 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
|
|||
'traefik_ingress_controller_tag')
|
||||
auto_healing_enabled = mock_cluster.labels.get(
|
||||
'auto_healing_enabled')
|
||||
auto_healing_controller = mock_cluster.labels.get(
|
||||
'auto_healing_controller')
|
||||
magnum_auto_healer_tag = mock_cluster.labels.get(
|
||||
'magnum_auto_healer_tag')
|
||||
auto_scaling_enabled = mock_cluster.labels.get(
|
||||
'auto_scaling_enabled')
|
||||
draino_tag = mock_cluster.labels.get('draino_tag')
|
||||
|
@ -596,6 +600,8 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
|
|||
'tiller_namespace': tiller_namespace,
|
||||
'node_problem_detector_tag': npd_tag,
|
||||
'auto_healing_enabled': auto_healing_enabled,
|
||||
'auto_healing_controller': auto_healing_controller,
|
||||
'magnum_auto_healer_tag': magnum_auto_healer_tag,
|
||||
'auto_scaling_enabled': auto_scaling_enabled,
|
||||
'draino_tag': draino_tag,
|
||||
'autoscaler_tag': autoscaler_tag,
|
||||
|
@ -925,6 +931,10 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
|
|||
'traefik_ingress_controller_tag')
|
||||
auto_healing_enabled = mock_cluster.labels.get(
|
||||
'auto_healing_enabled')
|
||||
auto_healing_controller = mock_cluster.labels.get(
|
||||
'auto_healing_controller')
|
||||
magnum_auto_healer_tag = mock_cluster.labels.get(
|
||||
'magnum_auto_healer_tag')
|
||||
auto_scaling_enabled = mock_cluster.labels.get(
|
||||
'auto_scaling_enabled')
|
||||
draino_tag = mock_cluster.labels.get('draino_tag')
|
||||
|
@ -1001,6 +1011,8 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
|
|||
'tiller_namespace': tiller_namespace,
|
||||
'node_problem_detector_tag': npd_tag,
|
||||
'auto_healing_enabled': auto_healing_enabled,
|
||||
'auto_healing_controller': auto_healing_controller,
|
||||
'magnum_auto_healer_tag': magnum_auto_healer_tag,
|
||||
'auto_scaling_enabled': auto_scaling_enabled,
|
||||
'draino_tag': draino_tag,
|
||||
'autoscaler_tag': autoscaler_tag,
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
---
|
||||
features:
|
||||
- A new tag ``auto_healing_controller`` is introduced to allow the user to
|
||||
choose the auto-healing service when ``auto_healing_enabled`` is specified
|
||||
in the labels, ``draino`` and ``magnum-auto-healer`` are supported for now.
|
||||
Another label ``magnum_auto_healer_tag`` is also added to specify the
|
||||
``magnum-auto-healer`` image tag.
|
Loading…
Reference in New Issue