magnum/magnum/drivers/common/templates/kubernetes/fragments/enable-auto-healing.sh

224 lines
5.9 KiB
Bash

#!/bin/sh
step="enable-node-problem-detector"
printf "Starting to run ${step}\n"
. /etc/sysconfig/heat-params
_gcr_prefix=${CONTAINER_INFRA_PREFIX:-k8s.gcr.io/}
# Generate Node Problem Detector manifest file
NPD_DEPLOY=/srv/magnum/kubernetes/manifests/npd.yaml
[ -f ${NPD_DEPLOY} ] || {
echo "Writing File: $NPD_DEPLOY"
mkdir -p $(dirname ${NPD_DEPLOY})
cat << EOF > ${NPD_DEPLOY}
apiVersion: v1
kind: ServiceAccount
metadata:
name: node-problem-detector
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: npd-binding
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:node-problem-detector
subjects:
- kind: ServiceAccount
name: node-problem-detector
namespace: kube-system
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: npd
namespace: kube-system
labels:
k8s-app: node-problem-detector
version: ${NODE_PROBLEM_DETECTOR_TAG}
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
spec:
selector:
matchLabels:
k8s-app: node-problem-detector
version: ${NODE_PROBLEM_DETECTOR_TAG}
template:
metadata:
labels:
k8s-app: node-problem-detector
version: ${NODE_PROBLEM_DETECTOR_TAG}
kubernetes.io/cluster-service: "true"
spec:
containers:
- name: node-problem-detector
image: ${_gcr_prefix}node-problem-detector:${NODE_PROBLEM_DETECTOR_TAG}
command:
- "/bin/sh"
- "-c"
# Pass both config to support both journald and syslog.
- "exec /node-problem-detector --logtostderr --system-log-monitors=/config/kernel-monitor.json,/config/kernel-monitor-filelog.json,/config/docker-monitor.json,/config/docker-monitor-filelog.json 2>&1 | tee /var/log/node-problem-detector.log"
securityContext:
privileged: true
resources:
limits:
cpu: "200m"
memory: "100Mi"
requests:
cpu: "20m"
memory: "20Mi"
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- name: log
mountPath: /var/log
- name: localtime
mountPath: /etc/localtime
readOnly: true
volumes:
- name: log
hostPath:
path: /var/log/
- name: localtime
hostPath:
path: /etc/localtime
type: "FileOrCreate"
serviceAccountName: node-problem-detector
tolerations:
- operator: "Exists"
effect: "NoExecute"
- key: "CriticalAddonsOnly"
operator: "Exists"
EOF
}
echo "Waiting for Kubernetes API..."
until [ "ok" = "$(curl --silent http://127.0.0.1:8080/healthz)" ]
do
sleep 5
done
kubectl apply -f ${NPD_DEPLOY}
printf "Finished running ${step}\n"
_docker_draino_prefix=${CONTAINER_INFRA_PREFIX:-docker.io/planetlabs/}
step="enable-auto-healing"
printf "Starting to run ${step}\n"
if [ "$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" ]; then
# Generate Draino manifest file
DRAINO_DEPLOY=/srv/magnum/kubernetes/manifests/draino.yaml
[ -f ${DRAINO_DEPLOY} ] || {
echo "Writing File: $DRAINO_DEPLOY"
mkdir -p $(dirname ${DRAINO_DEPLOY})
cat << EOF > ${DRAINO_DEPLOY}
---
apiVersion: v1
kind: ServiceAccount
metadata:
labels: {component: draino}
name: draino
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels: {component: draino}
name: draino
rules:
- apiGroups: ['']
resources: [events]
verbs: [create, patch, update]
- apiGroups: ['']
resources: [nodes]
verbs: [get, watch, list, update]
- apiGroups: ['']
resources: [nodes/status]
verbs: [patch]
- apiGroups: ['']
resources: [pods]
verbs: [get, watch, list]
- apiGroups: ['']
resources: [pods/eviction]
verbs: [create]
- apiGroups: [extensions]
resources: [daemonsets]
verbs: [get, watch, list]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels: {component: draino}
name: draino
roleRef: {apiGroup: rbac.authorization.k8s.io, kind: ClusterRole, name: draino}
subjects:
- {kind: ServiceAccount, name: draino, namespace: kube-system}
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels: {component: draino}
name: draino
namespace: kube-system
spec:
# Draino does not currently support locking/master election, so you should
# only run one draino at a time. Draino won't start draining nodes immediately
# so it's usually safe for multiple drainos to exist for a brief period of
# time.
replicas: 1
selector:
matchLabels: {component: draino}
template:
metadata:
labels: {component: draino}
name: draino
namespace: kube-system
spec:
nodeSelector:
node-role.kubernetes.io/master: ""
hostNetwork: true
tolerations:
- effect: NoSchedule
operator: Exists
- key: CriticalAddonsOnly
operator: Exists
- effect: NoExecute
operator: Exists
- key: node.cloudprovider.kubernetes.io/uninitialized
value: "true"
effect: NoSchedule
- key: node-role.kubernetes.io/master
effect: NoSchedule
containers:
# You'll want to change these labels and conditions to suit your deployment.
- command: [/draino, --node-label=draino-enabled=true, --evict-daemonset-pods, --evict-emptydir-pods, NotReady]
image: ${_docker_draino_prefix}draino:${DRAINO_TAG}
livenessProbe:
httpGet: {path: /healthz, port: 10002}
initialDelaySeconds: 30
name: draino
serviceAccountName: draino
EOF
}
kubectl apply -f ${DRAINO_DEPLOY}
fi
printf "Finished running ${step}\n"