224 lines
5.9 KiB
Bash
224 lines
5.9 KiB
Bash
#!/bin/sh
|
|
|
|
step="enable-node-problem-detector"
|
|
printf "Starting to run ${step}\n"
|
|
|
|
. /etc/sysconfig/heat-params
|
|
|
|
_gcr_prefix=${CONTAINER_INFRA_PREFIX:-k8s.gcr.io/}
|
|
|
|
# Generate Node Problem Detector manifest file
|
|
NPD_DEPLOY=/srv/magnum/kubernetes/manifests/npd.yaml
|
|
|
|
[ -f ${NPD_DEPLOY} ] || {
|
|
echo "Writing File: $NPD_DEPLOY"
|
|
mkdir -p $(dirname ${NPD_DEPLOY})
|
|
cat << EOF > ${NPD_DEPLOY}
|
|
apiVersion: v1
|
|
kind: ServiceAccount
|
|
metadata:
|
|
name: node-problem-detector
|
|
namespace: kube-system
|
|
labels:
|
|
kubernetes.io/cluster-service: "true"
|
|
addonmanager.kubernetes.io/mode: Reconcile
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRoleBinding
|
|
metadata:
|
|
name: npd-binding
|
|
labels:
|
|
kubernetes.io/cluster-service: "true"
|
|
addonmanager.kubernetes.io/mode: Reconcile
|
|
roleRef:
|
|
apiGroup: rbac.authorization.k8s.io
|
|
kind: ClusterRole
|
|
name: system:node-problem-detector
|
|
subjects:
|
|
- kind: ServiceAccount
|
|
name: node-problem-detector
|
|
namespace: kube-system
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: DaemonSet
|
|
metadata:
|
|
name: npd
|
|
namespace: kube-system
|
|
labels:
|
|
k8s-app: node-problem-detector
|
|
version: ${NODE_PROBLEM_DETECTOR_TAG}
|
|
kubernetes.io/cluster-service: "true"
|
|
addonmanager.kubernetes.io/mode: Reconcile
|
|
spec:
|
|
selector:
|
|
matchLabels:
|
|
k8s-app: node-problem-detector
|
|
version: ${NODE_PROBLEM_DETECTOR_TAG}
|
|
template:
|
|
metadata:
|
|
labels:
|
|
k8s-app: node-problem-detector
|
|
version: ${NODE_PROBLEM_DETECTOR_TAG}
|
|
kubernetes.io/cluster-service: "true"
|
|
spec:
|
|
containers:
|
|
- name: node-problem-detector
|
|
image: ${_gcr_prefix}node-problem-detector:${NODE_PROBLEM_DETECTOR_TAG}
|
|
command:
|
|
- "/bin/sh"
|
|
- "-c"
|
|
# Pass both config to support both journald and syslog.
|
|
- "exec /node-problem-detector --logtostderr --system-log-monitors=/config/kernel-monitor.json,/config/kernel-monitor-filelog.json,/config/docker-monitor.json,/config/docker-monitor-filelog.json 2>&1 | tee /var/log/node-problem-detector.log"
|
|
securityContext:
|
|
privileged: true
|
|
resources:
|
|
limits:
|
|
cpu: "200m"
|
|
memory: "100Mi"
|
|
requests:
|
|
cpu: "20m"
|
|
memory: "20Mi"
|
|
env:
|
|
- name: NODE_NAME
|
|
valueFrom:
|
|
fieldRef:
|
|
fieldPath: spec.nodeName
|
|
volumeMounts:
|
|
- name: log
|
|
mountPath: /var/log
|
|
- name: localtime
|
|
mountPath: /etc/localtime
|
|
readOnly: true
|
|
volumes:
|
|
- name: log
|
|
hostPath:
|
|
path: /var/log/
|
|
- name: localtime
|
|
hostPath:
|
|
path: /etc/localtime
|
|
type: "FileOrCreate"
|
|
serviceAccountName: node-problem-detector
|
|
tolerations:
|
|
- operator: "Exists"
|
|
effect: "NoExecute"
|
|
- key: "CriticalAddonsOnly"
|
|
operator: "Exists"
|
|
EOF
|
|
}
|
|
|
|
echo "Waiting for Kubernetes API..."
|
|
until [ "ok" = "$(curl --silent http://127.0.0.1:8080/healthz)" ]
|
|
do
|
|
sleep 5
|
|
done
|
|
|
|
kubectl apply -f ${NPD_DEPLOY}
|
|
|
|
printf "Finished running ${step}\n"
|
|
|
|
_docker_draino_prefix=${CONTAINER_INFRA_PREFIX:-docker.io/planetlabs/}
|
|
step="enable-auto-healing"
|
|
printf "Starting to run ${step}\n"
|
|
|
|
if [ "$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" ]; then
|
|
# Generate Draino manifest file
|
|
DRAINO_DEPLOY=/srv/magnum/kubernetes/manifests/draino.yaml
|
|
|
|
[ -f ${DRAINO_DEPLOY} ] || {
|
|
echo "Writing File: $DRAINO_DEPLOY"
|
|
mkdir -p $(dirname ${DRAINO_DEPLOY})
|
|
cat << EOF > ${DRAINO_DEPLOY}
|
|
---
|
|
apiVersion: v1
|
|
kind: ServiceAccount
|
|
metadata:
|
|
labels: {component: draino}
|
|
name: draino
|
|
namespace: kube-system
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRole
|
|
metadata:
|
|
labels: {component: draino}
|
|
name: draino
|
|
rules:
|
|
- apiGroups: ['']
|
|
resources: [events]
|
|
verbs: [create, patch, update]
|
|
- apiGroups: ['']
|
|
resources: [nodes]
|
|
verbs: [get, watch, list, update]
|
|
- apiGroups: ['']
|
|
resources: [nodes/status]
|
|
verbs: [patch]
|
|
- apiGroups: ['']
|
|
resources: [pods]
|
|
verbs: [get, watch, list]
|
|
- apiGroups: ['']
|
|
resources: [pods/eviction]
|
|
verbs: [create]
|
|
- apiGroups: [extensions]
|
|
resources: [daemonsets]
|
|
verbs: [get, watch, list]
|
|
---
|
|
apiVersion: rbac.authorization.k8s.io/v1
|
|
kind: ClusterRoleBinding
|
|
metadata:
|
|
labels: {component: draino}
|
|
name: draino
|
|
roleRef: {apiGroup: rbac.authorization.k8s.io, kind: ClusterRole, name: draino}
|
|
subjects:
|
|
- {kind: ServiceAccount, name: draino, namespace: kube-system}
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
labels: {component: draino}
|
|
name: draino
|
|
namespace: kube-system
|
|
spec:
|
|
# Draino does not currently support locking/master election, so you should
|
|
# only run one draino at a time. Draino won't start draining nodes immediately
|
|
# so it's usually safe for multiple drainos to exist for a brief period of
|
|
# time.
|
|
replicas: 1
|
|
selector:
|
|
matchLabels: {component: draino}
|
|
template:
|
|
metadata:
|
|
labels: {component: draino}
|
|
name: draino
|
|
namespace: kube-system
|
|
spec:
|
|
nodeSelector:
|
|
node-role.kubernetes.io/master: ""
|
|
hostNetwork: true
|
|
tolerations:
|
|
- effect: NoSchedule
|
|
operator: Exists
|
|
- key: CriticalAddonsOnly
|
|
operator: Exists
|
|
- effect: NoExecute
|
|
operator: Exists
|
|
- key: node.cloudprovider.kubernetes.io/uninitialized
|
|
value: "true"
|
|
effect: NoSchedule
|
|
- key: node-role.kubernetes.io/master
|
|
effect: NoSchedule
|
|
containers:
|
|
# You'll want to change these labels and conditions to suit your deployment.
|
|
- command: [/draino, --node-label=draino-enabled=true, --evict-daemonset-pods, --evict-emptydir-pods, NotReady]
|
|
image: ${_docker_draino_prefix}draino:${DRAINO_TAG}
|
|
livenessProbe:
|
|
httpGet: {path: /healthz, port: 10002}
|
|
initialDelaySeconds: 30
|
|
name: draino
|
|
serviceAccountName: draino
|
|
EOF
|
|
}
|
|
|
|
kubectl apply -f ${DRAINO_DEPLOY}
|
|
|
|
fi
|
|
printf "Finished running ${step}\n"
|