step="enable-node-problem-detector" printf "Starting to run ${step}\n" . /etc/sysconfig/heat-params _gcr_prefix=${CONTAINER_INFRA_PREFIX:-k8s.gcr.io/} # Either auto scaling or auto healing we need CA to be deployed if [[ "$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" || "$(echo $NPD_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" ]]; then # Generate Node Problem Detector manifest file NPD_DEPLOY=/srv/magnum/kubernetes/manifests/npd.yaml [ -f ${NPD_DEPLOY} ] || { echo "Writing File: $NPD_DEPLOY" mkdir -p $(dirname ${NPD_DEPLOY}) cat << EOF > ${NPD_DEPLOY} --- apiVersion: v1 kind: ServiceAccount metadata: name: node-problem-detector namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: magnum:podsecuritypolicy:node-problem-detector namespace: kube-system labels: addonmanager.kubernetes.io/mode: Reconcile kubernetes.io/cluster-service: "true" roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: magnum:podsecuritypolicy:privileged subjects: - kind: ServiceAccount name: node-problem-detector namespace: kube-system --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: npd-binding labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: system:node-problem-detector subjects: - kind: ServiceAccount name: node-problem-detector namespace: kube-system --- apiVersion: apps/v1 kind: DaemonSet metadata: name: npd namespace: kube-system labels: k8s-app: node-problem-detector version: ${NODE_PROBLEM_DETECTOR_TAG} kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile spec: selector: matchLabels: k8s-app: node-problem-detector version: ${NODE_PROBLEM_DETECTOR_TAG} template: metadata: labels: k8s-app: node-problem-detector version: ${NODE_PROBLEM_DETECTOR_TAG} kubernetes.io/cluster-service: "true" spec: containers: - name: node-problem-detector image: ${_gcr_prefix}node-problem-detector:${NODE_PROBLEM_DETECTOR_TAG} command: - "/bin/sh" - "-c" # Pass both config to support both journald and syslog. - "exec /node-problem-detector --logtostderr --system-log-monitors=/config/kernel-monitor.json,/config/kernel-monitor-filelog.json,/config/docker-monitor.json,/config/docker-monitor-filelog.json 2>&1 | tee /var/log/node-problem-detector.log" securityContext: privileged: true resources: limits: cpu: "200m" memory: "100Mi" requests: cpu: "20m" memory: "20Mi" env: - name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName volumeMounts: - name: log mountPath: /var/log - name: localtime mountPath: /etc/localtime readOnly: true volumes: - name: log hostPath: path: /var/log/ - name: localtime hostPath: path: /etc/localtime type: "FileOrCreate" serviceAccountName: node-problem-detector tolerations: - operator: "Exists" effect: "NoExecute" - key: "CriticalAddonsOnly" operator: "Exists" EOF } echo "Waiting for Kubernetes API..." until [ "ok" = "$(curl --silent http://127.0.0.1:8080/healthz)" ] do sleep 5 done kubectl apply -f ${NPD_DEPLOY} printf "Finished running ${step}\n" fi function enable_draino { echo "Installing draino" _docker_draino_prefix=${CONTAINER_INFRA_PREFIX:-docker.io/planetlabs/} draino_manifest=/srv/magnum/kubernetes/manifests/draino.yaml [ -f ${draino_manifest} ] || { echo "Writing File: $draino_manifest" mkdir -p $(dirname ${draino_manifest}) cat << EOF > ${draino_manifest} --- apiVersion: v1 kind: ServiceAccount metadata: labels: {component: draino} name: draino namespace: kube-system --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: {component: draino} name: draino rules: - apiGroups: [''] resources: [events] verbs: [create, patch, update] - apiGroups: [''] resources: [nodes] verbs: [get, watch, list, update] - apiGroups: [''] resources: [nodes/status] verbs: [patch] - apiGroups: [''] resources: [pods] verbs: [get, watch, list] - apiGroups: [''] resources: [pods/eviction] verbs: [create] - apiGroups: [extensions] resources: [daemonsets] verbs: [get, watch, list] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: labels: {component: draino} name: draino roleRef: {apiGroup: rbac.authorization.k8s.io, kind: ClusterRole, name: draino} subjects: - {kind: ServiceAccount, name: draino, namespace: kube-system} --- apiVersion: apps/v1 kind: Deployment metadata: labels: {component: draino} name: draino namespace: kube-system spec: # Draino does not currently support locking/master election, so you should # only run one draino at a time. Draino won't start draining nodes immediately # so it's usually safe for multiple drainos to exist for a brief period of # time. replicas: 1 selector: matchLabels: {component: draino} template: metadata: labels: {component: draino} name: draino namespace: kube-system spec: nodeSelector: node-role.kubernetes.io/master: "" hostNetwork: true tolerations: - effect: NoSchedule operator: Exists - key: CriticalAddonsOnly operator: Exists - effect: NoExecute operator: Exists - key: node.cloudprovider.kubernetes.io/uninitialized value: "true" effect: NoSchedule - key: node-role.kubernetes.io/master effect: NoSchedule containers: # You'll want to change these labels and conditions to suit your deployment. - command: [/draino, --node-label=draino-enabled=true, --evict-daemonset-pods, --evict-emptydir-pods, NotReady] image: ${_docker_draino_prefix}draino:${DRAINO_TAG} livenessProbe: httpGet: {path: /healthz, port: 10002} initialDelaySeconds: 30 name: draino serviceAccountName: draino EOF } kubectl apply -f ${draino_manifest} } function enable_magnum_auto_healer { echo "Installing magnum_auto_healer" image_prefix=${CONTAINER_INFRA_PREFIX:-docker.io/k8scloudprovider/} image_prefix=${image_prefix%/} magnum_auto_healer_manifest=/srv/magnum/kubernetes/manifests/magnum_auto_healer.yaml [ -f ${magnum_auto_healer_manifest} ] || { echo "Writing File: ${magnum_auto_healer_manifest}" mkdir -p $(dirname ${magnum_auto_healer_manifest}) cat << EOF > ${magnum_auto_healer_manifest} --- kind: ServiceAccount apiVersion: v1 metadata: name: magnum-auto-healer namespace: kube-system --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: name: magnum-auto-healer roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: cluster-admin subjects: - kind: ServiceAccount name: magnum-auto-healer namespace: kube-system --- kind: ConfigMap apiVersion: v1 metadata: name: magnum-auto-healer-config namespace: kube-system data: config.yaml: | cluster-name: ${CLUSTER_UUID} dry-run: false monitor-interval: 30s check-delay-after-add: 20m leader-elect: true healthcheck: master: - type: Endpoint params: unhealthy-duration: 3m protocol: HTTPS port: 6443 endpoints: ["/healthz"] ok-codes: [200] - type: NodeCondition params: unhealthy-duration: 3m types: ["Ready"] ok-values: ["True"] worker: - type: NodeCondition params: unhealthy-duration: 3m types: ["Ready"] ok-values: ["True"] openstack: auth-url: ${AUTH_URL} user-id: ${TRUSTEE_USER_ID} password: ${TRUSTEE_PASSWORD} trust-id: ${TRUST_ID} region: ${REGION_NAME} ca-file: /etc/kubernetes/ca-bundle.crt --- apiVersion: apps/v1 kind: DaemonSet metadata: name: magnum-auto-healer namespace: kube-system labels: k8s-app: magnum-auto-healer spec: selector: matchLabels: k8s-app: magnum-auto-healer template: metadata: labels: k8s-app: magnum-auto-healer spec: hostNetwork: true serviceAccountName: magnum-auto-healer tolerations: - effect: NoSchedule operator: Exists - key: CriticalAddonsOnly operator: Exists - effect: NoExecute operator: Exists nodeSelector: node-role.kubernetes.io/master: "" containers: - name: magnum-auto-healer image: ${image_prefix}/magnum-auto-healer:${MAGNUM_AUTO_HEALER_TAG} imagePullPolicy: Always args: - /bin/magnum-auto-healer - --config=/etc/magnum-auto-healer/config.yaml - --v - "2" volumeMounts: - name: config mountPath: /etc/magnum-auto-healer - name: kubernetes-config mountPath: /etc/kubernetes readOnly: true volumes: - name: config configMap: name: magnum-auto-healer-config - name: kubernetes-config hostPath: path: /etc/kubernetes EOF } kubectl apply -f ${magnum_auto_healer_manifest} } step="enable-auto-healing" printf "Starting to run ${step}\n" if [ "$(echo $AUTO_HEALING_ENABLED | tr '[:upper:]' '[:lower:]')" = "true" ]; then autohealing_controller=$(echo ${AUTO_HEALING_CONTROLLER} | tr '[:upper:]' '[:lower:]') case "${autohealing_controller}" in "") echo "No autohealing controller configured." ;; "draino") enable_draino ;; "magnum-auto-healer") enable_magnum_auto_healer ;; *) echo "Autohealing controller ${autohealing_controller} not supported." ;; esac fi printf "Finished running ${step}\n"