From df24b1853be5ceac340139978fd882b362fdeaa4 Mon Sep 17 00:00:00 2001 From: Kaspars Skels Date: Wed, 1 May 2019 14:04:38 -0500 Subject: [PATCH] Stability workarounds for ceph/cgroups issues - Add stale rbd cleanup fix - Add cleaning of cgroups mounts Change-Id: I9950496416fbbb572eb5775032305e18fee1f026 --- global/scripts/hanging-cgroup-release.yaml | 26 +++++ global/scripts/rbd-roomba-scanner.yaml | 32 ++++++ .../charts/ucp/divingbell/divingbell.yaml | 24 +++++ tools/fixes/hanging-cgroup-release.sh | 96 +++++++++++++++++ tools/fixes/rbd-roomba-scanner.sh | 101 ++++++++++++++++++ 5 files changed, 279 insertions(+) create mode 100644 global/scripts/hanging-cgroup-release.yaml create mode 100644 global/scripts/rbd-roomba-scanner.yaml create mode 100755 tools/fixes/hanging-cgroup-release.sh create mode 100755 tools/fixes/rbd-roomba-scanner.sh diff --git a/global/scripts/hanging-cgroup-release.yaml b/global/scripts/hanging-cgroup-release.yaml new file mode 100644 index 000000000..e199e1372 --- /dev/null +++ b/global/scripts/hanging-cgroup-release.yaml @@ -0,0 +1,26 @@ +--- +schema: pegleg/Script/v1 +metadata: + schema: metadata/Document/v1 + name: hanging-cgroup-release + storagePolicy: cleartext + layeringDefinition: + abstract: false + layer: global +data: |- + #!/bin/bash + set -ex + + cgroup_count() { + echo "Current cgroup count: $(find /sys/fs/cgroup/*/system.slice -name tasks | wc -l)" + } + + DATE=$(date) + echo "$(cgroup_count)" + echo # Stop systemd mount unit that isn't actually mounted + echo "Stopping Kubernetes systemd mount units that are not mounted to the system." + systemctl list-units --state=running| \ + sed -rn '/Kubernetes.transient.mount/s,(run-\S+).+(/var/lib/kubelet/pods/.+),\1 \2,p' | \ + xargs -r -l1 sh -c 'test -d $2 || echo $1' -- | \ + xargs -r -tl1 systemctl stop |& wc -l + echo "$(cgroup_count)" diff --git a/global/scripts/rbd-roomba-scanner.yaml b/global/scripts/rbd-roomba-scanner.yaml new file mode 100644 index 000000000..3a4be9cdb --- /dev/null +++ b/global/scripts/rbd-roomba-scanner.yaml @@ -0,0 +1,32 @@ +--- +schema: pegleg/Script/v1 +metadata: + schema: metadata/Document/v1 + name: rbd-roomba-scanner + storagePolicy: cleartext + layeringDefinition: + abstract: false + layer: global +data: |- + #!/bin/bash + set -ex + + # don't put it in /tmp where it can be p0wned (???) + lsblk | awk '/^rbd/ {if($7==""){print $0}}' | awk '{ printf "/dev/%s\n",$1 }' > /var/run/rbd_list + + # wait a while, so we don't catch rbd devices the kubelet is working on mounting + sleep 60 + + # finally, examine rbd devices again and if any were seen previously (60s ago) we will + # forcefully unmount them if they have no fs mounts + DATE=$(date) + for rbd in `lsblk | awk '/^rbd/ {if($7==""){print $0}}' | awk '{ printf "/dev/%s\n",$1 }'`; do + if grep -q $rbd /var/run/rbd_list; then + echo "[${DATE}] Unmapping stale RBD $rbd" + /usr/bin/rbd unmap -o force $rbd + # NOTE(supamatt): rbd unmap -o force will only succeed if there are NO pending I/O + else + echo "[${DATE}] Skipping RBD $rbd as it hasn't been stale for at least 60 seconds" + fi + done + rm -rf /var/run/rbd_list diff --git a/global/software/charts/ucp/divingbell/divingbell.yaml b/global/software/charts/ucp/divingbell/divingbell.yaml index b4fafc50c..deeba6920 100644 --- a/global/software/charts/ucp/divingbell/divingbell.yaml +++ b/global/software/charts/ucp/divingbell/divingbell.yaml @@ -24,6 +24,19 @@ metadata: path: .images.ucp.divingbell dest: path: .values.images + + - src: + schema: pegleg/Script/v1 + name: rbd-roomba-scanner + path: . + dest: + path: .values.conf.exec.X005-rbd-roomba-scanner.data + - src: + schema: pegleg/Script/v1 + name: hanging-cgroup-release + path: . + dest: + path: .values.conf.exec.X005-hanging-cgroup-release.data data: chart_name: ucp-divingbell release: ucp-divingbell @@ -78,6 +91,17 @@ data: # were restarted. "Failed to add /run/systemd/ask-password to directory # watch: No space left on device". https://bit.ly/2Mj5qn2 TDP bug 427616 fs.inotify.max_user_watches: '1048576' + exec: + X005-rbd-roomba-scanner: + rerun_policy: always + # 300 = 5 minutes + rerun_interval: 300 + timeout: 300 + X005-hanging-cgroup-release: + rerun_policy: always + # 300 = 5 minutes + rerun_interval: 3600 + timeout: 600 dependencies: - ucp-divingbell-htk --- diff --git a/tools/fixes/hanging-cgroup-release.sh b/tools/fixes/hanging-cgroup-release.sh new file mode 100755 index 000000000..7b3eb1c45 --- /dev/null +++ b/tools/fixes/hanging-cgroup-release.sh @@ -0,0 +1,96 @@ +#!/bin/bash +set -ex + +CLUSTER_DNS=${CLUSTER_DNS:-10.96.0.10} + +KUBECTL_IMAGE=${KUBECTL_IMAGE:-gcr.io/google-containers/hyperkube-amd64:v1.11.6} +UBUNTU_IMAGE=${UBUNTU_IMAGE:-docker.io/ubuntu:16.04} + +cat > /tmp/hanging-cgroup-release.yaml << 'EOF' +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: hanging-cgroup-release + namespace: kube-system + labels: + hotfix: 'true' +data: + singleshot.sh: |+ + #!/bin/bash + set -ex + + while [ 1 ]; + do + cgroup_count() { + echo "Current cgroup count: $(find /sys/fs/cgroup/*/system.slice -name tasks | wc -l)" + } + + DATE=$(date) + echo "$(cgroup_count)" + echo # Stop systemd mount unit that isn't actually mounted + echo "Stopping Kubernetes systemd mount units that are not mounted to the system." + systemctl list-units --state=running| \ + sed -rn '/Kubernetes.transient.mount/s,(run-\S+).+(/var/lib/kubelet/pods/.+),\1 \2,p' | \ + xargs -r -l1 sh -c 'test -d $2 || echo $1' -- | \ + xargs -r -tl1 systemctl stop |& wc -l + echo "$(cgroup_count)" + + sleep 3600 + + done; +EOF +cat >> /tmp/hanging-cgroup-release.yaml << EOF +--- +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: hanging-cgroup-release + namespace: kube-system +spec: + template: + metadata: + labels: + name: hanging-cgroup-release + spec: + hostNetwork: true + hostPID: true + nodeSelector: + ucp-control-plane: enabled + containers: + - resources: + requests: + cpu: 0.1 + securityContext: + privileged: true + image: ${UBUNTU_IMAGE} + name: hanging-cgroup-release + command: ["/bin/bash", "-cx"] + args: + - > + cp -p /tmp/singleshot.sh /host/tmp; + nsenter -t 1 -m -u -n -i /tmp/singleshot.sh; + volumeMounts: + - name: host + mountPath: /host + - name: hanging-cgroup-release + subPath: singleshot.sh + mountPath: /tmp/singleshot.sh + volumes: + - name: host + hostPath: + path: / + - name: hanging-cgroup-release + configMap: + name: hanging-cgroup-release + defaultMode: 0555 +EOF + +docker run --rm -i \ + --net host \ + -v /tmp:/work \ + -v /etc/kubernetes/admin:/etc/kubernetes/admin \ + -e KUBECONFIG=/etc/kubernetes/admin/kubeconfig.yaml \ + ${KUBECTL_IMAGE} \ + /kubectl apply -f /work/hanging-cgroup-release.yaml + diff --git a/tools/fixes/rbd-roomba-scanner.sh b/tools/fixes/rbd-roomba-scanner.sh new file mode 100755 index 000000000..06095b01b --- /dev/null +++ b/tools/fixes/rbd-roomba-scanner.sh @@ -0,0 +1,101 @@ +#!/bin/bash +set -ex + +CLUSTER_DNS=${CLUSTER_DNS:-10.96.0.10} + +KUBECTL_IMAGE=${KUBECTL_IMAGE:-gcr.io/google-containers/hyperkube-amd64:v1.11.6} +UBUNTU_IMAGE=${UBUNTU_IMAGE:-docker.io/ubuntu:16.04} + +cat > /tmp/rbd-roomba-scanner.yaml << 'EOF' +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: rbd-roomba-scanner + namespace: ceph + labels: + hotfix: 'true' +data: + singleshot.sh: |+ + #!/bin/bash + set -ex + + while [ 1 ]; + + do + + # don't put it in /tmp where it can be p0wned (???) + lsblk | awk '/^rbd/ {if($7==""){print $0}}' | awk '{ printf "/dev/%s\n",$1 }' > /var/run/rbd_list + + # wait a while, so we don't catch rbd devices the kubelet is working on mounting + sleep 60 + + # finally, examine rbd devices again and if any were seen previously (60s ago) we will + # forcefully unmount them if they have no fs mounts + DATE=$(date) + for rbd in `lsblk | awk '/^rbd/ {if($7==""){print $0}}' | awk '{ printf "/dev/%s\n",$1 }'`; do + if grep -q $rbd /var/run/rbd_list; then + echo "[${DATE}] Unmapping stale RBD $rbd" + /usr/bin/rbd unmap -o force $rbd + # NOTE(supamatt): rbd unmap -o force will only succeed if there are NO pending I/O + else + echo "[${DATE}] Skipping RBD $rbd as it hasn't been stale for at least 60 seconds" + fi + done + rm -rf /var/run/rbd_list + + done; +EOF +cat >> /tmp/rbd-roomba-scanner.yaml << EOF +--- +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: rbd-roomba-scanner + namespace: ceph +spec: + template: + metadata: + labels: + name: rbd-roomba-scanner + spec: + hostNetwork: true + hostPID: true + nodeSelector: + openstack-control-plane: enabled + containers: + - resources: + requests: + cpu: 0.1 + securityContext: + privileged: true + image: ${UBUNTU_IMAGE} + name: rbd-roomba-scanner + command: ["/bin/bash", "-cx"] + args: + - > + cp -p /tmp/singleshot.sh /host/tmp; + nsenter -t 1 -m -u -n -i /tmp/singleshot.sh; + volumeMounts: + - name: host + mountPath: /host + - name: rbd-roomba-scanner + subPath: singleshot.sh + mountPath: /tmp/singleshot.sh + volumes: + - name: host + hostPath: + path: / + - name: rbd-roomba-scanner + configMap: + name: rbd-roomba-scanner + defaultMode: 0555 +EOF + +docker run --rm -i \ + --net host \ + -v /tmp:/work \ + -v /etc/kubernetes/admin:/etc/kubernetes/admin \ + -e KUBECONFIG=/etc/kubernetes/admin/kubeconfig.yaml \ + ${KUBECTL_IMAGE} \ + /kubectl apply -f /work/rbd-roomba-scanner.yaml