Stability workarounds for ceph/cgroups issues
- Add stale rbd cleanup fix - Add cleaning of cgroups mounts Change-Id: I9950496416fbbb572eb5775032305e18fee1f026
This commit is contained in:
parent
e1328ed98b
commit
df24b1853b
26
global/scripts/hanging-cgroup-release.yaml
Normal file
26
global/scripts/hanging-cgroup-release.yaml
Normal file
@ -0,0 +1,26 @@
|
||||
---
|
||||
schema: pegleg/Script/v1
|
||||
metadata:
|
||||
schema: metadata/Document/v1
|
||||
name: hanging-cgroup-release
|
||||
storagePolicy: cleartext
|
||||
layeringDefinition:
|
||||
abstract: false
|
||||
layer: global
|
||||
data: |-
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
cgroup_count() {
|
||||
echo "Current cgroup count: $(find /sys/fs/cgroup/*/system.slice -name tasks | wc -l)"
|
||||
}
|
||||
|
||||
DATE=$(date)
|
||||
echo "$(cgroup_count)"
|
||||
echo # Stop systemd mount unit that isn't actually mounted
|
||||
echo "Stopping Kubernetes systemd mount units that are not mounted to the system."
|
||||
systemctl list-units --state=running| \
|
||||
sed -rn '/Kubernetes.transient.mount/s,(run-\S+).+(/var/lib/kubelet/pods/.+),\1 \2,p' | \
|
||||
xargs -r -l1 sh -c 'test -d $2 || echo $1' -- | \
|
||||
xargs -r -tl1 systemctl stop |& wc -l
|
||||
echo "$(cgroup_count)"
|
32
global/scripts/rbd-roomba-scanner.yaml
Normal file
32
global/scripts/rbd-roomba-scanner.yaml
Normal file
@ -0,0 +1,32 @@
|
||||
---
|
||||
schema: pegleg/Script/v1
|
||||
metadata:
|
||||
schema: metadata/Document/v1
|
||||
name: rbd-roomba-scanner
|
||||
storagePolicy: cleartext
|
||||
layeringDefinition:
|
||||
abstract: false
|
||||
layer: global
|
||||
data: |-
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
# don't put it in /tmp where it can be p0wned (???)
|
||||
lsblk | awk '/^rbd/ {if($7==""){print $0}}' | awk '{ printf "/dev/%s\n",$1 }' > /var/run/rbd_list
|
||||
|
||||
# wait a while, so we don't catch rbd devices the kubelet is working on mounting
|
||||
sleep 60
|
||||
|
||||
# finally, examine rbd devices again and if any were seen previously (60s ago) we will
|
||||
# forcefully unmount them if they have no fs mounts
|
||||
DATE=$(date)
|
||||
for rbd in `lsblk | awk '/^rbd/ {if($7==""){print $0}}' | awk '{ printf "/dev/%s\n",$1 }'`; do
|
||||
if grep -q $rbd /var/run/rbd_list; then
|
||||
echo "[${DATE}] Unmapping stale RBD $rbd"
|
||||
/usr/bin/rbd unmap -o force $rbd
|
||||
# NOTE(supamatt): rbd unmap -o force will only succeed if there are NO pending I/O
|
||||
else
|
||||
echo "[${DATE}] Skipping RBD $rbd as it hasn't been stale for at least 60 seconds"
|
||||
fi
|
||||
done
|
||||
rm -rf /var/run/rbd_list
|
@ -24,6 +24,19 @@ metadata:
|
||||
path: .images.ucp.divingbell
|
||||
dest:
|
||||
path: .values.images
|
||||
|
||||
- src:
|
||||
schema: pegleg/Script/v1
|
||||
name: rbd-roomba-scanner
|
||||
path: .
|
||||
dest:
|
||||
path: .values.conf.exec.X005-rbd-roomba-scanner.data
|
||||
- src:
|
||||
schema: pegleg/Script/v1
|
||||
name: hanging-cgroup-release
|
||||
path: .
|
||||
dest:
|
||||
path: .values.conf.exec.X005-hanging-cgroup-release.data
|
||||
data:
|
||||
chart_name: ucp-divingbell
|
||||
release: ucp-divingbell
|
||||
@ -78,6 +91,17 @@ data:
|
||||
# were restarted. "Failed to add /run/systemd/ask-password to directory
|
||||
# watch: No space left on device". https://bit.ly/2Mj5qn2 TDP bug 427616
|
||||
fs.inotify.max_user_watches: '1048576'
|
||||
exec:
|
||||
X005-rbd-roomba-scanner:
|
||||
rerun_policy: always
|
||||
# 300 = 5 minutes
|
||||
rerun_interval: 300
|
||||
timeout: 300
|
||||
X005-hanging-cgroup-release:
|
||||
rerun_policy: always
|
||||
# 300 = 5 minutes
|
||||
rerun_interval: 3600
|
||||
timeout: 600
|
||||
dependencies:
|
||||
- ucp-divingbell-htk
|
||||
---
|
||||
|
96
tools/fixes/hanging-cgroup-release.sh
Executable file
96
tools/fixes/hanging-cgroup-release.sh
Executable file
@ -0,0 +1,96 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
CLUSTER_DNS=${CLUSTER_DNS:-10.96.0.10}
|
||||
|
||||
KUBECTL_IMAGE=${KUBECTL_IMAGE:-gcr.io/google-containers/hyperkube-amd64:v1.11.6}
|
||||
UBUNTU_IMAGE=${UBUNTU_IMAGE:-docker.io/ubuntu:16.04}
|
||||
|
||||
cat > /tmp/hanging-cgroup-release.yaml << 'EOF'
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: hanging-cgroup-release
|
||||
namespace: kube-system
|
||||
labels:
|
||||
hotfix: 'true'
|
||||
data:
|
||||
singleshot.sh: |+
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
while [ 1 ];
|
||||
do
|
||||
cgroup_count() {
|
||||
echo "Current cgroup count: $(find /sys/fs/cgroup/*/system.slice -name tasks | wc -l)"
|
||||
}
|
||||
|
||||
DATE=$(date)
|
||||
echo "$(cgroup_count)"
|
||||
echo # Stop systemd mount unit that isn't actually mounted
|
||||
echo "Stopping Kubernetes systemd mount units that are not mounted to the system."
|
||||
systemctl list-units --state=running| \
|
||||
sed -rn '/Kubernetes.transient.mount/s,(run-\S+).+(/var/lib/kubelet/pods/.+),\1 \2,p' | \
|
||||
xargs -r -l1 sh -c 'test -d $2 || echo $1' -- | \
|
||||
xargs -r -tl1 systemctl stop |& wc -l
|
||||
echo "$(cgroup_count)"
|
||||
|
||||
sleep 3600
|
||||
|
||||
done;
|
||||
EOF
|
||||
cat >> /tmp/hanging-cgroup-release.yaml << EOF
|
||||
---
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: hanging-cgroup-release
|
||||
namespace: kube-system
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: hanging-cgroup-release
|
||||
spec:
|
||||
hostNetwork: true
|
||||
hostPID: true
|
||||
nodeSelector:
|
||||
ucp-control-plane: enabled
|
||||
containers:
|
||||
- resources:
|
||||
requests:
|
||||
cpu: 0.1
|
||||
securityContext:
|
||||
privileged: true
|
||||
image: ${UBUNTU_IMAGE}
|
||||
name: hanging-cgroup-release
|
||||
command: ["/bin/bash", "-cx"]
|
||||
args:
|
||||
- >
|
||||
cp -p /tmp/singleshot.sh /host/tmp;
|
||||
nsenter -t 1 -m -u -n -i /tmp/singleshot.sh;
|
||||
volumeMounts:
|
||||
- name: host
|
||||
mountPath: /host
|
||||
- name: hanging-cgroup-release
|
||||
subPath: singleshot.sh
|
||||
mountPath: /tmp/singleshot.sh
|
||||
volumes:
|
||||
- name: host
|
||||
hostPath:
|
||||
path: /
|
||||
- name: hanging-cgroup-release
|
||||
configMap:
|
||||
name: hanging-cgroup-release
|
||||
defaultMode: 0555
|
||||
EOF
|
||||
|
||||
docker run --rm -i \
|
||||
--net host \
|
||||
-v /tmp:/work \
|
||||
-v /etc/kubernetes/admin:/etc/kubernetes/admin \
|
||||
-e KUBECONFIG=/etc/kubernetes/admin/kubeconfig.yaml \
|
||||
${KUBECTL_IMAGE} \
|
||||
/kubectl apply -f /work/hanging-cgroup-release.yaml
|
||||
|
101
tools/fixes/rbd-roomba-scanner.sh
Executable file
101
tools/fixes/rbd-roomba-scanner.sh
Executable file
@ -0,0 +1,101 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
CLUSTER_DNS=${CLUSTER_DNS:-10.96.0.10}
|
||||
|
||||
KUBECTL_IMAGE=${KUBECTL_IMAGE:-gcr.io/google-containers/hyperkube-amd64:v1.11.6}
|
||||
UBUNTU_IMAGE=${UBUNTU_IMAGE:-docker.io/ubuntu:16.04}
|
||||
|
||||
cat > /tmp/rbd-roomba-scanner.yaml << 'EOF'
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: rbd-roomba-scanner
|
||||
namespace: ceph
|
||||
labels:
|
||||
hotfix: 'true'
|
||||
data:
|
||||
singleshot.sh: |+
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
while [ 1 ];
|
||||
|
||||
do
|
||||
|
||||
# don't put it in /tmp where it can be p0wned (???)
|
||||
lsblk | awk '/^rbd/ {if($7==""){print $0}}' | awk '{ printf "/dev/%s\n",$1 }' > /var/run/rbd_list
|
||||
|
||||
# wait a while, so we don't catch rbd devices the kubelet is working on mounting
|
||||
sleep 60
|
||||
|
||||
# finally, examine rbd devices again and if any were seen previously (60s ago) we will
|
||||
# forcefully unmount them if they have no fs mounts
|
||||
DATE=$(date)
|
||||
for rbd in `lsblk | awk '/^rbd/ {if($7==""){print $0}}' | awk '{ printf "/dev/%s\n",$1 }'`; do
|
||||
if grep -q $rbd /var/run/rbd_list; then
|
||||
echo "[${DATE}] Unmapping stale RBD $rbd"
|
||||
/usr/bin/rbd unmap -o force $rbd
|
||||
# NOTE(supamatt): rbd unmap -o force will only succeed if there are NO pending I/O
|
||||
else
|
||||
echo "[${DATE}] Skipping RBD $rbd as it hasn't been stale for at least 60 seconds"
|
||||
fi
|
||||
done
|
||||
rm -rf /var/run/rbd_list
|
||||
|
||||
done;
|
||||
EOF
|
||||
cat >> /tmp/rbd-roomba-scanner.yaml << EOF
|
||||
---
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: rbd-roomba-scanner
|
||||
namespace: ceph
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: rbd-roomba-scanner
|
||||
spec:
|
||||
hostNetwork: true
|
||||
hostPID: true
|
||||
nodeSelector:
|
||||
openstack-control-plane: enabled
|
||||
containers:
|
||||
- resources:
|
||||
requests:
|
||||
cpu: 0.1
|
||||
securityContext:
|
||||
privileged: true
|
||||
image: ${UBUNTU_IMAGE}
|
||||
name: rbd-roomba-scanner
|
||||
command: ["/bin/bash", "-cx"]
|
||||
args:
|
||||
- >
|
||||
cp -p /tmp/singleshot.sh /host/tmp;
|
||||
nsenter -t 1 -m -u -n -i /tmp/singleshot.sh;
|
||||
volumeMounts:
|
||||
- name: host
|
||||
mountPath: /host
|
||||
- name: rbd-roomba-scanner
|
||||
subPath: singleshot.sh
|
||||
mountPath: /tmp/singleshot.sh
|
||||
volumes:
|
||||
- name: host
|
||||
hostPath:
|
||||
path: /
|
||||
- name: rbd-roomba-scanner
|
||||
configMap:
|
||||
name: rbd-roomba-scanner
|
||||
defaultMode: 0555
|
||||
EOF
|
||||
|
||||
docker run --rm -i \
|
||||
--net host \
|
||||
-v /tmp:/work \
|
||||
-v /etc/kubernetes/admin:/etc/kubernetes/admin \
|
||||
-e KUBECONFIG=/etc/kubernetes/admin/kubeconfig.yaml \
|
||||
${KUBECTL_IMAGE} \
|
||||
/kubectl apply -f /work/rbd-roomba-scanner.yaml
|
Loading…
Reference in New Issue
Block a user