Stability workarounds for ceph/cgroups issues
- Add stale rbd cleanup fix - Add cleaning of cgroups mounts Change-Id: I9950496416fbbb572eb5775032305e18fee1f026
This commit is contained in:
parent
e1328ed98b
commit
df24b1853b
26
global/scripts/hanging-cgroup-release.yaml
Normal file
26
global/scripts/hanging-cgroup-release.yaml
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
---
|
||||||
|
schema: pegleg/Script/v1
|
||||||
|
metadata:
|
||||||
|
schema: metadata/Document/v1
|
||||||
|
name: hanging-cgroup-release
|
||||||
|
storagePolicy: cleartext
|
||||||
|
layeringDefinition:
|
||||||
|
abstract: false
|
||||||
|
layer: global
|
||||||
|
data: |-
|
||||||
|
#!/bin/bash
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
cgroup_count() {
|
||||||
|
echo "Current cgroup count: $(find /sys/fs/cgroup/*/system.slice -name tasks | wc -l)"
|
||||||
|
}
|
||||||
|
|
||||||
|
DATE=$(date)
|
||||||
|
echo "$(cgroup_count)"
|
||||||
|
echo # Stop systemd mount unit that isn't actually mounted
|
||||||
|
echo "Stopping Kubernetes systemd mount units that are not mounted to the system."
|
||||||
|
systemctl list-units --state=running| \
|
||||||
|
sed -rn '/Kubernetes.transient.mount/s,(run-\S+).+(/var/lib/kubelet/pods/.+),\1 \2,p' | \
|
||||||
|
xargs -r -l1 sh -c 'test -d $2 || echo $1' -- | \
|
||||||
|
xargs -r -tl1 systemctl stop |& wc -l
|
||||||
|
echo "$(cgroup_count)"
|
32
global/scripts/rbd-roomba-scanner.yaml
Normal file
32
global/scripts/rbd-roomba-scanner.yaml
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
---
|
||||||
|
schema: pegleg/Script/v1
|
||||||
|
metadata:
|
||||||
|
schema: metadata/Document/v1
|
||||||
|
name: rbd-roomba-scanner
|
||||||
|
storagePolicy: cleartext
|
||||||
|
layeringDefinition:
|
||||||
|
abstract: false
|
||||||
|
layer: global
|
||||||
|
data: |-
|
||||||
|
#!/bin/bash
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# don't put it in /tmp where it can be p0wned (???)
|
||||||
|
lsblk | awk '/^rbd/ {if($7==""){print $0}}' | awk '{ printf "/dev/%s\n",$1 }' > /var/run/rbd_list
|
||||||
|
|
||||||
|
# wait a while, so we don't catch rbd devices the kubelet is working on mounting
|
||||||
|
sleep 60
|
||||||
|
|
||||||
|
# finally, examine rbd devices again and if any were seen previously (60s ago) we will
|
||||||
|
# forcefully unmount them if they have no fs mounts
|
||||||
|
DATE=$(date)
|
||||||
|
for rbd in `lsblk | awk '/^rbd/ {if($7==""){print $0}}' | awk '{ printf "/dev/%s\n",$1 }'`; do
|
||||||
|
if grep -q $rbd /var/run/rbd_list; then
|
||||||
|
echo "[${DATE}] Unmapping stale RBD $rbd"
|
||||||
|
/usr/bin/rbd unmap -o force $rbd
|
||||||
|
# NOTE(supamatt): rbd unmap -o force will only succeed if there are NO pending I/O
|
||||||
|
else
|
||||||
|
echo "[${DATE}] Skipping RBD $rbd as it hasn't been stale for at least 60 seconds"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
rm -rf /var/run/rbd_list
|
@ -24,6 +24,19 @@ metadata:
|
|||||||
path: .images.ucp.divingbell
|
path: .images.ucp.divingbell
|
||||||
dest:
|
dest:
|
||||||
path: .values.images
|
path: .values.images
|
||||||
|
|
||||||
|
- src:
|
||||||
|
schema: pegleg/Script/v1
|
||||||
|
name: rbd-roomba-scanner
|
||||||
|
path: .
|
||||||
|
dest:
|
||||||
|
path: .values.conf.exec.X005-rbd-roomba-scanner.data
|
||||||
|
- src:
|
||||||
|
schema: pegleg/Script/v1
|
||||||
|
name: hanging-cgroup-release
|
||||||
|
path: .
|
||||||
|
dest:
|
||||||
|
path: .values.conf.exec.X005-hanging-cgroup-release.data
|
||||||
data:
|
data:
|
||||||
chart_name: ucp-divingbell
|
chart_name: ucp-divingbell
|
||||||
release: ucp-divingbell
|
release: ucp-divingbell
|
||||||
@ -78,6 +91,17 @@ data:
|
|||||||
# were restarted. "Failed to add /run/systemd/ask-password to directory
|
# were restarted. "Failed to add /run/systemd/ask-password to directory
|
||||||
# watch: No space left on device". https://bit.ly/2Mj5qn2 TDP bug 427616
|
# watch: No space left on device". https://bit.ly/2Mj5qn2 TDP bug 427616
|
||||||
fs.inotify.max_user_watches: '1048576'
|
fs.inotify.max_user_watches: '1048576'
|
||||||
|
exec:
|
||||||
|
X005-rbd-roomba-scanner:
|
||||||
|
rerun_policy: always
|
||||||
|
# 300 = 5 minutes
|
||||||
|
rerun_interval: 300
|
||||||
|
timeout: 300
|
||||||
|
X005-hanging-cgroup-release:
|
||||||
|
rerun_policy: always
|
||||||
|
# 300 = 5 minutes
|
||||||
|
rerun_interval: 3600
|
||||||
|
timeout: 600
|
||||||
dependencies:
|
dependencies:
|
||||||
- ucp-divingbell-htk
|
- ucp-divingbell-htk
|
||||||
---
|
---
|
||||||
|
96
tools/fixes/hanging-cgroup-release.sh
Executable file
96
tools/fixes/hanging-cgroup-release.sh
Executable file
@ -0,0 +1,96 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
CLUSTER_DNS=${CLUSTER_DNS:-10.96.0.10}
|
||||||
|
|
||||||
|
KUBECTL_IMAGE=${KUBECTL_IMAGE:-gcr.io/google-containers/hyperkube-amd64:v1.11.6}
|
||||||
|
UBUNTU_IMAGE=${UBUNTU_IMAGE:-docker.io/ubuntu:16.04}
|
||||||
|
|
||||||
|
cat > /tmp/hanging-cgroup-release.yaml << 'EOF'
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: hanging-cgroup-release
|
||||||
|
namespace: kube-system
|
||||||
|
labels:
|
||||||
|
hotfix: 'true'
|
||||||
|
data:
|
||||||
|
singleshot.sh: |+
|
||||||
|
#!/bin/bash
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
while [ 1 ];
|
||||||
|
do
|
||||||
|
cgroup_count() {
|
||||||
|
echo "Current cgroup count: $(find /sys/fs/cgroup/*/system.slice -name tasks | wc -l)"
|
||||||
|
}
|
||||||
|
|
||||||
|
DATE=$(date)
|
||||||
|
echo "$(cgroup_count)"
|
||||||
|
echo # Stop systemd mount unit that isn't actually mounted
|
||||||
|
echo "Stopping Kubernetes systemd mount units that are not mounted to the system."
|
||||||
|
systemctl list-units --state=running| \
|
||||||
|
sed -rn '/Kubernetes.transient.mount/s,(run-\S+).+(/var/lib/kubelet/pods/.+),\1 \2,p' | \
|
||||||
|
xargs -r -l1 sh -c 'test -d $2 || echo $1' -- | \
|
||||||
|
xargs -r -tl1 systemctl stop |& wc -l
|
||||||
|
echo "$(cgroup_count)"
|
||||||
|
|
||||||
|
sleep 3600
|
||||||
|
|
||||||
|
done;
|
||||||
|
EOF
|
||||||
|
cat >> /tmp/hanging-cgroup-release.yaml << EOF
|
||||||
|
---
|
||||||
|
apiVersion: extensions/v1beta1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: hanging-cgroup-release
|
||||||
|
namespace: kube-system
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
name: hanging-cgroup-release
|
||||||
|
spec:
|
||||||
|
hostNetwork: true
|
||||||
|
hostPID: true
|
||||||
|
nodeSelector:
|
||||||
|
ucp-control-plane: enabled
|
||||||
|
containers:
|
||||||
|
- resources:
|
||||||
|
requests:
|
||||||
|
cpu: 0.1
|
||||||
|
securityContext:
|
||||||
|
privileged: true
|
||||||
|
image: ${UBUNTU_IMAGE}
|
||||||
|
name: hanging-cgroup-release
|
||||||
|
command: ["/bin/bash", "-cx"]
|
||||||
|
args:
|
||||||
|
- >
|
||||||
|
cp -p /tmp/singleshot.sh /host/tmp;
|
||||||
|
nsenter -t 1 -m -u -n -i /tmp/singleshot.sh;
|
||||||
|
volumeMounts:
|
||||||
|
- name: host
|
||||||
|
mountPath: /host
|
||||||
|
- name: hanging-cgroup-release
|
||||||
|
subPath: singleshot.sh
|
||||||
|
mountPath: /tmp/singleshot.sh
|
||||||
|
volumes:
|
||||||
|
- name: host
|
||||||
|
hostPath:
|
||||||
|
path: /
|
||||||
|
- name: hanging-cgroup-release
|
||||||
|
configMap:
|
||||||
|
name: hanging-cgroup-release
|
||||||
|
defaultMode: 0555
|
||||||
|
EOF
|
||||||
|
|
||||||
|
docker run --rm -i \
|
||||||
|
--net host \
|
||||||
|
-v /tmp:/work \
|
||||||
|
-v /etc/kubernetes/admin:/etc/kubernetes/admin \
|
||||||
|
-e KUBECONFIG=/etc/kubernetes/admin/kubeconfig.yaml \
|
||||||
|
${KUBECTL_IMAGE} \
|
||||||
|
/kubectl apply -f /work/hanging-cgroup-release.yaml
|
||||||
|
|
101
tools/fixes/rbd-roomba-scanner.sh
Executable file
101
tools/fixes/rbd-roomba-scanner.sh
Executable file
@ -0,0 +1,101 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
CLUSTER_DNS=${CLUSTER_DNS:-10.96.0.10}
|
||||||
|
|
||||||
|
KUBECTL_IMAGE=${KUBECTL_IMAGE:-gcr.io/google-containers/hyperkube-amd64:v1.11.6}
|
||||||
|
UBUNTU_IMAGE=${UBUNTU_IMAGE:-docker.io/ubuntu:16.04}
|
||||||
|
|
||||||
|
cat > /tmp/rbd-roomba-scanner.yaml << 'EOF'
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: rbd-roomba-scanner
|
||||||
|
namespace: ceph
|
||||||
|
labels:
|
||||||
|
hotfix: 'true'
|
||||||
|
data:
|
||||||
|
singleshot.sh: |+
|
||||||
|
#!/bin/bash
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
while [ 1 ];
|
||||||
|
|
||||||
|
do
|
||||||
|
|
||||||
|
# don't put it in /tmp where it can be p0wned (???)
|
||||||
|
lsblk | awk '/^rbd/ {if($7==""){print $0}}' | awk '{ printf "/dev/%s\n",$1 }' > /var/run/rbd_list
|
||||||
|
|
||||||
|
# wait a while, so we don't catch rbd devices the kubelet is working on mounting
|
||||||
|
sleep 60
|
||||||
|
|
||||||
|
# finally, examine rbd devices again and if any were seen previously (60s ago) we will
|
||||||
|
# forcefully unmount them if they have no fs mounts
|
||||||
|
DATE=$(date)
|
||||||
|
for rbd in `lsblk | awk '/^rbd/ {if($7==""){print $0}}' | awk '{ printf "/dev/%s\n",$1 }'`; do
|
||||||
|
if grep -q $rbd /var/run/rbd_list; then
|
||||||
|
echo "[${DATE}] Unmapping stale RBD $rbd"
|
||||||
|
/usr/bin/rbd unmap -o force $rbd
|
||||||
|
# NOTE(supamatt): rbd unmap -o force will only succeed if there are NO pending I/O
|
||||||
|
else
|
||||||
|
echo "[${DATE}] Skipping RBD $rbd as it hasn't been stale for at least 60 seconds"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
rm -rf /var/run/rbd_list
|
||||||
|
|
||||||
|
done;
|
||||||
|
EOF
|
||||||
|
cat >> /tmp/rbd-roomba-scanner.yaml << EOF
|
||||||
|
---
|
||||||
|
apiVersion: extensions/v1beta1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: rbd-roomba-scanner
|
||||||
|
namespace: ceph
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
name: rbd-roomba-scanner
|
||||||
|
spec:
|
||||||
|
hostNetwork: true
|
||||||
|
hostPID: true
|
||||||
|
nodeSelector:
|
||||||
|
openstack-control-plane: enabled
|
||||||
|
containers:
|
||||||
|
- resources:
|
||||||
|
requests:
|
||||||
|
cpu: 0.1
|
||||||
|
securityContext:
|
||||||
|
privileged: true
|
||||||
|
image: ${UBUNTU_IMAGE}
|
||||||
|
name: rbd-roomba-scanner
|
||||||
|
command: ["/bin/bash", "-cx"]
|
||||||
|
args:
|
||||||
|
- >
|
||||||
|
cp -p /tmp/singleshot.sh /host/tmp;
|
||||||
|
nsenter -t 1 -m -u -n -i /tmp/singleshot.sh;
|
||||||
|
volumeMounts:
|
||||||
|
- name: host
|
||||||
|
mountPath: /host
|
||||||
|
- name: rbd-roomba-scanner
|
||||||
|
subPath: singleshot.sh
|
||||||
|
mountPath: /tmp/singleshot.sh
|
||||||
|
volumes:
|
||||||
|
- name: host
|
||||||
|
hostPath:
|
||||||
|
path: /
|
||||||
|
- name: rbd-roomba-scanner
|
||||||
|
configMap:
|
||||||
|
name: rbd-roomba-scanner
|
||||||
|
defaultMode: 0555
|
||||||
|
EOF
|
||||||
|
|
||||||
|
docker run --rm -i \
|
||||||
|
--net host \
|
||||||
|
-v /tmp:/work \
|
||||||
|
-v /etc/kubernetes/admin:/etc/kubernetes/admin \
|
||||||
|
-e KUBECONFIG=/etc/kubernetes/admin/kubeconfig.yaml \
|
||||||
|
${KUBECTL_IMAGE} \
|
||||||
|
/kubectl apply -f /work/rbd-roomba-scanner.yaml
|
Loading…
x
Reference in New Issue
Block a user