Ceph liveness scripts
Replace socket-based liveness checks with scripts The current TCP socket-based liveness/readiness check for Ceph doesn't accurately reflect when daemons are live, doesn't handle multiple OSDs on a host, and doesn't work when hostNetworking is in use and the Ceph network is different from the one associated with the hostname. This change adds new scripts for checking Ceph monitor and OSD liveness/readiness that query the Ceph Unix domain sockets to get daemon status and exits 0 iff all sockets report that their daemons are in an "active" state. This isn't perfect: we don't know how many daemons SHOULD be active, so if only a subset is live and the others have no sockets (yet?), we'll still claim the pod is ready. The scripts also don't distinguish between liveness and readiness for OSDs. Change-Id: I5d370b4bc4025fece2e640355c3a29167afca871
This commit is contained in:
parent
f5a6a5dedc
commit
5f3f13cc0a
44
ceph/templates/bin/_ceph-mon-liveness.sh.tpl
Executable file
44
ceph/templates/bin/_ceph-mon-liveness.sh.tpl
Executable file
@ -0,0 +1,44 @@
|
||||
#!/bin/sh
|
||||
|
||||
# Copyright 2017 The Openstack-Helm Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# A liveness check for ceph monitors: exit 0 iff the monitor appears to be at least
|
||||
# alive (but not necessarily in a quorum).
|
||||
CEPH=${CEPH_CMD:-/usr/bin/ceph}
|
||||
SOCKDIR=${CEPH_SOCKET_DIR:-/run/ceph}
|
||||
SBASE=${CEPH_OSD_SOCKET_BASE:-ceph-mon}
|
||||
SSUFFIX=${CEPH_SOCKET_SUFFIX:-asok}
|
||||
|
||||
mon_live_state="probing electing synchronizing leader peon"
|
||||
|
||||
monid=`ps auwwx | grep ceph-mon | grep -v "$1" | grep -v grep | sed 's/.*-i\ *//;s/\ *-.*//'|awk '{print $1}'`
|
||||
|
||||
if [ -z "${monid}" ]; then
|
||||
# not really a sensible fallback, but it'll do.
|
||||
monid=`hostname`
|
||||
fi
|
||||
|
||||
if [ -S "${SOCKDIR}/${SBASE}.${monid}.${SSUFFIX}" ]; then
|
||||
state=`${CEPH} -f json-pretty --connect-timeout 1 --admin-daemon "${sock}" mon_status|grep state|sed 's/.*://;s/[^a-z]//g'`
|
||||
echo "MON $monid $state";
|
||||
# this might be a stricter check than we actually want. what are the
|
||||
# other values for the "state" field?
|
||||
for S in ${mon_live_state}; do
|
||||
if [ "x${state}x" = "x${S}x" ]; then
|
||||
exit 0
|
||||
fi
|
||||
done
|
||||
fi
|
||||
exit 1
|
44
ceph/templates/bin/_ceph-mon-readiness.sh.tpl
Executable file
44
ceph/templates/bin/_ceph-mon-readiness.sh.tpl
Executable file
@ -0,0 +1,44 @@
|
||||
#!/bin/sh
|
||||
|
||||
# Copyright 2017 The Openstack-Helm Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# A readiness check for ceph monitors: exit 0 iff the monitor appears to be at least
|
||||
# alive (but not necessarily in a quorum).
|
||||
CEPH=${CEPH_CMD:-/usr/bin/ceph}
|
||||
SOCKDIR=${CEPH_SOCKET_DIR:-/run/ceph}
|
||||
SBASE=${CEPH_OSD_SOCKET_BASE:-ceph-mon}
|
||||
SSUFFIX=${CEPH_SOCKET_SUFFIX:-asok}
|
||||
|
||||
mon_live_state="leader peon"
|
||||
|
||||
monid=`ps auwwx | grep ceph-mon | grep -v "$1" | grep -v grep | sed 's/.*-i\ *//;s/\ *-.*//'|awk '{print $1}'`
|
||||
|
||||
if [ -z "${monid}" ]; then
|
||||
# not really a sensible fallback, but it'll do.
|
||||
monid=`hostname`
|
||||
fi
|
||||
|
||||
if [ -S "${SOCKDIR}/${SBASE}.${monid}.${SSUFFIX}" ]; then
|
||||
state=`${CEPH} -f json-pretty --connect-timeout 1 --admin-daemon "${sock}" mon_status|grep state|sed 's/.*://;s/[^a-z]//g'`
|
||||
echo "MON $monid $state";
|
||||
# this might be a stricter check than we actually want. what are the
|
||||
# other values for the "state" field?
|
||||
for S in ${mon_live_state}; do
|
||||
if [ "x${state}x" = "x${S}x" ]; then
|
||||
exit 0
|
||||
fi
|
||||
done
|
||||
fi
|
||||
exit 1
|
45
ceph/templates/bin/_ceph-osd-liveness-readiness.sh.tpl
Executable file
45
ceph/templates/bin/_ceph-osd-liveness-readiness.sh.tpl
Executable file
@ -0,0 +1,45 @@
|
||||
#!/bin/sh
|
||||
|
||||
# Copyright 2017 The Openstack-Helm Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# A liveness check for ceph OSDs: exit 0 iff
|
||||
# all OSDs on this host are in the "active" state
|
||||
# per their admin sockets.
|
||||
CEPH=${CEPH_CMD:-/usr/bin/ceph}
|
||||
|
||||
SOCKDIR=${CEPH_SOCKET_DIR:-/run/ceph}
|
||||
SBASE=${CEPH_OSD_SOCKET_BASE:-ceph-osd}
|
||||
SSUFFIX=${CEPH_SOCKET_SUFFIX:-asok}
|
||||
|
||||
# default: no sockets, not live
|
||||
cond=1
|
||||
for sock in $SOCKDIR/$SBASE.*.$SSUFFIX; do
|
||||
if [ -S $sock ]; then
|
||||
osdid=`echo $sock | awk -F. '{print $2}'`
|
||||
state=`${CEPH} -f json-pretty --connect-timeout 1 --admin-daemon "${sock}" status|grep state|sed 's/.*://;s/[^a-z]//g'`
|
||||
echo "OSD $osdid $state";
|
||||
# this might be a stricter check than we actually want. what are the
|
||||
# other values for the "state" field?
|
||||
if [ "x${state}x" = 'xactivex' ]; then
|
||||
cond=0
|
||||
else
|
||||
# one's not ready, so the whole pod's not ready.
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "No daemon sockets found in $SOCKDIR"
|
||||
fi
|
||||
done
|
||||
exit $cond
|
@ -71,5 +71,11 @@ data:
|
||||
{{ tuple "bin/_check_zombie_mons.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
|
||||
rbd-provisioner.sh: |
|
||||
{{ tuple "bin/_rbd-provisioner.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
|
||||
ceph-osd-liveness-readiness.sh: |
|
||||
{{ tuple "bin/_ceph-osd-liveness-readiness.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
|
||||
ceph-mon-liveness.sh: |
|
||||
{{ tuple "bin/_ceph-mon-liveness.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
|
||||
ceph-mon-readiness.sh: |
|
||||
{{ tuple "bin/_ceph-mon-readiness.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
@ -94,14 +94,17 @@ spec:
|
||||
ports:
|
||||
- containerPort: 6789
|
||||
livenessProbe:
|
||||
tcpSocket:
|
||||
port: 6789
|
||||
initialDelaySeconds: 60
|
||||
timeoutSeconds: 5
|
||||
exec:
|
||||
command:
|
||||
- /ceph-mon-liveness.sh
|
||||
initialDelaySeconds: 60
|
||||
periodSeconds: 60
|
||||
readinessProbe:
|
||||
tcpSocket:
|
||||
port: 6789
|
||||
timeoutSeconds: 5
|
||||
exec:
|
||||
command:
|
||||
- /ceph-mon-readiness.sh
|
||||
initialDelaySeconds: 60
|
||||
periodSeconds: 60
|
||||
volumeMounts:
|
||||
- name: ceph-bin
|
||||
mountPath: /start_mon.sh
|
||||
@ -115,6 +118,14 @@ spec:
|
||||
mountPath: /common_functions.sh
|
||||
subPath: common_functions.sh
|
||||
readOnly: true
|
||||
- name: ceph-bin
|
||||
mountPath: /ceph-mon-liveness.sh
|
||||
subPath: ceph-mon-liveness.sh
|
||||
readOnly: true
|
||||
- name: ceph-bin
|
||||
mountPath: /ceph-mon-readiness.sh
|
||||
subPath: ceph-mon-readiness.sh
|
||||
readOnly: true
|
||||
- name: ceph-etc
|
||||
mountPath: /etc/ceph/ceph.conf
|
||||
subPath: ceph.conf
|
||||
|
@ -72,14 +72,17 @@ spec:
|
||||
ports:
|
||||
- containerPort: 6800
|
||||
livenessProbe:
|
||||
tcpSocket:
|
||||
port: 6800
|
||||
initialDelaySeconds: 60
|
||||
timeoutSeconds: 5
|
||||
exec:
|
||||
command:
|
||||
- /ceph-osd-liveness-readiness.sh
|
||||
initialDelaySeconds: 60
|
||||
periodSeconds: 60
|
||||
readinessProbe:
|
||||
tcpSocket:
|
||||
port: 6800
|
||||
timeoutSeconds: 5
|
||||
exec:
|
||||
command:
|
||||
- /ceph-osd-liveness-readiness.sh
|
||||
initialDelaySeconds: 60
|
||||
periodSeconds: 60
|
||||
volumeMounts:
|
||||
- name: devices
|
||||
mountPath: /dev
|
||||
@ -96,6 +99,10 @@ spec:
|
||||
mountPath: /common_functions.sh
|
||||
subPath: common_functions.sh
|
||||
readOnly: true
|
||||
- name: ceph-bin
|
||||
mountPath: /ceph-osd-liveness-readiness.sh
|
||||
subPath: ceph-osd-liveness-readiness.sh
|
||||
readOnly: true
|
||||
- name: ceph-etc
|
||||
mountPath: /etc/ceph/ceph.conf
|
||||
subPath: ceph.conf
|
||||
|
Loading…
Reference in New Issue
Block a user