Ceph liveness scripts

Replace socket-based liveness checks with scripts

The current TCP socket-based liveness/readiness check for Ceph
doesn't accurately reflect when daemons are live, doesn't handle
multiple OSDs on a host, and doesn't work when hostNetworking is
in use and the Ceph network is different from the one associated
with the hostname.  This change adds new scripts for checking
Ceph monitor and OSD liveness/readiness that query the Ceph Unix
domain sockets to get daemon status and exits 0 iff all sockets
report that their daemons are in an "active" state.

This isn't perfect: we don't know how many daemons SHOULD be
active, so if only a subset is live and the others have no
sockets (yet?), we'll still claim the pod is ready.  The scripts
also don't distinguish between liveness and readiness for OSDs.

Change-Id: I5d370b4bc4025fece2e640355c3a29167afca871
This commit is contained in:
dave kormann 2017-08-24 10:36:38 -04:00
parent f5a6a5dedc
commit 5f3f13cc0a
6 changed files with 171 additions and 14 deletions

View File

@ -0,0 +1,44 @@
#!/bin/sh
# Copyright 2017 The Openstack-Helm Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# A liveness check for ceph monitors: exit 0 iff the monitor appears to be at least
# alive (but not necessarily in a quorum).
CEPH=${CEPH_CMD:-/usr/bin/ceph}
SOCKDIR=${CEPH_SOCKET_DIR:-/run/ceph}
SBASE=${CEPH_OSD_SOCKET_BASE:-ceph-mon}
SSUFFIX=${CEPH_SOCKET_SUFFIX:-asok}
mon_live_state="probing electing synchronizing leader peon"
monid=`ps auwwx | grep ceph-mon | grep -v "$1" | grep -v grep | sed 's/.*-i\ *//;s/\ *-.*//'|awk '{print $1}'`
if [ -z "${monid}" ]; then
# not really a sensible fallback, but it'll do.
monid=`hostname`
fi
if [ -S "${SOCKDIR}/${SBASE}.${monid}.${SSUFFIX}" ]; then
state=`${CEPH} -f json-pretty --connect-timeout 1 --admin-daemon "${sock}" mon_status|grep state|sed 's/.*://;s/[^a-z]//g'`
echo "MON $monid $state";
# this might be a stricter check than we actually want. what are the
# other values for the "state" field?
for S in ${mon_live_state}; do
if [ "x${state}x" = "x${S}x" ]; then
exit 0
fi
done
fi
exit 1

View File

@ -0,0 +1,44 @@
#!/bin/sh
# Copyright 2017 The Openstack-Helm Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# A readiness check for ceph monitors: exit 0 iff the monitor appears to be at least
# alive (but not necessarily in a quorum).
CEPH=${CEPH_CMD:-/usr/bin/ceph}
SOCKDIR=${CEPH_SOCKET_DIR:-/run/ceph}
SBASE=${CEPH_OSD_SOCKET_BASE:-ceph-mon}
SSUFFIX=${CEPH_SOCKET_SUFFIX:-asok}
mon_live_state="leader peon"
monid=`ps auwwx | grep ceph-mon | grep -v "$1" | grep -v grep | sed 's/.*-i\ *//;s/\ *-.*//'|awk '{print $1}'`
if [ -z "${monid}" ]; then
# not really a sensible fallback, but it'll do.
monid=`hostname`
fi
if [ -S "${SOCKDIR}/${SBASE}.${monid}.${SSUFFIX}" ]; then
state=`${CEPH} -f json-pretty --connect-timeout 1 --admin-daemon "${sock}" mon_status|grep state|sed 's/.*://;s/[^a-z]//g'`
echo "MON $monid $state";
# this might be a stricter check than we actually want. what are the
# other values for the "state" field?
for S in ${mon_live_state}; do
if [ "x${state}x" = "x${S}x" ]; then
exit 0
fi
done
fi
exit 1

View File

@ -0,0 +1,45 @@
#!/bin/sh
# Copyright 2017 The Openstack-Helm Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# A liveness check for ceph OSDs: exit 0 iff
# all OSDs on this host are in the "active" state
# per their admin sockets.
CEPH=${CEPH_CMD:-/usr/bin/ceph}
SOCKDIR=${CEPH_SOCKET_DIR:-/run/ceph}
SBASE=${CEPH_OSD_SOCKET_BASE:-ceph-osd}
SSUFFIX=${CEPH_SOCKET_SUFFIX:-asok}
# default: no sockets, not live
cond=1
for sock in $SOCKDIR/$SBASE.*.$SSUFFIX; do
if [ -S $sock ]; then
osdid=`echo $sock | awk -F. '{print $2}'`
state=`${CEPH} -f json-pretty --connect-timeout 1 --admin-daemon "${sock}" status|grep state|sed 's/.*://;s/[^a-z]//g'`
echo "OSD $osdid $state";
# this might be a stricter check than we actually want. what are the
# other values for the "state" field?
if [ "x${state}x" = 'xactivex' ]; then
cond=0
else
# one's not ready, so the whole pod's not ready.
exit 1
fi
else
echo "No daemon sockets found in $SOCKDIR"
fi
done
exit $cond

View File

@ -71,5 +71,11 @@ data:
{{ tuple "bin/_check_zombie_mons.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
rbd-provisioner.sh: |
{{ tuple "bin/_rbd-provisioner.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
ceph-osd-liveness-readiness.sh: |
{{ tuple "bin/_ceph-osd-liveness-readiness.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
ceph-mon-liveness.sh: |
{{ tuple "bin/_ceph-mon-liveness.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
ceph-mon-readiness.sh: |
{{ tuple "bin/_ceph-mon-readiness.sh.tpl" . | include "helm-toolkit.utils.template" | indent 4 }}
{{- end }}
{{- end }}

View File

@ -94,14 +94,17 @@ spec:
ports:
- containerPort: 6789
livenessProbe:
tcpSocket:
port: 6789
initialDelaySeconds: 60
timeoutSeconds: 5
exec:
command:
- /ceph-mon-liveness.sh
initialDelaySeconds: 60
periodSeconds: 60
readinessProbe:
tcpSocket:
port: 6789
timeoutSeconds: 5
exec:
command:
- /ceph-mon-readiness.sh
initialDelaySeconds: 60
periodSeconds: 60
volumeMounts:
- name: ceph-bin
mountPath: /start_mon.sh
@ -115,6 +118,14 @@ spec:
mountPath: /common_functions.sh
subPath: common_functions.sh
readOnly: true
- name: ceph-bin
mountPath: /ceph-mon-liveness.sh
subPath: ceph-mon-liveness.sh
readOnly: true
- name: ceph-bin
mountPath: /ceph-mon-readiness.sh
subPath: ceph-mon-readiness.sh
readOnly: true
- name: ceph-etc
mountPath: /etc/ceph/ceph.conf
subPath: ceph.conf

View File

@ -72,14 +72,17 @@ spec:
ports:
- containerPort: 6800
livenessProbe:
tcpSocket:
port: 6800
initialDelaySeconds: 60
timeoutSeconds: 5
exec:
command:
- /ceph-osd-liveness-readiness.sh
initialDelaySeconds: 60
periodSeconds: 60
readinessProbe:
tcpSocket:
port: 6800
timeoutSeconds: 5
exec:
command:
- /ceph-osd-liveness-readiness.sh
initialDelaySeconds: 60
periodSeconds: 60
volumeMounts:
- name: devices
mountPath: /dev
@ -96,6 +99,10 @@ spec:
mountPath: /common_functions.sh
subPath: common_functions.sh
readOnly: true
- name: ceph-bin
mountPath: /ceph-osd-liveness-readiness.sh
subPath: ceph-osd-liveness-readiness.sh
readOnly: true
- name: ceph-etc
mountPath: /etc/ceph/ceph.conf
subPath: ceph.conf