From 9a37183b26526086ef577acc365b59e1b54471fc Mon Sep 17 00:00:00 2001 From: Stephen Taylor Date: Fri, 6 May 2022 10:11:31 -0600 Subject: [PATCH] [ceph-osd] Remove ceph-mon dependency in ceph-osd liveness probe It is possible for misbehaving ceph-mon pods to cause the ceph-osd liveness probe to fail for healthy ceph-osd pods, which can cause healthy pods to get restarted unnecessarily. This change removes the ceph-mon query from the ceph-osd liveness probe so the probe is only dependent on ceph-osd state. Change-Id: I9e1846cfdc5783dbb261583e04ea19df81d143f4 --- ceph-osd/Chart.yaml | 2 +- ceph-osd/templates/bin/osd/_check.sh.tpl | 12 ++++-------- releasenotes/notes/ceph-osd.yaml | 1 + 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/ceph-osd/Chart.yaml b/ceph-osd/Chart.yaml index 54ff58f8e..f5bd86bb4 100644 --- a/ceph-osd/Chart.yaml +++ b/ceph-osd/Chart.yaml @@ -15,6 +15,6 @@ apiVersion: v1 appVersion: v1.0.0 description: OpenStack-Helm Ceph OSD name: ceph-osd -version: 0.1.40 +version: 0.1.41 home: https://github.com/ceph/ceph ... diff --git a/ceph-osd/templates/bin/osd/_check.sh.tpl b/ceph-osd/templates/bin/osd/_check.sh.tpl index dc321806f..3ed90d01a 100644 --- a/ceph-osd/templates/bin/osd/_check.sh.tpl +++ b/ceph-osd/templates/bin/osd/_check.sh.tpl @@ -25,17 +25,13 @@ cond=1 for sock in $SOCKDIR/$SBASE.*.$SSUFFIX; do if [ -S $sock ]; then OSD_ID=$(echo $sock | awk -F. '{print $2}') - OSD_STATE=$(ceph -f json-pretty --connect-timeout 1 --admin-daemon "${sock}" status|grep state|sed 's/.*://;s/[^a-z]//g') - NOUP_FLAG=$(ceph --name client.bootstrap-osd --keyring /var/lib/ceph/bootstrap-osd/ceph.keyring status | awk '/flags/{print $2}' | grep noup) + OSD_STATE=$(ceph -f json --connect-timeout 1 --admin-daemon "${sock}" status|jq -r '.state') echo "OSD ${OSD_ID} ${OSD_STATE}"; - # this might be a stricter check than we actually want. what are the - # other values for the "state" field? - if [ "x${OSD_STATE}x" = 'xactivex' ]; then - cond=0 - elif [ "${NOUP_FLAG}" ] && [ "x${OSD_STATE}x" = 'xprebootx' ]; then + # Succeed if the OSD state is active (running) or preboot (starting) + if [ "${OSD_STATE}" = "active" ] || [ "${OSD_STATE}" = "preboot" ]; then cond=0 else - # one's not ready, so the whole pod's not ready. + # Any other state is unexpected and the probe fails exit 1 fi else diff --git a/releasenotes/notes/ceph-osd.yaml b/releasenotes/notes/ceph-osd.yaml index 296651613..913a16d4f 100644 --- a/releasenotes/notes/ceph-osd.yaml +++ b/releasenotes/notes/ceph-osd.yaml @@ -41,4 +41,5 @@ ceph-osd: - 0.1.38 Skip pod wait in post-apply job when disruptive - 0.1.39 Allow for unconditional OSD restart - 0.1.40 Remove udev interactions from osd-init + - 0.1.41 Remove ceph-mon dependency in ceph-osd liveness probe ...