From 30b57ba671ab7ce1d8f73c2689a49212fec7c68e Mon Sep 17 00:00:00 2001 From: Matthew Heler Date: Thu, 20 Dec 2018 14:54:47 -0600 Subject: [PATCH] [CEPH] Fix race conditions with OSD POD initialization Under POD restart conditions there is a race condition with lsblk causing the helm chart to zap a fully working OSD disk. We refactor the code to remove this requirement. Additonally the new automatic journal partitioning code has a race condition in which the same journal partition could be picked twice for OSDs on the same node. To resolve this we share a common tmp directory from the node to all of the OSD pods on that node. Change-Id: I807074c4c5e54b953b5c0efa4c169763c5629062 --- ceph-osd/templates/bin/osd/_init.sh.tpl | 65 +++++++++++++------------ ceph-osd/templates/daemonset-osd.yaml | 10 ++++ 2 files changed, 43 insertions(+), 32 deletions(-) diff --git a/ceph-osd/templates/bin/osd/_init.sh.tpl b/ceph-osd/templates/bin/osd/_init.sh.tpl index 354905ea9..8891333fe 100644 --- a/ceph-osd/templates/bin/osd/_init.sh.tpl +++ b/ceph-osd/templates/bin/osd/_init.sh.tpl @@ -50,6 +50,13 @@ else export OSD_JOURNAL=$(readlink -f ${JOURNAL_LOCATION}) fi + +function udev_settle { + partprobe "${OSD_DEVICE}" + # watch the udev event queue, and exit if all current events are handled + udevadm settle --timeout=600 +} + # Calculate proper device names, given a device and partition number function dev_part { local OSD_DEVICE=${1} @@ -121,46 +128,41 @@ function osd_disk_prepare { fi fi + udev_settle + # then search for some ceph metadata on the disk if [[ "$(parted --script ${OSD_DEVICE} print | egrep '^ 1.*ceph data')" ]]; then if [[ ${OSD_FORCE_ZAP} -eq 1 ]]; then if [ -b "${OSD_DEVICE}1" ]; then - local fs=`lsblk -fn ${OSD_DEVICE}1` - if [ ! -z "${fs}" ]; then - local cephFSID=`ceph-conf --lookup fsid` - if [ ! -z "${cephFSID}" ]; then - local tmpmnt=`mktemp -d` - mount ${OSD_DEVICE}1 ${tmpmnt} - if [ -f "${tmpmnt}/ceph_fsid" ]; then - osdFSID=`cat "${tmpmnt}/ceph_fsid"` - umount ${tmpmnt} - if [ ${osdFSID} != ${cephFSID} ]; then - echo "It looks like ${OSD_DEVICE} is an OSD belonging to a different (or old) ceph cluster." - echo "The OSD FSID is ${osdFSID} while this cluster is ${cephFSID}" - echo "Because OSD_FORCE_ZAP was set, we will zap this device." - ceph-disk -v zap ${OSD_DEVICE} - else - echo "It looks like ${OSD_DEVICE} is an OSD belonging to a this ceph cluster." - echo "OSD_FORCE_ZAP is set, but will be ignored and the device will not be zapped." - echo "Moving on, trying to activate the OSD now." - return - fi - else - umount ${tmpmnt} - echo "It looks like ${OSD_DEVICE} has a ceph data partition but no FSID." + local cephFSID=`ceph-conf --lookup fsid` + if [ ! -z "${cephFSID}" ]; then + local tmpmnt=`mktemp -d` + mount ${OSD_DEVICE}1 ${tmpmnt} + if [ -f "${tmpmnt}/ceph_fsid" ]; then + osdFSID=`cat "${tmpmnt}/ceph_fsid"` + umount ${tmpmnt} + if [ ${osdFSID} != ${cephFSID} ]; then + echo "It looks like ${OSD_DEVICE} is an OSD belonging to a different (or old) ceph cluster." + echo "The OSD FSID is ${osdFSID} while this cluster is ${cephFSID}" echo "Because OSD_FORCE_ZAP was set, we will zap this device." ceph-disk -v zap ${OSD_DEVICE} + else + echo "It looks like ${OSD_DEVICE} is an OSD belonging to a this ceph cluster." + echo "OSD_FORCE_ZAP is set, but will be ignored and the device will not be zapped." + echo "Moving on, trying to activate the OSD now." + return fi else - echo "Unable to determine the FSID of the current cluster." - echo "OSD_FORCE_ZAP is set, but this OSD will not be zapped." - echo "Moving on, trying to activate the OSD now." - return + umount ${tmpmnt} + echo "It looks like ${OSD_DEVICE} has a ceph data partition but no FSID." + echo "Because OSD_FORCE_ZAP was set, we will zap this device." + ceph-disk -v zap ${OSD_DEVICE} fi else - echo "It looks like ${OSD_DEVICE} has a ceph data partition but no filesystem." - echo "Because OSD_FORCE_ZAP was set, we will zap this device." - ceph-disk -v zap ${OSD_DEVICE} + echo "Unable to determine the FSID of the current cluster." + echo "OSD_FORCE_ZAP is set, but this OSD will not be zapped." + echo "Moving on, trying to activate the OSD now." + return fi else echo "parted says ${OSD_DEVICE}1 should exist, but we do not see it." @@ -225,8 +227,7 @@ function osd_disk_prepare { ceph-disk -v prepare ${CLI_OPTS} --journal-uuid ${OSD_JOURNAL_UUID} ${OSD_DEVICE} ${OSD_JOURNAL} - # watch the udev event queue, and exit if all current events are handled - udevadm settle --timeout=600 + udev_settle } if ! [ "x${STORAGE_TYPE%-*}" == "xdirectory" ]; then diff --git a/ceph-osd/templates/daemonset-osd.yaml b/ceph-osd/templates/daemonset-osd.yaml index f28247a2c..c8b5bd91b 100644 --- a/ceph-osd/templates/daemonset-osd.yaml +++ b/ceph-osd/templates/daemonset-osd.yaml @@ -79,6 +79,9 @@ spec: - name: pod-var-lib-ceph mountPath: /var/lib/ceph readOnly: false + - name: pod-var-lib-ceph-tmp + mountPath: /var/lib/ceph/tmp + readOnly: false - name: pod-run mountPath: /run readOnly: false @@ -160,6 +163,9 @@ spec: - name: pod-var-lib-ceph mountPath: /var/lib/ceph readOnly: false + - name: pod-var-lib-ceph-tmp + mountPath: /var/lib/ceph/tmp + ReadOnly: false - name: pod-run mountPath: /run readOnly: false @@ -288,6 +294,10 @@ spec: path: /run/lvm - name: pod-var-lib-ceph emptyDir: {} + - name: pod-var-lib-ceph-tmp + hostPath: + path: /var/lib/openstack-helm/ceph/var-tmp + type: DirectoryOrCreate - name: pod-var-log hostPath: path: {{ print "/var/log/ceph/" $envAll.Release.Name }}