From 4ea34edb793ab4bd3d9d5198ea9ac7568930c384 Mon Sep 17 00:00:00 2001 From: Giulio Fidente Date: Tue, 23 Aug 2016 22:24:57 +0200 Subject: [PATCH] Add Ceph cluster health validation on upgrade This will prevent the Ceph Mon upgrade script from starting if the Ceph cluster is in error state. It also adds a parameter to ignore warning states, useful when performing an upgrade of a cluster where the number of healthy OSDs does not guarantee the desired replica size. Closes-Bug: 1618533 Change-Id: I1beb8ad0812f19b1018ba19b5a9fc85fa132d7f7 --- extraconfig/tasks/major_upgrade_ceph_mon.sh | 21 ++++++++++++++++--- .../tasks/major_upgrade_pacemaker.yaml | 15 ++++++++++++- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/extraconfig/tasks/major_upgrade_ceph_mon.sh b/extraconfig/tasks/major_upgrade_ceph_mon.sh index 38befbbf65..b76dd7c3b7 100755 --- a/extraconfig/tasks/major_upgrade_ceph_mon.sh +++ b/extraconfig/tasks/major_upgrade_ceph_mon.sh @@ -17,6 +17,21 @@ if ! [[ "$INSTALLED_VERSION" =~ ^0\.94.* ]]; then exit 0 fi +CEPH_STATUS=$(ceph health | awk '{print $1}') +if [ ${CEPH_STATUS} = HEALTH_ERR ]; do + echo ERROR: Ceph cluster status is HEALTH_ERR, cannot be upgraded + exit 1 +fi + +# Useful when upgrading with OSDs num < replica size +if [ $ignore_ceph_upgrade_warnings != "true" ]; then + timeout 300 bash -c "while [ ${CEPH_STATUS} != HEALTH_OK ]; do + echo WARNING: Waiting for Ceph cluster status to go HEALTH_OK; + sleep 30; + CEPH_STATUS=$(ceph health | awk '{print $1}') + done" +fi + MON_PID=$(pidof ceph-mon) MON_ID=$(hostname -s) @@ -37,8 +52,6 @@ if [[ "$UPDATED_VERSION" =~ ^0\.94.* ]]; then echo WARNING: Ceph was not upgraded, restarting daemons service ceph start mon.${MON_ID} elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then - echo INFO: Ceph was upgraded to Jewel - # RPM could own some of these but we can't take risks on the pre-existing files for d in /var/lib/ceph/mon /var/log/ceph /var/run/ceph /etc/ceph; do chown -R ceph:ceph $d @@ -54,9 +67,11 @@ elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then # Wait for daemon to be back in the quorum timeout 300 bash -c "until (ceph quorum_status | jq .quorum_names | grep -sq ${MON_ID}); do - echo Waiting for mon.${MON_ID} to re-join quorum; + echo WARNING: Waiting for mon.${MON_ID} to re-join quorum; sleep 10; done" + + echo INFO: Ceph was upgraded to Jewel else echo ERROR: Ceph was upgraded to an unknown release, daemon is stopped, need manual intervention exit 1 diff --git a/extraconfig/tasks/major_upgrade_pacemaker.yaml b/extraconfig/tasks/major_upgrade_pacemaker.yaml index c2e1488037..598d22d0da 100644 --- a/extraconfig/tasks/major_upgrade_pacemaker.yaml +++ b/extraconfig/tasks/major_upgrade_pacemaker.yaml @@ -26,6 +26,10 @@ parameters: constraints: - allowed_values: ['auto', 'yes', 'no'] default: 'auto' + IgnoreCephUpgradeWarnings: + type: boolean + default: false + description: If enabled, Ceph upgrade will be forced even though cluster or PGs status is not clean resources: # TODO(jistr): for Mitaka->Newton upgrades and further we can use @@ -36,7 +40,16 @@ resources: type: OS::Heat::SoftwareConfig properties: group: script - config: {get_file: major_upgrade_ceph_mon.sh} + config: + list_join: + - '' + - - str_replace: + template: | + #!/bin/bash + ignore_ceph_upgrade_warnings='IGNORE_CEPH_UPGRADE_WARNINGS' + params: + IGNORE_CEPH_UPGRADE_WARNINGS: {get_param: IgnoreCephUpgradeWarnings} + - get_file: major_upgrade_ceph_mon.sh CephMonUpgradeDeployment: type: OS::Heat::SoftwareDeploymentGroup