Add Ceph cluster health validation on upgrade

This will prevent the Ceph Mon upgrade script from starting if the
Ceph cluster is in error state.

It also adds a parameter to ignore warning states, useful when
performing an upgrade of a cluster where the number of healthy
OSDs does not guarantee the desired replica size.

Closes-Bug: 1618533
Change-Id: I1beb8ad0812f19b1018ba19b5a9fc85fa132d7f7
This commit is contained in:
Giulio Fidente 2016-08-23 22:24:57 +02:00
parent e5a627e43a
commit 4ea34edb79
2 changed files with 32 additions and 4 deletions

View File

@ -17,6 +17,21 @@ if ! [[ "$INSTALLED_VERSION" =~ ^0\.94.* ]]; then
exit 0
fi
CEPH_STATUS=$(ceph health | awk '{print $1}')
if [ ${CEPH_STATUS} = HEALTH_ERR ]; do
echo ERROR: Ceph cluster status is HEALTH_ERR, cannot be upgraded
exit 1
fi
# Useful when upgrading with OSDs num < replica size
if [ $ignore_ceph_upgrade_warnings != "true" ]; then
timeout 300 bash -c "while [ ${CEPH_STATUS} != HEALTH_OK ]; do
echo WARNING: Waiting for Ceph cluster status to go HEALTH_OK;
sleep 30;
CEPH_STATUS=$(ceph health | awk '{print $1}')
done"
fi
MON_PID=$(pidof ceph-mon)
MON_ID=$(hostname -s)
@ -37,8 +52,6 @@ if [[ "$UPDATED_VERSION" =~ ^0\.94.* ]]; then
echo WARNING: Ceph was not upgraded, restarting daemons
service ceph start mon.${MON_ID}
elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then
echo INFO: Ceph was upgraded to Jewel
# RPM could own some of these but we can't take risks on the pre-existing files
for d in /var/lib/ceph/mon /var/log/ceph /var/run/ceph /etc/ceph; do
chown -R ceph:ceph $d
@ -54,9 +67,11 @@ elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then
# Wait for daemon to be back in the quorum
timeout 300 bash -c "until (ceph quorum_status | jq .quorum_names | grep -sq ${MON_ID}); do
echo Waiting for mon.${MON_ID} to re-join quorum;
echo WARNING: Waiting for mon.${MON_ID} to re-join quorum;
sleep 10;
done"
echo INFO: Ceph was upgraded to Jewel
else
echo ERROR: Ceph was upgraded to an unknown release, daemon is stopped, need manual intervention
exit 1

View File

@ -26,6 +26,10 @@ parameters:
constraints:
- allowed_values: ['auto', 'yes', 'no']
default: 'auto'
IgnoreCephUpgradeWarnings:
type: boolean
default: false
description: If enabled, Ceph upgrade will be forced even though cluster or PGs status is not clean
resources:
# TODO(jistr): for Mitaka->Newton upgrades and further we can use
@ -36,7 +40,16 @@ resources:
type: OS::Heat::SoftwareConfig
properties:
group: script
config: {get_file: major_upgrade_ceph_mon.sh}
config:
list_join:
- ''
- - str_replace:
template: |
#!/bin/bash
ignore_ceph_upgrade_warnings='IGNORE_CEPH_UPGRADE_WARNINGS'
params:
IGNORE_CEPH_UPGRADE_WARNINGS: {get_param: IgnoreCephUpgradeWarnings}
- get_file: major_upgrade_ceph_mon.sh
CephMonUpgradeDeployment:
type: OS::Heat::SoftwareDeploymentGroup