Merge "Add checks in ansible upgrade tasks for CephMon and CephOSD"

This commit is contained in:
Jenkins 2017-02-24 18:05:08 +00:00 committed by Gerrit Code Review
commit ecfb298112
4 changed files with 78 additions and 23 deletions

View File

@ -18,10 +18,6 @@ parameters:
constraints:
- allowed_values: ['auto', 'yes', 'no']
default: 'auto'
IgnoreCephUpgradeWarnings:
type: boolean
default: false
description: If enabled, Ceph upgrade will be forced even though cluster or PGs status is not clean
KeepSaharaServicesOnUpgrade:
type: boolean
default: true

View File

@ -59,6 +59,14 @@ parameters:
}
default: {}
type: json
CephValidationRetries:
type: number
default: 5
description: Number of retry attempts for Ceph validation
CephValidationDelay:
type: number
default: 10
description: Interval (in seconds) in between validation checks
MonitoringSubscriptionCephMon:
default: 'overcloud-ceph-mon'
type: string
@ -119,21 +127,32 @@ outputs:
# rolling upgrade of all osd nodes in step1
- name: Check status
tags: step0,validation
shell: ceph health | grep -qv HEALTH_ERR
# FIXME(shardy) I suspect we can use heat or ansible facts here instead?
- name: Get hostname
shell: ceph health | egrep -sq "HEALTH_OK|HEALTH_WARN"
- name: Stop CephMon
tags: step0
shell: hostname -s
register: mon_id
- name: Stop Ceph Mon
service:
name: ceph-mon@{{ ansible_hostname }}
state: stopped
- name: Update Ceph packages
tags: step0
service: name=ceph-mon@{{mon_id.stdout}} pattern=ceph-mon state=stopped
- name: Update ceph packages
yum:
name: ceph-mon
state: latest
- name: Start CephMon
tags: step0
yum: name=ceph-mon state=latest
- name: Start ceph-mon service
tags: step0
service: name=ceph-mon@{{mon_id.stdout}} state=started
service:
name: ceph-mon@{{ ansible_hostname }}
state: started
# ceph-ansible
# https://github.com/ceph/ceph-ansible/blob/master/infrastructure-playbooks/rolling_update.yml#L149-L157
- name: Wait for the monitor to join the quorum...
tags: step0,ceph_quorum_validation
shell: |
ceph -s | grep monmap | sed 's/.*quorum//' | egrep -sq {{ ansible_hostname }}
register: ceph_quorum_nodecheck
until: ceph_quorum_nodecheck.rc == 0
retries: {get_param: CephValidationRetries}
delay: {get_param: CephValidationDelay}
- name: ceph osd crush tunables default
tags: step0
shell: ceph osd crush tunables default

View File

@ -21,6 +21,24 @@ parameters:
MonitoringSubscriptionCephOsd:
default: 'overcloud-ceph-osd'
type: string
CephValidationRetries:
type: number
default: 40
description: Number of retry attempts for Ceph validation
CephValidationDelay:
type: number
default: 30
description: Interval (in seconds) in between validation checks
IgnoreCephUpgradeWarnings:
type: boolean
default: false
description: If enabled, Ceph upgrade will be forced even though cluster or PGs status is not clean
parameter_groups:
- label: deprecated
description: Do not use deprecated params, they will be removed.
parameters:
- IgnoreCephUpgradeWarnings
resources:
CephBase:
@ -66,17 +84,37 @@ outputs:
- name: ceph osd set noscrub
tags: step1
command: ceph osd set noscrub
- name: Stop Ceph OSD
- name: Stop CephOSD
tags: step1
service: name=ceph-osd@{{ item }} state=stopped
service:
name: ceph-osd@{{ item }}
state: stopped
with_items: "{{osd_ids.stdout.strip().split()}}"
- name: Update ceph OSD packages
- name: Update Ceph packages
tags: step1
yum: name=ceph-osd state=latest
- name: Start ceph-osd service
yum:
name: ceph-osd
state: latest
- name: Start CephOSD
tags: step1
service: name=ceph-osd@{{ item }} state=started
service:
name: ceph-osd@{{ item }}
state: started
with_items: "{{osd_ids.stdout.strip().split()}}"
# with awk we are meant to check if $2 and $4 are *the same* but it returns 1 when
# they are, so the check is inverted to produce an useful exit code
- name: Wait for clean pgs...
tags: step1,ceph_pgs_clean_validation
vars:
ignore_warnings: {get_param: IgnoreCephUpgradeWarnings}
shell: |
ceph pg stat | awk '{exit($2!=$4)}' && ceph health | egrep -sq "HEALTH_OK|HEALTH_WARN"
register: ceph_pgs_healthcheck
until: ceph_pgs_healthcheck.rc == 0
retries: {get_param: CephValidationRetries}
delay: {get_param: CephValidationDelay}
when:
- not ignore_warnings
- name: ceph osd unset noout
tags: step1
command: ceph osd unset noout

View File

@ -87,4 +87,6 @@ outputs:
tags: step0,validation
- name: Stop RGW instance
tags: step1
service: name=ceph-radosgw@{{rgw_id.stdout}} state=stopped
service:
name: ceph-radosgw@{{rgw_id.stdout}}
state: stopped