B&R: Recover Ceph monitor data at restore on AIO-SX
This commit implements the recovery procedure for Ceph monitor data as the last step in the restore procedure. This procedure involves the following steps: 1. Recover ceph.conf from backup tarball; 2. Create folder for every ceph osd under '/var/lib/ceph/osd' and mount in there the ceph osds by parsing the 'ceph-disk list' output e.g ceph-disk list 2>/dev/null /dev/sdc : /dev/sdc1 ceph data, active, cluster ceph, osd.1, osd uuid 7ce96cba-ee43-4616-966c-652e1d7da80a, journal /dev/sdc2 /dev/sdc2 ceph journal, for /dev/sdc1 3. Create a ceph-mon logical volume (ceph-mon-lv) with 'lvcreate' 4. Format ceph-mon-lv with ext4 file system 5. Mount ceph-mon-lv in /var/lib/ceph/mon 6. Create a ceph-mon fs with: ceph-mon --mkfs -i controller-0 7. Start ceph-mon 8. Collect data from osds (store.db file) using "ceph-objectstore-tool" to create a new store.db 8. Stop ceph-mon 9. Copy the newly created store.db over the one in /var/lib/ceph/mon 10. Start ceph-mon/ceph-mgr/osds Co-Authored-By: Elena Taivan <elena.taivan@windriver.com> Change-Id: I96c1c749dfa5a2ce92bce12469ac6ec5e6051e9a Story: 2004761 Task: 36200 Depends-On: Id56789db11c1fb180608975a962baf19514d8da6 Signed-off-by: Ovidiu Poncea <ovidiu.poncea@windriver.com>
This commit is contained in:
parent
5204feccbd
commit
038ef7ff35
@ -22,6 +22,7 @@
|
||||
gather_facts: no
|
||||
|
||||
vars_files:
|
||||
- host_vars/bootstrap/default.yml
|
||||
- host_vars/backup-restore/default.yml
|
||||
|
||||
roles:
|
||||
|
@ -0,0 +1,107 @@
|
||||
#!/usr/bin/python
|
||||
#
|
||||
# Copyright (c) 2019 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
||||
import os
|
||||
import json
|
||||
import subprocess
|
||||
|
||||
from controllerconfig import openstack
|
||||
|
||||
OSD_ROOT_DIR = "/var/lib/ceph/osd"
|
||||
MON_ROOT_DIR = "/var/lib/ceph/mon"
|
||||
CEPH_LV_PATH = '/dev/mapper/cgts--vg-ceph--mon--lv'
|
||||
CEPH_MON_VG = 'cgts-vg'
|
||||
CEPH_MON_LV = 'ceph-mon-lv'
|
||||
|
||||
|
||||
def get_ceph_mon_size():
|
||||
with openstack.OpenStack() as client:
|
||||
ceph_mons = client.sysinv.ceph_mon.list()
|
||||
# All Ceph monitor partitions have the same size, so grab one and return.
|
||||
if ceph_mons:
|
||||
return ceph_mons[0].ceph_mon_gib
|
||||
else:
|
||||
raise Exception("No ceph monitor defined!")
|
||||
|
||||
|
||||
def mount_osds():
|
||||
cmd_line = ['ceph-disk', 'list', '--format=json']
|
||||
|
||||
with open(os.devnull, "w") as fnull:
|
||||
config_data = json.loads(subprocess.check_output(cmd_line,
|
||||
stderr=fnull).decode('UTF-8'))
|
||||
|
||||
# Filter Ceph OSD partitions from our cluster
|
||||
# ceph data partition is always the first, it is part of the
|
||||
# cluster called 'ceph' and it is of type 'data'.
|
||||
ceph_parts = [e for e in config_data
|
||||
if 'partitions' in e and 'cluster' in e['partitions'][0] and
|
||||
e['partitions'][0]['cluster'] == 'ceph' and
|
||||
e['partitions'][0]['type'] == 'data']
|
||||
|
||||
for ceph_part in ceph_parts:
|
||||
# e.g: 'path: /dev/sdc1' => the osd that should be mounted
|
||||
disk_to_mount = ceph_part['partitions'][0]['path']
|
||||
fs_type = ceph_part['partitions'][0]['fs_type']
|
||||
|
||||
# 'whoami' - the osd number (0,1...)
|
||||
osd = ceph_part['partitions'][0]['whoami']
|
||||
osd_dir = OSD_ROOT_DIR + "/ceph-" + osd
|
||||
|
||||
if not os.path.exists(osd_dir):
|
||||
os.mkdir(osd_dir, 0o751)
|
||||
|
||||
# mount the osd in /var/lib/ceph/osd/ceph-(0,1..)
|
||||
if not os.path.ismount(osd_dir):
|
||||
print("Mounting partition {} to {}".format(disk_to_mount, osd_dir))
|
||||
with open(os.devnull, "w") as fnull:
|
||||
subprocess.check_output(["mount", "-t",
|
||||
fs_type, disk_to_mount,
|
||||
osd_dir], stderr=fnull)
|
||||
else:
|
||||
print("Directory {} already mounted, skipping.".format(osd_dir))
|
||||
|
||||
|
||||
def prepare_monitor():
|
||||
ceph_mon_gib = get_ceph_mon_size()
|
||||
with open(os.devnull, "w") as fnull:
|
||||
# Cleaning up, in case of replay
|
||||
try:
|
||||
cmd = ["umount", MON_ROOT_DIR]
|
||||
subprocess.check_output(cmd, stderr=fnull)
|
||||
print("Unmounted ceph-mon at {}.".format(MON_ROOT_DIR))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
cmd = ["lvremove", "{}/{}".format(CEPH_MON_VG, CEPH_MON_LV), "-y"]
|
||||
subprocess.check_output(cmd, stderr=fnull)
|
||||
print("Removed Ceph mon logical volume.")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
print("Creating ceph-mon lv with size {}GB.".format(ceph_mon_gib))
|
||||
cmd = ['timeout', '20', 'lvcreate', '-n', CEPH_MON_LV, '-L',
|
||||
'{}G'.format(ceph_mon_gib), CEPH_MON_VG]
|
||||
subprocess.check_output(cmd, stderr=fnull)
|
||||
|
||||
print("Formatting ceph-mon lv as ext4.")
|
||||
subprocess.check_output(["mkfs.ext4", CEPH_LV_PATH], stderr=fnull)
|
||||
|
||||
print("Mounting ceph-mon lv at {} to {}.".format(CEPH_LV_PATH, MON_ROOT_DIR))
|
||||
if not os.path.exists(MON_ROOT_DIR):
|
||||
os.mkdir(MON_ROOT_DIR, 0o751)
|
||||
subprocess.check_output(['mount', "-t", "ext4", CEPH_LV_PATH, MON_ROOT_DIR],
|
||||
stderr=fnull)
|
||||
|
||||
print("Populating Ceph mon fs structure for controller-0.")
|
||||
subprocess.check_output(["ceph-mon", "--mkfs", "-i", "controller-0"], stderr=fnull)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
mount_osds()
|
||||
prepare_monitor()
|
@ -0,0 +1,37 @@
|
||||
#!/usr/bin/python
|
||||
#
|
||||
# Copyright (c) 2019 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
|
||||
|
||||
def recover_ceph_data():
|
||||
ceph_osds = '/var/lib/ceph/osd/'
|
||||
mon_store = '/tmp/mon-store'
|
||||
|
||||
if os.path.exists(mon_store):
|
||||
print("Removing {}.".format(mon_store))
|
||||
shutil.rmtree(mon_store)
|
||||
|
||||
os.mkdir(mon_store, 0o751)
|
||||
|
||||
with open(os.devnull, "w") as fnull:
|
||||
for osd in os.listdir(ceph_osds):
|
||||
osd = ceph_osds + osd
|
||||
print("Scanning {}.".format(osd))
|
||||
subprocess.check_output(["ceph-objectstore-tool", "--data-path",
|
||||
osd, "--op", "update-mon-db",
|
||||
"--mon-store-path",
|
||||
mon_store], stderr=fnull)
|
||||
print("Rebuilding monitor data.")
|
||||
subprocess.check_output(["ceph-monstore-tool", mon_store, "rebuild"],
|
||||
stderr=fnull)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
recover_ceph_data()
|
@ -0,0 +1,93 @@
|
||||
---
|
||||
#
|
||||
# Copyright (c) 2019 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# ROLE DESCRIPTION:
|
||||
# This role is to restore the CEPH Monitor data
|
||||
|
||||
- name: Restore CEPH Monitor data
|
||||
block:
|
||||
- name: Restore ceph.conf file
|
||||
command: >-
|
||||
tar -C / -xpf {{ target_backup_dir }}/{{ backup_filename }}
|
||||
'etc/ceph/ceph.conf'
|
||||
args:
|
||||
warn: false
|
||||
|
||||
- name: Check if ceph-mon processes are running
|
||||
command: pgrep ceph-mon
|
||||
register: ceph_mons
|
||||
failed_when: false
|
||||
|
||||
- name: Shut down Ceph monitor and OSDs if they are running
|
||||
command: "{{ item }}"
|
||||
with_items:
|
||||
- /etc/init.d/ceph stop osd
|
||||
- /etc/init.d/ceph stop mon
|
||||
when: ceph_mons.stdout != ""
|
||||
|
||||
# On a partial restore ceph-osds are not wiped.
|
||||
# 'ceph-disk list' command returns the list of ceph osds
|
||||
# This task:
|
||||
# 1. parses the output of 'ceph-disk list' and extracts
|
||||
# the ceph osds, create for every ceph osd a folder under
|
||||
# /var/lib/ceph/osd and mount the osd in there.
|
||||
# 2. Gets ceph-mon size from sysinv, creates ceph-mon-lv,
|
||||
# format and mounts it under /var/lib/ceph/mon
|
||||
# then populates the data structure for controller-0 monitor
|
||||
# so that Ceph can be started.
|
||||
- name: Mount ceph-osds and format ceph-mon
|
||||
script: prepare_ceph_partitions.py
|
||||
register: prepare_ceph_partitions
|
||||
|
||||
- debug: var=prepare_ceph_partitions.stdout_lines
|
||||
|
||||
- name: Bring up ceph-mon
|
||||
command: /etc/init.d/ceph start mon
|
||||
|
||||
# Recover ceph-data from every osd with ceph-objectore-tool
|
||||
- name: Recover ceph-data
|
||||
script: recover_ceph_data.py
|
||||
register: ceph_data_out
|
||||
|
||||
- debug: var=ceph_data_out.stdout_lines
|
||||
|
||||
- name: Bring down ceph-mon
|
||||
command: /etc/init.d/ceph stop mon
|
||||
|
||||
- name: Delete store.db file from ceph-mon
|
||||
file:
|
||||
path: /var/lib/ceph/mon/ceph-controller-0/store.db
|
||||
state: absent
|
||||
|
||||
# Cannot use the 'copy' module with 'remote_src: yes' for
|
||||
# recursive copy till Ansible 2.8.
|
||||
- name: Restore store.db from mon-store
|
||||
shell: cp -ar /tmp/mon-store/store.db /var/lib/ceph/mon/ceph-controller-0
|
||||
|
||||
- name: Bring up ceph Monitor and OSDs
|
||||
command: /etc/init.d/ceph start
|
||||
|
||||
- name: Wait for ceph monitor to be up
|
||||
shell: ceph -s
|
||||
until: true
|
||||
retries: 5
|
||||
delay: 2
|
||||
|
||||
- name: Start Ceph manager
|
||||
command: /usr/bin/ceph-mgr --cluster ceph --id controller-0 - start ceph-mgr
|
||||
|
||||
- name: Wait for ceph-mgr to detect Ceph's pools
|
||||
shell: ceph -s
|
||||
register: result
|
||||
until: "'0 pools' not in result"
|
||||
retries: 30
|
||||
delay: 10
|
||||
|
||||
- name: Restart ceph one more time to pick latest changes
|
||||
command: /etc/init.d/ceph restart
|
||||
|
||||
become: yes
|
||||
become_user: root
|
@ -338,21 +338,28 @@
|
||||
retries: 30
|
||||
delay: 10
|
||||
|
||||
- name: Remove {{ ansible_remote_tmp }} directory
|
||||
file:
|
||||
path: "{{ ansible_remote_tmp }}"
|
||||
state: absent
|
||||
|
||||
- name: Inform user that restore_platform is run successfully
|
||||
debug:
|
||||
msg: >-
|
||||
Controller-0 is now online. The next step is to unlock this controller.
|
||||
Please refer to the system administration guide for more details.
|
||||
when: check_online.rc == 0
|
||||
|
||||
- name: Inform user that restore_platform is not successful
|
||||
debug:
|
||||
msg: >-
|
||||
Platform restore was unsuccessful. Please refer to the system administration
|
||||
guide for next step.
|
||||
when: check_online.rc != 0
|
||||
|
||||
# Restore ceph-mon data
|
||||
- block:
|
||||
- include_role:
|
||||
name: recover-ceph-data
|
||||
when: not wipe_ceph_osds|bool
|
||||
|
||||
- name: Inform user that restore_platform is run successfully
|
||||
debug:
|
||||
msg: >-
|
||||
Controller-0 is now online. The next step is to unlock this controller.
|
||||
Please refer to the system administration guide for more details.
|
||||
when: check_online.rc == 0
|
||||
|
||||
# Remove temporary staging area used by the copy module
|
||||
- name: Remove {{ ansible_remote_tmp }} directory
|
||||
file:
|
||||
path: "{{ ansible_remote_tmp }}"
|
||||
state: absent
|
||||
|
34
playbookconfig/src/playbooks/test/tc_recover_ceph_data.yml
Normal file
34
playbookconfig/src/playbooks/test/tc_recover_ceph_data.yml
Normal file
@ -0,0 +1,34 @@
|
||||
---
|
||||
#
|
||||
# Copyright (c) 2019 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Role of this playbook is to allow easy testing of ceph recovery process.
|
||||
# To run it make sure that the normal platform restore playbook was
|
||||
# executed with wipe_ceph_osds=false. Then run this playbook from sysadmin
|
||||
# user with same params as the platform restore. E.g.: ' ansible-playbook
|
||||
# /usr/share/ansible/stx-ansible/playbooks/bootstrap/tc_recover_ceph_data.yml
|
||||
# -e "wipe_ceph_osds=false ansible_become_pass=<password> admin_password=<password>
|
||||
# backup_filename=<backup.tgz>"'
|
||||
|
||||
- hosts: localhost
|
||||
gather_facts: no
|
||||
|
||||
vars_files:
|
||||
- host_vars/default.yml
|
||||
|
||||
pre_tasks:
|
||||
- name: Fail if backup_filename is not defined or set
|
||||
fail:
|
||||
msg: "Mandatory configuration parameter backup_filename is not defined or set."
|
||||
when: backup_filename is not defined or backup_filename is none
|
||||
|
||||
# Put the backup tarball in /scratch
|
||||
- name: Set staging and target backup dirs
|
||||
set_fact:
|
||||
staging_dir: /scratch
|
||||
target_backup_dir: /scratch
|
||||
|
||||
roles:
|
||||
- { role: recover-ceph-data, become: yes }
|
Loading…
Reference in New Issue
Block a user