diff --git a/playbookconfig/src/playbooks/restore_platform.yml b/playbookconfig/src/playbooks/restore_platform.yml index ef0c9ab4b..26d4fa608 100644 --- a/playbookconfig/src/playbooks/restore_platform.yml +++ b/playbookconfig/src/playbooks/restore_platform.yml @@ -23,6 +23,7 @@ gather_facts: no vars_files: + - host_vars/bootstrap/default.yml - host_vars/backup-restore/default.yml roles: diff --git a/playbookconfig/src/playbooks/roles/recover-ceph-data/files/prepare_ceph_partitions.py b/playbookconfig/src/playbooks/roles/recover-ceph-data/files/prepare_ceph_partitions.py new file mode 100644 index 000000000..9c432dd98 --- /dev/null +++ b/playbookconfig/src/playbooks/roles/recover-ceph-data/files/prepare_ceph_partitions.py @@ -0,0 +1,107 @@ +#!/usr/bin/python +# +# Copyright (c) 2019 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +import os +import json +import subprocess + +from controllerconfig import openstack + +OSD_ROOT_DIR = "/var/lib/ceph/osd" +MON_ROOT_DIR = "/var/lib/ceph/mon" +CEPH_LV_PATH = '/dev/mapper/cgts--vg-ceph--mon--lv' +CEPH_MON_VG = 'cgts-vg' +CEPH_MON_LV = 'ceph-mon-lv' + + +def get_ceph_mon_size(): + with openstack.OpenStack() as client: + ceph_mons = client.sysinv.ceph_mon.list() + # All Ceph monitor partitions have the same size, so grab one and return. + if ceph_mons: + return ceph_mons[0].ceph_mon_gib + else: + raise Exception("No ceph monitor defined!") + + +def mount_osds(): + cmd_line = ['ceph-disk', 'list', '--format=json'] + + with open(os.devnull, "w") as fnull: + config_data = json.loads(subprocess.check_output(cmd_line, + stderr=fnull).decode('UTF-8')) + + # Filter Ceph OSD partitions from our cluster + # ceph data partition is always the first, it is part of the + # cluster called 'ceph' and it is of type 'data'. + ceph_parts = [e for e in config_data + if 'partitions' in e and 'cluster' in e['partitions'][0] and + e['partitions'][0]['cluster'] == 'ceph' and + e['partitions'][0]['type'] == 'data'] + + for ceph_part in ceph_parts: + # e.g: 'path: /dev/sdc1' => the osd that should be mounted + disk_to_mount = ceph_part['partitions'][0]['path'] + fs_type = ceph_part['partitions'][0]['fs_type'] + + # 'whoami' - the osd number (0,1...) + osd = ceph_part['partitions'][0]['whoami'] + osd_dir = OSD_ROOT_DIR + "/ceph-" + osd + + if not os.path.exists(osd_dir): + os.mkdir(osd_dir, 0o751) + + # mount the osd in /var/lib/ceph/osd/ceph-(0,1..) + if not os.path.ismount(osd_dir): + print("Mounting partition {} to {}".format(disk_to_mount, osd_dir)) + with open(os.devnull, "w") as fnull: + subprocess.check_output(["mount", "-t", + fs_type, disk_to_mount, + osd_dir], stderr=fnull) + else: + print("Directory {} already mounted, skipping.".format(osd_dir)) + + +def prepare_monitor(): + ceph_mon_gib = get_ceph_mon_size() + with open(os.devnull, "w") as fnull: + # Cleaning up, in case of replay + try: + cmd = ["umount", MON_ROOT_DIR] + subprocess.check_output(cmd, stderr=fnull) + print("Unmounted ceph-mon at {}.".format(MON_ROOT_DIR)) + except Exception: + pass + + try: + cmd = ["lvremove", "{}/{}".format(CEPH_MON_VG, CEPH_MON_LV), "-y"] + subprocess.check_output(cmd, stderr=fnull) + print("Removed Ceph mon logical volume.") + except Exception: + pass + + print("Creating ceph-mon lv with size {}GB.".format(ceph_mon_gib)) + cmd = ['timeout', '20', 'lvcreate', '-n', CEPH_MON_LV, '-L', + '{}G'.format(ceph_mon_gib), CEPH_MON_VG] + subprocess.check_output(cmd, stderr=fnull) + + print("Formatting ceph-mon lv as ext4.") + subprocess.check_output(["mkfs.ext4", CEPH_LV_PATH], stderr=fnull) + + print("Mounting ceph-mon lv at {} to {}.".format(CEPH_LV_PATH, MON_ROOT_DIR)) + if not os.path.exists(MON_ROOT_DIR): + os.mkdir(MON_ROOT_DIR, 0o751) + subprocess.check_output(['mount', "-t", "ext4", CEPH_LV_PATH, MON_ROOT_DIR], + stderr=fnull) + + print("Populating Ceph mon fs structure for controller-0.") + subprocess.check_output(["ceph-mon", "--mkfs", "-i", "controller-0"], stderr=fnull) + + +if __name__ == '__main__': + mount_osds() + prepare_monitor() diff --git a/playbookconfig/src/playbooks/roles/recover-ceph-data/files/recover_ceph_data.py b/playbookconfig/src/playbooks/roles/recover-ceph-data/files/recover_ceph_data.py new file mode 100644 index 000000000..092fbf166 --- /dev/null +++ b/playbookconfig/src/playbooks/roles/recover-ceph-data/files/recover_ceph_data.py @@ -0,0 +1,37 @@ +#!/usr/bin/python +# +# Copyright (c) 2019 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +import os +import shutil +import subprocess + + +def recover_ceph_data(): + ceph_osds = '/var/lib/ceph/osd/' + mon_store = '/tmp/mon-store' + + if os.path.exists(mon_store): + print("Removing {}.".format(mon_store)) + shutil.rmtree(mon_store) + + os.mkdir(mon_store, 0o751) + + with open(os.devnull, "w") as fnull: + for osd in os.listdir(ceph_osds): + osd = ceph_osds + osd + print("Scanning {}.".format(osd)) + subprocess.check_output(["ceph-objectstore-tool", "--data-path", + osd, "--op", "update-mon-db", + "--mon-store-path", + mon_store], stderr=fnull) + print("Rebuilding monitor data.") + subprocess.check_output(["ceph-monstore-tool", mon_store, "rebuild"], + stderr=fnull) + + +if __name__ == '__main__': + recover_ceph_data() diff --git a/playbookconfig/src/playbooks/roles/recover-ceph-data/tasks/main.yml b/playbookconfig/src/playbooks/roles/recover-ceph-data/tasks/main.yml new file mode 100644 index 000000000..da833187b --- /dev/null +++ b/playbookconfig/src/playbooks/roles/recover-ceph-data/tasks/main.yml @@ -0,0 +1,93 @@ +--- +# +# Copyright (c) 2019 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +# ROLE DESCRIPTION: +# This role is to restore the CEPH Monitor data + +- name: Restore CEPH Monitor data + block: + - name: Restore ceph.conf file + command: >- + tar -C / -xpf {{ target_backup_dir }}/{{ backup_filename }} + 'etc/ceph/ceph.conf' + args: + warn: false + + - name: Check if ceph-mon processes are running + command: pgrep ceph-mon + register: ceph_mons + failed_when: false + + - name: Shut down Ceph monitor and OSDs if they are running + command: "{{ item }}" + with_items: + - /etc/init.d/ceph stop osd + - /etc/init.d/ceph stop mon + when: ceph_mons.stdout != "" + + # On a partial restore ceph-osds are not wiped. + # 'ceph-disk list' command returns the list of ceph osds + # This task: + # 1. parses the output of 'ceph-disk list' and extracts + # the ceph osds, create for every ceph osd a folder under + # /var/lib/ceph/osd and mount the osd in there. + # 2. Gets ceph-mon size from sysinv, creates ceph-mon-lv, + # format and mounts it under /var/lib/ceph/mon + # then populates the data structure for controller-0 monitor + # so that Ceph can be started. + - name: Mount ceph-osds and format ceph-mon + script: prepare_ceph_partitions.py + register: prepare_ceph_partitions + + - debug: var=prepare_ceph_partitions.stdout_lines + + - name: Bring up ceph-mon + command: /etc/init.d/ceph start mon + + # Recover ceph-data from every osd with ceph-objectore-tool + - name: Recover ceph-data + script: recover_ceph_data.py + register: ceph_data_out + + - debug: var=ceph_data_out.stdout_lines + + - name: Bring down ceph-mon + command: /etc/init.d/ceph stop mon + + - name: Delete store.db file from ceph-mon + file: + path: /var/lib/ceph/mon/ceph-controller-0/store.db + state: absent + + # Cannot use the 'copy' module with 'remote_src: yes' for + # recursive copy till Ansible 2.8. + - name: Restore store.db from mon-store + shell: cp -ar /tmp/mon-store/store.db /var/lib/ceph/mon/ceph-controller-0 + + - name: Bring up ceph Monitor and OSDs + command: /etc/init.d/ceph start + + - name: Wait for ceph monitor to be up + shell: ceph -s + until: true + retries: 5 + delay: 2 + + - name: Start Ceph manager + command: /usr/bin/ceph-mgr --cluster ceph --id controller-0 - start ceph-mgr + + - name: Wait for ceph-mgr to detect Ceph's pools + shell: ceph -s + register: result + until: "'0 pools' not in result" + retries: 30 + delay: 10 + + - name: Restart ceph one more time to pick latest changes + command: /etc/init.d/ceph restart + + become: yes + become_user: root diff --git a/playbookconfig/src/playbooks/roles/restore-platform/restore-more-data/tasks/main.yml b/playbookconfig/src/playbooks/roles/restore-platform/restore-more-data/tasks/main.yml index 54b569862..b964c15ba 100644 --- a/playbookconfig/src/playbooks/roles/restore-platform/restore-more-data/tasks/main.yml +++ b/playbookconfig/src/playbooks/roles/restore-platform/restore-more-data/tasks/main.yml @@ -338,21 +338,28 @@ retries: 30 delay: 10 - - name: Remove {{ ansible_remote_tmp }} directory - file: - path: "{{ ansible_remote_tmp }}" - state: absent - - - name: Inform user that restore_platform is run successfully - debug: - msg: >- - Controller-0 is now online. The next step is to unlock this controller. - Please refer to the system administration guide for more details. - when: check_online.rc == 0 - - name: Inform user that restore_platform is not successful debug: msg: >- Platform restore was unsuccessful. Please refer to the system administration guide for next step. when: check_online.rc != 0 + + # Restore ceph-mon data + - block: + - include_role: + name: recover-ceph-data + when: not wipe_ceph_osds|bool + + - name: Inform user that restore_platform is run successfully + debug: + msg: >- + Controller-0 is now online. The next step is to unlock this controller. + Please refer to the system administration guide for more details. + when: check_online.rc == 0 + + # Remove temporary staging area used by the copy module + - name: Remove {{ ansible_remote_tmp }} directory + file: + path: "{{ ansible_remote_tmp }}" + state: absent diff --git a/playbookconfig/src/playbooks/test/tc_recover_ceph_data.yml b/playbookconfig/src/playbooks/test/tc_recover_ceph_data.yml new file mode 100644 index 000000000..001aed94d --- /dev/null +++ b/playbookconfig/src/playbooks/test/tc_recover_ceph_data.yml @@ -0,0 +1,34 @@ +--- +# +# Copyright (c) 2019 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Role of this playbook is to allow easy testing of ceph recovery process. +# To run it make sure that the normal platform restore playbook was +# executed with wipe_ceph_osds=false. Then run this playbook from sysadmin +# user with same params as the platform restore. E.g.: ' ansible-playbook +# /usr/share/ansible/stx-ansible/playbooks/bootstrap/tc_recover_ceph_data.yml +# -e "wipe_ceph_osds=false ansible_become_pass= admin_password= +# backup_filename="' + +- hosts: localhost + gather_facts: no + + vars_files: + - host_vars/default.yml + + pre_tasks: + - name: Fail if backup_filename is not defined or set + fail: + msg: "Mandatory configuration parameter backup_filename is not defined or set." + when: backup_filename is not defined or backup_filename is none + + # Put the backup tarball in /scratch + - name: Set staging and target backup dirs + set_fact: + staging_dir: /scratch + target_backup_dir: /scratch + + roles: + - { role: recover-ceph-data, become: yes }