
424 lines
14 KiB

# Copyright (c) 2019 Wind River Systems, Inc.
# SPDX-License-Identifier: Apache-2.0
# This role is to restore the remaining data in the backup tarball
# during platform restore.
# These hieradata were generated after persist-config role was run. They
# will be re-generated when sysinv is restarted after postgres db is restored
- name: Remove newly generated hieradata data
path: "{{ item }}"
state: absent
- "{{ puppet_permdir }}/hieradata/{{ controller_floating_address|ipmath(1) }}.yaml"
- "{{ puppet_permdir }}/hieradata/system.yaml"
- "{{ puppet_permdir }}/hieradata/secure_system.yaml"
# To work around an ansible quirk that regex_replace filter
# is ignored when it is applied to variables in the command module
- name: Remove leading '/' from dir name
short_platform_conf_path: "{{ platform_conf_path | regex_replace('^\\/', '') }}"
short_config_permdir: "{{ config_permdir | regex_replace('^\\/', '') }}"
- name: Extract platform.conf from the backup tarball
command: >-
tar -C {{ staging_dir }} -xpf {{ target_backup_dir }}/{{ backup_filename }} --transform='s,.*/,,'
{{ short_platform_conf_path }}/platform.conf
warn: false
- name: Search for the new INSTALL_UUID in /etc/platform/platform.conf
shell: grep INSTALL_UUID {{ platform_conf_path }}/platform.conf
register: result
- name: Replace INSTALL_UUID with the new one
dest: "{{ staging_dir }}/platform.conf"
regexp: 'INSTALL_UUID'
line: "{{ result.stdout }}"
- name: Strip out entries that are host specific
dest: "{{ staging_dir }}/platform.conf"
regexp: "{{ item }}"
state: absent
- '^oam_interface='
- '^cluster_host_interface='
- '^UUID='
- name: Search for the management_interface in /etc/platform/platform.conf
shell: grep management_interface {{ platform_conf_path }}/platform.conf
failed_when: false
register: result
- name: Replace management_interface with the new one
dest: "{{ staging_dir }}/platform.conf"
regexp: '^management_interface='
line: "{{ result.stdout }}"
when: result.rc == 0
- name: Replace platform config file
command: mv -f {{ staging_dir }}/platform.conf {{ platform_conf_path}}/platform.conf
# Restore resolv.conf and dnsmaq
- name: Extract resolv.conf from backup tarball
command: >-
tar -C /etc -xpf {{ target_backup_dir }}/{{ backup_filename }} --overwrite
--transform='s,.*/,,' etc/resolv.conf
warn: false
- name: Restore resolv.conf in config permdir (/opt/platform/config/...)
command: >-
tar -C {{ config_permdir }} -xpf {{ target_backup_dir }}/{{ backup_filename }}
--overwrite --transform='s,.*/,,' '{{ short_config_permdir }}/resolv.conf'
warn: false
- name: Restore dnsmaq in config permdir (/opt/platform/config/...)
command: >-
tar -C {{ config_permdir }} -xpf {{ target_backup_dir }}/{{ backup_filename }}
--overwrite --transform='s,.*/,,' '{{ short_config_permdir }}/dnsmasq*'
warn: false
- name: Remove leading '/' from directory name
short_pxe_config_permdir: "{{ pxe_config_permdir | regex_replace('^\\/', '') }}"
- name: Restore boot files in pxelinux.cfg dir
command: >-
tar -C {{ pxe_config_permdir }} -xpf {{ target_backup_dir }}/{{ backup_filename }}
--overwrite --transform='s,.*/,,' '{{ short_pxe_config_permdir }}/*-*-*'
warn: false
- name: Extract ldap.db to staging directory
command: >-
tar -C {{ staging_dir }} -xpf {{ target_backup_dir }}/{{ backup_filename }}
--transform='s,.*/,,' '*/ldap.db'
warn: false
- name: Stop openldap service
shell: "export SYSTEMCTL_SKIP_REDIRECT=1; /etc/init.d/openldap stop"
- name: Delete ldap directory
path: "{{ ldap_permdir }}"
state: absent
- name: Recreate ldap directory
path: "{{ ldap_permdir }}"
state: directory
recurse: yes
owner: root
group: root
mode: 0755
- name: Restore ldap
shell: slapadd -F /etc/openldap/schema -l {{ staging_dir }}/ldap.db
- name: Start openldap service
shell: "export SYSTEMCTL_SKIP_REDIRECT=1; /etc/init.d/openldap start"
- name: Delete file from staging dir
path: "{{ staging_dir }}/ldap.db"
state: absent
- name: Restore home directory
shell: tar -C / --overwrite -xpf {{ target_backup_dir }}/{{ backup_filename }} 'home/*'
warn: false
become_user: root
- name: Restore Helm charts, armada manifests and extension filesystem
command: tar -C / --overwrite -xpf {{ target_backup_dir }}/{{ backup_filename }} {{ item }}
warn: false
become_user: root
- "{{ helm_charts_permdir | regex_replace('^\\/', '') }}"
- "{{ armada_permdir | regex_replace('^\\/', '') }}"
- "{{ extension_permdir | regex_replace('^\\/', '') }}"
- name: Restore sysinv default configuration file
command: >-
tar -C {{ sysinv_config_permdir }} -xpf {{ target_backup_dir }}/{{ backup_filename }}
--transform='s,.*/,,' '*/sysinv.conf.default'
warn: false
# Can't store ceph crushmap at sysinv_config_permdir (/opt/platform/sysinv/)
# for AIO systems because when unlocking controller-0 for the first time,
# the crushmap is set thru ceph puppet when /opt/platform is not mounted yet.
# So for AIO systems store the crushmap at /etc/sysinv.
- name: Set ceph crushmap directory to /etc/sysinv if it is AIO system
ceph_crushmap_dir: /etc/sysinv
when: system_type == 'All-in-one'
- name: Set ceph crushmap directory to /opt/platform/sysinv if it is non-AIO system
ceph_crushmap_dir: "{{ sysinv_config_permdir }}"
when: system_type != 'All-in-one'
- name: Restore ceph crush map
command: >-
tar -C {{ ceph_crushmap_dir }} -xpf {{ target_backup_dir }}/{{ backup_filename }}
--transform='s,.*/,,' '*/crushmap.bin.backup'
warn: false
# Need to remove osd info from the crushmap before it is loaded into ceph.
# When osds are created they will be inserted into the crushmap by ceph.
# TODO: There might be a better command to do this, like the rebuild option
# with the ceph-monstore-tool.
- name: Remove osds from the crushmap
shell: >-
crushtool -i {{ ceph_crushmap_dir }}/{{ crushmap_file }} --tree |
awk /osd/'{print $NF}' |
xargs -i crushtool -i {{ ceph_crushmap_dir }}/{{ crushmap_file }} --remove-item {}
-o {{ ceph_crushmap_dir }}/{{ crushmap_file }}
- name: Remove leading '/' from patch-vault directory
short_patch_vault_permdir: "{{ patch_vault_permdir | regex_replace('^\\/', '') }}"
- name: Look for patch-vault filesystem
shell: "tar -tf {{ target_backup_dir }}/{{ backup_filename }} | grep 'patch-vault'"
warn: false
failed_when: false
register: search_result
- name: Restore patch-vault filesystem
command: >-
tar -C / --overwrite -xpf {{ target_backup_dir }}/{{ backup_filename }}
{{ short_patch_vault_permdir }}
warn: false
when: search_result.rc == 0
# TODO: Restore ceph_external when it is supported
- name: Create Helm overrides directory
path: "{{ helm_overrides_permdir }}"
state: directory
recurse: yes
owner: root
group: root
mode: 0755
- block:
- name: Shutdown mtce
command: /usr/lib/ocf/resource.d/platform/mtcAgent stop
OCF_ROOT: "/usr/lib/ocf"
OCF_RESKEY_state: "active"
- name: Stop services
name: "{{ item }}"
state: stopped
- openstack-keystone
- fminit
- fm-api
- sysinv-api
- sysinv-conductor
- sysinv-agent
- openstack-barbican-api
- name: Create staging directory for postgres data
path: "{{ staging_dir }}/postgres"
state: directory
recurse: yes
owner: root
group: root
mode: 0755
- name: Extract postgres db to staging directory
command: >-
tar -C {{ staging_dir }}/postgres -xpf {{ target_backup_dir }}/{{ backup_filename }}
--transform='s,.*/,,' '*/*\.postgreSql\.*'
warn: false
- name: Restore postgres db
shell: "psql -f {{ item }} {{ (item|basename).split('.')[0] }}"
become_user: postgres
- "{{ staging_dir }}/postgres/postgres.postgreSql.config"
- "{{ staging_dir }}/postgres/"
- "{{ staging_dir }}/postgres/"
- "{{ staging_dir }}/postgres/"
- "{{ staging_dir }}/postgres/"
- "{{ staging_dir }}/postgres/"
- "{{ staging_dir }}/postgres/"
- name: Remove postgres staging directory
path: "{{ staging_dir }}/postgres"
state: absent
# Set all the hosts including controller-0 to locked/disabled/offline state.
# After the services are restarted, mtce will update controller-0 to
# locked/disabled/online state. Setting controller-0 to offline state now
# will ensure that keystone, sysinv and mtcAgent are indeed in-service after being restated.
- name: Set all the hosts to locked/disabled/offline state
shell: >-
psql -c "update i_host set administrative='locked', operational='disabled',
availability='offline'" sysinv
become_user: postgres
when: wipe_ceph_osds|bool
- name: Set all the hosts, except storage nodes to locked/disabled/offline state
shell: >-
psql -c "update i_host set administrative='locked', operational='disabled',
availability='offline' where personality!='storage'" sysinv
become_user: postgres
when: not wipe_ceph_osds|bool
# Set platform-integ-apps to "uploaded" state, so that once ceph is up after
# controller-0 is unlocked for the first time, the manifest will be applied.
- name: Set platform-integ-apps to "uploaded" state
shell: psql -c "update kube_app set status='uploaded' where name='platform-integ-apps'" sysinv
become_user: postgres
# If stx-openstack app is in "applied" state, set it to "uploaded" state to
# avoid confusion. stx-openstack app will be brought up in stages after the
# platform is restored.
- name: Check stx-openstack app state
shell: psql -c "select status from kube_app where name='stx-openstack'" sysinv
become_user: postgres
register: app_res
- name: Set stx-openstack app to "uploaded" state
shell: psql -c "update kube_app set status='uploaded' where name='stx-openstack'" sysinv
become_user: postgres
when: app_res.stdout is search('applied')
- name: Restart services
name: "{{ item }}"
state: restarted
- openstack-keystone
- fminit
- fm-api
- sysinv-api
- sysinv-conductor
- sysinv-agent
- openstack-barbican-api
- name: Bring up Maintenance Agent
command: /usr/lib/ocf/resource.d/platform/mtcAgent start
OCF_ROOT: "/usr/lib/ocf"
OCF_RESKEY_state: "active"
- name: Wait for 90 secs before check if services come up
wait_for: timeout=90
# admin-keystone is always the very last to be ready,
# So we just wait and check for admin-keystone to come up.
- name: Make sure admin-keystone is ready
shell: "ps -ef | grep admin-keystone | grep -v grep"
register: result
until: result.stdout.find("keystone") != -1
retries: 6
delay: 10
# Run "system host-list" to verify that controller-0 is in
# "online" state. This will ensure that keystone, sysinv and
# mtcAgent are indeed in-service after being restated.
- name: Check controller-0 is in online state
shell: source /etc/platform/openrc; system host-show controller-0 --column availability --format value
register: check_online
failed_when: false
retries: 30
delay: 10
until: check_online.stdout == "online"
- name: Inform user that restore_platform is not successful
msg: >-
Platform restore was unsuccessful. Please refer to the system administration
guide for next step.
when: check_online.stdout != "online"
# Restore ceph-mon data
- block:
- block:
# Recover procedure for systems with storage nodes is different from
# that of systems with controller storage:
# - For controller storage we recover ceph-mon data by scanning OSDs.
# - For systems with storage nodes we get ceph-mon data from storage-0
# ceph-mon that is already up and will not be reinstalled.
- name: Check if setup has storage nodes
shell: source /etc/platform/openrc; system host-list --format value --column personality
register: node_personalities
failed_when: false
# Get system_mode after restore and create flag file to skip wiping OSDs
- name: Retrieve system mode
shell: source /etc/platform/platform.conf; echo $system_mode
register: restore_system_mode_result
- name: Fail if system mode is not defined
msg: "system_mode is missing in /etc/platform/platform.conf"
when: restore_system_mode_result.stdout_lines|length == 0
- name: Set system mode fact
restore_system_mode: "{{ restore_system_mode_result.stdout_lines[0] }}"
- name: Create flag file in /etc/platform to skip wiping OSDs
path: "{{ skip_ceph_osds_wipe_flag }}"
state: touch
when: restore_system_mode != 'simplex'
# Recover ceph data for systems with controller storage
- include_role:
name: recover-ceph-data
when: node_personalities.stdout is not search('storage')
- name: Mark crushmap as restored
path: "{{ sysinv_config_permdir }}/.crushmap_applied"
owner: root
group: root
mode: 644
state: touch
when: not wipe_ceph_osds|bool
- name: Inform user that restore_platform is run successfully
msg: >-
Controller-0 is now online. The next step is to unlock this controller.
Please refer to the system administration guide for more details.
when: check_online.stdout == "online"
# Remove temporary staging area used by the copy module
- name: Remove {{ ansible_remote_tmp }} directory
path: "{{ ansible_remote_tmp }}"
state: absent