Improve and fix ceph data restore

When recovering with flag wipe_ceph_osds=false, in some cases,
kube-cephfs filesystem is not recovered from backup. Some
improvements were made to make it more robust and added more
debugging info.

When recovering with flag wipe_ceph_osds=true, the default pools
were not being recreated because platform-integ-apps application
does not recreate those pools when reapplied. To solve this,
the application is now removed and will be automatically applied
by conductor after restore is complete.

Test-Plan:
  PASS: B&R AIO-SX without ceph configured
  PASS: Optimized B&R AIO-SX without ceph configured
  PASS: B&R AIO-SX with wipe_ceph_osds=true
  PASS: B&R AIO-SX with wipe_ceph_osds=false
  PASS: Optimized B&R AIO-SX with wipe_ceph_osds=true
  PASS: Optimized B&R AIO-SX with wipe_ceph_osds=false
  PASS: Upgrade AIO-SX from stx-7.0 to stx-8.0
  PASS: Upgrade AIO-SX from stx-6.0 to stx-8.0

Closes-Bug: 2016328

Change-Id: Ie09c4bf9c74b2e0bf0dde9e7f41cf85002177525
Signed-off-by: Felipe Sanches Zanoni <Felipe.SanchesZanoni@windriver.com>
This commit is contained in:
Felipe Sanches Zanoni
2023-04-14 15:29:08 -03:00
parent 99e1a4aae3
commit 2db2b1fbbf
5 changed files with 60 additions and 10 deletions

View File

@@ -1,6 +1,6 @@
---
#
# Copyright (c) 2022 Wind River Systems, Inc.
# Copyright (c) 2022-2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@@ -157,3 +157,14 @@
state: absent
when: not wipe_ceph_osds|bool and ceph_backend.rc == 0
# The applicaiton platform-integ-apps is being removed when the flag
# wipe_ceph_osds is set to true because this application needs to be
# reapplied, but helm will not reapply the charts if the version is not bumped.
#
# The application is removed here to be applied after host is unlocked and
# ceph is correctly configured after a wipe. This app is automatically
# applied by conductor when there is ceph backend configured.
- name: Remove platform-integ-apps application when asked to wipe ceph osd disks
shell: source /etc/platform/openrc; system application-remove platform-integ-apps
when: wipe_ceph_osds|bool and ceph_backend.rc == 0

View File

@@ -1,6 +1,6 @@
---
#
# Copyright (c) 2022 Wind River Systems, Inc.
# Copyright (c) 2022-2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@@ -45,6 +45,13 @@
shell: psql -c "update kube_app set status='restore-requested' where status='applied'" sysinv
become_user: postgres
- name: Set platform-integ-apps to applied state when set to wipe osds disks to remove the app later
shell: >-
psql -c "update kube_app set status='applied' where name='platform-integ-apps'
and status='restore-requested'" sysinv
become_user: postgres
when: wipe_ceph_osds|bool
- name: Bringup flock services
systemd:
name: "{{ item }}"

View File

@@ -1,6 +1,6 @@
#!/bin/sh
#
# Copyright (c) 2021 Wind River Systems, Inc.
# Copyright (c) 2021-2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@@ -31,12 +31,18 @@ set -x
# Check if the filesystem for the system RWX provisioner is present
ceph fs ls | grep ${FS_NAME}
if [ $? -ne 0 ]; then
# If we have existing metadata/data pools, use them
# Use existing metadata/data pools to recover cephfs
ceph fs new ${FS_NAME} ${METADATA_POOL_NAME} ${DATA_POOL_NAME} --force
# Reset the filesystem and journal
# Recover MDS state from filesystem
ceph fs reset ${FS_NAME} --yes-i-really-mean-it
# Try to recover from some common errors
cephfs-journal-tool --rank=${FS_NAME}:0 event recover_dentries summary
cephfs-journal-tool --rank=${FS_NAME}:0 journal reset
cephfs-table-tool ${FS_NAME}:0 reset session
cephfs-table-tool ${FS_NAME}:0 reset snap
cephfs-table-tool ${FS_NAME}:0 reset inode
fi
# Start the Ceph MDS

View File

@@ -1,6 +1,6 @@
---
#
# Copyright (c) 2019-2022 Wind River Systems, Inc.
# Copyright (c) 2019-2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@@ -310,10 +310,18 @@
script: recover_cephfs.sh
register: cephfs_recovery_out
- name: Create ceph.client.guest.keyring to allow ceph mount again
command: touch /etc/ceph/ceph.client.guest.keyring
- name: Display cephfs recovery script stdout output
debug:
var: cephfs_recovery_out.stdout_lines
- debug: var=cephfs_recovery_out.stdout_lines
- name: Display cephfs recovery script stderr output
debug:
var: cephfs_recovery_out.stderr_lines
- name: Create ceph.client.guest.keyring to allow ceph mount again
file:
path: "/etc/ceph/ceph.client.guest.keyring"
state: touch
- name: Restart ceph one more time to pick latest changes
command: /etc/init.d/ceph restart

View File

@@ -1,6 +1,6 @@
---
#
# Copyright (c) 2019-2022 Wind River Systems, Inc.
# Copyright (c) 2019-2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@@ -493,6 +493,13 @@
shell: psql -c "update kube_app set status='restore-requested' where status='applied'" sysinv
become_user: postgres
- name: Set platform-integ-apps to applied state when set to wipe osds disks
shell: >-
psql -c "update kube_app set status='applied' where name='platform-integ-apps'
and status='restore-requested'" sysinv
become_user: postgres
when: wipe_ceph_osds|bool and ceph_backend.rc == 0
- name: Restart services
systemd:
name: "{{ item }}"
@@ -599,6 +606,17 @@
when: not wipe_ceph_osds|bool
# The applicaiton platform-integ-apps is being removed when the flag
# wipe_ceph_osds is set to true because this application needs to be
# reapplied, but helm will not reapply the charts if the version is not bumped.
#
# The application is removed here to be applied after host is unlocked and
# ceph is correctly configured after a wipe. This app is automatically
# applied by conductor when there is ceph backend configured.
- name: Remove platform-integ-apps
shell: source /etc/platform/openrc; system application-remove platform-integ-apps
when: wipe_ceph_osds|bool
when: check_online.stdout == "online" and ceph_backend.rc == 0
- name: Apply kube-apiserver parameters