Fix rook-ceph recovery (B&R)

In some scenarios, after recovering rook-ceph, the ceph status remained as if the cluster was empty: cluster: id: 1c6688fc-58b8-4055-abc7-c1c730a88608 health: HEALTH_OK services: mon: 1 daemons, quorum a (age 4m) mgr: no daemons active osd: 0 osds: 0 up, 0 in data: pools: 0 pools, 0 pgs objects: 0 objects, 0 B usage: 0 B used, 0 B / 0 B avail pgs: After analyzing the code and logs, it was possible to observe changes caused by race conditions in rook-ceph secrets and configmaps. To prevent this from happening, the rook-ceph operator is paused while the recovery process patches secrets and configmaps. Furthermore, it was observed that when we do not have OSD and monitor on the same host, the mgr and mds were on the host where the recovery was performed (which only has OSD). To fix this, the respective deployments are restarted, thus placing them on the correct hosts. Finally, the logs of all jobs/pods used during recovery are saved to /var/log/ceph/restore.log on the target host, which is where the recovery job runs. Test Plan: - PASS: Configure Standard with OSDs on controller-1 and worker-0, and monitors on the other hosts. - PASS: Perform B&R - PASS: After restore, unlock controller-1 only after all hosts have been unlocked. - PASS: Check ceph status after recovery Closes-Bug: 2086473 Change-Id: If2cd7186eaa4510ce55bcf9b10a9536d76e366c4 Signed-off-by: Erickson Silva de Oliveira <Erickson.SilvadeOliveira@windriver.com>
2024-10-31 20:45:41 -03:00 · 2024-10-31 20:45:41 -03:00 · 75b5bc17e3
commit 75b5bc17e3
parent 20f2f20b01
2 changed files with 183 additions and 91 deletions
--- a/playbookconfig/src/playbooks/roles/recover-rook-ceph-data/files/recover_rook_ceph.py
+++ b/playbookconfig/src/playbooks/roles/recover-rook-ceph-data/files/recover_rook_ceph.py
@ -16,7 +16,7 @@ from string import Template
 CEPH_TMP = '/tmp/ceph'
 MONMAP_FILENAME = 'monmap.bin'
 RECOVERY_JOB_RESOURCE_FILENAME = 'recovery-job.yaml'
-KEYRING_JOB_RESOURCE_FILENAME = 'keyring-update-{}-job.yaml'
+UPDATE_OSD_KEYRING_JOB_RESOURCE_FILENAME = 'update-osd-keyring-{}-job.yaml'
 MOVE_MON_JOB_RESOURCE_FILENAME = 'move-mon-{}-job.yaml'
 CLEAN_MON_JOB_RESOURCE_FILENAME = 'clean-mon-{}-job.yaml'
 MONITOR_JOB_RESOURCE_FILENAME = 'monitor-job.yaml'
@ -92,17 +92,18 @@ def recover_cluster():
        if structure == "ONLY_OSD":
            move_mon_job_template = get_move_mon_job_template()
            move_mon_job_resource = move_mon_job_template.safe_substitute({'TARGET_HOSTNAME': target_mon_hostname,
-                                                                           'TARGET_MON': target_mon})
+                                                                           'TARGET_MON': target_mon,
+                                                                           'RECOVERY_HOSTNAME': target_hostname})
            move_mon_job_resource_path = create_job_resource(move_mon_job_resource,
                                                             MOVE_MON_JOB_RESOURCE_FILENAME.format(target_mon))
            subprocess.run(["kubectl", "apply", "-f", move_mon_job_resource_path])

        for hostname in hosts_to_update_keyring:
-            keyring_job_template = get_keyring_job_template()
-            keyring_job_resource = keyring_job_template.safe_substitute({'TARGET_HOSTNAME': hostname})
-            keyring_job_resource_path = create_job_resource(keyring_job_resource,
-                                                            KEYRING_JOB_RESOURCE_FILENAME.format(hostname))
-            subprocess.run(["kubectl", "apply", "-f", keyring_job_resource_path])
+            update_osd_keyring_job_template = get_update_osd_keyring_job_template()
+            update_osd_keyring_job_resource = update_osd_keyring_job_template.safe_substitute({'TARGET_HOSTNAME': hostname})
+            update_osd_keyring_job_resource_path = create_job_resource(update_osd_keyring_job_resource,
+                                                                       UPDATE_OSD_KEYRING_JOB_RESOURCE_FILENAME.format(hostname))
+            subprocess.run(["kubectl", "apply", "-f", update_osd_keyring_job_resource_path])

        for name, hostname in mons_to_clean.items():
            clean_mon_job_template = get_clean_mon_job_template()
@ -112,11 +113,13 @@ def recover_cluster():
                                                              CLEAN_MON_JOB_RESOURCE_FILENAME.format(name))
            subprocess.run(["kubectl", "apply", "-f", clean_mon_job_resource_path])

-        monitor_job_template = get_monitor_job_template()
-        monitor_job_resource = monitor_job_template.safe_substitute({'MON_FLOAT_ENABLED': mon_float})
-        monitor_job_resource_path = create_job_resource(monitor_job_resource,
-                                                        MONITOR_JOB_RESOURCE_FILENAME)
-        subprocess.run(["kubectl", "apply", "-f", monitor_job_resource_path])
+    monitor_job_template = get_monitor_job_template()
+    monitor_job_resource = monitor_job_template.safe_substitute({'TARGET_HOSTNAME': target_hostname,
+                                                                 'MON_FLOAT_ENABLED': mon_float,
+                                                                 'STRUCTURE': structure})
+    monitor_job_resource_path = create_job_resource(monitor_job_resource,
+                                                    MONITOR_JOB_RESOURCE_FILENAME)
+    subprocess.run(["kubectl", "apply", "-f", monitor_job_resource_path])


 def create_job_resource(content, filename):
@ -205,8 +208,6 @@ data:
      kubectl -n rook-ceph patch configmap rook-ceph-mon-endpoints -p '{"data": {"data": "'"${MON_NAME}"'='"${mon_host_addr}"':6789"}}'
      kubectl -n rook-ceph patch secret rook-ceph-config -p '{"stringData": {"mon_host": "[v2:'"${mon_host_addr}"':3300,v1:'"${mon_host_addr}"':6789]", "mon_initial_members": "'"${MON_NAME}"'"}}'

-      kubectl -n rook-ceph label deployment -l app=rook-ceph-mon ceph.rook.io/do-not-reconcile=""
-
      if [ $STRUCT == 'ONLY_OSD' ]; then
        kubectl label nodes ${HOSTNAME} ceph-mgr-placement=enabled
        kubectl label nodes ${HOSTNAME} ceph-mon-placement=enabled
@ -292,20 +293,23 @@ data:
        kubectl -n rook-ceph delete pod -l osd=${osd_id} --grace-period=0 --force
      fi

-      NEW_OSD_KEYRING=(`cat /var/lib/rook/rook-ceph/${ceph_fsid}_${osd_uuid}/keyring | sed -n -e 's/^.*key = //p'`)
+      OSD_KEYRING=(`cat /var/lib/rook/rook-ceph/${ceph_fsid}_${osd_uuid}/keyring | sed -n -e 's/^.*key = //p'`)
      cat > /tmp/osd.${osd_id}.keyring << EOF
    [osd.${osd_id}]
-            key = ${NEW_OSD_KEYRING}
+            key = ${OSD_KEYRING}
            caps mgr = "allow profile osd"
            caps mon = "allow profile osd"
            caps osd = "allow *"
    EOF

-      ceph auth import -i /tmp/osd.${osd_id}.keyring
-      if [ $? -ne 0 ]; then
-        echo "ceph timeout exceeded, exit"
-        exit 1
-      fi
+      while [ "$(ceph auth get-key osd.${osd_id})" != "${OSD_KEYRING}" ]
+      do
+        ceph auth import -i /tmp/osd.${osd_id}.keyring
+        if [ $? -ne 0 ]; then
+          echo "ceph timeout exceeded, exit"
+          exit 1
+        fi
+      done

      ceph-objectstore-tool --type bluestore --data-path /var/lib/rook/rook-ceph/${ceph_fsid}_${osd_uuid} --op update-mon-db --mon-store-path /tmp/monstore
      if [ $? -ne 0 ]; then
@ -364,7 +368,7 @@ data:
    kubectl -n rook-ceph scale deployment -l app=rook-ceph-osd --replicas 1
    sleep ${TIME_AFTER_SCALE}
    kubectl -n rook-ceph wait --for=condition=Ready pod --all=true -l mon=${MON_NAME} --timeout=${TIME_WAIT_READY}
-    kubectl -n rook-ceph wait --for=condition=Ready pod --all=true -l app=rook-ceph-osd --timeout=${TIME_WAIT_READY}
+    kubectl -n rook-ceph wait --for=condition=Ready pod --all=true -l "app=rook-ceph-osd,topology-location-host=${HOSTNAME}" --timeout=${TIME_WAIT_READY}

    ceph -s
    if [ $? -ne 0 ]; then
@ -428,6 +432,12 @@ data:
        echo "Waiting for cluster recovery"
      done

+      kubectl -n rook-ceph scale deployment rook-ceph-operator --replicas 0
+      kubectl -n rook-ceph wait --for=delete pod --all=true -l app=rook-ceph-operator --timeout=${TIME_WAIT_DELETE}
+      if [ $? -ne 0 ]; then
+        kubectl -n rook-ceph delete pod -l app=rook-ceph-operator --grace-period=0 --force
+      fi
+
      kubectl -n rook-ceph scale deployment -l app=rook-ceph-mon --replicas 0
      kubectl -n rook-ceph wait --for=delete pod --all=true -l app=rook-ceph-mon --timeout=${TIME_WAIT_DELETE}
      if [ $? -ne 0 ]; then
@ -439,27 +449,31 @@ data:
      kubectl -n rook-ceph patch secret rook-ceph-config -p '{"data": {"mon_initial_members": "'"${DATA_MON_INIT}"'"}}'

      kubectl -n rook-ceph scale deployment rook-ceph-mon-${MON_NAME} --replicas 1
+      kubectl -n rook-ceph scale deployment rook-ceph-operator --replicas 1
      sleep ${TIME_AFTER_SCALE}
      kubectl -n rook-ceph wait --for=condition=Ready pod --all=true -l mon=${MON_NAME} --timeout=${TIME_WAIT_READY}
+      kubectl -n rook-ceph wait --for=condition=Ready pod --all=true -l app=rook-ceph-operator --timeout=${TIME_WAIT_READY}
    fi

+    ceph config set mgr mgr/crash/warn_recent_interval 0
+
    kubectl -n rook-ceph patch configmap rook-ceph-recovery -p '{"data": {"status": "completed"}}'

    exit 0

-  update_keyring.sh: |-
+  update_osd_keyring.sh: |-
    #!/bin/bash

    set -x

    while true
    do
+      # TODO: Instead of sleep, use 'kubectl wait'
      status=$(kubectl -n rook-ceph get configmap rook-ceph-recovery -o jsonpath='{.data.status}')
      if [ "$status" == "completed" ]; then
        break
-      else
-        sleep 10
      fi
+      sleep 10
    done

    if [ "${MON_HOST}"x == ""x ]; then
@ -503,10 +517,10 @@ data:
        sleep 60
      done

-      NEW_OSD_KEYRING=(`cat /var/lib/rook/rook-ceph/${ceph_fsid}_${osd_uuid}/keyring | sed -n -e 's/^.*key = //p'`)
+      OSD_KEYRING=(`cat /var/lib/rook/rook-ceph/${ceph_fsid}_${osd_uuid}/keyring | sed -n -e 's/^.*key = //p'`)
      cat > /tmp/osd.${osd_id}.keyring << EOF
    [osd.${osd_id}]
-            key = ${NEW_OSD_KEYRING}
+            key = ${OSD_KEYRING}
            caps mgr = "allow profile osd"
            caps mon = "allow profile osd"
            caps osd = "allow *"
@ -516,11 +530,14 @@ data:
        sleep 60
      done

-      ceph auth import -i /tmp/osd.${osd_id}.keyring
-      if [ $? -ne 0 ]; then
-        echo "ceph timeout exceeded, exit"
-        exit 1
-      fi
+      while [ "$(ceph auth get-key osd.${osd_id})" != "${OSD_KEYRING}" ]
+      do
+        ceph auth import -i /tmp/osd.${osd_id}.keyring
+        if [ $? -ne 0 ]; then
+          echo "ceph timeout exceeded, exit"
+          exit 1
+        fi
+      done
    done

    exit 0
@ -530,30 +547,24 @@ data:

    set -x

-    if [ ${MON_NAME} == "float" ]; then
-      data_dir"/var/lib/rook/mon-${MON_NAME}/mon-${MON_NAME}"
-    else
-      data_dir="/var/lib/rook/data/mon-${MON_NAME}"
-    fi
-
    while true
    do
+      # TODO: Instead of sleep, use 'kubectl wait'
      status=$(kubectl -n rook-ceph get configmap rook-ceph-recovery -o jsonpath='{.data.status}')
      if [ "$status" == "completed" ]; then
-        kubectl -n rook-ceph scale deployment rook-ceph-mon-${MON_NAME} --replicas 0
-        kubectl -n rook-ceph wait --for=delete pod --all=true -l mon=${MON_NAME} --timeout=30s
-        if [ $? -ne 0 ]; then
-          kubectl -n rook-ceph delete pod -l mon=${MON_NAME} --grace-period=0 --force
+        kubectl -n rook-ceph wait --for=condition=complete job --all=true -l app=rook-ceph-recovery-update-osd-keyring --timeout=30s
+        if [ $? -eq 0 ]; then
+          kubectl -n rook-ceph scale deployment rook-ceph-mon-${MON_NAME} --replicas 0
+          kubectl -n rook-ceph wait --for=delete pod --all=true -l mon=${MON_NAME} --timeout=30s
+          if [ $? -ne 0 ]; then
+            kubectl -n rook-ceph delete pod -l mon=${MON_NAME} --grace-period=0 --force
+          fi
+          rm -rf /var/lib/rook/mon-${MON_NAME}
+          kubectl -n rook-ceph scale deployment rook-ceph-mon-${MON_NAME} --replicas 1
+          break
        fi
-
-        rm -rf $data_dir
-
-        kubectl -n rook-ceph scale deployment rook-ceph-mon-${MON_NAME} --replicas 1
-
-        break
-      else
-        sleep 10
      fi
+      sleep 10
    done

    exit 0
@ -580,15 +591,14 @@ data:

    while true
    do
+      # TODO: Instead of sleep, use 'kubectl wait'
+      sleep 10
      status=$(kubectl -n rook-ceph get configmap rook-ceph-recovery -o jsonpath='{.data.status}')
      if [ "$status" == "completed" ]; then
        if [ "$(ceph health)" == "HEALTH_OK" ]; then
-          PODS=$(kubectl -n rook-ceph get pods -l app=rook-ceph-clean-mon)
-          if echo "$PODS" | grep rook-ceph-clean; then
-            kubectl -n rook-ceph wait --for=condition=complete job --all=true -l app=rook-ceph-clean-mon --timeout=30s
-            if [ $? -ne 0 ]; then
-                continue
-            fi
+          kubectl -n rook-ceph wait --for=condition=complete job --all=true -l app=rook-ceph-recovery-clean-mon --timeout=30s
+          if [ $? -ne 0 ]; then
+            continue
          fi

          kubectl -n rook-ceph scale deployment rook-ceph-mon-${MON_NAME} --replicas 0
@ -597,25 +607,40 @@ data:
            kubectl -n rook-ceph delete pod -l mon=${MON_NAME} --grace-period=0 --force
          fi

+          MGR_NAME=$(kubectl -n rook-ceph get pods -l app=rook-ceph-mgr --field-selector spec.nodeName=${RECOVERY_HOSTNAME} --no-headers -o custom-columns=":metadata.labels.mgr")
+          MDS_NAME=$(kubectl -n rook-ceph get pods -l app=rook-ceph-mds --field-selector spec.nodeName=${RECOVERY_HOSTNAME} --no-headers -o custom-columns=":metadata.labels.mds")
+
+          kubectl -n rook-ceph scale deployment rook-ceph-mgr-${MGR_NAME} --replicas 0
+          kubectl -n rook-ceph wait --for=delete pod --all=true -l mgr=${MGR_NAME} --timeout=30s
+          if [ $? -ne 0 ]; then
+            kubectl -n rook-ceph delete pod -l mgr=${MGR_NAME} --grace-period=0 --force
+          fi
+
+          kubectl -n rook-ceph scale deployment rook-ceph-mds-${MDS_NAME} --replicas 0
+          kubectl -n rook-ceph wait --for=delete pod --all=true -l mds=${MDS_NAME} --timeout=30s
+          if [ $? -ne 0 ]; then
+            kubectl -n rook-ceph delete pod -l mds=${MDS_NAME} --grace-period=0 --force
+          fi
+
          rm -rf /var/lib/rook/mon-${MON_NAME}

-          kubectl -n rook-ceph label deployment -l app=rook-ceph-mon ceph.rook.io/do-not-reconcile=""
-
          kubectl -n rook-ceph patch deployment rook-ceph-mon-${MON_NAME} -p '{"spec": {"template": {"spec": {"nodeSelector": {"kubernetes.io/hostname": "'"${HOSTNAME}"'"}}}}}'
-          kubectl label nodes ${HOSTNAME} ceph-mgr-placement-
-          kubectl label nodes ${HOSTNAME} ceph-mon-placement-
+          kubectl label nodes ${RECOVERY_HOSTNAME} ceph-mgr-placement-
+          kubectl label nodes ${RECOVERY_HOSTNAME} ceph-mon-placement-

          kubectl -n rook-ceph scale deployment rook-ceph-mon-${MON_NAME} --replicas 1
+          kubectl -n rook-ceph scale deployment rook-ceph-mgr-${MGR_NAME} --replicas 1
+          kubectl -n rook-ceph scale deployment rook-ceph-mds-${MDS_NAME} --replicas 1
          sleep 10
          kubectl -n rook-ceph wait --for=condition=Ready pod --all=true -l mon=${MON_NAME} --timeout=60s
+          kubectl -n rook-ceph wait --for=condition=Ready pod --all=true -l mgr=${MGR_NAME} --timeout=60s
+          kubectl -n rook-ceph wait --for=condition=Ready pod --all=true -l mds=${MDS_NAME} --timeout=60s

          echo "rook-ceph mon moved successfully."
          break
        fi
      fi
-      sleep 30
    done
-
    exit 0

  monitor.sh: |-
@ -625,26 +650,50 @@ data:

    while true
    do
+      # TODO: Instead of sleep, use 'kubectl wait'
+      sleep 30
      status=$(kubectl -n rook-ceph get configmap rook-ceph-recovery -o jsonpath='{.data.status}')
      if [ "$status" == "completed" ]; then
-        kubectl -n rook-ceph wait --for=condition=complete job --all=true -l app.kubernetes.io/part-of=rook-ceph-recovery --timeout=30s
-        if [ $? -eq 0 ]; then
-          if [ "${HAS_MON_FLOAT}" == false ]; then
-            kubectl -n rook-ceph label deployment -l app=rook-ceph-mon ceph.rook.io/do-not-reconcile-
-          fi
+        if [ "${STRUCT}" == 'ONE_HOST' ]; then
          break
        fi
-      else
-        sleep 5m
+
+        kubectl -n rook-ceph wait --for=condition=complete job --all=true -l app.kubernetes.io/part-of=rook-ceph-recovery --timeout=30s
+        if [ $? -ne 0 ]; then
+          continue
+        fi
+
+        if [ "${HAS_MON_FLOAT}" == false ]; then
+          kubectl -n rook-ceph label deployment -l app=rook-ceph-mon ceph.rook.io/do-not-reconcile-
+        fi
+
+        if [ "${STRUCT}" == 'ONLY_OSD' ]; then
+          rm -rf /var/lib/rook/mon-*
+        fi
+
+        break
      fi
    done

+    set +x
+    PODS=$(kubectl -n rook-ceph get pods -l app.kubernetes.io/part-of=rook-ceph-recovery --no-headers -o custom-columns=":metadata.name")
+    for pod in $PODS; do
+      echo -e "\\n##############################\\n$pod\\n##############################" >> /var/log/ceph/restore.log
+      kubectl -n rook-ceph logs $pod >> /var/log/ceph/restore.log
+    done
+
+    set -x
    kubectl -n rook-ceph delete jobs -l app.kubernetes.io/part-of=rook-ceph-recovery
    kubectl -n rook-ceph wait --for=delete job --all=true -l app.kubernetes.io/part-of=rook-ceph-recovery --timeout=30s
    if [ $? -ne 0 ]; then
      kubectl -n rook-ceph delete jobs -l app.kubernetes.io/part-of=rook-ceph-recovery --grace-period=0 --force
    fi

+    set +x
+    echo -e "\\n##############################\\nrook-ceph-recovery-monitor\\n##############################" >> /var/log/ceph/restore.log
+    kubectl -n rook-ceph logs $(kubectl get pod -n rook-ceph -l app=rook-ceph-recovery-monitor -o name) >> /var/log/ceph/restore.log
+
+    set -x
    exit 0

  monmap_b64: |-
@ -662,6 +711,12 @@ rules:
 - apiGroups: [""]
  resources: ["services"]
  verbs: ["get"]
+- apiGroups: [""]
+  resources: ["pods/log"]
+  verbs: ["get"]
+- apiGroups: [""]
+  resources: ["pods/exec"]
+  verbs: ["create"]
 - apiGroups: [""]
  resources: ["nodes"]
  verbs: ["get", "patch"]
@ -708,13 +763,16 @@ metadata:
  name: rook-ceph-recovery
  namespace: rook-ceph
  labels:
-   app: rook-ceph-recovery
-   app.kubernetes.io/part-of: rook-ceph-recovery
+    app: rook-ceph-recovery
+    app.kubernetes.io/part-of: rook-ceph-recovery
 spec:
  template:
    metadata:
      name: rook-ceph-recovery
      namespace: rook-ceph
+      labels:
+        app: rook-ceph-recovery
+        app.kubernetes.io/part-of: rook-ceph-recovery
    spec:
      serviceAccountName: rook-ceph-recovery
      nodeSelector:
@ -812,23 +870,26 @@ spec:
        """)


-def get_keyring_job_template():
+def get_update_osd_keyring_job_template():
    return Template(
        """
 ---
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: rook-ceph-keyring-update-$TARGET_HOSTNAME
+  name: rook-ceph-recovery-update-osd-keyring-$TARGET_HOSTNAME
  namespace: rook-ceph
  labels:
-   app: rook-ceph-keyring-update
-   app.kubernetes.io/part-of: rook-ceph-recovery
+    app: rook-ceph-recovery-update-osd-keyring
+    app.kubernetes.io/part-of: rook-ceph-recovery
 spec:
  template:
    metadata:
-      name: rook-ceph-keyring-update-$TARGET_HOSTNAME
+      name: rook-ceph-recovery-update-osd-keyring-$TARGET_HOSTNAME
      namespace: rook-ceph
+      labels:
+        app: rook-ceph-recovery-update-osd-keyring
+        app.kubernetes.io/part-of: rook-ceph-recovery
    spec:
      serviceAccountName: rook-ceph-recovery
      nodeSelector:
@ -883,9 +944,9 @@ spec:
          - mountPath: /run/udev
            name: run-udev
      containers:
-        - name: update
+        - name: update-osd-keyring
          image: registry.local:9001/docker.io/openstackhelm/ceph-config-helper:ubuntu_jammy_18.2.2-1-20240312
-          command: [ "/bin/bash", "/tmp/mount/update_keyring.sh" ]
+          command: [ "/bin/bash", "/tmp/mount/update_osd_keyring.sh" ]
          env:
          - name: ROOK_MONS
            valueFrom:
@ -925,16 +986,19 @@ def get_clean_mon_job_template():
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: rook-ceph-clean-mon-$TARGET_MON
+  name: rook-ceph-recovery-clean-mon-$TARGET_MON
  namespace: rook-ceph
  labels:
-   app: rook-ceph-clean-mon
-   app.kubernetes.io/part-of: rook-ceph-recovery
+    app: rook-ceph-recovery-clean-mon
+    app.kubernetes.io/part-of: rook-ceph-recovery
 spec:
  template:
    metadata:
-      name: rook-ceph-clean-mon-$TARGET_MON
+      name: rook-ceph-recovery-clean-mon-$TARGET_MON
      namespace: rook-ceph
+      labels:
+        app: rook-ceph-recovery-clean-mon
+        app.kubernetes.io/part-of: rook-ceph-recovery
    spec:
      serviceAccountName: rook-ceph-recovery
      nodeSelector:
@ -949,7 +1013,7 @@ spec:
      restartPolicy: OnFailure
      volumes:
        - hostPath:
-            path: /var/lib/ceph
+            path: /var/lib/ceph/data
            type: ""
          name: rook-data
        - name: rook-ceph-recovery
@ -960,7 +1024,7 @@ spec:
          hostPath:
            path: /etc/kubernetes/admin.conf
      containers:
-        - name: clean
+        - name: clean-mon
          image: registry.local:9001/docker.io/openstackhelm/ceph-config-helper:ubuntu_jammy_18.2.2-1-20240312
          command: [ "/bin/bash", "/tmp/mount/clean_mon.sh" ]
          env:
@ -988,16 +1052,19 @@ def get_move_mon_job_template():
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: rook-ceph-move-mon-$TARGET_MON
+  name: rook-ceph-recovery-move-mon-$TARGET_MON
  namespace: rook-ceph
  labels:
-   app: rook-ceph-move-mon
-   app.kubernetes.io/part-of: rook-ceph-recovery
+    app: rook-ceph-recovery-move-mon
+    app.kubernetes.io/part-of: rook-ceph-recovery
 spec:
  template:
    metadata:
-      name: rook-ceph-move-mon-$TARGET_MON
+      name: rook-ceph-recovery-move-mon-$TARGET_MON
      namespace: rook-ceph
+      labels:
+        app: rook-ceph-recovery-move-mon
+        app.kubernetes.io/part-of: rook-ceph-recovery
    spec:
      serviceAccountName: rook-ceph-recovery
      nodeSelector:
@ -1023,7 +1090,7 @@ spec:
          hostPath:
            path: /etc/kubernetes/admin.conf
      containers:
-        - name: update
+        - name: move-mon
          image: registry.local:9001/docker.io/openstackhelm/ceph-config-helper:ubuntu_jammy_18.2.2-1-20240312
          command: [ "/bin/bash", "/tmp/mount/move_mon.sh" ]
          env:
@ -1037,6 +1104,8 @@ spec:
              secretKeyRef:
                name: rook-ceph-admin-keyring
                key: keyring
+          - name: RECOVERY_HOSTNAME
+            value: $RECOVERY_HOSTNAME
          - name: HOSTNAME
            value: $TARGET_HOSTNAME
          - name: MON_NAME
@ -1066,15 +1135,19 @@ metadata:
  name: rook-ceph-recovery-monitor
  namespace: rook-ceph
  labels:
-   app: rook-ceph-recovery-monitor
+    app: rook-ceph-recovery-monitor
 spec:
-  ttlSecondsAfterFinished: 300
+  ttlSecondsAfterFinished: 30
  template:
    metadata:
      name: rook-ceph-recovery-monitor
      namespace: rook-ceph
+      labels:
+        app: rook-ceph-recovery-monitor
    spec:
      serviceAccountName: rook-ceph-recovery
+      nodeSelector:
+        kubernetes.io/hostname: $TARGET_HOSTNAME
      tolerations:
      - effect: NoSchedule
        operator: Exists
@ -1084,6 +1157,14 @@ spec:
        key: node-role.kubernetes.io/control-plane
      restartPolicy: OnFailure
      volumes:
+        - hostPath:
+            path: /var/lib/ceph/data
+            type: ""
+          name: rook-data
+        - hostPath:
+            path: /var/log/ceph
+            type: ""
+          name: ceph-log
        - name: rook-ceph-recovery
          configMap:
            name: rook-ceph-recovery
@ -1096,9 +1177,19 @@ spec:
          image: registry.local:9001/docker.io/bitnami/kubectl:1.29
          command: [ "/bin/bash", "/tmp/mount/monitor.sh" ]
          env:
+          - name: STRUCT
+            value: $STRUCTURE
          - name: HAS_MON_FLOAT
            value: "$MON_FLOAT_ENABLED"
+          securityContext:
+            privileged: true
+            readOnlyRootFilesystem: false
+            runAsUser: 0
          volumeMounts:
+          - mountPath: /var/lib/rook
+            name: rook-data
+          - mountPath: /var/log/ceph
+            name: ceph-log
          - mountPath: /tmp/mount
            name: rook-ceph-recovery
          - name: kube-config
--- a/playbookconfig/src/playbooks/roles/restore-platform/restore-more-data/tasks/main.yml
+++ b/playbookconfig/src/playbooks/roles/restore-platform/restore-more-data/tasks/main.yml
@ -802,6 +802,7 @@
          kubectl delete -n rook-ceph helmrepository stx-platform --force
        environment:
          KUBECONFIG: /etc/kubernetes/admin.conf
+        ignore_errors: true
        when: rook_backend.rc == 0

      # When controller unlock occurs there is a chance platform-integ-apps or rook-ceph