Fix rook-ceph recovery (B&R)

In some scenarios, after recovering rook-ceph, the ceph status
remained as if the cluster was empty:

  cluster:
    id:     1c6688fc-58b8-4055-abc7-c1c730a88608
    health: HEALTH_OK

  services:
    mon: 1 daemons, quorum a (age 4m)
    mgr: no daemons active
    osd: 0 osds: 0 up, 0 in

  data:
    pools:   0 pools, 0 pgs
    objects: 0 objects, 0 B
    usage:   0 B used, 0 B / 0 B avail
    pgs:

After analyzing the code and logs, it was possible to observe
changes caused by race conditions in rook-ceph secrets and
configmaps. To prevent this from happening, the rook-ceph
operator is paused while the recovery process patches secrets
and configmaps.

Furthermore, it was observed that when we do not have OSD and
monitor on the same host, the mgr and mds were on the host
where the recovery was performed (which only has OSD). To fix
this, the respective deployments are restarted, thus placing
them on the correct hosts.

Finally, the logs of all jobs/pods used during recovery are
saved to /var/log/ceph/restore.log on the target host, which
is where the recovery job runs.

Test Plan:
- PASS: Configure Standard with OSDs on controller-1
        and worker-0, and monitors on the other hosts.
- PASS: Perform B&R
- PASS: After restore, unlock controller-1 only after
	    all hosts have been unlocked.
- PASS: Check ceph status after recovery

Closes-Bug: 2086473

Change-Id: If2cd7186eaa4510ce55bcf9b10a9536d76e366c4
Signed-off-by: Erickson Silva de Oliveira <Erickson.SilvadeOliveira@windriver.com>
This commit is contained in:
Erickson Silva de Oliveira 2024-10-31 20:45:41 -03:00
parent 20f2f20b01
commit 75b5bc17e3
2 changed files with 183 additions and 91 deletions

View File

@ -16,7 +16,7 @@ from string import Template
CEPH_TMP = '/tmp/ceph'
MONMAP_FILENAME = 'monmap.bin'
RECOVERY_JOB_RESOURCE_FILENAME = 'recovery-job.yaml'
KEYRING_JOB_RESOURCE_FILENAME = 'keyring-update-{}-job.yaml'
UPDATE_OSD_KEYRING_JOB_RESOURCE_FILENAME = 'update-osd-keyring-{}-job.yaml'
MOVE_MON_JOB_RESOURCE_FILENAME = 'move-mon-{}-job.yaml'
CLEAN_MON_JOB_RESOURCE_FILENAME = 'clean-mon-{}-job.yaml'
MONITOR_JOB_RESOURCE_FILENAME = 'monitor-job.yaml'
@ -92,17 +92,18 @@ def recover_cluster():
if structure == "ONLY_OSD":
move_mon_job_template = get_move_mon_job_template()
move_mon_job_resource = move_mon_job_template.safe_substitute({'TARGET_HOSTNAME': target_mon_hostname,
'TARGET_MON': target_mon})
'TARGET_MON': target_mon,
'RECOVERY_HOSTNAME': target_hostname})
move_mon_job_resource_path = create_job_resource(move_mon_job_resource,
MOVE_MON_JOB_RESOURCE_FILENAME.format(target_mon))
subprocess.run(["kubectl", "apply", "-f", move_mon_job_resource_path])
for hostname in hosts_to_update_keyring:
keyring_job_template = get_keyring_job_template()
keyring_job_resource = keyring_job_template.safe_substitute({'TARGET_HOSTNAME': hostname})
keyring_job_resource_path = create_job_resource(keyring_job_resource,
KEYRING_JOB_RESOURCE_FILENAME.format(hostname))
subprocess.run(["kubectl", "apply", "-f", keyring_job_resource_path])
update_osd_keyring_job_template = get_update_osd_keyring_job_template()
update_osd_keyring_job_resource = update_osd_keyring_job_template.safe_substitute({'TARGET_HOSTNAME': hostname})
update_osd_keyring_job_resource_path = create_job_resource(update_osd_keyring_job_resource,
UPDATE_OSD_KEYRING_JOB_RESOURCE_FILENAME.format(hostname))
subprocess.run(["kubectl", "apply", "-f", update_osd_keyring_job_resource_path])
for name, hostname in mons_to_clean.items():
clean_mon_job_template = get_clean_mon_job_template()
@ -112,11 +113,13 @@ def recover_cluster():
CLEAN_MON_JOB_RESOURCE_FILENAME.format(name))
subprocess.run(["kubectl", "apply", "-f", clean_mon_job_resource_path])
monitor_job_template = get_monitor_job_template()
monitor_job_resource = monitor_job_template.safe_substitute({'MON_FLOAT_ENABLED': mon_float})
monitor_job_resource_path = create_job_resource(monitor_job_resource,
MONITOR_JOB_RESOURCE_FILENAME)
subprocess.run(["kubectl", "apply", "-f", monitor_job_resource_path])
monitor_job_template = get_monitor_job_template()
monitor_job_resource = monitor_job_template.safe_substitute({'TARGET_HOSTNAME': target_hostname,
'MON_FLOAT_ENABLED': mon_float,
'STRUCTURE': structure})
monitor_job_resource_path = create_job_resource(monitor_job_resource,
MONITOR_JOB_RESOURCE_FILENAME)
subprocess.run(["kubectl", "apply", "-f", monitor_job_resource_path])
def create_job_resource(content, filename):
@ -205,8 +208,6 @@ data:
kubectl -n rook-ceph patch configmap rook-ceph-mon-endpoints -p '{"data": {"data": "'"${MON_NAME}"'='"${mon_host_addr}"':6789"}}'
kubectl -n rook-ceph patch secret rook-ceph-config -p '{"stringData": {"mon_host": "[v2:'"${mon_host_addr}"':3300,v1:'"${mon_host_addr}"':6789]", "mon_initial_members": "'"${MON_NAME}"'"}}'
kubectl -n rook-ceph label deployment -l app=rook-ceph-mon ceph.rook.io/do-not-reconcile=""
if [ $STRUCT == 'ONLY_OSD' ]; then
kubectl label nodes ${HOSTNAME} ceph-mgr-placement=enabled
kubectl label nodes ${HOSTNAME} ceph-mon-placement=enabled
@ -292,20 +293,23 @@ data:
kubectl -n rook-ceph delete pod -l osd=${osd_id} --grace-period=0 --force
fi
NEW_OSD_KEYRING=(`cat /var/lib/rook/rook-ceph/${ceph_fsid}_${osd_uuid}/keyring | sed -n -e 's/^.*key = //p'`)
OSD_KEYRING=(`cat /var/lib/rook/rook-ceph/${ceph_fsid}_${osd_uuid}/keyring | sed -n -e 's/^.*key = //p'`)
cat > /tmp/osd.${osd_id}.keyring << EOF
[osd.${osd_id}]
key = ${NEW_OSD_KEYRING}
key = ${OSD_KEYRING}
caps mgr = "allow profile osd"
caps mon = "allow profile osd"
caps osd = "allow *"
EOF
ceph auth import -i /tmp/osd.${osd_id}.keyring
if [ $? -ne 0 ]; then
echo "ceph timeout exceeded, exit"
exit 1
fi
while [ "$(ceph auth get-key osd.${osd_id})" != "${OSD_KEYRING}" ]
do
ceph auth import -i /tmp/osd.${osd_id}.keyring
if [ $? -ne 0 ]; then
echo "ceph timeout exceeded, exit"
exit 1
fi
done
ceph-objectstore-tool --type bluestore --data-path /var/lib/rook/rook-ceph/${ceph_fsid}_${osd_uuid} --op update-mon-db --mon-store-path /tmp/monstore
if [ $? -ne 0 ]; then
@ -364,7 +368,7 @@ data:
kubectl -n rook-ceph scale deployment -l app=rook-ceph-osd --replicas 1
sleep ${TIME_AFTER_SCALE}
kubectl -n rook-ceph wait --for=condition=Ready pod --all=true -l mon=${MON_NAME} --timeout=${TIME_WAIT_READY}
kubectl -n rook-ceph wait --for=condition=Ready pod --all=true -l app=rook-ceph-osd --timeout=${TIME_WAIT_READY}
kubectl -n rook-ceph wait --for=condition=Ready pod --all=true -l "app=rook-ceph-osd,topology-location-host=${HOSTNAME}" --timeout=${TIME_WAIT_READY}
ceph -s
if [ $? -ne 0 ]; then
@ -428,6 +432,12 @@ data:
echo "Waiting for cluster recovery"
done
kubectl -n rook-ceph scale deployment rook-ceph-operator --replicas 0
kubectl -n rook-ceph wait --for=delete pod --all=true -l app=rook-ceph-operator --timeout=${TIME_WAIT_DELETE}
if [ $? -ne 0 ]; then
kubectl -n rook-ceph delete pod -l app=rook-ceph-operator --grace-period=0 --force
fi
kubectl -n rook-ceph scale deployment -l app=rook-ceph-mon --replicas 0
kubectl -n rook-ceph wait --for=delete pod --all=true -l app=rook-ceph-mon --timeout=${TIME_WAIT_DELETE}
if [ $? -ne 0 ]; then
@ -439,27 +449,31 @@ data:
kubectl -n rook-ceph patch secret rook-ceph-config -p '{"data": {"mon_initial_members": "'"${DATA_MON_INIT}"'"}}'
kubectl -n rook-ceph scale deployment rook-ceph-mon-${MON_NAME} --replicas 1
kubectl -n rook-ceph scale deployment rook-ceph-operator --replicas 1
sleep ${TIME_AFTER_SCALE}
kubectl -n rook-ceph wait --for=condition=Ready pod --all=true -l mon=${MON_NAME} --timeout=${TIME_WAIT_READY}
kubectl -n rook-ceph wait --for=condition=Ready pod --all=true -l app=rook-ceph-operator --timeout=${TIME_WAIT_READY}
fi
ceph config set mgr mgr/crash/warn_recent_interval 0
kubectl -n rook-ceph patch configmap rook-ceph-recovery -p '{"data": {"status": "completed"}}'
exit 0
update_keyring.sh: |-
update_osd_keyring.sh: |-
#!/bin/bash
set -x
while true
do
# TODO: Instead of sleep, use 'kubectl wait'
status=$(kubectl -n rook-ceph get configmap rook-ceph-recovery -o jsonpath='{.data.status}')
if [ "$status" == "completed" ]; then
break
else
sleep 10
fi
sleep 10
done
if [ "${MON_HOST}"x == ""x ]; then
@ -503,10 +517,10 @@ data:
sleep 60
done
NEW_OSD_KEYRING=(`cat /var/lib/rook/rook-ceph/${ceph_fsid}_${osd_uuid}/keyring | sed -n -e 's/^.*key = //p'`)
OSD_KEYRING=(`cat /var/lib/rook/rook-ceph/${ceph_fsid}_${osd_uuid}/keyring | sed -n -e 's/^.*key = //p'`)
cat > /tmp/osd.${osd_id}.keyring << EOF
[osd.${osd_id}]
key = ${NEW_OSD_KEYRING}
key = ${OSD_KEYRING}
caps mgr = "allow profile osd"
caps mon = "allow profile osd"
caps osd = "allow *"
@ -516,11 +530,14 @@ data:
sleep 60
done
ceph auth import -i /tmp/osd.${osd_id}.keyring
if [ $? -ne 0 ]; then
echo "ceph timeout exceeded, exit"
exit 1
fi
while [ "$(ceph auth get-key osd.${osd_id})" != "${OSD_KEYRING}" ]
do
ceph auth import -i /tmp/osd.${osd_id}.keyring
if [ $? -ne 0 ]; then
echo "ceph timeout exceeded, exit"
exit 1
fi
done
done
exit 0
@ -530,30 +547,24 @@ data:
set -x
if [ ${MON_NAME} == "float" ]; then
data_dir"/var/lib/rook/mon-${MON_NAME}/mon-${MON_NAME}"
else
data_dir="/var/lib/rook/data/mon-${MON_NAME}"
fi
while true
do
# TODO: Instead of sleep, use 'kubectl wait'
status=$(kubectl -n rook-ceph get configmap rook-ceph-recovery -o jsonpath='{.data.status}')
if [ "$status" == "completed" ]; then
kubectl -n rook-ceph scale deployment rook-ceph-mon-${MON_NAME} --replicas 0
kubectl -n rook-ceph wait --for=delete pod --all=true -l mon=${MON_NAME} --timeout=30s
if [ $? -ne 0 ]; then
kubectl -n rook-ceph delete pod -l mon=${MON_NAME} --grace-period=0 --force
kubectl -n rook-ceph wait --for=condition=complete job --all=true -l app=rook-ceph-recovery-update-osd-keyring --timeout=30s
if [ $? -eq 0 ]; then
kubectl -n rook-ceph scale deployment rook-ceph-mon-${MON_NAME} --replicas 0
kubectl -n rook-ceph wait --for=delete pod --all=true -l mon=${MON_NAME} --timeout=30s
if [ $? -ne 0 ]; then
kubectl -n rook-ceph delete pod -l mon=${MON_NAME} --grace-period=0 --force
fi
rm -rf /var/lib/rook/mon-${MON_NAME}
kubectl -n rook-ceph scale deployment rook-ceph-mon-${MON_NAME} --replicas 1
break
fi
rm -rf $data_dir
kubectl -n rook-ceph scale deployment rook-ceph-mon-${MON_NAME} --replicas 1
break
else
sleep 10
fi
sleep 10
done
exit 0
@ -580,15 +591,14 @@ data:
while true
do
# TODO: Instead of sleep, use 'kubectl wait'
sleep 10
status=$(kubectl -n rook-ceph get configmap rook-ceph-recovery -o jsonpath='{.data.status}')
if [ "$status" == "completed" ]; then
if [ "$(ceph health)" == "HEALTH_OK" ]; then
PODS=$(kubectl -n rook-ceph get pods -l app=rook-ceph-clean-mon)
if echo "$PODS" | grep rook-ceph-clean; then
kubectl -n rook-ceph wait --for=condition=complete job --all=true -l app=rook-ceph-clean-mon --timeout=30s
if [ $? -ne 0 ]; then
continue
fi
kubectl -n rook-ceph wait --for=condition=complete job --all=true -l app=rook-ceph-recovery-clean-mon --timeout=30s
if [ $? -ne 0 ]; then
continue
fi
kubectl -n rook-ceph scale deployment rook-ceph-mon-${MON_NAME} --replicas 0
@ -597,25 +607,40 @@ data:
kubectl -n rook-ceph delete pod -l mon=${MON_NAME} --grace-period=0 --force
fi
MGR_NAME=$(kubectl -n rook-ceph get pods -l app=rook-ceph-mgr --field-selector spec.nodeName=${RECOVERY_HOSTNAME} --no-headers -o custom-columns=":metadata.labels.mgr")
MDS_NAME=$(kubectl -n rook-ceph get pods -l app=rook-ceph-mds --field-selector spec.nodeName=${RECOVERY_HOSTNAME} --no-headers -o custom-columns=":metadata.labels.mds")
kubectl -n rook-ceph scale deployment rook-ceph-mgr-${MGR_NAME} --replicas 0
kubectl -n rook-ceph wait --for=delete pod --all=true -l mgr=${MGR_NAME} --timeout=30s
if [ $? -ne 0 ]; then
kubectl -n rook-ceph delete pod -l mgr=${MGR_NAME} --grace-period=0 --force
fi
kubectl -n rook-ceph scale deployment rook-ceph-mds-${MDS_NAME} --replicas 0
kubectl -n rook-ceph wait --for=delete pod --all=true -l mds=${MDS_NAME} --timeout=30s
if [ $? -ne 0 ]; then
kubectl -n rook-ceph delete pod -l mds=${MDS_NAME} --grace-period=0 --force
fi
rm -rf /var/lib/rook/mon-${MON_NAME}
kubectl -n rook-ceph label deployment -l app=rook-ceph-mon ceph.rook.io/do-not-reconcile=""
kubectl -n rook-ceph patch deployment rook-ceph-mon-${MON_NAME} -p '{"spec": {"template": {"spec": {"nodeSelector": {"kubernetes.io/hostname": "'"${HOSTNAME}"'"}}}}}'
kubectl label nodes ${HOSTNAME} ceph-mgr-placement-
kubectl label nodes ${HOSTNAME} ceph-mon-placement-
kubectl label nodes ${RECOVERY_HOSTNAME} ceph-mgr-placement-
kubectl label nodes ${RECOVERY_HOSTNAME} ceph-mon-placement-
kubectl -n rook-ceph scale deployment rook-ceph-mon-${MON_NAME} --replicas 1
kubectl -n rook-ceph scale deployment rook-ceph-mgr-${MGR_NAME} --replicas 1
kubectl -n rook-ceph scale deployment rook-ceph-mds-${MDS_NAME} --replicas 1
sleep 10
kubectl -n rook-ceph wait --for=condition=Ready pod --all=true -l mon=${MON_NAME} --timeout=60s
kubectl -n rook-ceph wait --for=condition=Ready pod --all=true -l mgr=${MGR_NAME} --timeout=60s
kubectl -n rook-ceph wait --for=condition=Ready pod --all=true -l mds=${MDS_NAME} --timeout=60s
echo "rook-ceph mon moved successfully."
break
fi
fi
sleep 30
done
exit 0
monitor.sh: |-
@ -625,26 +650,50 @@ data:
while true
do
# TODO: Instead of sleep, use 'kubectl wait'
sleep 30
status=$(kubectl -n rook-ceph get configmap rook-ceph-recovery -o jsonpath='{.data.status}')
if [ "$status" == "completed" ]; then
kubectl -n rook-ceph wait --for=condition=complete job --all=true -l app.kubernetes.io/part-of=rook-ceph-recovery --timeout=30s
if [ $? -eq 0 ]; then
if [ "${HAS_MON_FLOAT}" == false ]; then
kubectl -n rook-ceph label deployment -l app=rook-ceph-mon ceph.rook.io/do-not-reconcile-
fi
if [ "${STRUCT}" == 'ONE_HOST' ]; then
break
fi
else
sleep 5m
kubectl -n rook-ceph wait --for=condition=complete job --all=true -l app.kubernetes.io/part-of=rook-ceph-recovery --timeout=30s
if [ $? -ne 0 ]; then
continue
fi
if [ "${HAS_MON_FLOAT}" == false ]; then
kubectl -n rook-ceph label deployment -l app=rook-ceph-mon ceph.rook.io/do-not-reconcile-
fi
if [ "${STRUCT}" == 'ONLY_OSD' ]; then
rm -rf /var/lib/rook/mon-*
fi
break
fi
done
set +x
PODS=$(kubectl -n rook-ceph get pods -l app.kubernetes.io/part-of=rook-ceph-recovery --no-headers -o custom-columns=":metadata.name")
for pod in $PODS; do
echo -e "\\n##############################\\n$pod\\n##############################" >> /var/log/ceph/restore.log
kubectl -n rook-ceph logs $pod >> /var/log/ceph/restore.log
done
set -x
kubectl -n rook-ceph delete jobs -l app.kubernetes.io/part-of=rook-ceph-recovery
kubectl -n rook-ceph wait --for=delete job --all=true -l app.kubernetes.io/part-of=rook-ceph-recovery --timeout=30s
if [ $? -ne 0 ]; then
kubectl -n rook-ceph delete jobs -l app.kubernetes.io/part-of=rook-ceph-recovery --grace-period=0 --force
fi
set +x
echo -e "\\n##############################\\nrook-ceph-recovery-monitor\\n##############################" >> /var/log/ceph/restore.log
kubectl -n rook-ceph logs $(kubectl get pod -n rook-ceph -l app=rook-ceph-recovery-monitor -o name) >> /var/log/ceph/restore.log
set -x
exit 0
monmap_b64: |-
@ -662,6 +711,12 @@ rules:
- apiGroups: [""]
resources: ["services"]
verbs: ["get"]
- apiGroups: [""]
resources: ["pods/log"]
verbs: ["get"]
- apiGroups: [""]
resources: ["pods/exec"]
verbs: ["create"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "patch"]
@ -708,13 +763,16 @@ metadata:
name: rook-ceph-recovery
namespace: rook-ceph
labels:
app: rook-ceph-recovery
app.kubernetes.io/part-of: rook-ceph-recovery
app: rook-ceph-recovery
app.kubernetes.io/part-of: rook-ceph-recovery
spec:
template:
metadata:
name: rook-ceph-recovery
namespace: rook-ceph
labels:
app: rook-ceph-recovery
app.kubernetes.io/part-of: rook-ceph-recovery
spec:
serviceAccountName: rook-ceph-recovery
nodeSelector:
@ -812,23 +870,26 @@ spec:
""")
def get_keyring_job_template():
def get_update_osd_keyring_job_template():
return Template(
"""
---
apiVersion: batch/v1
kind: Job
metadata:
name: rook-ceph-keyring-update-$TARGET_HOSTNAME
name: rook-ceph-recovery-update-osd-keyring-$TARGET_HOSTNAME
namespace: rook-ceph
labels:
app: rook-ceph-keyring-update
app.kubernetes.io/part-of: rook-ceph-recovery
app: rook-ceph-recovery-update-osd-keyring
app.kubernetes.io/part-of: rook-ceph-recovery
spec:
template:
metadata:
name: rook-ceph-keyring-update-$TARGET_HOSTNAME
name: rook-ceph-recovery-update-osd-keyring-$TARGET_HOSTNAME
namespace: rook-ceph
labels:
app: rook-ceph-recovery-update-osd-keyring
app.kubernetes.io/part-of: rook-ceph-recovery
spec:
serviceAccountName: rook-ceph-recovery
nodeSelector:
@ -883,9 +944,9 @@ spec:
- mountPath: /run/udev
name: run-udev
containers:
- name: update
- name: update-osd-keyring
image: registry.local:9001/docker.io/openstackhelm/ceph-config-helper:ubuntu_jammy_18.2.2-1-20240312
command: [ "/bin/bash", "/tmp/mount/update_keyring.sh" ]
command: [ "/bin/bash", "/tmp/mount/update_osd_keyring.sh" ]
env:
- name: ROOK_MONS
valueFrom:
@ -925,16 +986,19 @@ def get_clean_mon_job_template():
apiVersion: batch/v1
kind: Job
metadata:
name: rook-ceph-clean-mon-$TARGET_MON
name: rook-ceph-recovery-clean-mon-$TARGET_MON
namespace: rook-ceph
labels:
app: rook-ceph-clean-mon
app.kubernetes.io/part-of: rook-ceph-recovery
app: rook-ceph-recovery-clean-mon
app.kubernetes.io/part-of: rook-ceph-recovery
spec:
template:
metadata:
name: rook-ceph-clean-mon-$TARGET_MON
name: rook-ceph-recovery-clean-mon-$TARGET_MON
namespace: rook-ceph
labels:
app: rook-ceph-recovery-clean-mon
app.kubernetes.io/part-of: rook-ceph-recovery
spec:
serviceAccountName: rook-ceph-recovery
nodeSelector:
@ -949,7 +1013,7 @@ spec:
restartPolicy: OnFailure
volumes:
- hostPath:
path: /var/lib/ceph
path: /var/lib/ceph/data
type: ""
name: rook-data
- name: rook-ceph-recovery
@ -960,7 +1024,7 @@ spec:
hostPath:
path: /etc/kubernetes/admin.conf
containers:
- name: clean
- name: clean-mon
image: registry.local:9001/docker.io/openstackhelm/ceph-config-helper:ubuntu_jammy_18.2.2-1-20240312
command: [ "/bin/bash", "/tmp/mount/clean_mon.sh" ]
env:
@ -988,16 +1052,19 @@ def get_move_mon_job_template():
apiVersion: batch/v1
kind: Job
metadata:
name: rook-ceph-move-mon-$TARGET_MON
name: rook-ceph-recovery-move-mon-$TARGET_MON
namespace: rook-ceph
labels:
app: rook-ceph-move-mon
app.kubernetes.io/part-of: rook-ceph-recovery
app: rook-ceph-recovery-move-mon
app.kubernetes.io/part-of: rook-ceph-recovery
spec:
template:
metadata:
name: rook-ceph-move-mon-$TARGET_MON
name: rook-ceph-recovery-move-mon-$TARGET_MON
namespace: rook-ceph
labels:
app: rook-ceph-recovery-move-mon
app.kubernetes.io/part-of: rook-ceph-recovery
spec:
serviceAccountName: rook-ceph-recovery
nodeSelector:
@ -1023,7 +1090,7 @@ spec:
hostPath:
path: /etc/kubernetes/admin.conf
containers:
- name: update
- name: move-mon
image: registry.local:9001/docker.io/openstackhelm/ceph-config-helper:ubuntu_jammy_18.2.2-1-20240312
command: [ "/bin/bash", "/tmp/mount/move_mon.sh" ]
env:
@ -1037,6 +1104,8 @@ spec:
secretKeyRef:
name: rook-ceph-admin-keyring
key: keyring
- name: RECOVERY_HOSTNAME
value: $RECOVERY_HOSTNAME
- name: HOSTNAME
value: $TARGET_HOSTNAME
- name: MON_NAME
@ -1066,15 +1135,19 @@ metadata:
name: rook-ceph-recovery-monitor
namespace: rook-ceph
labels:
app: rook-ceph-recovery-monitor
app: rook-ceph-recovery-monitor
spec:
ttlSecondsAfterFinished: 300
ttlSecondsAfterFinished: 30
template:
metadata:
name: rook-ceph-recovery-monitor
namespace: rook-ceph
labels:
app: rook-ceph-recovery-monitor
spec:
serviceAccountName: rook-ceph-recovery
nodeSelector:
kubernetes.io/hostname: $TARGET_HOSTNAME
tolerations:
- effect: NoSchedule
operator: Exists
@ -1084,6 +1157,14 @@ spec:
key: node-role.kubernetes.io/control-plane
restartPolicy: OnFailure
volumes:
- hostPath:
path: /var/lib/ceph/data
type: ""
name: rook-data
- hostPath:
path: /var/log/ceph
type: ""
name: ceph-log
- name: rook-ceph-recovery
configMap:
name: rook-ceph-recovery
@ -1096,9 +1177,19 @@ spec:
image: registry.local:9001/docker.io/bitnami/kubectl:1.29
command: [ "/bin/bash", "/tmp/mount/monitor.sh" ]
env:
- name: STRUCT
value: $STRUCTURE
- name: HAS_MON_FLOAT
value: "$MON_FLOAT_ENABLED"
securityContext:
privileged: true
readOnlyRootFilesystem: false
runAsUser: 0
volumeMounts:
- mountPath: /var/lib/rook
name: rook-data
- mountPath: /var/log/ceph
name: ceph-log
- mountPath: /tmp/mount
name: rook-ceph-recovery
- name: kube-config

View File

@ -802,6 +802,7 @@
kubectl delete -n rook-ceph helmrepository stx-platform --force
environment:
KUBECONFIG: /etc/kubernetes/admin.conf
ignore_errors: true
when: rook_backend.rc == 0
# When controller unlock occurs there is a chance platform-integ-apps or rook-ceph