From 087091663f2077214ae7efd5c80fbdee9bb10697 Mon Sep 17 00:00:00 2001 From: Vladimir Kozhukalov Date: Thu, 10 Jul 2025 16:25:56 -0500 Subject: [PATCH] Improve stability of Ceph cluster deployment script Do not fail and retry when some of mon pods are not found while checking its status. This is to avoid situations like the following: ``` 2025-07-10 14:53:17.670728 | primary | + MON_PODS='rook-ceph-mon-a-canary-6d7bf54997-mtzmt 2025-07-10 14:53:17.670767 | primary | rook-ceph-mon-b-canary-7ff47b6fc6-sbtjh 2025-07-10 14:53:17.670781 | primary | rook-ceph-mon-c-canary-68cf8fb595-4jptf' 2025-07-10 14:53:17.670786 | primary | + for MON_POD in $MON_PODS 2025-07-10 14:53:17.670791 | primary | + kubectl get pod --namespace=ceph rook-ceph-mon-a-canary-6d7bf54997-mtzmt 2025-07-10 14:53:17.824501 | primary | + kubectl wait --namespace=ceph --for=condition=ready pod/rook-ceph-mon-a-canary-6d7bf54997-mtzmt --timeout=600s 2025-07-10 14:53:17.897216 | primary | Error from server (NotFound): pods "rook-ceph-mon-a-canary-6d7bf54997-mtzmt" not found ``` Change-Id: I7f10df4d9b395a5775aa3afd42e17dbd09855304 Signed-off-by: Vladimir Kozhukalov --- tools/deployment/ceph/ceph-rook.sh | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/tools/deployment/ceph/ceph-rook.sh b/tools/deployment/ceph/ceph-rook.sh index b56f2046de..3d07865c4d 100755 --- a/tools/deployment/ceph/ceph-rook.sh +++ b/tools/deployment/ceph/ceph-rook.sh @@ -371,13 +371,25 @@ helm osh wait-for-pods rook-ceph kubectl wait --namespace=ceph --for=condition=ready pod --selector=app=rook-ceph-tools --timeout=600s # Wait for all monitor pods to be ready -MON_PODS=$(kubectl get pods --namespace=ceph --selector=app=rook-ceph-mon --no-headers | awk '{ print $1 }') -for MON_POD in $MON_PODS; do - if kubectl get pod --namespace=ceph "$MON_POD" > /dev/null 2>&1; then - kubectl wait --namespace=ceph --for=condition=ready "pod/$MON_POD" --timeout=600s - else - echo "Pod $MON_POD not found, skipping..." - fi +wait_start_time=$(date +%s) +while [[ $(($(date +%s) - $wait_start_time)) -lt 1800 ]]; do + sleep 30 + MON_PODS=$(kubectl get pods --namespace=ceph --selector=app=rook-ceph-mon --no-headers | awk '{ print $1 }') + MON_PODS_NUM=$(echo $MON_PODS | wc -w) + MON_PODS_READY=0 + for MON_POD in $MON_PODS; do + if kubectl get pod --namespace=ceph "$MON_POD" > /dev/null 2>&1; then + kubectl wait --namespace=ceph --for=condition=ready "pod/$MON_POD" --timeout=60s && \ + { MON_PODS_READY=$(($MON_PODS_READY+1)); } || \ + echo "Pod $MON_POD not ready, skipping..." + else + echo "Pod $MON_POD not found, skipping..." + fi + done + if [[ ${MON_PODS_READY} == ${MON_PODS_NUM} ]]; then + echo "Monitor pods are ready. Moving on." + break; + fi done echo "=========== CEPH K8S PODS LIST ============"