Improved MAAS helm test reliability and logging

This PS adds more robust checks to the helm test
script to ensure that the MAAS services are
properly deployed and functional before marking
the test as successful. It includes additional
logging for easier debugging in case of failures.

In addition it updated import-boot-resources.sh
to add stall detection and restart logic to
handle cases where the boot image import process
may hang indefinitely.

Change-Id: Ic1ccf8ec60bb0b25988bc82c589537933f9f74c1
Signed-off-by: Sergiy Markin <smarkin@mirantis.com>
This commit is contained in:
Sergiy Markin
2025-11-19 14:23:44 -06:00
parent 1598cbd4df
commit ae64294f93
3 changed files with 323 additions and 24 deletions

View File

@@ -56,25 +56,240 @@ function import_resources {
}
function start_import {
timer "$RETRY_TIMER" import_resources
local import_start_time=$(date +%s)
local stall_check_time=$import_start_time
local last_progress_time=$import_start_time
local stall_restart_done=false
local last_synced_count=$(get_synced_count)
local stall_check_interval=300 # Check for stalls every 5 minutes
echo "Starting import at $(date) (timeout: ${JOB_TIMEOUT}s)"
echo "Initial synced count: ${last_synced_count}"
# Custom loop with stall detection based on progress
while [[ ${JOB_TIMEOUT} -gt 0 ]]; do
import_resources
rc=$?
if [ $rc -eq 0 ]; then
echo "Import completed successfully!"
return 0
fi
local current_time=$(date +%s)
local elapsed=$((current_time - import_start_time))
local current_synced_count=$(get_synced_count)
# Track progress - if synced count increased, update progress time
if [[ $current_synced_count -gt $last_synced_count ]]; then
echo "Progress detected: synced count increased from ${last_synced_count} to ${current_synced_count}"
last_synced_count=$current_synced_count
last_progress_time=$current_time
fi
# Check if import appears stalled (no progress for 5 minutes)
if [[ $stall_restart_done == false ]]; then
if check_import_stalled "$last_synced_count" "$current_synced_count" "$stall_check_interval" "$last_progress_time"; then
echo "Stalled import detected - attempting restart..."
restart_stalled_import
import_start_time=$(date +%s)
last_progress_time=$import_start_time
stall_restart_done=true
stall_check_time=$import_start_time
# Don't update last_synced_count - we want to see if restart helps
fi
fi
# Progress update every 2 minutes
if [[ $((current_time - stall_check_time)) -gt 120 ]]; then
echo "Import still in progress... (elapsed: ${elapsed}s, synced: ${current_synced_count}, timeout remaining: ${JOB_TIMEOUT}s)"
stall_check_time=$current_time
fi
JOB_TIMEOUT=$((JOB_TIMEOUT - RETRY_TIMER))
sleep $RETRY_TIMER
done
echo "ERROR: Import timed out after reaching JOB_TIMEOUT"
echo "Last known state before timeout:"
check_for_download || true
return 124
}
function check_for_download {
if maas ${ADMIN_USERNAME} boot-resources is-importing | grep -q 'true'; then
echo -e '\nBoot resources currently importing\n'
local is_importing=$(maas ${ADMIN_USERNAME} boot-resources is-importing 2>/dev/null || echo "false")
local synced_imgs=$(maas ${ADMIN_USERNAME} boot-resources read 2>/dev/null | tail -n +1 | jq '.[] | select( .type | contains("Synced")) | .name ' 2>/dev/null | grep -c $MAAS_DEFAULT_DISTRO || echo "0")
if echo "$is_importing" | grep -q 'true'; then
echo -e "\nBoot resources currently importing (synced: ${synced_imgs})\n"
return 1
else
synced_imgs=$(maas ${ADMIN_USERNAME} boot-resources read | tail -n +1 | jq '.[] | select( .type | contains("Synced")) | .name ' | grep -c $MAAS_DEFAULT_DISTRO)
if [[ $synced_imgs -gt 0 ]]; then
echo 'Boot resources have completed importing'
echo "Boot resources have completed importing (synced: ${synced_imgs})"
return 0
else
echo 'Import failed!'
echo 'Import failed - no synced images found!'
return 1
fi
fi
}
function get_synced_count {
maas ${ADMIN_USERNAME} boot-resources read 2>/dev/null | tail -n +1 | jq '.[] | select( .type | contains("Synced")) | .name ' 2>/dev/null | grep -c $MAAS_DEFAULT_DISTRO || echo "0"
}
function check_import_stalled {
# Check if import has been running without making progress
# This detects both stuck downloads and queued-but-never-starting imports
local last_synced_count=${1:-0}
local current_synced_count=${2:-0}
local check_interval=${3:-300} # How long to wait for progress (5 minutes)
local last_progress_time=${4:-0}
local current_time=$(date +%s)
# If synced count increased, we're making progress
if [[ $current_synced_count -gt $last_synced_count ]]; then
return 1 # Not stalled, making progress
fi
# No progress detected, check how long it's been
if [[ $last_progress_time -eq 0 ]]; then
return 1 # First check, can't determine stall yet
fi
local stall_duration=$((current_time - last_progress_time))
if [[ $stall_duration -gt $check_interval ]]; then
echo "WARNING: No import progress for ${stall_duration}s (synced count stuck at ${current_synced_count})"
echo "Checking import status details..."
# Show what's in the queue
maas ${ADMIN_USERNAME} boot-resources read 2>/dev/null | jq -r '.[] | "\(.name): \(.type)"' | head -20 || true
return 0 # Import appears stalled
fi
return 1 # Not stalled yet
}
function clean_boot_resources {
echo "Cleaning boot resource metadata from database..."
# Use maas-region command to access the database
maas-region shell << 'PYTHON_EOF'
from maasserver.models import BootResource, BootResourceFile, BootResourceSet
from django.db import transaction
print("Checking for boot resources in database...")
resources = BootResource.objects.all()
print(f"Found {resources.count()} boot resources in database")
if resources.count() > 0:
print("Deleting all boot resource metadata to force fresh download...")
with transaction.atomic():
deleted_files = BootResourceFile.objects.all().delete()
deleted_sets = BootResourceSet.objects.all().delete()
deleted_resources = BootResource.objects.all().delete()
print(f"Deleted: {deleted_resources[0]} resources, {deleted_sets[0]} sets, {deleted_files[0]} files")
print("Database cleaned successfully")
else:
print("No boot resources found")
PYTHON_EOF
echo "Boot resource cleanup completed"
}
function restart_region_statefulset {
echo "Triggering rollout restart of maas-region statefulset via Kubernetes API..."
PATCH_DATA='{"spec":{"template":{"metadata":{"annotations":{"kubectl.kubernetes.io/restartedAt":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'"}}}}}'
wget \
--server-response \
--ca-certificate=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt \
--header='Content-Type: application/strategic-merge-patch+json' \
--header="Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" \
--method=PATCH \
--body-data="$PATCH_DATA" \
https://kubernetes.default.svc.cluster.local/apis/apps/v1/namespaces/{{ .Release.Namespace }}/statefulsets/maas-region \
2>&1 | grep -E '200 OK|error|Error' || true
echo "Restart command sent, waiting for statefulset rollout to complete..."
# Wait for statefulset to be ready (equivalent to kubectl rollout status)
local max_wait=300 # 5 minutes timeout
local waited=0
while [[ $waited -lt $max_wait ]]; do
# Get statefulset status via K8s API
local sts_status=$(wget -qO- \
--ca-certificate=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt \
--header="Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" \
https://kubernetes.default.svc.cluster.local/apis/apps/v1/namespaces/{{ .Release.Namespace }}/statefulsets/maas-region 2>/dev/null)
local replicas=$(echo "$sts_status" | jq -r '.spec.replicas // 0')
local ready_replicas=$(echo "$sts_status" | jq -r '.status.readyReplicas // 0')
local current_replicas=$(echo "$sts_status" | jq -r '.status.currentReplicas // 0')
local updated_replicas=$(echo "$sts_status" | jq -r '.status.updatedReplicas // 0')
echo "Statefulset status: replicas=${replicas}, ready=${ready_replicas}, current=${current_replicas}, updated=${updated_replicas}"
# Check if rollout is complete
if [[ "$ready_replicas" == "$replicas" ]] && [[ "$updated_replicas" == "$replicas" ]] && [[ "$current_replicas" == "$replicas" ]] && [[ "$replicas" != "0" ]]; then
echo "Statefulset rollout completed successfully!"
return 0
fi
sleep 5
waited=$((waited + 5))
if [[ $((waited % 30)) -eq 0 ]]; then
echo "Still waiting for rollout... (${waited}s elapsed)"
fi
done
echo "WARNING: Timeout waiting for statefulset rollout after ${max_wait}s"
return 1
}
function restart_stalled_import {
echo "Attempting to restart stalled import..."
echo "Stopping current import..."
maas ${ADMIN_USERNAME} boot-resources stop-import || true
sleep 15
# Clean boot resource metadata that may be causing issues
clean_boot_resources
# Restart the region statefulset to clear stuck state
restart_region_statefulset
echo "Waiting for region to be ready after restart..."
max_wait=90
waited=0
until curl -sf ${MAAS_ENDPOINT}/MAAS/ > /dev/null 2>&1; do
sleep 3
waited=$((waited + 3))
if [[ $waited -gt $max_wait ]]; then
echo "WARNING: Region may not be fully ready yet, proceeding anyway"
break
fi
if [[ $((waited % 15)) -eq 0 ]]; then
echo "Still waiting for region API... (${waited}s elapsed)"
fi
done
echo "Re-establishing MAAS session after restart..."
maas_login
echo "Restarting import after region restart and cleanup..."
maas ${ADMIN_USERNAME} boot-resources import
sleep 10
echo "Import restarted at $(date)"
}
function check_then_set_single {
option="$1"
value="$2"

View File

@@ -18,15 +18,18 @@
set -ex
function check_boot_images {
if maas local boot-resources is-importing | grep -q 'true'; then
echo -e '\nBoot resources currently importing\n'
local is_importing=$(maas local boot-resources is-importing 2>/dev/null || echo "false")
local synced_imgs=$(maas local boot-resources read 2>/dev/null | tr -d '\n' | grep -oE '{[^}]+}' | grep ubuntu | grep -c Synced || echo "0")
if echo "$is_importing" | grep -q 'true'; then
echo "Boot resources currently importing... (synced: $synced_imgs)"
return 1
else
synced_imgs=$(maas local boot-resources read | tr -d '\n' | grep -oE '{[^}]+}' | grep ubuntu | grep -c Synced)
if [[ $synced_imgs -gt 0 ]]; then
echo 'Boot resources have completed importing'
echo "Boot resources have completed importing ($synced_imgs images synced)"
return 0
else
echo "No synced boot images found yet (import status: not importing)"
return 1
fi
fi
@@ -52,8 +55,20 @@ function check_admin_api {
}
function establish_session {
maas login local ${MAAS_URL} ${MAAS_API_KEY}
return $?
echo "Attempting to establish MAAS session at ${MAAS_URL}..."
retry_count=0
max_retries=10
until maas login local ${MAAS_URL} ${MAAS_API_KEY}; do
retry_count=$((retry_count + 1))
if [[ $retry_count -ge $max_retries ]]; then
echo "Failed to establish MAAS session after $max_retries attempts"
return 1
fi
echo "Session login failed, retrying... (attempt $retry_count/$max_retries)"
sleep 5
done
echo "MAAS session established successfully"
return 0
}
# Import CA Certificate
@@ -68,23 +83,48 @@ if [[ $? -ne 0 ]]; then
exit 1
fi
check_boot_images
# Wait for rack controllers to register first (max 10 minutes)
echo "Waiting for rack controllers to register..."
retry_count=0
max_retries=60 # 60 * 10 seconds = 10 minutes
until check_rack_controllers; do
retry_count=$((retry_count + 1))
if [[ $retry_count -ge $max_retries ]]; then
echo "Rack controller query FAILED! Timeout after 10 minutes."
echo "This usually means the rack controller pods are not running or cannot connect to the region."
exit 1
fi
if [[ $((retry_count % 6)) -eq 0 ]]; then
echo "Rack controllers not ready yet, waiting... (attempt $retry_count/$max_retries, elapsed: $((retry_count * 10 / 60)) minutes)"
fi
sleep 10
done
if [[ $? -eq 1 ]]; then
echo "Image import test FAILED!"
exit 1
fi
# Wait for boot images to complete importing (max 20 minutes)
# The import job should handle any stalls, we just verify the result
echo "Waiting for boot images to complete importing..."
retry_count=0
max_retries=120 # 120 * 10 seconds = 20 minutes
check_rack_controllers
until check_boot_images; do
retry_count=$((retry_count + 1))
if [[ $? -eq 1 ]]; then
echo "Rack controller query FAILED!"
exit 1
fi
if [[ $retry_count -ge $max_retries ]]; then
echo "Image import test FAILED! Timeout after 20 minutes."
echo "The import job may have failed or is still running."
echo "Check the 'maas-import-resources' job logs for details."
exit 1
fi
check_admin_api
if [[ $((retry_count % 6)) -eq 0 ]]; then
echo "Boot images not ready yet, waiting... (attempt $retry_count/$max_retries, elapsed: $((retry_count * 10 / 60)) minutes)"
fi
if [[ $? -eq 1 ]]; then
sleep 10
done
# Verify admin API is still responding
if ! check_admin_api; then
echo "Admin API response FAILED!"
exit 1
fi

View File

@@ -19,6 +19,50 @@ limitations under the License.
{{- $serviceAccountName := "maas-import-resources" }}
{{ tuple $envAll "import_resources" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: {{ $envAll.Release.Name }}-{{ $serviceAccountName }}-job
namespace: {{ $envAll.Release.Namespace }}
annotations:
{{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: {{ $envAll.Release.Name }}-{{ $envAll.Release.Namespace }}-{{ $serviceAccountName }}-job
subjects:
- kind: ServiceAccount
name: {{ $serviceAccountName }}
namespace: {{ $envAll.Release.Namespace }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: {{ $envAll.Release.Name }}-{{ $envAll.Release.Namespace }}-{{ $serviceAccountName }}-job
namespace: {{ $envAll.Release.Namespace }}
rules:
- apiGroups:
- ""
- extensions
- batch
- apps
verbs:
- get
- list
resources:
- services
- endpoints
- jobs
- pods
- apiGroups:
- apps
verbs:
- patch
resources:
- statefulsets
resourceNames:
- maas-region
---
apiVersion: batch/v1
kind: Job
metadata: