Files
distcloud/distributedcloud/scripts/execute-restore.sh
Gustavo Herzmann 5ae0ba1407 Optimize IPMI SEL event monitoring
This commit optimizes the SEL event monitoring script used during
subcloud enrollment. The script now runs a single ipmitool sel list -v
command instead of invoking ipmitool sel list followed by multiple
ipmitool sel get calls.

A new mode has been added to the monitoring script that monitors for
all success events, returning the last one, allowing the caller to
detect if multiple events were sent and skip directly to the latest
completed phase. If a failure event is detected, the script exits
immediately.

The IPMI codes used by auto-restore and enrollment have been updated
from OEM-reserved codes to the “System Event” sensor type, which the
BMC never interprets as an error.

Additionally, the script now stops sending the expected_region_name
restore override for factory restores, since the factory-installed
subcloud region name is not expected to match the system controller's
region name.

Test Plan:
01. PASS – Run subcloud enrollment and verify the following:
    - Events sent sequentially with delays are returned individually.
    - Events sent quickly without delays return only the latest event
      captured in the polling interval.
    - A failure event is returned immediately when detected.
    - Success events followed by a failure event return the failure
      event as soon as it appears.
    - If no events are detected, the script exits with a timeout error.
02. PASS – Run subcloud auto-restore and factory restore to verify the
    new codes are used and that both operations complete successfully.
03. PASS – Time the full enroll-init operation on a slow BMC and verify
    it completes faster than the previous implementation.

Story: 2011455
Task: 52858

Change-Id: Ib379e4e4c8308361357914f21fd8d73ab5bafbb4
Signed-off-by: Gustavo Herzmann <gustavo.herzmann@windriver.com>
2025-09-29 17:16:53 -03:00

473 lines
16 KiB
Bash

#!/usr/bin/env bash
#
# Copyright (c) 2025 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# Subcloud auto-restore script that orchestrates platform restoration across
# system reboots. Executed by systemd when /opt/platform-backup/auto-restore
# directory is present after subcloud installation.
#
# Restore workflow:
# - First boot: Send install success IPMI event, discover backup file,
# execute restore playbook, and unlock controller
# - Second boot: Run system restore-complete and send restore complete IPMI event
#
# The auto-restore directory is removed after the script is executed to stop
# the systemd service from triggering again after reboot.
set -euo pipefail
readonly CONFIG_DIR="/opt/platform-backup/auto-restore"
readonly RESTORE_CONFIG="${CONFIG_DIR}/backup_restore_values.yml"
readonly RESTORE_PLAYBOOK_COMPLETE_FLAG="${CONFIG_DIR}/.restore_playbook_complete"
readonly LOG_FILE="/var/log/auto-restore.log"
readonly OPENRC_FILE="/etc/platform/openrc"
readonly ANSIBLE_PLAYBOOK="/usr/share/ansible/stx-ansible/playbooks/restore_platform.yml"
log() {
local level="${2:-INFO}"
printf "%(%F %T)T [%s] %s\n" -1 "$level" "$1" | tee -a "$LOG_FILE"
}
send_ipmi_event() {
local event_type="$1"
local event_data
case "$event_type" in
"install_success")
event_data="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF0 # \"Install Completed\""
;;
"restore_complete")
event_data="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF1 # \"Restore Completed\""
;;
"restore_failed")
event_data="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF2 # \"Restore Failed\""
;;
"restore_failed_backup_missing")
event_data="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF3 # \"Restore Failed: missing backup file\""
;;
"restore_failed_images_missing")
event_data="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF4 # \"Restore Failed: missing container images backup file\""
;;
"restore_failed_both_missing")
event_data="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF5 # \"Restore Failed: missing backup and container images backup files\""
;;
*)
log "Unknown IPMI event type: $event_type" "ERROR"
return 1
;;
esac
temp_file=$(mktemp /tmp/ipmi_event_XXXXXX.txt)
echo "$event_data" > "$temp_file"
if retry "send IPMI event ($event_type)" 3 5 "_send_ipmi_command"; then
log "IPMI event sent successfully: $event_type"
rm -f "$temp_file"
return 0
else
log "Failed to send IPMI event after retries: $event_type" "ERROR"
rm -f "$temp_file"
return 1
fi
}
cleanup() {
log "Removing auto-restore directory to prevent future triggers"
rm -rf "$CONFIG_DIR" || log "Failed to remove auto-restore directory" "WARN"
}
retry() {
local -r operation="$1" max_attempts="$2" delay="$3"
local -r command_func="$4"
local attempt=1
while (( attempt <= max_attempts )); do
log "Attempting $operation (attempt $attempt/$max_attempts)"
if "$command_func"; then
log "$operation completed successfully"
return 0
fi
log "$operation failed (attempt $attempt)" "WARN"
if (( attempt < max_attempts )); then
log "Retrying in ${delay}s..."
sleep "$delay"
fi
((attempt++))
done
log "$operation failed after $max_attempts attempts" "ERROR"
return 1
}
_source_openrc() {
# shellcheck disable=SC1090
source "$OPENRC_FILE" || {
log "Failed to source openrc" "ERROR"
return 1
}
}
_unlock_host() {
if system host-unlock controller-0 >> "$LOG_FILE" 2>&1; then
return 0
fi
log "Host unlock failed" "ERROR"
return 1
}
_restore_complete() {
local output
# restore-complete doesn't return a non-zero exit code if the command fails
# because the restore is still in progress, so we need to parse its output instead
output=$(system restore-complete 2>&1)
local exit_code=$?
if [[ -n "$output" ]]; then
echo "$output" >> "$LOG_FILE"
fi
if [[ $exit_code -eq 0 ]] && echo "$output" | grep -q "Restore procedure completed"; then
return 0
else
if [[ $exit_code -ne 0 ]]; then
log "system restore-complete command failed with exit code $exit_code" "ERROR"
fi
return 1
fi
}
_send_ipmi_command() {
if ipmitool sel add "$temp_file" 2>&1 | tee -a "$LOG_FILE" > /dev/null; then
return 0
fi
return 1
}
run_restore_playbook() {
log "Starting automatic restore process"
export HOME=/home/sysadmin
if ! ansible-playbook "$ANSIBLE_PLAYBOOK" \
-e "@${RESTORE_CONFIG}" \
-e "override_files_dir=${HOME}" >> "$LOG_FILE" 2>&1; then
log "Restore playbook failed" "ERROR"
send_ipmi_event "restore_failed"
return 1
fi
return 0
}
find_and_set_backup_filename() {
log "Checking if backup_filename is already set in config..."
if grep -q "^backup_filename:" "$RESTORE_CONFIG"; then
local existing_filename
existing_filename=$(grep "^backup_filename:" "$RESTORE_CONFIG" | sed 's/backup_filename: *//' | tr -d '"' | tr -d "'")
log "backup_filename already set to: $existing_filename"
return 0
fi
log "backup_filename not found in config, scanning for backup file..."
# Get and validate backup directory
local backup_dir
if ! backup_dir=$(get_backup_directory); then
return 1
fi
local auto_restore_mode
auto_restore_mode=$(grep "^auto_restore_mode:" "$RESTORE_CONFIG" 2>/dev/null | sed 's/auto_restore_mode: *//' | tr -d '"' | tr -d "'")
# For factory auto-restore, we need to look for a backup file matching
# the *factory_backup*.tgz pattern
local backup_pattern
if [[ "$auto_restore_mode" == "factory" ]]; then
backup_pattern="*factory_backup*.tgz"
log "Factory auto-restore mode, searching for pattern: $backup_pattern"
else
backup_pattern="*_platform_backup_*.tgz"
log "Standard auto-restore mode, searching for pattern: $backup_pattern"
fi
log "Scanning backup directory: $backup_dir"
local backup_files
mapfile -t backup_files < <(find "$backup_dir" -maxdepth 1 -name "$backup_pattern" -type f)
if [[ ${#backup_files[@]} -eq 0 ]]; then
log "No backup files found matching pattern $backup_pattern in $backup_dir" "ERROR"
return 1
elif [[ ${#backup_files[@]} -gt 1 ]]; then
log "Multiple backup files found in $backup_dir:" "ERROR"
for file in "${backup_files[@]}"; do
log " - $(basename "$file")" "ERROR"
done
return 1
fi
# Set backup_filename in the config file
local backup_filename
backup_filename=$(basename "${backup_files[0]}")
log "Found backup file: $backup_filename"
set_config_value "backup_filename" "\"$backup_filename\"" "$RESTORE_CONFIG"
return 0
}
set_config_value() {
local key="$1"
local value="$2"
local config_file="$3"
if grep -q "^${key}:" "$config_file"; then
# Update existing value
sed -i "s/^${key}:.*/${key}: ${value}/" "$config_file"
log "Updated ${key} to ${value} in config"
else
# Add new value
echo "${key}: ${value}" >> "$config_file"
log "Added ${key}: ${value} to config"
fi
}
get_backup_directory() {
# Extract initial_backup_dir from the config
local backup_dir
backup_dir=$(grep "^initial_backup_dir:" "$RESTORE_CONFIG" | sed 's/initial_backup_dir: *//' | tr -d '"' | tr -d "'")
if [[ -z "$backup_dir" ]]; then
log "initial_backup_dir not found in config" "ERROR"
return 1
fi
if [[ ! -d "$backup_dir" ]]; then
log "Backup directory does not exist: $backup_dir" "ERROR"
return 1
fi
echo "$backup_dir"
return 0
}
get_software_version() {
local version
version=$(grep "^SW_VERSION=" /etc/build.info | cut -d'=' -f2 | tr -d '"')
echo "$version"
}
check_and_set_registry_restore() {
log "Checking for image registry backup file..."
# Get and validate backup directory
local backup_dir
if ! backup_dir=$(get_backup_directory); then
return 1
fi
log "Scanning backup directory for image registry backup: $backup_dir"
# Find image registry backup files matching the pattern *_image_registry_backup_*.tgz
local registry_backup_files
mapfile -t registry_backup_files < <(find "$backup_dir" -maxdepth 1 -name "*_image_registry_backup_*.tgz" -type f)
if [[ ${#registry_backup_files[@]} -eq 1 ]]; then
local registry_backup_filename
registry_backup_filename=$(basename "${registry_backup_files[0]}")
log "Found image registry backup file: $registry_backup_filename"
set_config_value "restore_registry_filesystem" "true" "$RESTORE_CONFIG"
set_config_value "registry_backup_filename" "$registry_backup_filename" "$RESTORE_CONFIG"
elif [[ ${#registry_backup_files[@]} -gt 1 ]]; then
log "Multiple image registry backup files found in $backup_dir:" "ERROR"
for file in "${registry_backup_files[@]}"; do
log " - $(basename "$file")" "ERROR"
done
return 1
else
log "No image registry backup files found matching pattern *_image_registry_backup_*.tgz"
# We set restore_registry_filesystem to false so the restore playbook attempts
# to use the prestaged registry data instead of the registry backup file.
set_config_value "restore_registry_filesystem" "false" "$RESTORE_CONFIG"
fi
return 0
}
check_prestaged_images() {
log "Checking for prestaged container images..."
local auto_restore_mode
auto_restore_mode=$(grep "^auto_restore_mode:" "$RESTORE_CONFIG" 2>/dev/null | sed 's/auto_restore_mode: *//' | tr -d '"' | tr -d "'")
# Factory auto-restore prestaged data is stored in the factory backup directory
local prestage_dir
if [[ "$auto_restore_mode" == "factory" ]]; then
if ! prestage_dir=$(get_backup_directory); then
return 1
fi
log "Factory auto-restore mode: checking for prestaged images in: $prestage_dir"
else
local software_version
software_version=$(get_software_version)
prestage_dir="/opt/platform-backup/${software_version}"
log "Standard auto-restore mode: checking for prestaged images in: $prestage_dir"
fi
# Check for prestaged registry filesystem file
local prestaged_registry_file="${prestage_dir}/local_registry_filesystem.tgz"
local registry_found=false
if [[ -f "$prestaged_registry_file" ]]; then
log "Found prestaged registry file: $prestaged_registry_file"
registry_found=true
fi
# Check for container image files
local container_images
mapfile -t container_images < <(find "$prestage_dir" -maxdepth 1 -name "container-image*.tar.gz" -type f 2>/dev/null)
local containers_found=false
if [[ ${#container_images[@]} -gt 0 ]]; then
log "Found ${#container_images[@]} container image file(s):"
for file in "${container_images[@]}"; do
log " - $(basename "$file")"
done
containers_found=true
fi
if [[ "$registry_found" == true || "$containers_found" == true ]]; then
return 0
else
log "No prestaged images found in: $prestage_dir"
return 1
fi
}
validate_restore_prerequisites() {
log "Validating restore prerequisites..."
# Check backup file availability
local backup_available=false
if find_and_set_backup_filename; then
backup_available=true
log "Platform backup file validation: PASSED"
else
log "Platform backup file validation: FAILED" "ERROR"
fi
# Check registry restore options
local registry_available=false
# First, we check that check_and_set_registry_restore returns 0, indicating
# that backup_dir exists and the registry backup was either found or not.
if check_and_set_registry_restore; then
if grep -q "^restore_registry_filesystem: true" "$RESTORE_CONFIG"; then
# check_and_set_registry_restore sets restore_registry_filesystem
# to true if it finds the container images backup file
registry_available=true
log "Registry backup file validation: PASSED"
elif check_prestaged_images; then
# if the registry backup was not found, we check if the prestaged registry data is available
registry_available=true
log "Prestaged images validation: PASSED"
else
log "Registry backup and prestaged images validation: FAILED" "ERROR"
fi
else
# This means check_and_set_registry_restore exited with a return code of 1,
# indicating it failed to check if registry backup exists or not.
log "Registry restore validation: FAILED" "ERROR"
fi
# Send the correct failure event
if [[ "$backup_available" == false && "$registry_available" == false ]]; then
log "Both platform backup and container images are missing" "ERROR"
send_ipmi_event "restore_failed_both_missing"
return 1
elif [[ "$backup_available" == false ]]; then
log "Platform backup file is missing" "ERROR"
send_ipmi_event "restore_failed_backup_missing"
return 1
elif [[ "$registry_available" == false ]]; then
log "Container images (backup file and prestaged) are missing" "ERROR"
send_ipmi_event "restore_failed_images_missing"
return 1
fi
log "All restore prerequisites validated successfully"
return 0
}
handle_first_boot() {
send_ipmi_event "install_success"
# The IPMI monitor scripts polls every 30s, and initially it's looking for
# the install_success event, so we add a 60s pause so the system controller
# has time to detect the install_success and start to look for the restore
# events, otherwise, if the restore events are sent too soon, there's a
# possibility the system controller would miss the event.
log "Waiting 60 seconds for IPMI monitoring transition..."
sleep 60
if ! validate_restore_prerequisites; then
cleanup
exit 1
fi
if ! run_restore_playbook; then
cleanup
exit 1
fi
log "Restore playbook completed successfully"
rm -f "$RESTORE_CONFIG" || log "Failed to remove config file" "WARN"
touch "$RESTORE_PLAYBOOK_COMPLETE_FLAG" || log "Failed to create flag" "WARN"
if retry "source openrc" 10 10 "_source_openrc" &&
retry "host unlock" 10 30 "_unlock_host"; then
log "Host unlock process completed successfully"
else
exit 1
fi
}
handle_second_boot() {
log "Detected post-unlock boot, running 'system restore-complete'..."
if retry "source openrc" 10 10 "_source_openrc" &&
retry "system restore-complete" 15 10 "_restore_complete"; then
log "System restore-complete executed successfully"
send_ipmi_event "restore_complete"
cleanup
systemctl disable dc-auto-restore.service
else
send_ipmi_event "restore_failed"
exit 1
fi
}
main() {
trap 'log "Script exited with code $?."' EXIT
trap 'log "An error occurred on line $LINENO." "ERROR"' ERR
log "===== Starting auto-restore script ====="
if [[ -f "$RESTORE_CONFIG" ]]; then
handle_first_boot
elif [[ -f "$RESTORE_PLAYBOOK_COMPLETE_FLAG" ]]; then
handle_second_boot
else
log "No auto-restore config or flag found - nothing to do"
cleanup
fi
log "===== Auto-restore script completed successfully ====="
}
main "$@"