This commit optimizes the SEL event monitoring script used during
subcloud enrollment. The script now runs a single ipmitool sel list -v
command instead of invoking ipmitool sel list followed by multiple
ipmitool sel get calls.
A new mode has been added to the monitoring script that monitors for
all success events, returning the last one, allowing the caller to
detect if multiple events were sent and skip directly to the latest
completed phase. If a failure event is detected, the script exits
immediately.
The IPMI codes used by auto-restore and enrollment have been updated
from OEM-reserved codes to the “System Event” sensor type, which the
BMC never interprets as an error.
Additionally, the script now stops sending the expected_region_name
restore override for factory restores, since the factory-installed
subcloud region name is not expected to match the system controller's
region name.
Test Plan:
01. PASS – Run subcloud enrollment and verify the following:
- Events sent sequentially with delays are returned individually.
- Events sent quickly without delays return only the latest event
captured in the polling interval.
- A failure event is returned immediately when detected.
- Success events followed by a failure event return the failure
event as soon as it appears.
- If no events are detected, the script exits with a timeout error.
02. PASS – Run subcloud auto-restore and factory restore to verify the
new codes are used and that both operations complete successfully.
03. PASS – Time the full enroll-init operation on a slow BMC and verify
it completes faster than the previous implementation.
Story: 2011455
Task: 52858
Change-Id: Ib379e4e4c8308361357914f21fd8d73ab5bafbb4
Signed-off-by: Gustavo Herzmann <gustavo.herzmann@windriver.com>
473 lines
16 KiB
Bash
473 lines
16 KiB
Bash
#!/usr/bin/env bash
|
|
|
|
#
|
|
# Copyright (c) 2025 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
|
|
# Subcloud auto-restore script that orchestrates platform restoration across
|
|
# system reboots. Executed by systemd when /opt/platform-backup/auto-restore
|
|
# directory is present after subcloud installation.
|
|
#
|
|
# Restore workflow:
|
|
# - First boot: Send install success IPMI event, discover backup file,
|
|
# execute restore playbook, and unlock controller
|
|
# - Second boot: Run system restore-complete and send restore complete IPMI event
|
|
#
|
|
# The auto-restore directory is removed after the script is executed to stop
|
|
# the systemd service from triggering again after reboot.
|
|
|
|
set -euo pipefail
|
|
|
|
readonly CONFIG_DIR="/opt/platform-backup/auto-restore"
|
|
readonly RESTORE_CONFIG="${CONFIG_DIR}/backup_restore_values.yml"
|
|
readonly RESTORE_PLAYBOOK_COMPLETE_FLAG="${CONFIG_DIR}/.restore_playbook_complete"
|
|
readonly LOG_FILE="/var/log/auto-restore.log"
|
|
readonly OPENRC_FILE="/etc/platform/openrc"
|
|
readonly ANSIBLE_PLAYBOOK="/usr/share/ansible/stx-ansible/playbooks/restore_platform.yml"
|
|
|
|
log() {
|
|
local level="${2:-INFO}"
|
|
printf "%(%F %T)T [%s] %s\n" -1 "$level" "$1" | tee -a "$LOG_FILE"
|
|
}
|
|
|
|
send_ipmi_event() {
|
|
local event_type="$1"
|
|
local event_data
|
|
|
|
case "$event_type" in
|
|
"install_success")
|
|
event_data="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF0 # \"Install Completed\""
|
|
;;
|
|
"restore_complete")
|
|
event_data="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF1 # \"Restore Completed\""
|
|
;;
|
|
"restore_failed")
|
|
event_data="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF2 # \"Restore Failed\""
|
|
;;
|
|
"restore_failed_backup_missing")
|
|
event_data="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF3 # \"Restore Failed: missing backup file\""
|
|
;;
|
|
"restore_failed_images_missing")
|
|
event_data="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF4 # \"Restore Failed: missing container images backup file\""
|
|
;;
|
|
"restore_failed_both_missing")
|
|
event_data="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF5 # \"Restore Failed: missing backup and container images backup files\""
|
|
;;
|
|
*)
|
|
log "Unknown IPMI event type: $event_type" "ERROR"
|
|
return 1
|
|
;;
|
|
esac
|
|
|
|
temp_file=$(mktemp /tmp/ipmi_event_XXXXXX.txt)
|
|
|
|
echo "$event_data" > "$temp_file"
|
|
|
|
if retry "send IPMI event ($event_type)" 3 5 "_send_ipmi_command"; then
|
|
log "IPMI event sent successfully: $event_type"
|
|
rm -f "$temp_file"
|
|
return 0
|
|
else
|
|
log "Failed to send IPMI event after retries: $event_type" "ERROR"
|
|
rm -f "$temp_file"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
cleanup() {
|
|
log "Removing auto-restore directory to prevent future triggers"
|
|
rm -rf "$CONFIG_DIR" || log "Failed to remove auto-restore directory" "WARN"
|
|
}
|
|
|
|
retry() {
|
|
local -r operation="$1" max_attempts="$2" delay="$3"
|
|
local -r command_func="$4"
|
|
|
|
local attempt=1
|
|
|
|
while (( attempt <= max_attempts )); do
|
|
log "Attempting $operation (attempt $attempt/$max_attempts)"
|
|
|
|
if "$command_func"; then
|
|
log "$operation completed successfully"
|
|
return 0
|
|
fi
|
|
|
|
log "$operation failed (attempt $attempt)" "WARN"
|
|
if (( attempt < max_attempts )); then
|
|
log "Retrying in ${delay}s..."
|
|
sleep "$delay"
|
|
fi
|
|
((attempt++))
|
|
done
|
|
|
|
log "$operation failed after $max_attempts attempts" "ERROR"
|
|
return 1
|
|
}
|
|
|
|
_source_openrc() {
|
|
# shellcheck disable=SC1090
|
|
source "$OPENRC_FILE" || {
|
|
log "Failed to source openrc" "ERROR"
|
|
return 1
|
|
}
|
|
}
|
|
|
|
_unlock_host() {
|
|
if system host-unlock controller-0 >> "$LOG_FILE" 2>&1; then
|
|
return 0
|
|
fi
|
|
log "Host unlock failed" "ERROR"
|
|
return 1
|
|
}
|
|
|
|
_restore_complete() {
|
|
local output
|
|
|
|
# restore-complete doesn't return a non-zero exit code if the command fails
|
|
# because the restore is still in progress, so we need to parse its output instead
|
|
output=$(system restore-complete 2>&1)
|
|
local exit_code=$?
|
|
|
|
if [[ -n "$output" ]]; then
|
|
echo "$output" >> "$LOG_FILE"
|
|
fi
|
|
|
|
if [[ $exit_code -eq 0 ]] && echo "$output" | grep -q "Restore procedure completed"; then
|
|
return 0
|
|
else
|
|
if [[ $exit_code -ne 0 ]]; then
|
|
log "system restore-complete command failed with exit code $exit_code" "ERROR"
|
|
fi
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
_send_ipmi_command() {
|
|
if ipmitool sel add "$temp_file" 2>&1 | tee -a "$LOG_FILE" > /dev/null; then
|
|
return 0
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
run_restore_playbook() {
|
|
log "Starting automatic restore process"
|
|
export HOME=/home/sysadmin
|
|
|
|
if ! ansible-playbook "$ANSIBLE_PLAYBOOK" \
|
|
-e "@${RESTORE_CONFIG}" \
|
|
-e "override_files_dir=${HOME}" >> "$LOG_FILE" 2>&1; then
|
|
log "Restore playbook failed" "ERROR"
|
|
send_ipmi_event "restore_failed"
|
|
return 1
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
find_and_set_backup_filename() {
|
|
log "Checking if backup_filename is already set in config..."
|
|
|
|
if grep -q "^backup_filename:" "$RESTORE_CONFIG"; then
|
|
local existing_filename
|
|
existing_filename=$(grep "^backup_filename:" "$RESTORE_CONFIG" | sed 's/backup_filename: *//' | tr -d '"' | tr -d "'")
|
|
log "backup_filename already set to: $existing_filename"
|
|
return 0
|
|
fi
|
|
|
|
log "backup_filename not found in config, scanning for backup file..."
|
|
|
|
# Get and validate backup directory
|
|
local backup_dir
|
|
if ! backup_dir=$(get_backup_directory); then
|
|
return 1
|
|
fi
|
|
|
|
local auto_restore_mode
|
|
auto_restore_mode=$(grep "^auto_restore_mode:" "$RESTORE_CONFIG" 2>/dev/null | sed 's/auto_restore_mode: *//' | tr -d '"' | tr -d "'")
|
|
|
|
# For factory auto-restore, we need to look for a backup file matching
|
|
# the *factory_backup*.tgz pattern
|
|
local backup_pattern
|
|
if [[ "$auto_restore_mode" == "factory" ]]; then
|
|
backup_pattern="*factory_backup*.tgz"
|
|
log "Factory auto-restore mode, searching for pattern: $backup_pattern"
|
|
else
|
|
backup_pattern="*_platform_backup_*.tgz"
|
|
log "Standard auto-restore mode, searching for pattern: $backup_pattern"
|
|
fi
|
|
|
|
log "Scanning backup directory: $backup_dir"
|
|
|
|
local backup_files
|
|
mapfile -t backup_files < <(find "$backup_dir" -maxdepth 1 -name "$backup_pattern" -type f)
|
|
|
|
if [[ ${#backup_files[@]} -eq 0 ]]; then
|
|
log "No backup files found matching pattern $backup_pattern in $backup_dir" "ERROR"
|
|
return 1
|
|
elif [[ ${#backup_files[@]} -gt 1 ]]; then
|
|
log "Multiple backup files found in $backup_dir:" "ERROR"
|
|
for file in "${backup_files[@]}"; do
|
|
log " - $(basename "$file")" "ERROR"
|
|
done
|
|
return 1
|
|
fi
|
|
|
|
# Set backup_filename in the config file
|
|
local backup_filename
|
|
backup_filename=$(basename "${backup_files[0]}")
|
|
log "Found backup file: $backup_filename"
|
|
|
|
set_config_value "backup_filename" "\"$backup_filename\"" "$RESTORE_CONFIG"
|
|
|
|
return 0
|
|
}
|
|
|
|
set_config_value() {
|
|
local key="$1"
|
|
local value="$2"
|
|
local config_file="$3"
|
|
|
|
if grep -q "^${key}:" "$config_file"; then
|
|
# Update existing value
|
|
sed -i "s/^${key}:.*/${key}: ${value}/" "$config_file"
|
|
log "Updated ${key} to ${value} in config"
|
|
else
|
|
# Add new value
|
|
echo "${key}: ${value}" >> "$config_file"
|
|
log "Added ${key}: ${value} to config"
|
|
fi
|
|
}
|
|
|
|
get_backup_directory() {
|
|
# Extract initial_backup_dir from the config
|
|
local backup_dir
|
|
backup_dir=$(grep "^initial_backup_dir:" "$RESTORE_CONFIG" | sed 's/initial_backup_dir: *//' | tr -d '"' | tr -d "'")
|
|
|
|
if [[ -z "$backup_dir" ]]; then
|
|
log "initial_backup_dir not found in config" "ERROR"
|
|
return 1
|
|
fi
|
|
|
|
if [[ ! -d "$backup_dir" ]]; then
|
|
log "Backup directory does not exist: $backup_dir" "ERROR"
|
|
return 1
|
|
fi
|
|
|
|
echo "$backup_dir"
|
|
return 0
|
|
}
|
|
|
|
get_software_version() {
|
|
local version
|
|
version=$(grep "^SW_VERSION=" /etc/build.info | cut -d'=' -f2 | tr -d '"')
|
|
echo "$version"
|
|
}
|
|
|
|
check_and_set_registry_restore() {
|
|
log "Checking for image registry backup file..."
|
|
|
|
# Get and validate backup directory
|
|
local backup_dir
|
|
if ! backup_dir=$(get_backup_directory); then
|
|
return 1
|
|
fi
|
|
|
|
log "Scanning backup directory for image registry backup: $backup_dir"
|
|
|
|
# Find image registry backup files matching the pattern *_image_registry_backup_*.tgz
|
|
local registry_backup_files
|
|
mapfile -t registry_backup_files < <(find "$backup_dir" -maxdepth 1 -name "*_image_registry_backup_*.tgz" -type f)
|
|
|
|
if [[ ${#registry_backup_files[@]} -eq 1 ]]; then
|
|
local registry_backup_filename
|
|
registry_backup_filename=$(basename "${registry_backup_files[0]}")
|
|
log "Found image registry backup file: $registry_backup_filename"
|
|
set_config_value "restore_registry_filesystem" "true" "$RESTORE_CONFIG"
|
|
set_config_value "registry_backup_filename" "$registry_backup_filename" "$RESTORE_CONFIG"
|
|
elif [[ ${#registry_backup_files[@]} -gt 1 ]]; then
|
|
log "Multiple image registry backup files found in $backup_dir:" "ERROR"
|
|
for file in "${registry_backup_files[@]}"; do
|
|
log " - $(basename "$file")" "ERROR"
|
|
done
|
|
return 1
|
|
else
|
|
log "No image registry backup files found matching pattern *_image_registry_backup_*.tgz"
|
|
# We set restore_registry_filesystem to false so the restore playbook attempts
|
|
# to use the prestaged registry data instead of the registry backup file.
|
|
set_config_value "restore_registry_filesystem" "false" "$RESTORE_CONFIG"
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
check_prestaged_images() {
|
|
log "Checking for prestaged container images..."
|
|
|
|
local auto_restore_mode
|
|
auto_restore_mode=$(grep "^auto_restore_mode:" "$RESTORE_CONFIG" 2>/dev/null | sed 's/auto_restore_mode: *//' | tr -d '"' | tr -d "'")
|
|
|
|
# Factory auto-restore prestaged data is stored in the factory backup directory
|
|
local prestage_dir
|
|
if [[ "$auto_restore_mode" == "factory" ]]; then
|
|
if ! prestage_dir=$(get_backup_directory); then
|
|
return 1
|
|
fi
|
|
log "Factory auto-restore mode: checking for prestaged images in: $prestage_dir"
|
|
else
|
|
local software_version
|
|
software_version=$(get_software_version)
|
|
prestage_dir="/opt/platform-backup/${software_version}"
|
|
log "Standard auto-restore mode: checking for prestaged images in: $prestage_dir"
|
|
fi
|
|
|
|
# Check for prestaged registry filesystem file
|
|
local prestaged_registry_file="${prestage_dir}/local_registry_filesystem.tgz"
|
|
local registry_found=false
|
|
if [[ -f "$prestaged_registry_file" ]]; then
|
|
log "Found prestaged registry file: $prestaged_registry_file"
|
|
registry_found=true
|
|
fi
|
|
|
|
# Check for container image files
|
|
local container_images
|
|
mapfile -t container_images < <(find "$prestage_dir" -maxdepth 1 -name "container-image*.tar.gz" -type f 2>/dev/null)
|
|
local containers_found=false
|
|
if [[ ${#container_images[@]} -gt 0 ]]; then
|
|
log "Found ${#container_images[@]} container image file(s):"
|
|
for file in "${container_images[@]}"; do
|
|
log " - $(basename "$file")"
|
|
done
|
|
containers_found=true
|
|
fi
|
|
|
|
if [[ "$registry_found" == true || "$containers_found" == true ]]; then
|
|
return 0
|
|
else
|
|
log "No prestaged images found in: $prestage_dir"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
validate_restore_prerequisites() {
|
|
log "Validating restore prerequisites..."
|
|
|
|
# Check backup file availability
|
|
local backup_available=false
|
|
if find_and_set_backup_filename; then
|
|
backup_available=true
|
|
log "Platform backup file validation: PASSED"
|
|
else
|
|
log "Platform backup file validation: FAILED" "ERROR"
|
|
fi
|
|
|
|
# Check registry restore options
|
|
local registry_available=false
|
|
# First, we check that check_and_set_registry_restore returns 0, indicating
|
|
# that backup_dir exists and the registry backup was either found or not.
|
|
if check_and_set_registry_restore; then
|
|
if grep -q "^restore_registry_filesystem: true" "$RESTORE_CONFIG"; then
|
|
# check_and_set_registry_restore sets restore_registry_filesystem
|
|
# to true if it finds the container images backup file
|
|
registry_available=true
|
|
log "Registry backup file validation: PASSED"
|
|
elif check_prestaged_images; then
|
|
# if the registry backup was not found, we check if the prestaged registry data is available
|
|
registry_available=true
|
|
log "Prestaged images validation: PASSED"
|
|
else
|
|
log "Registry backup and prestaged images validation: FAILED" "ERROR"
|
|
fi
|
|
else
|
|
# This means check_and_set_registry_restore exited with a return code of 1,
|
|
# indicating it failed to check if registry backup exists or not.
|
|
log "Registry restore validation: FAILED" "ERROR"
|
|
fi
|
|
|
|
# Send the correct failure event
|
|
if [[ "$backup_available" == false && "$registry_available" == false ]]; then
|
|
log "Both platform backup and container images are missing" "ERROR"
|
|
send_ipmi_event "restore_failed_both_missing"
|
|
return 1
|
|
elif [[ "$backup_available" == false ]]; then
|
|
log "Platform backup file is missing" "ERROR"
|
|
send_ipmi_event "restore_failed_backup_missing"
|
|
return 1
|
|
elif [[ "$registry_available" == false ]]; then
|
|
log "Container images (backup file and prestaged) are missing" "ERROR"
|
|
send_ipmi_event "restore_failed_images_missing"
|
|
return 1
|
|
fi
|
|
|
|
log "All restore prerequisites validated successfully"
|
|
return 0
|
|
}
|
|
|
|
handle_first_boot() {
|
|
send_ipmi_event "install_success"
|
|
|
|
# The IPMI monitor scripts polls every 30s, and initially it's looking for
|
|
# the install_success event, so we add a 60s pause so the system controller
|
|
# has time to detect the install_success and start to look for the restore
|
|
# events, otherwise, if the restore events are sent too soon, there's a
|
|
# possibility the system controller would miss the event.
|
|
log "Waiting 60 seconds for IPMI monitoring transition..."
|
|
sleep 60
|
|
|
|
if ! validate_restore_prerequisites; then
|
|
cleanup
|
|
exit 1
|
|
fi
|
|
|
|
if ! run_restore_playbook; then
|
|
cleanup
|
|
exit 1
|
|
fi
|
|
|
|
log "Restore playbook completed successfully"
|
|
rm -f "$RESTORE_CONFIG" || log "Failed to remove config file" "WARN"
|
|
touch "$RESTORE_PLAYBOOK_COMPLETE_FLAG" || log "Failed to create flag" "WARN"
|
|
|
|
if retry "source openrc" 10 10 "_source_openrc" &&
|
|
retry "host unlock" 10 30 "_unlock_host"; then
|
|
log "Host unlock process completed successfully"
|
|
else
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
handle_second_boot() {
|
|
log "Detected post-unlock boot, running 'system restore-complete'..."
|
|
if retry "source openrc" 10 10 "_source_openrc" &&
|
|
retry "system restore-complete" 15 10 "_restore_complete"; then
|
|
log "System restore-complete executed successfully"
|
|
send_ipmi_event "restore_complete"
|
|
cleanup
|
|
systemctl disable dc-auto-restore.service
|
|
else
|
|
send_ipmi_event "restore_failed"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
main() {
|
|
trap 'log "Script exited with code $?."' EXIT
|
|
trap 'log "An error occurred on line $LINENO." "ERROR"' ERR
|
|
|
|
log "===== Starting auto-restore script ====="
|
|
|
|
if [[ -f "$RESTORE_CONFIG" ]]; then
|
|
handle_first_boot
|
|
elif [[ -f "$RESTORE_PLAYBOOK_COMPLETE_FLAG" ]]; then
|
|
handle_second_boot
|
|
else
|
|
log "No auto-restore config or flag found - nothing to do"
|
|
cleanup
|
|
fi
|
|
|
|
log "===== Auto-restore script completed successfully ====="
|
|
}
|
|
|
|
main "$@"
|