#!/usr/bin/env bash # # Copyright (c) 2025 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # # Subcloud auto-restore script that orchestrates platform restoration across # system reboots. Executed by systemd when /opt/platform-backup/auto-restore # directory is present after subcloud installation. # # Restore workflow: # - First boot: Send install success IPMI event, discover backup file, # execute restore playbook, and unlock controller # - Second boot: Run system restore-complete and send restore complete IPMI event # # The auto-restore directory is removed after the script is executed to stop # the systemd service from triggering again after reboot. set -euo pipefail readonly CONFIG_DIR="/opt/platform-backup/auto-restore" readonly RESTORE_CONFIG="${CONFIG_DIR}/backup_restore_values.yml" readonly RESTORE_PLAYBOOK_COMPLETE_FLAG="${CONFIG_DIR}/.restore_playbook_complete" readonly LOG_FILE="/var/log/auto-restore.log" readonly OPENRC_FILE="/etc/platform/openrc" readonly ANSIBLE_PLAYBOOK="/usr/share/ansible/stx-ansible/playbooks/restore_platform.yml" log() { local level="${2:-INFO}" printf "%(%F %T)T [%s] %s\n" -1 "$level" "$1" | tee -a "$LOG_FILE" } send_ipmi_event() { local event_type="$1" local event_data case "$event_type" in "install_success") event_data="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF0 # \"Install Completed\"" ;; "restore_complete") event_data="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF1 # \"Restore Completed\"" ;; "restore_failed") event_data="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF2 # \"Restore Failed\"" ;; "restore_failed_backup_missing") event_data="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF3 # \"Restore Failed: missing backup file\"" ;; "restore_failed_images_missing") event_data="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF4 # \"Restore Failed: missing container images backup file\"" ;; "restore_failed_both_missing") event_data="0x04 0x12 0xCC 0x63 0xCC 0x10 0xF5 # \"Restore Failed: missing backup and container images backup files\"" ;; *) log "Unknown IPMI event type: $event_type" "ERROR" return 1 ;; esac temp_file=$(mktemp /tmp/ipmi_event_XXXXXX.txt) echo "$event_data" > "$temp_file" if retry "send IPMI event ($event_type)" 3 5 "_send_ipmi_command"; then log "IPMI event sent successfully: $event_type" rm -f "$temp_file" return 0 else log "Failed to send IPMI event after retries: $event_type" "ERROR" rm -f "$temp_file" return 1 fi } cleanup() { log "Removing auto-restore directory to prevent future triggers" rm -rf "$CONFIG_DIR" || log "Failed to remove auto-restore directory" "WARN" } retry() { local -r operation="$1" max_attempts="$2" delay="$3" local -r command_func="$4" local attempt=1 while (( attempt <= max_attempts )); do log "Attempting $operation (attempt $attempt/$max_attempts)" if "$command_func"; then log "$operation completed successfully" return 0 fi log "$operation failed (attempt $attempt)" "WARN" if (( attempt < max_attempts )); then log "Retrying in ${delay}s..." sleep "$delay" fi ((attempt++)) done log "$operation failed after $max_attempts attempts" "ERROR" return 1 } _source_openrc() { # shellcheck disable=SC1090 source "$OPENRC_FILE" || { log "Failed to source openrc" "ERROR" return 1 } } _unlock_host() { if system host-unlock controller-0 >> "$LOG_FILE" 2>&1; then return 0 fi log "Host unlock failed" "ERROR" return 1 } _restore_complete() { local output # restore-complete doesn't return a non-zero exit code if the command fails # because the restore is still in progress, so we need to parse its output instead output=$(system restore-complete 2>&1) local exit_code=$? if [[ -n "$output" ]]; then echo "$output" >> "$LOG_FILE" fi if [[ $exit_code -eq 0 ]] && echo "$output" | grep -q "Restore procedure completed"; then return 0 else if [[ $exit_code -ne 0 ]]; then log "system restore-complete command failed with exit code $exit_code" "ERROR" fi return 1 fi } _send_ipmi_command() { if ipmitool sel add "$temp_file" 2>&1 | tee -a "$LOG_FILE" > /dev/null; then return 0 fi return 1 } run_restore_playbook() { log "Starting automatic restore process" export HOME=/home/sysadmin if ! ansible-playbook "$ANSIBLE_PLAYBOOK" \ -e "@${RESTORE_CONFIG}" \ -e "override_files_dir=${HOME}" >> "$LOG_FILE" 2>&1; then log "Restore playbook failed" "ERROR" send_ipmi_event "restore_failed" return 1 fi return 0 } find_and_set_backup_filename() { log "Checking if backup_filename is already set in config..." if grep -q "^backup_filename:" "$RESTORE_CONFIG"; then local existing_filename existing_filename=$(grep "^backup_filename:" "$RESTORE_CONFIG" | sed 's/backup_filename: *//' | tr -d '"' | tr -d "'") log "backup_filename already set to: $existing_filename" return 0 fi log "backup_filename not found in config, scanning for backup file..." # Get and validate backup directory local backup_dir if ! backup_dir=$(get_backup_directory); then return 1 fi local auto_restore_mode auto_restore_mode=$(grep "^auto_restore_mode:" "$RESTORE_CONFIG" 2>/dev/null | sed 's/auto_restore_mode: *//' | tr -d '"' | tr -d "'") # For factory auto-restore, we need to look for a backup file matching # the *factory_backup*.tgz pattern local backup_pattern if [[ "$auto_restore_mode" == "factory" ]]; then backup_pattern="*factory_backup*.tgz" log "Factory auto-restore mode, searching for pattern: $backup_pattern" else backup_pattern="*_platform_backup_*.tgz" log "Standard auto-restore mode, searching for pattern: $backup_pattern" fi log "Scanning backup directory: $backup_dir" local backup_files mapfile -t backup_files < <(find "$backup_dir" -maxdepth 1 -name "$backup_pattern" -type f) if [[ ${#backup_files[@]} -eq 0 ]]; then log "No backup files found matching pattern $backup_pattern in $backup_dir" "ERROR" return 1 elif [[ ${#backup_files[@]} -gt 1 ]]; then log "Multiple backup files found in $backup_dir:" "ERROR" for file in "${backup_files[@]}"; do log " - $(basename "$file")" "ERROR" done return 1 fi # Set backup_filename in the config file local backup_filename backup_filename=$(basename "${backup_files[0]}") log "Found backup file: $backup_filename" set_config_value "backup_filename" "\"$backup_filename\"" "$RESTORE_CONFIG" return 0 } set_config_value() { local key="$1" local value="$2" local config_file="$3" if grep -q "^${key}:" "$config_file"; then # Update existing value sed -i "s/^${key}:.*/${key}: ${value}/" "$config_file" log "Updated ${key} to ${value} in config" else # Add new value echo "${key}: ${value}" >> "$config_file" log "Added ${key}: ${value} to config" fi } get_backup_directory() { # Extract initial_backup_dir from the config local backup_dir backup_dir=$(grep "^initial_backup_dir:" "$RESTORE_CONFIG" | sed 's/initial_backup_dir: *//' | tr -d '"' | tr -d "'") if [[ -z "$backup_dir" ]]; then log "initial_backup_dir not found in config" "ERROR" return 1 fi if [[ ! -d "$backup_dir" ]]; then log "Backup directory does not exist: $backup_dir" "ERROR" return 1 fi echo "$backup_dir" return 0 } get_software_version() { local version version=$(grep "^SW_VERSION=" /etc/build.info | cut -d'=' -f2 | tr -d '"') echo "$version" } check_and_set_registry_restore() { log "Checking for image registry backup file..." # Get and validate backup directory local backup_dir if ! backup_dir=$(get_backup_directory); then return 1 fi log "Scanning backup directory for image registry backup: $backup_dir" # Find image registry backup files matching the pattern *_image_registry_backup_*.tgz local registry_backup_files mapfile -t registry_backup_files < <(find "$backup_dir" -maxdepth 1 -name "*_image_registry_backup_*.tgz" -type f) if [[ ${#registry_backup_files[@]} -eq 1 ]]; then local registry_backup_filename registry_backup_filename=$(basename "${registry_backup_files[0]}") log "Found image registry backup file: $registry_backup_filename" set_config_value "restore_registry_filesystem" "true" "$RESTORE_CONFIG" set_config_value "registry_backup_filename" "$registry_backup_filename" "$RESTORE_CONFIG" elif [[ ${#registry_backup_files[@]} -gt 1 ]]; then log "Multiple image registry backup files found in $backup_dir:" "ERROR" for file in "${registry_backup_files[@]}"; do log " - $(basename "$file")" "ERROR" done return 1 else log "No image registry backup files found matching pattern *_image_registry_backup_*.tgz" # We set restore_registry_filesystem to false so the restore playbook attempts # to use the prestaged registry data instead of the registry backup file. set_config_value "restore_registry_filesystem" "false" "$RESTORE_CONFIG" fi return 0 } check_prestaged_images() { log "Checking for prestaged container images..." local auto_restore_mode auto_restore_mode=$(grep "^auto_restore_mode:" "$RESTORE_CONFIG" 2>/dev/null | sed 's/auto_restore_mode: *//' | tr -d '"' | tr -d "'") # Factory auto-restore prestaged data is stored in the factory backup directory local prestage_dir if [[ "$auto_restore_mode" == "factory" ]]; then if ! prestage_dir=$(get_backup_directory); then return 1 fi log "Factory auto-restore mode: checking for prestaged images in: $prestage_dir" else local software_version software_version=$(get_software_version) prestage_dir="/opt/platform-backup/${software_version}" log "Standard auto-restore mode: checking for prestaged images in: $prestage_dir" fi # Check for prestaged registry filesystem file local prestaged_registry_file="${prestage_dir}/local_registry_filesystem.tgz" local registry_found=false if [[ -f "$prestaged_registry_file" ]]; then log "Found prestaged registry file: $prestaged_registry_file" registry_found=true fi # Check for container image files local container_images mapfile -t container_images < <(find "$prestage_dir" -maxdepth 1 -name "container-image*.tar.gz" -type f 2>/dev/null) local containers_found=false if [[ ${#container_images[@]} -gt 0 ]]; then log "Found ${#container_images[@]} container image file(s):" for file in "${container_images[@]}"; do log " - $(basename "$file")" done containers_found=true fi if [[ "$registry_found" == true || "$containers_found" == true ]]; then return 0 else log "No prestaged images found in: $prestage_dir" return 1 fi } validate_restore_prerequisites() { log "Validating restore prerequisites..." # Check backup file availability local backup_available=false if find_and_set_backup_filename; then backup_available=true log "Platform backup file validation: PASSED" else log "Platform backup file validation: FAILED" "ERROR" fi # Check registry restore options local registry_available=false # First, we check that check_and_set_registry_restore returns 0, indicating # that backup_dir exists and the registry backup was either found or not. if check_and_set_registry_restore; then if grep -q "^restore_registry_filesystem: true" "$RESTORE_CONFIG"; then # check_and_set_registry_restore sets restore_registry_filesystem # to true if it finds the container images backup file registry_available=true log "Registry backup file validation: PASSED" elif check_prestaged_images; then # if the registry backup was not found, we check if the prestaged registry data is available registry_available=true log "Prestaged images validation: PASSED" else log "Registry backup and prestaged images validation: FAILED" "ERROR" fi else # This means check_and_set_registry_restore exited with a return code of 1, # indicating it failed to check if registry backup exists or not. log "Registry restore validation: FAILED" "ERROR" fi # Send the correct failure event if [[ "$backup_available" == false && "$registry_available" == false ]]; then log "Both platform backup and container images are missing" "ERROR" send_ipmi_event "restore_failed_both_missing" return 1 elif [[ "$backup_available" == false ]]; then log "Platform backup file is missing" "ERROR" send_ipmi_event "restore_failed_backup_missing" return 1 elif [[ "$registry_available" == false ]]; then log "Container images (backup file and prestaged) are missing" "ERROR" send_ipmi_event "restore_failed_images_missing" return 1 fi log "All restore prerequisites validated successfully" return 0 } handle_first_boot() { send_ipmi_event "install_success" # The IPMI monitor scripts polls every 30s, and initially it's looking for # the install_success event, so we add a 60s pause so the system controller # has time to detect the install_success and start to look for the restore # events, otherwise, if the restore events are sent too soon, there's a # possibility the system controller would miss the event. log "Waiting 60 seconds for IPMI monitoring transition..." sleep 60 if ! validate_restore_prerequisites; then cleanup exit 1 fi if ! run_restore_playbook; then cleanup exit 1 fi log "Restore playbook completed successfully" rm -f "$RESTORE_CONFIG" || log "Failed to remove config file" "WARN" touch "$RESTORE_PLAYBOOK_COMPLETE_FLAG" || log "Failed to create flag" "WARN" if retry "source openrc" 10 10 "_source_openrc" && retry "host unlock" 10 30 "_unlock_host"; then log "Host unlock process completed successfully" else exit 1 fi } handle_second_boot() { log "Detected post-unlock boot, running 'system restore-complete'..." if retry "source openrc" 10 10 "_source_openrc" && retry "system restore-complete" 15 10 "_restore_complete"; then log "System restore-complete executed successfully" send_ipmi_event "restore_complete" cleanup systemctl disable dc-auto-restore.service else send_ipmi_event "restore_failed" exit 1 fi } main() { trap 'log "Script exited with code $?."' EXIT trap 'log "An error occurred on line $LINENO." "ERROR"' ERR log "===== Starting auto-restore script =====" if [[ -f "$RESTORE_CONFIG" ]]; then handle_first_boot elif [[ -f "$RESTORE_PLAYBOOK_COMPLETE_FLAG" ]]; then handle_second_boot else log "No auto-restore config or flag found - nothing to do" cleanup fi log "===== Auto-restore script completed successfully =====" } main "$@"