config/controllerconfig/controllerconfig/controllerconfig/backup_restore.py

1691 lines
63 KiB
Python

#
# Copyright (c) 2014-2017 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
"""
Backup & Restore
"""
from __future__ import print_function
import copy
import filecmp
import fileinput
import os
import glob
import shutil
import stat
import subprocess
import tarfile
import tempfile
import textwrap
import time
from fm_api import constants as fm_constants
from fm_api import fm_api
from sysinv.common import constants as sysinv_constants
from controllerconfig.common import log
from controllerconfig.common import constants
from controllerconfig.common.exceptions import BackupFail
from controllerconfig.common.exceptions import RestoreFail
from controllerconfig.common.exceptions import KeystoneFail
from controllerconfig.common.exceptions import SysInvFail
from controllerconfig import openstack
import tsconfig.tsconfig as tsconfig
from controllerconfig import utils
from controllerconfig import sysinv_api as sysinv
from six.moves import input
from os import environ
LOG = log.get_logger(__name__)
DEVNULL = open(os.devnull, 'w')
RESTORE_COMPLETE = "restore-complete"
RESTORE_RERUN_REQUIRED = "restore-rerun-required"
# Backup/restore related constants
backup_in_progress = tsconfig.BACKUP_IN_PROGRESS_FLAG
restore_in_progress = tsconfig.RESTORE_IN_PROGRESS_FLAG
restore_patching_complete = '/etc/platform/.restore_patching_complete'
node_is_patched = '/var/run/node_is_patched'
keyring_permdir = os.path.join('/opt/platform/.keyring', tsconfig.SW_VERSION)
ceph_permdir = os.path.join(tsconfig.CONFIG_PATH, 'ceph-config')
ldap_permdir = '/var/lib/openldap-data'
patching_permdir = '/opt/patching'
patching_repo_permdir = '/www/pages/updates'
home_permdir = '/home'
extension_permdir = '/opt/extension'
patch_vault_permdir = '/opt/patch-vault'
mariadb_pod = 'mariadb-server-0'
kube_config = environ.get('KUBECONFIG')
if kube_config is None:
kube_config = '/etc/kubernetes/admin.conf'
kube_cmd_prefix = 'kubectl --kubeconfig=%s ' % kube_config
kube_cmd_prefix += 'exec -i %s -n openstack -- bash -c ' % mariadb_pod
mysql_prefix = '\'exec mysql -uroot -p"$MYSQL_ROOT_PASSWORD" '
mysqldump_prefix = '\'exec mysqldump -uroot -p"$MYSQL_ROOT_PASSWORD" '
def get_backup_databases():
"""
Retrieve database lists for backup.
:return: backup_databases and backup_database_skip_tables
"""
# Databases common to all configurations
REGION_LOCAL_DATABASES = ('postgres', 'template1', 'sysinv',
'fm', 'barbican')
REGION_SHARED_DATABASES = ('keystone',)
# Indicates which tables have to be dropped for a certain database.
DB_TABLE_SKIP_MAPPING = {
'fm': ('alarm',),
'dcorch': ('orch_job',
'orch_request',
'resource',
'subcloud_resource'), }
if tsconfig.region_config == 'yes':
BACKUP_DATABASES = REGION_LOCAL_DATABASES
else:
# Add additional databases for non-region configuration and for the
# primary region in region deployments.
BACKUP_DATABASES = REGION_LOCAL_DATABASES + REGION_SHARED_DATABASES
# Add distributed cloud databases
if tsconfig.distributed_cloud_role == \
sysinv_constants.DISTRIBUTED_CLOUD_ROLE_SYSTEMCONTROLLER:
BACKUP_DATABASES += ('dcmanager', 'dcorch')
# We generate the tables to be skipped for each database
# mentioned in BACKUP_DATABASES. We explicitly list
# skip tables in DB_TABLE_SKIP_MAPPING
BACKUP_DB_SKIP_TABLES = dict(
[[x, DB_TABLE_SKIP_MAPPING.get(x, ())] for x in BACKUP_DATABASES])
return BACKUP_DATABASES, BACKUP_DB_SKIP_TABLES
def get_os_backup_databases():
"""
Retrieve openstack database lists from MariaDB for backup.
:return: os_backup_databases
"""
skip_dbs = ("Database", "information_schema", "performance_schema",
"mysql", "horizon", "panko", "gnocchi")
try:
db_cmd = kube_cmd_prefix + mysql_prefix + '-e"show databases" \''
proc = subprocess.Popen([db_cmd], shell=True,
stdout=subprocess.PIPE, stderr=DEVNULL)
os_backup_dbs = set(line[:-1] for line in proc.stdout
if line[:-1] not in skip_dbs)
proc.communicate()
return os_backup_dbs
except subprocess.CalledProcessError:
raise BackupFail("Failed to get openstack databases from MariaDB.")
def check_load_versions(archive, staging_dir):
match = False
try:
member = archive.getmember('etc/build.info')
archive.extract(member, path=staging_dir)
match = filecmp.cmp('/etc/build.info', staging_dir + '/etc/build.info')
shutil.rmtree(staging_dir + '/etc')
except Exception as e:
LOG.exception(e)
raise RestoreFail("Unable to verify load version in backup file. "
"Invalid backup file.")
if not match:
LOG.error("Load version mismatch.")
raise RestoreFail("Load version of backup does not match the "
"version of the installed load.")
def get_subfunctions(filename):
"""
Retrieves the subfunctions from a platform.conf file.
:param filename: file to retrieve subfunctions from
:return: a list of the subfunctions or None if no subfunctions exist
"""
matchstr = 'subfunction='
with open(filename, 'r') as f:
for line in f:
if matchstr in line:
parsed = line.split('=')
return parsed[1].rstrip().split(",")
return
def check_load_subfunctions(archive, staging_dir):
"""
Verify that the subfunctions in the backup match the installed load.
:param archive: backup archive
:param staging_dir: staging directory
:return: raises exception if the subfunctions do not match
"""
match = False
backup_subfunctions = None
try:
member = archive.getmember('etc/platform/platform.conf')
archive.extract(member, path=staging_dir)
backup_subfunctions = get_subfunctions(staging_dir +
'/etc/platform/platform.conf')
shutil.rmtree(staging_dir + '/etc')
if set(backup_subfunctions) ^ set(tsconfig.subfunctions):
# The set of subfunctions do not match
match = False
else:
match = True
except Exception:
LOG.exception("Unable to verify subfunctions in backup file")
raise RestoreFail("Unable to verify subfunctions in backup file. "
"Invalid backup file.")
if not match:
LOG.error("Subfunction mismatch - backup: %s, installed: %s" %
(str(backup_subfunctions), str(tsconfig.subfunctions)))
raise RestoreFail("Subfunctions in backup load (%s) do not match the "
"subfunctions of the installed load (%s)." %
(str(backup_subfunctions),
str(tsconfig.subfunctions)))
def file_exists_in_archive(archive, file_path):
""" Check if file exists in archive """
try:
archive.getmember(file_path)
return True
except KeyError:
LOG.info("File %s is not in archive." % file_path)
return False
def filter_directory(archive, directory):
for tarinfo in archive:
if tarinfo.name.split('/')[0] == directory:
yield tarinfo
def backup_etc_size():
""" Backup etc size estimate """
try:
total_size = utils.directory_get_size('/etc')
return total_size
except OSError:
LOG.error("Failed to estimate backup etc size.")
raise BackupFail("Failed to estimate backup etc size")
def backup_etc(archive):
""" Backup etc """
try:
archive.add('/etc', arcname='etc')
except tarfile.TarError:
LOG.error("Failed to backup etc.")
raise BackupFail("Failed to backup etc")
def restore_etc_file(archive, dest_dir, etc_file):
""" Restore etc file """
try:
# Change the name of this file to remove the leading path
member = archive.getmember('etc/' + etc_file)
# Copy the member to avoid changing the name for future operations on
# this member.
temp_member = copy.copy(member)
temp_member.name = os.path.basename(temp_member.name)
archive.extract(temp_member, path=dest_dir)
except tarfile.TarError:
LOG.error("Failed to restore etc file.")
raise RestoreFail("Failed to restore etc file")
def restore_etc_ssl_dir(archive, configpath=constants.CONFIG_WORKDIR):
""" Restore the etc SSL dir """
def filter_etc_ssl_private(members):
for tarinfo in members:
if 'etc/ssl/private' in tarinfo.name:
yield tarinfo
if file_exists_in_archive(archive, 'config/server-cert.pem'):
restore_config_file(
archive, configpath, 'server-cert.pem')
if file_exists_in_archive(archive, 'etc/ssl/private'):
# NOTE: This will include all TPM certificate files if TPM was
# enabled on the backed up system. However in that case, this
# restoration is only done for the first controller and TPM
# will need to be reconfigured once duplex controller (if any)
# is restored.
archive.extractall(path='/',
members=filter_etc_ssl_private(archive))
def restore_ceph_external_config_files(archive, staging_dir):
# Restore ceph-config.
if file_exists_in_archive(archive, "config/ceph-config"):
restore_config_dir(archive, staging_dir, 'ceph-config', ceph_permdir)
# Copy the file to /etc/ceph.
# There might be no files to copy, so don't check the return code.
cp_command = ('cp -Rp ' + os.path.join(ceph_permdir, '*') +
' /etc/ceph/')
subprocess.call(cp_command, shell=True)
def backup_config_size(config_permdir):
""" Backup configuration size estimate """
try:
return(utils.directory_get_size(config_permdir))
except OSError:
LOG.error("Failed to estimate backup configuration size.")
raise BackupFail("Failed to estimate backup configuration size")
def backup_config(archive, config_permdir):
""" Backup configuration """
try:
# The config dir is versioned, but we're only grabbing the current
# release
archive.add(config_permdir, arcname='config')
except tarfile.TarError:
LOG.error("Failed to backup config.")
raise BackupFail("Failed to backup configuration")
def restore_config_file(archive, dest_dir, config_file):
""" Restore configuration file """
try:
# Change the name of this file to remove the leading path
member = archive.getmember('config/' + config_file)
# Copy the member to avoid changing the name for future operations on
# this member.
temp_member = copy.copy(member)
temp_member.name = os.path.basename(temp_member.name)
archive.extract(temp_member, path=dest_dir)
except tarfile.TarError:
LOG.error("Failed to restore config file %s." % config_file)
raise RestoreFail("Failed to restore configuration")
def restore_configuration(archive, staging_dir):
""" Restore configuration """
try:
os.makedirs(constants.CONFIG_WORKDIR, stat.S_IRWXU | stat.S_IRGRP |
stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH)
except OSError:
LOG.error("Failed to create config directory: %s",
constants.CONFIG_WORKDIR)
raise RestoreFail("Failed to restore configuration files")
# Restore cgcs_config file from original installation for historical
# purposes. Not used to restore the system as the information in this
# file is out of date (not updated after original installation).
restore_config_file(archive, constants.CONFIG_WORKDIR, 'cgcs_config')
# Restore platform.conf file and update as necessary. The file will be
# created in a temporary location and then moved into place when it is
# complete to prevent access to a partially created file.
restore_etc_file(archive, staging_dir, 'platform/platform.conf')
temp_platform_conf_file = os.path.join(tsconfig.PLATFORM_CONF_PATH,
'platform.conf.temp')
shutil.copyfile(os.path.join(staging_dir, 'platform.conf'),
temp_platform_conf_file)
install_uuid = utils.get_install_uuid()
for line in fileinput.FileInput(temp_platform_conf_file, inplace=1):
if line.startswith("INSTALL_UUID="):
# The INSTALL_UUID must be updated to match the new INSTALL_UUID
# which was generated when this controller was installed prior to
# doing the restore.
print("INSTALL_UUID=%s" % install_uuid)
elif line.startswith("management_interface=") or \
line.startswith("oam_interface=") or \
line.startswith("cluster_host_interface=") or \
line.startswith("UUID="):
# Strip out any entries that are host specific as the backup can
# be done on either controller. The application of the
# platform_conf manifest will add these back in.
pass
else:
print(line, end='')
fileinput.close()
# Move updated platform.conf file into place.
os.rename(temp_platform_conf_file, tsconfig.PLATFORM_CONF_FILE)
# Kick tsconfig to reload the platform.conf file
tsconfig._load()
# Restore branding
restore_config_dir(archive, staging_dir, 'branding', '/opt/branding/')
# Restore banner customization
restore_config_dir(archive, staging_dir, 'banner/etc', '/opt/banner')
# Restore ssh configuration
restore_config_dir(archive, staging_dir, 'ssh_config',
constants.CONFIG_WORKDIR + '/ssh_config')
# Configure hostname
utils.configure_hostname('controller-0')
# Restore hosts file
restore_etc_file(archive, '/etc', 'hosts')
restore_etc_file(archive, constants.CONFIG_WORKDIR, 'hosts')
# Restore certificate files
restore_etc_ssl_dir(archive)
def filter_pxelinux(archive):
for tarinfo in archive:
if tarinfo.name.find('config/pxelinux.cfg') == 0:
yield tarinfo
def restore_dnsmasq(archive, config_permdir):
""" Restore dnsmasq """
try:
etc_files = ['hosts']
perm_files = ['hosts',
'dnsmasq.hosts', 'dnsmasq.leases',
'dnsmasq.addn_hosts']
for etc_file in etc_files:
restore_config_file(archive, '/etc', etc_file)
for perm_file in perm_files:
restore_config_file(archive, config_permdir, perm_file)
# Extract distributed cloud addn_hosts file if present in archive.
if file_exists_in_archive(
archive, 'config/dnsmasq.addn_hosts_dc'):
restore_config_file(archive, config_permdir,
'dnsmasq.addn_hosts_dc')
tmpdir = tempfile.mkdtemp(prefix="pxerestore_")
archive.extractall(tmpdir,
members=filter_pxelinux(archive))
if os.path.exists(tmpdir + '/config/pxelinux.cfg'):
shutil.rmtree(config_permdir + 'pxelinux.cfg', ignore_errors=True)
shutil.move(tmpdir + '/config/pxelinux.cfg', config_permdir)
shutil.rmtree(tmpdir, ignore_errors=True)
except (shutil.Error, subprocess.CalledProcessError, tarfile.TarError):
LOG.error("Failed to restore dnsmasq config.")
raise RestoreFail("Failed to restore dnsmasq files")
def backup_puppet_data_size(puppet_permdir):
""" Backup puppet data size estimate """
try:
return(utils.directory_get_size(puppet_permdir))
except OSError:
LOG.error("Failed to estimate backup puppet data size.")
raise BackupFail("Failed to estimate backup puppet data size")
def backup_puppet_data(archive, puppet_permdir):
""" Backup puppet data """
try:
# The puppet dir is versioned, but we're only grabbing the current
# release
archive.add(puppet_permdir, arcname='hieradata')
except tarfile.TarError:
LOG.error("Failed to backup puppet data.")
raise BackupFail("Failed to backup puppet data")
def restore_static_puppet_data(archive, puppet_workdir):
""" Restore static puppet data """
try:
member = archive.getmember('hieradata/static.yaml')
archive.extract(member, path=os.path.dirname(puppet_workdir))
member = archive.getmember('hieradata/secure_static.yaml')
archive.extract(member, path=os.path.dirname(puppet_workdir))
except tarfile.TarError:
LOG.error("Failed to restore static puppet data.")
raise RestoreFail("Failed to restore static puppet data")
except OSError:
pass
def restore_puppet_data(archive, puppet_workdir, controller_0_address):
""" Restore puppet data """
try:
member = archive.getmember('hieradata/system.yaml')
archive.extract(member, path=os.path.dirname(puppet_workdir))
member = archive.getmember('hieradata/secure_system.yaml')
archive.extract(member, path=os.path.dirname(puppet_workdir))
# Only restore controller-0 hieradata
controller_0_hieradata = 'hieradata/%s.yaml' % controller_0_address
member = archive.getmember(controller_0_hieradata)
archive.extract(member, path=os.path.dirname(puppet_workdir))
except tarfile.TarError:
LOG.error("Failed to restore puppet data.")
raise RestoreFail("Failed to restore puppet data")
except OSError:
pass
def backup_armada_manifest_size(armada_permdir):
""" Backup armada manifest size estimate """
try:
return(utils.directory_get_size(armada_permdir))
except OSError:
LOG.error("Failed to estimate backup armada manifest size.")
raise BackupFail("Failed to estimate backup armada manifest size")
def backup_armada_manifest_data(archive, armada_permdir):
""" Backup armada manifest data """
try:
archive.add(armada_permdir, arcname='armada')
except tarfile.TarError:
LOG.error("Failed to backup armada manifest data.")
raise BackupFail("Failed to backup armada manifest data")
def restore_armada_manifest_data(archive, armada_permdir):
""" Restore armada manifest data """
try:
shutil.rmtree(armada_permdir, ignore_errors=True)
members = filter_directory(archive, 'armada')
temp_members = list()
# remove armada and armada/ from the member path since they are
# extracted to armada_permdir: /opt/platform/armada/release
for m in members:
temp_member = copy.copy(m)
lst = temp_member.name.split('armada/')
if len(lst) > 1:
temp_member.name = lst[1]
temp_members.append(temp_member)
archive.extractall(path=armada_permdir, members=temp_members)
except (tarfile.TarError, OSError):
LOG.error("Failed to restore armada manifest.")
shutil.rmtree(armada_permdir, ignore_errors=True)
raise RestoreFail("Failed to restore armada manifest")
def backup_keyring_size(keyring_permdir):
""" Backup keyring size estimate """
try:
return(utils.directory_get_size(keyring_permdir))
except OSError:
LOG.error("Failed to estimate backup keyring size.")
raise BackupFail("Failed to estimate backup keyring size")
def backup_keyring(archive, keyring_permdir):
""" Backup keyring configuration """
try:
archive.add(keyring_permdir, arcname='.keyring')
except tarfile.TarError:
LOG.error("Failed to backup keyring.")
raise BackupFail("Failed to backup keyring configuration")
def restore_keyring(archive, keyring_permdir):
""" Restore keyring configuration """
try:
shutil.rmtree(keyring_permdir, ignore_errors=False)
members = filter_directory(archive, '.keyring')
temp_members = list()
# remove .keyring and .keyring/ from the member path since they are
# extracted to keyring_permdir: /opt/platform/.keyring/release
for m in members:
temp_member = copy.copy(m)
lst = temp_member.name.split('.keyring/')
if len(lst) > 1:
temp_member.name = lst[1]
temp_members.append(temp_member)
archive.extractall(path=keyring_permdir, members=temp_members)
except (tarfile.TarError, shutil.Error):
LOG.error("Failed to restore keyring.")
shutil.rmtree(keyring_permdir, ignore_errors=True)
raise RestoreFail("Failed to restore keyring configuration")
def prefetch_keyring(archive):
""" Prefetch keyring configuration for manifest use """
keyring_tmpdir = '/tmp/.keyring'
python_keyring_tmpdir = '/tmp/python_keyring'
try:
shutil.rmtree(keyring_tmpdir, ignore_errors=True)
shutil.rmtree(python_keyring_tmpdir, ignore_errors=True)
archive.extractall(
path=os.path.dirname(keyring_tmpdir),
members=filter_directory(archive,
os.path.basename(keyring_tmpdir)))
shutil.move(keyring_tmpdir + '/python_keyring', python_keyring_tmpdir)
except (tarfile.TarError, shutil.Error):
LOG.error("Failed to restore keyring.")
shutil.rmtree(keyring_tmpdir, ignore_errors=True)
shutil.rmtree(python_keyring_tmpdir, ignore_errors=True)
raise RestoreFail("Failed to restore keyring configuration")
def cleanup_prefetched_keyring():
""" Cleanup fetched keyring """
try:
keyring_tmpdir = '/tmp/.keyring'
python_keyring_tmpdir = '/tmp/python_keyring'
shutil.rmtree(keyring_tmpdir, ignore_errors=True)
shutil.rmtree(python_keyring_tmpdir, ignore_errors=True)
except shutil.Error:
LOG.error("Failed to cleanup keyring.")
raise RestoreFail("Failed to cleanup fetched keyring")
def backup_ldap_size():
""" Backup ldap size estimate """
try:
total_size = 0
proc = subprocess.Popen(
['slapcat -d 0 -F /etc/openldap/schema | wc -c'],
shell=True, stdout=subprocess.PIPE)
for line in proc.stdout:
total_size = int(line)
break
proc.communicate()
return total_size
except subprocess.CalledProcessError:
LOG.error("Failed to estimate backup ldap size.")
raise BackupFail("Failed to estimate backup ldap size")
def backup_ldap(archive, staging_dir):
""" Backup ldap configuration """
try:
ldap_staging_dir = staging_dir + '/ldap'
os.mkdir(ldap_staging_dir, 0o655)
subprocess.check_call([
'slapcat', '-d', '0', '-F', '/etc/openldap/schema',
'-l', (ldap_staging_dir + '/ldap.db')], stdout=DEVNULL)
archive.add(ldap_staging_dir + '/ldap.db', arcname='ldap.db')
except (OSError, subprocess.CalledProcessError, tarfile.TarError):
LOG.error("Failed to backup ldap database.")
raise BackupFail("Failed to backup ldap configuration")
def restore_ldap(archive, ldap_permdir, staging_dir):
""" Restore ldap configuration """
try:
ldap_staging_dir = staging_dir + '/ldap'
archive.extract('ldap.db', path=ldap_staging_dir)
utils.stop_lsb_service('openldap')
subprocess.call(['rm', '-rf', ldap_permdir], stdout=DEVNULL)
os.mkdir(ldap_permdir, 0o755)
subprocess.check_call(['slapadd', '-F', '/etc/openldap/schema',
'-l', ldap_staging_dir + '/ldap.db'],
stdout=DEVNULL, stderr=DEVNULL)
except (subprocess.CalledProcessError, OSError, tarfile.TarError):
LOG.error("Failed to restore ldap database.")
raise RestoreFail("Failed to restore ldap configuration")
finally:
utils.start_lsb_service('openldap')
def backup_mariadb_size():
""" Backup MariaDB size estimate """
try:
total_size = 0
os_backup_dbs = get_os_backup_databases()
# Backup data for databases.
for db_elem in os_backup_dbs:
db_cmd = kube_cmd_prefix + mysqldump_prefix
db_cmd += ' %s\' | wc -c' % db_elem
proc = subprocess.Popen([db_cmd], shell=True,
stdout=subprocess.PIPE, stderr=DEVNULL)
total_size += int(proc.stdout.readline())
proc.communicate()
return total_size
except subprocess.CalledProcessError:
LOG.error("Failed to estimate MariaDB database size.")
raise BackupFail("Failed to estimate MariaDB database size")
def backup_mariadb(archive, staging_dir):
""" Backup MariaDB data """
try:
mariadb_staging_dir = staging_dir + '/mariadb'
os.mkdir(mariadb_staging_dir, 0o655)
os_backup_dbs = get_os_backup_databases()
# Backup data for databases.
for db_elem in os_backup_dbs:
db_cmd = kube_cmd_prefix + mysqldump_prefix
db_cmd += ' %s\' > %s/%s.sql.data' % (db_elem,
mariadb_staging_dir, db_elem)
subprocess.check_call([db_cmd], shell=True, stderr=DEVNULL)
archive.add(mariadb_staging_dir, arcname='mariadb')
except (OSError, subprocess.CalledProcessError, tarfile.TarError):
LOG.error("Failed to backup MariaDB databases.")
raise BackupFail("Failed to backup MariaDB database.")
def extract_mariadb_data(archive):
""" Extract and store MariaDB data """
try:
# We store MariaDB data in /opt/backups/mariadb for now.
# After MariaDB service is up, we will populate the
# database using these data.
archive.extractall(path=constants.BACKUPS_PATH,
members=filter_directory(archive, 'mariadb'))
except (OSError, tarfile.TarError) as e:
LOG.error("Failed to extract and store MariaDB data. Error: %s", e)
raise RestoreFail("Failed to extract and store MariaDB data.")
def create_helm_overrides_directory():
"""
Create helm overrides directory
During restore, application-apply will be done without
first running application-upload where the helm overrides
directory is created. So we need to create the helm overrides
directory before running application-apply.
"""
try:
os.mkdir(constants.HELM_OVERRIDES_PERMDIR, 0o755)
except OSError:
LOG.error("Failed to create helm overrides directory")
raise BackupFail("Failed to create helm overrides directory")
def restore_mariadb():
"""
Restore MariaDB
This function is called after MariaDB service is up
"""
try:
mariadb_staging_dir = constants.BACKUPS_PATH + '/mariadb'
# Restore data for databases.
for data in glob.glob(mariadb_staging_dir + '/*.sql.data'):
db_elem = data.split('/')[-1].split('.')[0]
create_db = "create database %s" % db_elem
# Create the database
db_cmd = kube_cmd_prefix + mysql_prefix + '-e"%s" \'' % create_db
subprocess.check_call([db_cmd], shell=True, stderr=DEVNULL)
# Populate data
db_cmd = 'cat %s | ' % data
db_cmd = db_cmd + kube_cmd_prefix + mysql_prefix
db_cmd += '%s\' ' % db_elem
subprocess.check_call([db_cmd], shell=True, stderr=DEVNULL)
shutil.rmtree(mariadb_staging_dir, ignore_errors=True)
except (OSError, subprocess.CalledProcessError) as e:
LOG.error("Failed to restore MariaDB data. Error: %s", e)
raise RestoreFail("Failed to restore MariaDB data.")
def backup_postgres_size():
""" Backup postgres size estimate """
try:
total_size = 0
# Backup roles, table spaces and schemas for databases.
proc = subprocess.Popen([('sudo -u postgres pg_dumpall --clean ' +
'--schema-only | wc -c')], shell=True,
stdout=subprocess.PIPE, stderr=DEVNULL)
for line in proc.stdout:
total_size = int(line)
break
proc.communicate()
# get backup database
backup_databases, backup_db_skip_tables = get_backup_databases()
# Backup data for databases.
for _, db_elem in enumerate(backup_databases):
db_cmd = 'sudo -u postgres pg_dump --format=plain --inserts '
db_cmd += '--disable-triggers --data-only %s ' % db_elem
for _, table_elem in enumerate(backup_db_skip_tables[db_elem]):
db_cmd += '--exclude-table=%s ' % table_elem
db_cmd += '| wc -c'
proc = subprocess.Popen([db_cmd], shell=True,
stdout=subprocess.PIPE, stderr=DEVNULL)
for line in proc.stdout:
total_size += int(line)
break
proc.communicate()
return total_size
except subprocess.CalledProcessError:
LOG.error("Failed to estimate backup database size.")
raise BackupFail("Failed to estimate backup database size")
def backup_postgres(archive, staging_dir):
""" Backup postgres configuration """
try:
postgres_staging_dir = staging_dir + '/postgres'
os.mkdir(postgres_staging_dir, 0o655)
# Backup roles, table spaces and schemas for databases.
subprocess.check_call([('sudo -u postgres pg_dumpall --clean ' +
'--schema-only' +
'> %s/%s' % (postgres_staging_dir,
'postgres.sql.config'))],
shell=True, stderr=DEVNULL)
# get backup database
backup_databases, backup_db_skip_tables = get_backup_databases()
# Backup data for databases.
for _, db_elem in enumerate(backup_databases):
db_cmd = 'sudo -u postgres pg_dump --format=plain --inserts '
db_cmd += '--disable-triggers --data-only %s ' % db_elem
for _, table_elem in enumerate(backup_db_skip_tables[db_elem]):
db_cmd += '--exclude-table=%s ' % table_elem
db_cmd += '> %s/%s.sql.data' % (postgres_staging_dir, db_elem)
subprocess.check_call([db_cmd], shell=True, stderr=DEVNULL)
archive.add(postgres_staging_dir, arcname='postgres')
except (OSError, subprocess.CalledProcessError, tarfile.TarError):
LOG.error("Failed to backup postgres databases.")
raise BackupFail("Failed to backup database configuration")
def restore_postgres(archive, staging_dir):
""" Restore postgres configuration """
try:
postgres_staging_dir = staging_dir + '/postgres'
archive.extractall(path=staging_dir,
members=filter_directory(archive, 'postgres'))
utils.start_service("postgresql")
# Restore roles, table spaces and schemas for databases.
subprocess.check_call(["sudo", "-u", "postgres", "psql", "-f",
postgres_staging_dir +
'/postgres.sql.config', "postgres"],
stdout=DEVNULL, stderr=DEVNULL)
# Restore data for databases.
for data in glob.glob(postgres_staging_dir + '/*.sql.data'):
db_elem = data.split('/')[-1].split('.')[0]
subprocess.check_call(["sudo", "-u", "postgres", "psql", "-f",
data, db_elem],
stdout=DEVNULL)
except (OSError, subprocess.CalledProcessError, tarfile.TarError) as e:
LOG.error("Failed to restore postgres databases. Error: %s", e)
raise RestoreFail("Failed to restore database configuration")
finally:
utils.stop_service('postgresql')
def filter_config_dir(archive, directory):
for tarinfo in archive:
if tarinfo.name.find('config/' + directory) == 0:
yield tarinfo
def restore_config_dir(archive, staging_dir, config_dir, dest_dir):
""" Restore configuration directory if it exists """
try:
archive.extractall(staging_dir,
members=filter_config_dir(archive, config_dir))
# Copy files from backup to dest dir
if (os.path.exists(staging_dir + '/config/' + config_dir) and
os.listdir(staging_dir + '/config/' + config_dir)):
subprocess.call(["mkdir", "-p", dest_dir])
try:
for f in glob.glob(
staging_dir + '/config/' + config_dir + '/*'):
subprocess.check_call(["cp", "-p", f, dest_dir])
except IOError:
LOG.warning("Failed to copy %s files" % config_dir)
except (subprocess.CalledProcessError, tarfile.TarError):
LOG.info("No custom %s config was found during restore." % config_dir)
def backup_std_dir_size(directory):
""" Backup standard directory size estimate """
try:
return utils.directory_get_size(directory)
except OSError:
LOG.error("Failed to estimate backup size for %s" % directory)
raise BackupFail("Failed to estimate backup size for %s" % directory)
def backup_std_dir(archive, directory):
""" Backup standard directory """
try:
archive.add(directory, arcname=os.path.basename(directory))
except tarfile.TarError:
LOG.error("Failed to backup %s" % directory)
raise BackupFail("Failed to backup %s" % directory)
def restore_std_dir(archive, directory):
""" Restore standard directory """
try:
shutil.rmtree(directory, ignore_errors=True)
# Verify that archive contains this directory
try:
archive.getmember(os.path.basename(directory))
except KeyError:
LOG.error("Archive does not contain directory %s" % directory)
raise RestoreFail("Invalid backup file - missing directory %s" %
directory)
archive.extractall(
path=os.path.dirname(directory),
members=filter_directory(archive, os.path.basename(directory)))
except (shutil.Error, tarfile.TarError):
LOG.error("Failed to restore %s" % directory)
raise RestoreFail("Failed to restore %s" % directory)
def configure_loopback_interface(archive):
""" Restore and apply configuration for loopback interface """
utils.remove_interface_config_files()
restore_etc_file(
archive, utils.NETWORK_SCRIPTS_PATH,
'sysconfig/network-scripts/' + utils.NETWORK_SCRIPTS_LOOPBACK)
utils.restart_networking()
def backup_ceph_crush_map(archive, staging_dir):
""" Backup ceph crush map """
try:
ceph_staging_dir = os.path.join(staging_dir, 'ceph')
os.mkdir(ceph_staging_dir, 0o655)
crushmap_file = os.path.join(ceph_staging_dir,
sysinv_constants.CEPH_CRUSH_MAP_BACKUP)
subprocess.check_call(['ceph', 'osd', 'getcrushmap',
'-o', crushmap_file], stdout=DEVNULL,
stderr=DEVNULL)
archive.add(crushmap_file, arcname='ceph/' +
sysinv_constants.CEPH_CRUSH_MAP_BACKUP)
except Exception as e:
LOG.error('Failed to backup ceph crush map. Reason: {}'.format(e))
raise BackupFail('Failed to backup ceph crush map')
def restore_ceph_crush_map(archive):
""" Restore ceph crush map """
if not file_exists_in_archive(archive, 'ceph/' +
sysinv_constants.CEPH_CRUSH_MAP_BACKUP):
return
try:
crush_map_file = 'ceph/' + sysinv_constants.CEPH_CRUSH_MAP_BACKUP
if file_exists_in_archive(archive, crush_map_file):
member = archive.getmember(crush_map_file)
# Copy the member to avoid changing the name for future
# operations on this member.
temp_member = copy.copy(member)
temp_member.name = os.path.basename(temp_member.name)
archive.extract(temp_member,
path=sysinv_constants.SYSINV_CONFIG_PATH)
except tarfile.TarError as e:
LOG.error('Failed to restore crush map file. Reason: {}'.format(e))
raise RestoreFail('Failed to restore crush map file')
def check_size(archive_dir):
"""Check if there is enough space to create backup."""
backup_overhead_bytes = 1024 ** 3 # extra GB for staging directory
backup_size = (backup_overhead_bytes +
backup_etc_size() +
backup_config_size(tsconfig.CONFIG_PATH) +
backup_puppet_data_size(constants.HIERADATA_PERMDIR) +
backup_keyring_size(keyring_permdir) +
backup_ldap_size() +
backup_postgres_size() +
backup_std_dir_size(home_permdir) +
backup_std_dir_size(patching_permdir) +
backup_std_dir_size(patching_repo_permdir) +
backup_std_dir_size(extension_permdir) +
backup_std_dir_size(patch_vault_permdir) +
backup_armada_manifest_size(constants.ARMADA_PERMDIR) +
backup_std_dir_size(constants.HELM_CHARTS_PERMDIR) +
backup_mariadb_size()
)
archive_dir_free_space = \
utils.filesystem_get_free_space(archive_dir)
if backup_size > archive_dir_free_space:
print("Archive directory (%s) does not have enough free "
"space (%s), estimated backup size is %s." %
(archive_dir, utils.print_bytes(archive_dir_free_space),
utils.print_bytes(backup_size)))
raise BackupFail("Not enough free space for backup.")
def backup(backup_name, archive_dir, clone=False):
"""Backup configuration."""
if not os.path.isdir(archive_dir):
raise BackupFail("Archive directory (%s) not found." % archive_dir)
if not utils.is_active("management-ip"):
raise BackupFail(
"Backups can only be performed from the active controller.")
if os.path.isfile(backup_in_progress):
raise BackupFail("Backup already in progress.")
else:
open(backup_in_progress, 'w')
fmApi = fm_api.FaultAPIs()
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST,
sysinv_constants.CONTROLLER_HOSTNAME)
fault = fm_api.Fault(alarm_id=fm_constants.FM_ALARM_ID_BACKUP_IN_PROGRESS,
alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
entity_instance_id=entity_instance_id,
severity=fm_constants.FM_ALARM_SEVERITY_MINOR,
reason_text=("System Backup in progress."),
# operational
alarm_type=fm_constants.FM_ALARM_TYPE_7,
# congestion
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_8,
proposed_repair_action=("No action required."),
service_affecting=False)
fmApi.set_fault(fault)
staging_dir = None
system_tar_path = None
warnings = ''
try:
os.chdir('/')
if not clone:
check_size(archive_dir)
print ("\nPerforming backup (this might take several minutes):")
staging_dir = tempfile.mkdtemp(dir=archive_dir)
system_tar_path = os.path.join(archive_dir,
backup_name + '_system.tgz')
system_archive = tarfile.open(system_tar_path, "w:gz")
step = 1
total_steps = 16
# Step 1: Backup etc
backup_etc(system_archive)
utils.progress(total_steps, step, 'backup etc', 'DONE')
step += 1
# Step 2: Backup configuration
backup_config(system_archive, tsconfig.CONFIG_PATH)
utils.progress(total_steps, step, 'backup configuration', 'DONE')
step += 1
# Step 3: Backup puppet data
backup_puppet_data(system_archive, constants.HIERADATA_PERMDIR)
utils.progress(total_steps, step, 'backup puppet data', 'DONE')
step += 1
# Step 4: Backup armada data
backup_armada_manifest_data(system_archive, constants.ARMADA_PERMDIR)
utils.progress(total_steps, step, 'backup armada data', 'DONE')
step += 1
# Step 5: Backup helm charts data
backup_std_dir(system_archive, constants.HELM_CHARTS_PERMDIR)
utils.progress(total_steps, step, 'backup helm charts', 'DONE')
step += 1
# Step 6: Backup keyring
backup_keyring(system_archive, keyring_permdir)
utils.progress(total_steps, step, 'backup keyring', 'DONE')
step += 1
# Step 7: Backup ldap
backup_ldap(system_archive, staging_dir)
utils.progress(total_steps, step, 'backup ldap', 'DONE')
step += 1
# Step 8: Backup postgres
backup_postgres(system_archive, staging_dir)
utils.progress(total_steps, step, 'backup postgres', 'DONE')
step += 1
# Step 9: Backup mariadb
backup_mariadb(system_archive, staging_dir)
utils.progress(total_steps, step, 'backup mariadb', 'DONE')
step += 1
# Step 10: Backup home
backup_std_dir(system_archive, home_permdir)
utils.progress(total_steps, step, 'backup home directory', 'DONE')
step += 1
# Step 11: Backup patching
if not clone:
backup_std_dir(system_archive, patching_permdir)
utils.progress(total_steps, step, 'backup patching', 'DONE')
step += 1
# Step 12: Backup patching repo
if not clone:
backup_std_dir(system_archive, patching_repo_permdir)
utils.progress(total_steps, step, 'backup patching repo', 'DONE')
step += 1
# Step 13: Backup extension filesystem
backup_std_dir(system_archive, extension_permdir)
utils.progress(total_steps, step, 'backup extension filesystem '
'directory', 'DONE')
step += 1
# Step 14: Backup patch-vault filesystem
if os.path.exists(patch_vault_permdir):
backup_std_dir(system_archive, patch_vault_permdir)
utils.progress(total_steps, step, 'backup patch-vault filesystem '
'directory', 'DONE')
step += 1
# Step 15: Backup ceph crush map
backup_ceph_crush_map(system_archive, staging_dir)
utils.progress(total_steps, step, 'backup ceph crush map', 'DONE')
step += 1
# Step 16: Create archive
system_archive.close()
utils.progress(total_steps, step, 'create archive', 'DONE')
step += 1
except Exception:
if system_tar_path and os.path.isfile(system_tar_path):
os.remove(system_tar_path)
raise
finally:
fmApi.clear_fault(fm_constants.FM_ALARM_ID_BACKUP_IN_PROGRESS,
entity_instance_id)
os.remove(backup_in_progress)
if staging_dir:
shutil.rmtree(staging_dir, ignore_errors=True)
system_msg = "System backup file created"
if not clone:
system_msg += ": " + system_tar_path
print(system_msg)
if warnings != '':
print("WARNING: The following problems occurred:")
print(textwrap.fill(warnings, 80))
def create_restore_runtime_config(filename):
""" Create any runtime parameters needed for Restore."""
config = {}
# We need to re-enable Openstack password rules, which
# were previously disabled while the controller manifests
# were applying during a Restore
config['classes'] = ['keystone::security_compliance']
utils.create_manifest_runtime_config(filename, config)
def restore_system(backup_file, include_storage_reinstall=False, clone=False):
"""Restoring system configuration."""
if (os.path.exists(constants.CGCS_CONFIG_FILE) or
os.path.exists(tsconfig.CONFIG_PATH) or
os.path.exists(constants.INITIAL_CONFIG_COMPLETE_FILE)):
print(textwrap.fill(
"Configuration has already been done. "
"A system restore operation can only be done "
"immediately after the load has been installed.", 80))
print('')
raise RestoreFail("System configuration already completed")
if not os.path.isabs(backup_file):
raise RestoreFail("Backup file (%s) not found. Full path is "
"required." % backup_file)
if os.path.isfile(restore_in_progress):
raise RestoreFail("Restore already in progress.")
else:
open(restore_in_progress, 'w')
# Add newline to console log for install-clone scenario
newline = clone
staging_dir = None
try:
try:
with open(os.devnull, "w") as fnull:
subprocess.check_call(["vgdisplay", "cgts-vg"],
stdout=fnull,
stderr=fnull)
except subprocess.CalledProcessError:
LOG.error("The cgts-vg volume group was not found")
raise RestoreFail("Volume groups not configured")
print("\nRestoring system (this will take several minutes):")
# Use /scratch for the staging dir for now,
# until /opt/backups is available
staging_dir = tempfile.mkdtemp(dir='/scratch')
# Permission change required or postgres restore fails
subprocess.call(['chmod', 'a+rx', staging_dir], stdout=DEVNULL)
os.chdir('/')
step = 1
total_steps = 26
# Step 1: Open archive and verify installed load matches backup
try:
archive = tarfile.open(backup_file)
except tarfile.TarError as e:
LOG.exception(e)
raise RestoreFail("Error opening backup file. Invalid backup "
"file.")
check_load_versions(archive, staging_dir)
check_load_subfunctions(archive, staging_dir)
utils.progress(total_steps, step, 'open archive', 'DONE', newline)
step += 1
# Patching is potentially a multi-phase step.
# If the controller is impacted by patches from the backup,
# it must be rebooted before continuing the restore.
# If this is the second pass through, we can skip over this.
if not os.path.isfile(restore_patching_complete) and not clone:
# Step 2: Restore patching
restore_std_dir(archive, patching_permdir)
utils.progress(total_steps, step, 'restore patching', 'DONE',
newline)
step += 1
# Step 3: Restore patching repo
restore_std_dir(archive, patching_repo_permdir)
utils.progress(total_steps, step, 'restore patching repo', 'DONE',
newline)
step += 1
# Step 4: Apply patches
try:
subprocess.check_output(["sw-patch", "install-local"])
except subprocess.CalledProcessError:
LOG.error("Failed to install patches")
raise RestoreFail("Failed to install patches")
utils.progress(total_steps, step, 'install patches', 'DONE',
newline)
step += 1
open(restore_patching_complete, 'w')
# If the controller was impacted by patches, we need to reboot.
if os.path.isfile(node_is_patched):
if not clone:
print("\nThis controller has been patched. " +
"A reboot is required.")
print("After the reboot is complete, " +
"re-execute the restore command.")
while True:
user_input = input(
"Enter 'reboot' to reboot controller: ")
if user_input == 'reboot':
break
LOG.info("This controller has been patched. Rebooting now")
print("\nThis controller has been patched. Rebooting now\n\n")
time.sleep(5)
os.remove(restore_in_progress)
if staging_dir:
shutil.rmtree(staging_dir, ignore_errors=True)
subprocess.call("reboot")
else:
# We need to restart the patch controller and agent, since
# we setup the repo and patch store outside its control
with open(os.devnull, "w") as devnull:
subprocess.call(
["systemctl",
"restart",
"sw-patch-controller-daemon.service"],
stdout=devnull, stderr=devnull)
subprocess.call(
["systemctl",
"restart",
"sw-patch-agent.service"],
stdout=devnull, stderr=devnull)
if clone:
# No patches were applied, return to cloning code
# to run validation code.
return RESTORE_RERUN_REQUIRED
else:
# Add the skipped steps
step += 3
if os.path.isfile(node_is_patched):
# If we get here, it means the node was patched by the user
# AFTER the restore applied patches and rebooted, but didn't
# reboot.
# This means the patch lineup no longer matches what's in the
# backup, but we can't (and probably shouldn't) prevent that.
# However, since this will ultimately cause the node to fail
# the goenabled step, we can fail immediately and force the
# user to reboot.
print ("\nThis controller has been patched, but not rebooted.")
print ("Please reboot before continuing the restore process.")
raise RestoreFail("Controller node patched without rebooting")
# Flag can now be cleared
if os.path.exists(restore_patching_complete):
os.remove(restore_patching_complete)
# Prefetch keyring
prefetch_keyring(archive)
# Step 5: Restore configuration
restore_configuration(archive, staging_dir)
# In AIO SX systems, the loopback interface is used as the management
# interface. However, the application of the interface manifest will
# not configure the necessary addresses on the loopback interface (see
# apply_network_config.sh for details). So, we need to configure the
# loopback interface here.
if tsconfig.system_mode == sysinv_constants.SYSTEM_MODE_SIMPLEX:
configure_loopback_interface(archive)
# Write the simplex flag
utils.write_simplex_flag()
utils.progress(total_steps, step, 'restore configuration', 'DONE',
newline)
step += 1
# Step 6: Apply restore bootstrap manifest
controller_0_address = utils.get_address_from_hosts_file(
'controller-0')
restore_static_puppet_data(archive, constants.HIERADATA_WORKDIR)
try:
utils.apply_manifest(controller_0_address,
sysinv_constants.CONTROLLER,
'bootstrap',
constants.HIERADATA_WORKDIR)
except Exception as e:
LOG.exception(e)
raise RestoreFail(
'Failed to apply bootstrap manifest. '
'See /var/log/puppet/latest/puppet.log for details.')
utils.progress(total_steps, step, 'apply bootstrap manifest', 'DONE',
newline)
step += 1
# Step 7: Restore puppet data
restore_puppet_data(archive, constants.HIERADATA_WORKDIR,
controller_0_address)
utils.progress(total_steps, step, 'restore puppet data', 'DONE',
newline)
step += 1
# Step 8: Persist configuration
utils.persist_config()
utils.progress(total_steps, step, 'persist configuration', 'DONE',
newline)
step += 1
# Step 9: Apply controller manifest
try:
utils.apply_manifest(controller_0_address,
sysinv_constants.CONTROLLER,
'controller',
constants.HIERADATA_PERMDIR)
except Exception as e:
LOG.exception(e)
raise RestoreFail(
'Failed to apply controller manifest. '
'See /var/log/puppet/latest/puppet.log for details.')
utils.progress(total_steps, step, 'apply controller manifest', 'DONE',
newline)
step += 1
# Step 10: Apply runtime controller manifests
restore_filename = os.path.join(staging_dir, 'restore.yaml')
create_restore_runtime_config(restore_filename)
try:
utils.apply_manifest(controller_0_address,
sysinv_constants.CONTROLLER,
'runtime',
constants.HIERADATA_PERMDIR,
runtime_filename=restore_filename)
except Exception as e:
LOG.exception(e)
raise RestoreFail(
'Failed to apply runtime controller manifest. '
'See /var/log/puppet/latest/puppet.log for details.')
utils.progress(total_steps, step,
'apply runtime controller manifest', 'DONE',
newline)
step += 1
# Move the staging dir under /opt/backups, now that it's setup
shutil.rmtree(staging_dir, ignore_errors=True)
staging_dir = tempfile.mkdtemp(dir=constants.BACKUPS_PATH)
# Permission change required or postgres restore fails
subprocess.call(['chmod', 'a+rx', staging_dir], stdout=DEVNULL)
# Step 11: Apply banner customization
utils.apply_banner_customization()
utils.progress(total_steps, step, 'apply banner customization', 'DONE',
newline)
step += 1
# Step 12: Restore dnsmasq and pxeboot config
restore_dnsmasq(archive, tsconfig.CONFIG_PATH)
utils.progress(total_steps, step, 'restore dnsmasq', 'DONE', newline)
step += 1
# Step 13: Restore keyring
restore_keyring(archive, keyring_permdir)
utils.progress(total_steps, step, 'restore keyring', 'DONE', newline)
step += 1
# Step 14: Restore ldap
restore_ldap(archive, ldap_permdir, staging_dir)
utils.progress(total_steps, step, 'restore ldap', 'DONE', newline)
step += 1
# Step 15: Restore postgres
restore_postgres(archive, staging_dir)
utils.progress(total_steps, step, 'restore postgres', 'DONE', newline)
step += 1
# Step 16: Extract and store mariadb data
extract_mariadb_data(archive)
utils.progress(total_steps, step, 'extract mariadb', 'DONE', newline)
step += 1
# Step 17: Restore ceph crush map
restore_ceph_crush_map(archive)
utils.progress(total_steps, step, 'restore ceph crush map', 'DONE',
newline)
step += 1
# Step 18: Restore home
restore_std_dir(archive, home_permdir)
utils.progress(total_steps, step, 'restore home directory', 'DONE',
newline)
step += 1
# Step 19: Restore extension filesystem
restore_std_dir(archive, extension_permdir)
utils.progress(total_steps, step, 'restore extension filesystem '
'directory', 'DONE', newline)
step += 1
# Step 20: Restore patch-vault filesystem
if file_exists_in_archive(archive,
os.path.basename(patch_vault_permdir)):
restore_std_dir(archive, patch_vault_permdir)
utils.progress(total_steps, step, 'restore patch-vault filesystem '
'directory', 'DONE', newline)
step += 1
# Step 21: Restore external ceph configuration files.
restore_ceph_external_config_files(archive, staging_dir)
utils.progress(total_steps, step, 'restore CEPH external config',
'DONE', newline)
step += 1
# Step 22: Restore Armada manifest
restore_armada_manifest_data(archive, constants.ARMADA_PERMDIR)
utils.progress(total_steps, step, 'restore armada manifest',
'DONE', newline)
step += 1
# Step 23: Restore Helm charts
restore_std_dir(archive, constants.HELM_CHARTS_PERMDIR)
utils.progress(total_steps, step, 'restore helm charts',
'DONE', newline)
step += 1
# Step 24: Create Helm overrides directory
create_helm_overrides_directory()
utils.progress(total_steps, step, 'create helm overrides directory',
'DONE', newline)
step += 1
# Step 25: Shutdown file systems
archive.close()
shutil.rmtree(staging_dir, ignore_errors=True)
utils.shutdown_file_systems()
utils.progress(total_steps, step, 'shutdown file systems', 'DONE',
newline)
step += 1
# Step 26: Recover services
utils.mtce_restart()
utils.mark_config_complete()
time.sleep(120)
for service in ['sysinv-conductor', 'sysinv-inv']:
if not utils.wait_sm_service(service):
raise RestoreFail("Services have failed to initialize.")
utils.progress(total_steps, step, 'recover services', 'DONE', newline)
step += 1
if tsconfig.system_mode != sysinv_constants.SYSTEM_MODE_SIMPLEX:
print("\nRestoring node states (this will take several minutes):")
with openstack.OpenStack() as client:
# On ceph setups storage nodes take about 90 seconds
# to become locked. Setting the timeout to 120 seconds
# for such setups
lock_timeout = 60
storage_hosts = sysinv.get_hosts(client.admin_token,
client.conf['region_name'],
personality='storage')
if storage_hosts:
lock_timeout = 120
failed_lock_host = False
skip_hosts = ['controller-0']
if not include_storage_reinstall:
if storage_hosts:
install_uuid = utils.get_install_uuid()
for h in storage_hosts:
skip_hosts.append(h.name)
# Update install_uuid on the storage node
client.sysinv.ihost.update_install_uuid(
h.uuid,
install_uuid)
skip_hosts_count = len(skip_hosts)
# Wait for nodes to be identified as disabled before attempting
# to lock hosts. Even if after 3 minute nodes are still not
# identified as disabled, we still continue the restore.
if not client.wait_for_hosts_disabled(
exempt_hostnames=skip_hosts,
timeout=180):
LOG.info("At least one node is not in a disabling state. "
"Continuing.")
print("\nLocking nodes:")
try:
failed_hosts = client.lock_hosts(skip_hosts,
utils.progress,
timeout=lock_timeout)
# Don't power off nodes that could not be locked
if len(failed_hosts) > 0:
skip_hosts.append(failed_hosts)
except (KeystoneFail, SysInvFail) as e:
LOG.exception(e)
failed_lock_host = True
if not failed_lock_host:
print("\nPowering-off nodes:")
try:
client.power_off_hosts(skip_hosts,
utils.progress,
timeout=60)
except (KeystoneFail, SysInvFail) as e:
LOG.exception(e)
# this is somehow expected
if failed_lock_host or len(skip_hosts) > skip_hosts_count:
if include_storage_reinstall:
print(textwrap.fill(
"Failed to lock at least one node. " +
"Please lock the unlocked nodes manually.", 80
))
else:
print(textwrap.fill(
"Failed to lock at least one node. " +
"Please lock the unlocked controller-1 or " +
"worker nodes manually.", 80
))
if not clone:
print(textwrap.fill(
"Before continuing to the next step in the restore, " +
"please ensure all nodes other than controller-0 " +
"and storage nodes, if they are not being " +
"reinstalled, are powered off. Please refer to the " +
"system administration guide for more details.", 80
))
finally:
os.remove(restore_in_progress)
if staging_dir:
shutil.rmtree(staging_dir, ignore_errors=True)
cleanup_prefetched_keyring()
fmApi = fm_api.FaultAPIs()
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST,
sysinv_constants.CONTROLLER_HOSTNAME)
fault = fm_api.Fault(
alarm_id=fm_constants.FM_ALARM_ID_BACKUP_IN_PROGRESS,
alarm_state=fm_constants.FM_ALARM_STATE_MSG,
entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST,
entity_instance_id=entity_instance_id,
severity=fm_constants.FM_ALARM_SEVERITY_MINOR,
reason_text=("System Restore complete."),
# other
alarm_type=fm_constants.FM_ALARM_TYPE_0,
# unknown
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_UNKNOWN,
proposed_repair_action=(""),
service_affecting=False)
fmApi.set_fault(fault)
if utils.get_system_type() == sysinv_constants.TIS_AIO_BUILD:
print("\nApplying worker manifests for %s. " %
(utils.get_controller_hostname()))
print("Node will reboot on completion.")
sysinv.do_worker_config_complete(utils.get_controller_hostname())
# show in-progress log on console every 30 seconds
# until self reboot or timeout
time.sleep(30)
for i in range(1, 10):
print("worker manifest apply in progress ... ")
time.sleep(30)
raise RestoreFail("Timeout running worker manifests, "
"reboot did not occur")
return RESTORE_COMPLETE