Support background runtime manifests during upgrade-activate

In distributed cloud environments runtime manifests can be applied in
the background. This can cause hosts to become config out-of-date after
the upgrade-activate completes. This is due to the large window between
setting the host's config_target and updating the config_applied. If a
manifest is run in this window the host will remain config out-of-date
until a lock/unlock is performed.

To address this the config_target changes will be limited to hosts that
apply a runtime manifest as part of the upgrade-activate process.
Further the config_target will be updated immediately before the
_config_apply_runtime_manifest is called.

Story: 2008055
Task: 41917
Change-Id: I2e60c7557e8d398eeef2a407a0552f5e8f4a1f18
Signed-off-by: David Sullivan <david.sullivan@windriver.com>
This commit is contained in:
David Sullivan 2021-02-19 09:21:32 -06:00
parent 85c31c46ab
commit 6add4f2dfb
4 changed files with 65 additions and 54 deletions

View File

@ -318,6 +318,7 @@ class UpgradeController(rest.RestController):
elif updates['state'] == constants.UPGRADE_ACTIVATION_REQUESTED:
if upgrade.state in [constants.UPGRADE_ACTIVATING,
constants.UPGRADE_ACTIVATING_HOSTS,
constants.UPGRADE_ACTIVATION_COMPLETE]:
raise wsme.exc.ClientSideError(_(
"upgrade-activate rejected: "

View File

@ -1165,6 +1165,7 @@ UPGRADE_UPGRADING_CONTROLLERS = 'upgrading-controllers'
UPGRADE_UPGRADING_HOSTS = 'upgrading-hosts'
UPGRADE_ACTIVATION_REQUESTED = 'activation-requested'
UPGRADE_ACTIVATING = 'activating'
UPGRADE_ACTIVATING_HOSTS = 'activating-hosts'
UPGRADE_ACTIVATION_FAILED = 'activation-failed'
UPGRADE_ACTIVATION_COMPLETE = 'activation-complete'
UPGRADE_COMPLETING = 'completing'

View File

@ -1051,41 +1051,34 @@ class ConductorManager(service.PeriodicService):
raise exception.SysinvException(_(
"Failed to create pxelinux.cfg file"))
def _enable_etcd_security_config(self, context, config_uuid):
def _enable_etcd_security_config(self, context):
"""Update the manifests for etcd security
Note: this can be removed in the release after STX5.0
returns True if runtime manifests were applied
"""
controllers = self.dbapi.ihost_get_by_personality(constants.CONTROLLER)
for host in controllers:
if not utils.is_host_active_controller(host):
# Just enable etcd security on the standby controller.
# Etcd security was enabled on the active controller with a
# migration script.
personalities = [constants.CONTROLLER]
host_uuids = [host.uuid]
config_uuid = self._config_update_hosts(
context, personalities, host_uuids)
config_dict = {
"personalities": personalities,
"host_uuids": host_uuids,
"classes": ['platform::etcd::upgrade::runtime'],
puppet_common.REPORT_STATUS_CFG:
puppet_common.REPORT_UPGRADE_ACTIONS
}
self._config_apply_runtime_manifest(context,
config_uuid=config_uuid,
config_dict=config_dict)
return True
personalities = [constants.CONTROLLER]
hostname = socket.gethostname()
ctrls = self.dbapi.ihost_get_by_personality(constants.CONTROLLER)
valid_ctrls = [ctrl for ctrl in ctrls if
ctrl.administrative == constants.ADMIN_UNLOCKED and
ctrl.availability in [constants.AVAILABILITY_AVAILABLE,
constants.AVAILABILITY_DEGRADED]]
active_ctrl_host = None
standby_ctrl_host = None
for controller_host in valid_ctrls:
if controller_host.hostname != hostname:
standby_ctrl_host = controller_host
else:
active_ctrl_host = controller_host
# Applied etcd security puppet manifest in migration script for active controller.
self._update_host_config_applied(context, active_ctrl_host, config_uuid)
# Just enable etcd security in standby controller, as it has already been
# enabled in active controller side in migration script.
if standby_ctrl_host:
config_dict = {"personalities": personalities,
"host_uuids": [standby_ctrl_host.uuid],
"classes": ['platform::etcd::upgrade::runtime'],
}
self._config_apply_runtime_manifest(context,
config_uuid=config_uuid,
config_dict=config_dict)
return False
def _remove_pxe_config(self, host):
"""Delete the PXE config file for this host.
@ -5146,17 +5139,11 @@ class ConductorManager(service.PeriodicService):
# Not upgrading. No need to update status
return
if upgrade.state == constants.UPGRADE_ACTIVATING:
personalities = [constants.CONTROLLER, constants.WORKER]
all_manifests_applied = True
if upgrade.state == constants.UPGRADE_ACTIVATING_HOSTS:
hosts = self.dbapi.ihost_get_list()
for host in hosts:
if host.personality in personalities and \
host.config_target != host.config_applied:
all_manifests_applied = False
break
if all_manifests_applied:
out_of_date_hosts = [host for host in hosts if host.config_target != host.config_applied]
if not out_of_date_hosts:
LOG.info("Manifests applied. Upgrade activation complete.")
self.dbapi.software_upgrade_update(
upgrade.uuid,
{'state': constants.UPGRADE_ACTIVATION_COMPLETE})
@ -6960,7 +6947,14 @@ class ConductorManager(service.PeriodicService):
# Identify the executed set of manifests executed
success = False
if reported_cfg == puppet_common.REPORT_DISK_PARTITON_CONFIG:
if reported_cfg == puppet_common.REPORT_UPGRADE_ACTIONS:
if status == puppet_common.REPORT_SUCCESS:
success = True
else:
host_uuid = iconfig['host_uuid']
LOG.info("Upgrade manifest failed for host: %s" % host_uuid)
self.report_upgrade_config_failure()
elif reported_cfg == puppet_common.REPORT_DISK_PARTITON_CONFIG:
partition_uuid = iconfig['partition_uuid']
host_uuid = iconfig['host_uuid']
idisk_uuid = iconfig['idisk_uuid']
@ -7867,6 +7861,19 @@ class ConductorManager(service.PeriodicService):
values = {'state': constants.SB_STATE_CONFIG_ERR}
self.dbapi.istor_update(stor_uuid, values)
def report_upgrade_config_failure(self):
"""
Callback for Sysinv Agent on upgrade manifest failure
"""
try:
upgrade = self.dbapi.software_upgrade_get_one()
except exception.NotFound:
LOG.error("Upgrade record not found during config failure")
return
self.dbapi.software_upgrade_update(
upgrade.uuid,
{'state': constants.UPGRADE_ACTIVATION_FAILED})
def create_controller_filesystems(self, context, rootfs_device):
""" Create the storage config based on disk size for database, platform,
extension, rabbit, etcd, docker-distribution, dc-vault(SC)
@ -9973,11 +9980,6 @@ class ConductorManager(service.PeriodicService):
to_load = self.dbapi.load_get(upgrade.to_load)
to_version = to_load.software_version
# Update the config target of the controllers. This prevents the audit
# from changing the upgrade state before we're ready.
personalities = [constants.CONTROLLER]
config_uuid = self._config_update_hosts(context, personalities)
self.dbapi.software_upgrade_update(
upgrade.uuid, {'state': constants.UPGRADE_ACTIVATING})
@ -9996,14 +9998,20 @@ class ConductorManager(service.PeriodicService):
self.dbapi.software_upgrade_update(
upgrade.uuid,
{'state': constants.UPGRADE_ACTIVATION_FAILED})
manifests_applied = False
if from_version == tsc.SW_VERSION_20_06:
# Apply etcd security puppet manifest to the standby controller.
manifests_applied = self._enable_etcd_security_config(context)
if manifests_applied:
LOG.info("Running upgrade activation manifests")
self.dbapi.software_upgrade_update(
upgrade.uuid, {'state': constants.UPGRADE_ACTIVATING_HOSTS})
else:
if from_version == tsc.SW_VERSION_20_06:
# Apply etcd security puppet manifest here for standby controller.
self._enable_etcd_security_config(context, config_uuid)
else:
hosts = self.dbapi.ihost_get_by_personality(constants.CONTROLLER)
for host in hosts:
self._update_host_config_applied(context, host, config_uuid)
LOG.info("Upgrade activation complete")
self.dbapi.software_upgrade_update(
upgrade.uuid, {'state': constants.UPGRADE_ACTIVATION_COMPLETE})
def complete_upgrade(self, context, upgrade, state):
""" Complete the upgrade"""

View File

@ -27,6 +27,7 @@ REPORT_FAILURE = 'report_failure'
REPORT_INVENTORY_UPDATE = 'inventory_update'
# name of manifest config operations to report back to sysinv conductor
REPORT_UPGRADE_ACTIONS = 'upgrade_actions'
REPORT_AIO_CINDER_CONFIG = 'aio_cinder_config'
REPORT_DISK_PARTITON_CONFIG = 'manage_disk_partitions'
REPORT_LVM_BACKEND_CONFIG = 'lvm_config'