# # Copyright (c) 2020-2024 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # import copy import re from dccommon import consts as dccommon_consts from dccommon.drivers.openstack.sysinv_v1 import HOST_FS_NAME_SCRATCH from dcmanager.common import consts from dcmanager.common.exceptions import ManualRecoveryRequiredException from dcmanager.common.exceptions import PreCheckFailedException from dcmanager.db import api as db_api from dcmanager.orchestrator.states.base import BaseState from dcmanager.orchestrator.states.upgrade.cache.cache_specifications import \ REGION_ONE_SYSTEM_INFO_CACHE_TYPE # These deploy states should transition to the 'upgrading' state VALID_UPGRADE_STATES = [consts.DEPLOY_STATE_PRE_INSTALL_FAILED, consts.DEPLOY_STATE_INSTALL_FAILED, consts.DEPLOY_STATE_DATA_MIGRATION_FAILED, ] # These deploy states should transition to the 'migrating_data' state VALID_MIGRATE_DATA_STATES = [consts.DEPLOY_STATE_INSTALLED, ] # These deploy states should transition to the 'activating_upgrade' state VALID_ACTIVATION_STATES = [consts.DEPLOY_STATE_MIGRATED, ] MIN_SCRATCH_SIZE_REQUIRED_GB = 16 UPGRADE_IN_PROGRESS_ALARM = '900.005' HOST_ADMINISTRATIVELY_LOCKED_ALARM = '200.001' ALARM_IGNORE_LIST = [UPGRADE_IN_PROGRESS_ALARM, ] class PreCheckState(BaseState): """This State performs entry checks and skips to the appropriate state""" def __init__(self, region_name): super(PreCheckState, self).__init__( next_state=consts.STRATEGY_STATE_INSTALLING_LICENSE, region_name=region_name) def _check_health(self, strategy_step, subcloud_sysinv_client, subcloud_fm_client, host, upgrades): # Check system upgrade health # # Sample output #1 # ================ # Some non-management affecting alarms, all other checks passed # # System Health: # All hosts are provisioned: [OK] # All hosts are unlocked/enabled: [OK] # All hosts have current configurations: [OK] # All hosts are patch current: [OK] # Ceph Storage Healthy: [OK] # No alarms: [Fail] # [1] alarms found, [0] of which are management affecting # All kubernetes nodes are ready: [OK] # All kubernetes control plane pods are ready: [OK] # Active kubernetes version is the latest supported version: [OK] # No imported load found. Unable to test further # # Sample output #2 # ================ # Multiple failed checks, management affecting alarms # # System Health: # All hosts are provisioned: [OK] # All hosts are unlocked/enabled: [OK] # All hosts have current configurations: [OK] # All hosts are patch current: [OK] # Ceph Storage Healthy: [Fail] # No alarms: [Fail] # [7] alarms found, [2] of which are management affecting # All kubernetes nodes are ready: [OK] # All kubernetes control plane pods are ready: [OK] # Active kubernetes version is the latest supported version: [OK] # No imported load found. Unable to test further # TODO(teewrs): Update the sysinv API to allow a list of ignored alarms # to be passed to the health check API. This would be much more efficient # than having to retrieve the alarms in a separate step. system_health = subcloud_sysinv_client.get_system_health_upgrade() fails = re.findall("\[Fail\]", system_health) failed_alarm_check = re.findall("No alarms: \[Fail\]", system_health) no_mgmt_alarms = re.findall("\[0\] of which are management affecting", system_health) alarm_ignore_list = copy.copy(ALARM_IGNORE_LIST) if (host.administrative == consts.ADMIN_LOCKED and upgrades): alarm_ignore_list.append(HOST_ADMINISTRATIVELY_LOCKED_ALARM) # Clean old error messages db_api.subcloud_update( self.context, strategy_step.subcloud_id, error_description=consts.ERROR_DESC_EMPTY) # The health conditions acceptable for upgrade are: # a) subcloud is completely healthy (i.e. no failed checks) # b) subcloud only fails alarm check and it only has non-management # affecting alarm(s) # c) the management alarm(s) that subcloud has once upgrade has started # are upgrade alarm itself and host locked alarm if ((len(fails) == 0) or (len(fails) == 1 and failed_alarm_check and no_mgmt_alarms)): self.info_log(strategy_step, "Health check passed.") return if not failed_alarm_check: # Health check failure: no alarms involved # # These could be Kubernetes or other related failure(s) which has not # been been converted into an alarm condition. error_desc_msg = ("System upgrade health check failed. \n %s" % fails) db_api.subcloud_update( self.context, strategy_step.subcloud_id, error_description=error_desc_msg[0:consts.ERROR_DESCRIPTION_LENGTH]) details = ( "System upgrade health check failed. Please run " "'system health-query-upgrade' command on the subcloud or %s " "on central for details" % (consts.ERROR_DESC_CMD)) self.error_log(strategy_step, "\n" + system_health) raise PreCheckFailedException( subcloud=strategy_step.subcloud.name, details=details, ) else: # Health check failure: one or more alarms if (upgrades and (len(fails) == len(alarm_ignore_list))): # Upgrade has started, previous try failed either before or after # host lock. return elif len(fails) == 1: # Healthy check failure: exclusively alarms related alarms = subcloud_fm_client.get_alarms() for alarm in alarms: if alarm.alarm_id not in alarm_ignore_list: if alarm.mgmt_affecting == "True": error_desc_msg = ( "System upgrade health check failed due to " "alarm %s. System upgrade health: \n %s" % (alarm.alarm_id, system_health)) db_api.subcloud_update( self.context, strategy_step.subcloud_id, error_description=error_desc_msg[ 0:consts.ERROR_DESCRIPTION_LENGTH]) details = ( "System upgrade health check failed due to " "alarm %s. Please run 'system health-query-upgrade' " "command on the subcloud or %s on central for " "details." % (alarm.alarm_id, consts.ERROR_DESC_CMD)) self.error_log(strategy_step, "\n" + system_health) raise PreCheckFailedException( subcloud=strategy_step.subcloud.name, details=details, ) else: # Multiple failures error_desc_msg = ( "System upgrade health check failed due to multiple failures. " "Health: \n %s" % system_health) db_api.subcloud_update( self.context, strategy_step.subcloud_id, error_description=error_desc_msg[ 0:consts.ERROR_DESCRIPTION_LENGTH]) details = ( "System upgrade health check failed due to multiple failures. " "Please run 'system health-query-upgrade' command on the " "subcloud or %s on central for details." % consts.ERROR_DESC_CMD) self.error_log(strategy_step, "\n" + system_health) raise PreCheckFailedException( subcloud=strategy_step.subcloud.name, details=details, ) def _check_scratch(self, strategy_step, subcloud_sysinv_client, host): scratch_fs = subcloud_sysinv_client.get_host_filesystem( host.uuid, HOST_FS_NAME_SCRATCH) if scratch_fs.size < MIN_SCRATCH_SIZE_REQUIRED_GB: details = ("Scratch filesystem size of %s does not meet " "minimum required %s" % (scratch_fs.size, MIN_SCRATCH_SIZE_REQUIRED_GB)) raise PreCheckFailedException( subcloud=strategy_step.subcloud.name, details=details, ) def _perform_subcloud_online_checks(self, strategy_step, subcloud_sysinv_client, subcloud_fm_client, host, upgrades): self._check_health(strategy_step, subcloud_sysinv_client, subcloud_fm_client, host, upgrades) self._check_scratch(strategy_step, subcloud_sysinv_client, host) def perform_state_action(self, strategy_step): """This state will check if the subcloud is offline: Check the deploy_status and transfer to the correct state. if an unsupported deploy_status is encountered, fail the upgrade """ subcloud = db_api.subcloud_get(self.context, strategy_step.subcloud.id) if subcloud.availability_status == dccommon_consts.AVAILABILITY_ONLINE: subcloud_sysinv_client = None try: subcloud_sysinv_client = \ self.get_sysinv_client(strategy_step.subcloud.region_name) subcloud_fm_client = \ self.get_fm_client(strategy_step.subcloud.region_name) except Exception: # if getting the token times out, the orchestrator may have # restarted and subcloud may be offline; so will attempt # to use the persisted values message = ("Subcloud %s failed to get subcloud client" % strategy_step.subcloud.name) self.error_log(strategy_step, message) error_message = "deploy state: %s" % subcloud.deploy_status raise ManualRecoveryRequiredException( subcloud=strategy_step.subcloud.name, error_message=error_message) host = subcloud_sysinv_client.get_host("controller-0") subcloud_type = self.get_sysinv_client( strategy_step.subcloud.region_name).get_system().system_mode upgrades = subcloud_sysinv_client.get_upgrades() if subcloud_type == consts.SYSTEM_MODE_SIMPLEX: # Check presence of data_install values. These are managed # semantically on subcloud add or update if not subcloud.data_install: details = ("Data install values are missing and must be updated " "via dcmanager subcloud update") raise PreCheckFailedException( subcloud=strategy_step.subcloud.name, details=details) sc_status = subcloud.deploy_status if (host.administrative == consts.ADMIN_LOCKED and (sc_status == consts.DEPLOY_STATE_INSTALL_FAILED or sc_status == consts.DEPLOY_STATE_PRE_INSTALL_FAILED)): # If the subcloud is online but its deploy state is # pre-install-failed or install-failed and the subcloud host is # locked, the upgrading simplex step must have failed early in # the previous upgrade attempt. The pre-check should transition # directly to upgrading simplex step in the retry. self.override_next_state(consts.STRATEGY_STATE_UPGRADING_SIMPLEX) return self.next_state # Skip subcloud online checks if the subcloud deploy status is # either "migrated" or "upgrade-activated". if subcloud.deploy_status in [consts.DEPLOY_STATE_MIGRATED, consts.DEPLOY_STATE_UPGRADE_ACTIVATED]: self.info_log(strategy_step, "Online subcloud checks skipped.") else: self._perform_subcloud_online_checks(strategy_step, subcloud_sysinv_client, subcloud_fm_client, host, upgrades) if subcloud.deploy_status == consts.DEPLOY_STATE_UPGRADE_ACTIVATED: # If the subcloud has completed upgrade activation, # advance directly to completing step. self.override_next_state( consts.STRATEGY_STATE_COMPLETING_UPGRADE ) elif subcloud.deploy_status == \ consts.DEPLOY_STATE_DATA_MIGRATION_FAILED: # If the subcloud deploy status is data-migration-failed but # it is online and has passed subcloud online checks, it must # have timed out while waiting for the subcloud to unlock # previously and has succesfully been unlocked since. Update # the subcloud deploy status and advance to activating upgrade # step. db_api.subcloud_update( self.context, strategy_step.subcloud_id, deploy_status=consts.DEPLOY_STATE_MIGRATED) self.override_next_state( consts.STRATEGY_STATE_ACTIVATING_UPGRADE ) elif subcloud.deploy_status == consts.DEPLOY_STATE_MIGRATED: # If the subcloud deploy status is migrated but it is online, it # must have undergone 2 upgrade attempts: # - in 1st upgrade attempt: strategy timed out while waiting # for the subcloud to unlock # - in 2nd upgrade attempt: the subcloud was unlocked # successfully (with or without manual interventions) but # failed to activate. # Advance to activating upgrade step so activation can be retried # after the manual intervention. self.override_next_state( consts.STRATEGY_STATE_ACTIVATING_UPGRADE ) else: # Duplex case if upgrades: # If upgrade has started, skip subcloud online checks self.info_log(strategy_step, "Online subcloud checks skipped.") upgrade_state = upgrades[0].state controllers_state = consts.UPGRADE_STATE_UPGRADING_CONTROLLERS migration_complete = consts.UPGRADE_STATE_DATA_MIGRATION_COMPLETE if (upgrade_state == consts.UPGRADE_STATE_DATA_MIGRATION_FAILED or upgrade_state == consts.UPGRADE_STATE_DATA_MIGRATION): error_message = "upgrade state: %s" % upgrade_state raise ManualRecoveryRequiredException( subcloud=strategy_step.subcloud.name, error_message=error_message) elif (upgrade_state == controllers_state or upgrade_state == migration_complete): # At this point the subcloud is duplex, deploy state is # completeand "system upgrade-show" on the subcloud indicates # that the upgrade state is "upgrading-controllers". # If controller-1 is locked then we unlock it, # if controller-0 is active we need to swact # else we can proceed to create the VIM strategy. controller_1_host = subcloud_sysinv_client.get_host( "controller-1") if controller_1_host.administrative == consts.ADMIN_LOCKED: self.override_next_state( consts.STRATEGY_STATE_UNLOCKING_CONTROLLER_1) elif host.capabilities.get('Personality') == \ consts.PERSONALITY_CONTROLLER_ACTIVE: self.override_next_state( consts.STRATEGY_STATE_SWACTING_TO_CONTROLLER_1) else: self.override_next_state( consts.STRATEGY_STATE_CREATING_VIM_UPGRADE_STRATEGY) elif upgrade_state == consts.UPGRADE_STATE_UPGRADING_HOSTS: # At this point the subcloud is duplex, deploy state is # complete and "system upgrade-show" on the subcloud # indicates that theupgrade state is "upgrading-hosts". # If both subcloud hosts are upgraded to the newer load, # we resume the state machine from activate upgrade state. # Otherwise, we resume from create the VIM strategy state. # determine the version of the system controller in regionone target_version = self._read_from_cache( REGION_ONE_SYSTEM_INFO_CACHE_TYPE).software_version all_hosts_upgraded = True subcloud_hosts = self.get_sysinv_client( strategy_step.subcloud.region_name).get_hosts() for subcloud_host in subcloud_hosts: is_locked = (subcloud_host.administrative == consts.ADMIN_LOCKED) is_disabled = (subcloud_host.operational == consts.OPERATIONAL_DISABLED) create_vim_state = \ consts.STRATEGY_STATE_CREATING_VIM_UPGRADE_STRATEGY if (subcloud_host.software_load != target_version or is_locked or is_disabled): all_hosts_upgraded = False self.override_next_state(create_vim_state) if all_hosts_upgraded: if host.capabilities.get('Personality') == \ consts.PERSONALITY_CONTROLLER_ACTIVE: self.override_next_state( consts.STRATEGY_STATE_ACTIVATING_UPGRADE) else: self.override_next_state( consts.STRATEGY_STATE_SWACTING_TO_CONTROLLER_0) elif upgrade_state == consts.UPGRADE_STATE_ACTIVATION_FAILED: if (host.capabilities.get('Personality') == consts.PERSONALITY_CONTROLLER_ACTIVE): self.override_next_state( consts.STRATEGY_STATE_ACTIVATING_UPGRADE) else: self.override_next_state( consts.STRATEGY_STATE_SWACTING_TO_CONTROLLER_0) elif upgrade_state == consts.UPGRADE_STATE_ACTIVATION_COMPLETE: self.override_next_state( consts.STRATEGY_STATE_COMPLETING_UPGRADE) else: # Perform subcloud online check for duplex and proceed to the # next step (i.e. installing license) self._perform_subcloud_online_checks(strategy_step, subcloud_sysinv_client, subcloud_fm_client, host, upgrades) return self.next_state # If it gets here, the subcloud must be offline and is a simplex if subcloud.deploy_status in VALID_UPGRADE_STATES: if not subcloud.data_install: details = ("Data install values are missing and must be updated " "via dcmanager subcloud update") raise PreCheckFailedException( subcloud=strategy_step.subcloud.name, details=details) self.override_next_state(consts.STRATEGY_STATE_UPGRADING_SIMPLEX) return self.next_state elif subcloud.deploy_status in VALID_MIGRATE_DATA_STATES: self.override_next_state(consts.STRATEGY_STATE_MIGRATING_DATA) return self.next_state elif subcloud.deploy_status in VALID_ACTIVATION_STATES: self.override_next_state(consts.STRATEGY_STATE_ACTIVATING_UPGRADE) return self.next_state # FAIL: We are offline and encountered an un-recoverable deploy status self.info_log(strategy_step, "Un-handled deploy_status: %s" % subcloud.deploy_status) error_message = "deploy state: %s" % subcloud.deploy_status raise ManualRecoveryRequiredException( subcloud=strategy_step.subcloud.name, error_message=error_message)