Enhance error reporting on subcloud upgrade
This commit updates subcloud error reporting command 'dcmanager subcloud errors' in order to provide information in some upgrade error scenarios. Some sysinv error responses are added also to the command, same as strategy_step details, and could be improved in the future. Test Plan: PASS: Generate two management affecting alarms on subcloud. Create and apply upgrade strategy. Verify that after strategy fails, dcmanager subcloud errors command returns error message and 'system health-query-upgrade' information from subcloud. PASS: Upgrade dcmanager with invalid license Modify the license file to make it invalid but readable. When upgrade strategy fails, run 'dcmanager subcloud errors <subcloud>' and check that output display information related with the error. PASS: On the System Controller, change the compatible_version of load to invalid one. When upgrade strategy fails, run 'dcmanager subcloud errors <subcloud>' and check that output display information related with the error. Check that stack trace is not lost and is available into /var/log/dcmanager/orchestrator.log PASS: Modify subcloud rvmc information to invalid one. When upgrade strategy fails, run 'dcmanager subcloud errors <subcloud>' and check that output display information regarding installation failed. PASS: Modify bootstrap subcloud information to make migration failure. When upgrade strategy fails, Run 'dcmanager subcloud errors <subcloud>' and check that output display information related with the error. PASS: Bypass activating upgrade step actions to make completing upgrade fail. Create and apply strategy. Verify that completing upgrade fails and dcmanager subcloud errors command shows exception from sysinv. PASS: Apply upgrade strategy after failed upgrade strategy. Ensure subcloud is healthy for upgrade. Check that dcmanager subcloud errors returns 'no errors present' after pre_check step. Story: 2010271 Task: 46914 Signed-off-by: fperez <fabrizio.perez@windriver.com> Change-Id: I5e2fa855778556d772bb29611604f9cd02a507ac
This commit is contained in:
parent
2c8cf582df
commit
df97792652
@ -31,6 +31,7 @@ from dccommon.drivers.openstack.sysinv_v1 import SysinvClient
|
||||
from dccommon import exceptions
|
||||
from dccommon import install_consts
|
||||
from dccommon import utils as common_utils
|
||||
from dcmanager.common import consts as common_consts
|
||||
from dcmanager.common import utils
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
@ -618,7 +619,7 @@ class SubcloudInstall(object):
|
||||
# for cleanup on process restart/SWACT.
|
||||
common_utils.run_playbook(log_file, install_command)
|
||||
except exceptions.PlaybookExecutionFailed:
|
||||
msg = ("Failed to install the subcloud %s, check individual "
|
||||
"log at %s for detailed output."
|
||||
% (self.name, log_file))
|
||||
msg = ("Failed to install %s, check individual "
|
||||
"log at %s or run %s for details"
|
||||
% (self.name, log_file, common_consts.ERROR_DESC_CMD))
|
||||
raise Exception(msg)
|
||||
|
@ -182,6 +182,7 @@ DEPLOY_STATE_DONE = 'complete'
|
||||
|
||||
# Subcloud errors
|
||||
ERROR_DESC_EMPTY = 'No errors present'
|
||||
ERROR_DESC_CMD = 'dcmanager subcloud errors <subcloud-name>'
|
||||
|
||||
# error_description max length
|
||||
ERROR_DESCRIPTION_LENGTH = 2048
|
||||
|
@ -178,7 +178,7 @@ class CertificateUploadError(DCManagerException):
|
||||
|
||||
|
||||
class LicenseInstallError(DCManagerException):
|
||||
message = _("Error while installing license on subcloud: %(subcloud_id)s")
|
||||
message = _("Error while installing license on subcloud: %(subcloud_id)s. %(error_message)s")
|
||||
|
||||
|
||||
class LicenseMissingError(DCManagerException):
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2020-2021 Wind River Systems, Inc.
|
||||
# Copyright (c) 2020-2022 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
@ -97,8 +97,15 @@ class CompletingUpgradeState(BaseState):
|
||||
# invoke the API 'upgrade-complete'
|
||||
# This is a partially blocking call that raises exception on failure.
|
||||
# We will re-attempt even if that failure is encountered
|
||||
self._upgrade_complete(strategy_step)
|
||||
|
||||
try:
|
||||
message = self._upgrade_complete(strategy_step)
|
||||
except Exception as e:
|
||||
msg = ("Failed to complete upgrade. %s" %
|
||||
str(e))
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
error_description=msg[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
raise
|
||||
# 'completion' deletes the upgrade. Need to loop until it is deleted
|
||||
counter = 0
|
||||
while True:
|
||||
@ -112,7 +119,12 @@ class CompletingUpgradeState(BaseState):
|
||||
break
|
||||
counter += 1
|
||||
if counter >= self.max_queries:
|
||||
raise Exception("Timeout waiting for completion to complete")
|
||||
msg = ("Timeout waiting for completion to complete: %s:" %
|
||||
message)
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
error_description=msg[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
raise Exception(msg)
|
||||
time.sleep(self.sleep_duration)
|
||||
|
||||
# When we return from this method without throwing an exception, the
|
||||
|
@ -11,6 +11,7 @@ from dcmanager.common import utils
|
||||
|
||||
from dcmanager.common.exceptions import StrategyStoppedException
|
||||
from dcmanager.common.exceptions import VaultLoadMissingError
|
||||
from dcmanager.db import api as db_api
|
||||
from dcmanager.orchestrator.states.base import BaseState
|
||||
from dcmanager.orchestrator.states.upgrade.cache.cache_specifications import \
|
||||
REGION_ONE_SYSTEM_INFO_CACHE_TYPE
|
||||
@ -155,11 +156,20 @@ class ImportingLoadState(BaseState):
|
||||
# Send only the required fields
|
||||
creation_keys = ['software_version', 'compatible_version', 'required_patches']
|
||||
target_load = {key: target_load[key] for key in creation_keys}
|
||||
load = self.get_sysinv_client(
|
||||
strategy_step.subcloud.name).import_load_metadata(target_load)
|
||||
self.info_log(strategy_step,
|
||||
"Load: %s is now: %s" % (
|
||||
load.software_version, load.state))
|
||||
try:
|
||||
load = self.get_sysinv_client(
|
||||
strategy_step.subcloud.name).import_load_metadata(target_load)
|
||||
self.info_log(strategy_step,
|
||||
"Load: %s is now: %s" % (
|
||||
load.software_version, load.state))
|
||||
except Exception as e:
|
||||
msg = ("Failed to import load metadata. %s" %
|
||||
str(e))
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
error_description=msg[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
self.error_log(strategy_step, msg)
|
||||
raise
|
||||
else:
|
||||
while True:
|
||||
# If event handler stop has been triggered, fail the state
|
||||
|
@ -6,6 +6,7 @@
|
||||
from dccommon import consts as dccommon_consts
|
||||
from dcmanager.common import consts
|
||||
from dcmanager.common import exceptions
|
||||
from dcmanager.db import api as db_api
|
||||
from dcmanager.orchestrator.states.base import BaseState
|
||||
from dcmanager.orchestrator.states.upgrade.cache.cache_specifications import \
|
||||
REGION_ONE_LICENSE_CACHE_TYPE
|
||||
@ -51,8 +52,15 @@ class InstallingLicenseState(BaseState):
|
||||
return self.next_state
|
||||
else:
|
||||
# An unexpected error occurred querying the license
|
||||
message = ('An unexpected error occurred querying the license %s. Detail: %s' %
|
||||
(dccommon_consts.SYSTEM_CONTROLLER_NAME,
|
||||
target_error))
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
error_description=message[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
raise exceptions.LicenseInstallError(
|
||||
subcloud_id=dccommon_consts.SYSTEM_CONTROLLER_NAME)
|
||||
subcloud_id=dccommon_consts.SYSTEM_CONTROLLER_NAME,
|
||||
error_message=target_error)
|
||||
|
||||
# retrieve the keystone session for the subcloud and query its license
|
||||
subcloud_sysinv_client = \
|
||||
@ -76,8 +84,17 @@ class InstallingLicenseState(BaseState):
|
||||
install_rc = subcloud_sysinv_client.install_license(target_license)
|
||||
install_error = install_rc.get('error')
|
||||
if len(install_error) != 0:
|
||||
# Save error response from sysinv into subcloud error description.
|
||||
# Provide exception with sysinv error response to strategy_step details
|
||||
message = ('Error installing license on subcloud %s. Detail: %s' %
|
||||
(strategy_step.subcloud.name,
|
||||
install_error))
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
error_description=message[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
raise exceptions.LicenseInstallError(
|
||||
subcloud_id=strategy_step.subcloud_id)
|
||||
subcloud_id=strategy_step.subcloud_id,
|
||||
error_message=install_error)
|
||||
|
||||
# The license has been successfully installed. Move to the next stage
|
||||
self.info_log(strategy_step, "License installed.")
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2020-2021 Wind River Systems, Inc.
|
||||
# Copyright (c) 2020-2022 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
@ -10,6 +10,7 @@ from dccommon.exceptions import PlaybookExecutionFailed
|
||||
from dccommon.utils import run_playbook
|
||||
from dcmanager.common import consts
|
||||
from dcmanager.common.exceptions import StrategyStoppedException
|
||||
from dcmanager.common import utils
|
||||
from dcmanager.db import api as db_api
|
||||
from dcmanager.orchestrator.states.base import BaseState
|
||||
|
||||
@ -32,16 +33,14 @@ DEFAULT_API_SLEEP = 60
|
||||
DEFAULT_ANSIBLE_SLEEP = 180
|
||||
|
||||
|
||||
def migrate_subcloud_data(subcloud_name, migrate_command):
|
||||
log_file = os.path.join(consts.DC_ANSIBLE_LOG_DIR, subcloud_name) + \
|
||||
'_playbook_output.log'
|
||||
def migrate_subcloud_data(migrate_command, log_file):
|
||||
try:
|
||||
run_playbook(log_file, migrate_command)
|
||||
except PlaybookExecutionFailed:
|
||||
msg = ("Failed to migrate data for subcloud %s, check individual "
|
||||
"log at %s for detailed output."
|
||||
% (subcloud_name, log_file))
|
||||
raise Exception(msg)
|
||||
msg_orch = ("Failed to migrate data, check individual "
|
||||
"log at %s or run %s for details"
|
||||
% (log_file, consts.ERROR_DESC_CMD))
|
||||
raise Exception(msg_orch)
|
||||
|
||||
|
||||
class MigratingDataState(BaseState):
|
||||
@ -142,7 +141,8 @@ class MigratingDataState(BaseState):
|
||||
ansible_subcloud_inventory_file = os.path.join(
|
||||
consts.ANSIBLE_OVERRIDES_PATH,
|
||||
strategy_step.subcloud.name + consts.INVENTORY_FILE_POSTFIX)
|
||||
|
||||
log_file = os.path.join(consts.DC_ANSIBLE_LOG_DIR, subcloud.name) + \
|
||||
'_playbook_output.log'
|
||||
# Send skip_patching=true to prevent the playbook from applying any patches present in the
|
||||
# upgrade_data. All the required patches will be included in the generated install iso.
|
||||
data_migrating_cmd = [
|
||||
@ -152,12 +152,17 @@ class MigratingDataState(BaseState):
|
||||
% (consts.TEMP_SYSADMIN_PASSWORD, consts.TEMP_SYSADMIN_PASSWORD)]
|
||||
|
||||
try:
|
||||
migrate_subcloud_data(strategy_step.subcloud.name,
|
||||
data_migrating_cmd)
|
||||
migrate_subcloud_data(data_migrating_cmd, log_file)
|
||||
except Exception as e:
|
||||
# Two error messages: one for subcloud error description and logs and
|
||||
# one for orchestrator strategy_step detail (shorter than the previous).
|
||||
msg_subcloud = utils.find_ansible_error_msg(
|
||||
strategy_step.subcloud.name, log_file, consts.DEPLOY_STATE_MIGRATING_DATA)
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED)
|
||||
deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED,
|
||||
error_description=msg_subcloud[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
self.error_log(strategy_step, msg_subcloud)
|
||||
self.error_log(strategy_step, str(e))
|
||||
raise
|
||||
|
||||
|
@ -90,6 +90,10 @@ class PreCheckState(BaseState):
|
||||
if (host.administrative == consts.ADMIN_LOCKED and upgrades):
|
||||
alarm_ignore_list.append(HOST_ADMINISTRATIVELY_LOCKED_ALARM)
|
||||
|
||||
# Clean old error messages
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
error_description=consts.ERROR_DESC_EMPTY)
|
||||
# The health conditions acceptable for upgrade are:
|
||||
# a) subcloud is completely healthy (i.e. no failed checks)
|
||||
# b) subcloud only fails alarm check and it only has non-management
|
||||
@ -106,8 +110,14 @@ class PreCheckState(BaseState):
|
||||
#
|
||||
# These could be Kubernetes or other related failure(s) which has not been been
|
||||
# converted into an alarm condition.
|
||||
details = "System health check failed. Please run 'system health-query' " \
|
||||
"command on the subcloud for more details."
|
||||
error_desc_msg = ("System health check failed. \n %s" %
|
||||
fails)
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
error_description=error_desc_msg[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
details = ("System health check failed. Please run 'system health-query' "
|
||||
"command on the subcloud or %s on central for details"
|
||||
% (consts.ERROR_DESC_CMD))
|
||||
self.error_log(strategy_step, "\n" + system_health)
|
||||
raise PreCheckFailedException(
|
||||
subcloud=strategy_step.subcloud.name,
|
||||
@ -125,9 +135,16 @@ class PreCheckState(BaseState):
|
||||
for alarm in alarms:
|
||||
if alarm.alarm_id not in alarm_ignore_list:
|
||||
if alarm.mgmt_affecting == "True":
|
||||
details = "System health check failed due to alarm %s. " \
|
||||
"Please run 'system health-query' " \
|
||||
"command on the subcloud for more details." % alarm.alarm_id
|
||||
error_desc_msg = ("System health check failed due to alarm %s. "
|
||||
"System health: \n %s" %
|
||||
(alarm.alarm_id, system_health))
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
error_description=error_desc_msg[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
details = ("System health check failed due to alarm %s. "
|
||||
"Please run 'system health-query' "
|
||||
"command on the subcloud or %s on central for details." %
|
||||
(alarm.alarm_id, consts.ERROR_DESC_CMD))
|
||||
self.error_log(strategy_step, "\n" + system_health)
|
||||
raise PreCheckFailedException(
|
||||
subcloud=strategy_step.subcloud.name,
|
||||
@ -135,9 +152,16 @@ class PreCheckState(BaseState):
|
||||
)
|
||||
else:
|
||||
# Multiple failures
|
||||
details = "System health check failed due to multiple failures. " \
|
||||
"Please run 'system health-query' command on the " \
|
||||
"subcloud for more details."
|
||||
error_desc_msg = ("System health check failed due to multiple failures. "
|
||||
"Health: \n %s" %
|
||||
(system_health))
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
error_description=error_desc_msg[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
details = ("System health check failed due to multiple failures. "
|
||||
"Please run 'system health-query' command on the "
|
||||
"subcloud or %s on central for details." %
|
||||
(consts.ERROR_DESC_CMD))
|
||||
self.error_log(strategy_step, "\n" + system_health)
|
||||
raise PreCheckFailedException(
|
||||
subcloud=strategy_step.subcloud.name,
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2020-2021 Wind River Systems, Inc.
|
||||
# Copyright (c) 2020-2022 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
@ -201,11 +201,12 @@ class UpgradingSimplexState(BaseState):
|
||||
if not subcloud.data_install:
|
||||
# Set the deploy status to pre-install-failed so it can be
|
||||
# handled accordingly in pre check step.
|
||||
message = ("Failed to get upgrade data from install")
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
deploy_status=consts.DEPLOY_STATE_PRE_INSTALL_FAILED)
|
||||
deploy_status=consts.DEPLOY_STATE_PRE_INSTALL_FAILED,
|
||||
error_description=message)
|
||||
|
||||
message = ("Failed to get upgrade data from install")
|
||||
self.warn_log(strategy_step, message)
|
||||
raise Exception(message)
|
||||
|
||||
@ -337,6 +338,8 @@ class UpgradingSimplexState(BaseState):
|
||||
|
||||
def perform_subcloud_install(self, strategy_step, session, install_values):
|
||||
|
||||
log_file = os.path.join(consts.DC_ANSIBLE_LOG_DIR, strategy_step.subcloud.name) + \
|
||||
'_playbook_output.log'
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
deploy_status=consts.DEPLOY_STATE_PRE_INSTALL)
|
||||
@ -350,7 +353,8 @@ class UpgradingSimplexState(BaseState):
|
||||
except Exception as e:
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
deploy_status=consts.DEPLOY_STATE_PRE_INSTALL_FAILED)
|
||||
deploy_status=consts.DEPLOY_STATE_PRE_INSTALL_FAILED,
|
||||
error_description=str(e)[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
self.error_log(strategy_step, str(e))
|
||||
# TODO(jkung): cleanup to be implemented within SubcloudInstall
|
||||
install.cleanup()
|
||||
@ -379,9 +383,15 @@ class UpgradingSimplexState(BaseState):
|
||||
try:
|
||||
install.install(consts.DC_ANSIBLE_LOG_DIR, install_command)
|
||||
except Exception as e:
|
||||
# Detailed error message for subcloud error description field.
|
||||
# Exception message for strategy_step detail.
|
||||
msg = utils.find_ansible_error_msg(
|
||||
strategy_step.subcloud.name, log_file, consts.DEPLOY_STATE_INSTALLING)
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
deploy_status=consts.DEPLOY_STATE_INSTALL_FAILED)
|
||||
deploy_status=consts.DEPLOY_STATE_INSTALL_FAILED,
|
||||
error_description=msg[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
self.error_log(strategy_step, msg)
|
||||
self.error_log(strategy_step, str(e))
|
||||
install.cleanup()
|
||||
raise
|
||||
|
Loading…
Reference in New Issue
Block a user