Merge "Enhance error reporting on subcloud upgrade"
This commit is contained in:
commit
b6badeaa4d
@ -31,6 +31,7 @@ from dccommon.drivers.openstack.sysinv_v1 import SysinvClient
|
||||
from dccommon import exceptions
|
||||
from dccommon import install_consts
|
||||
from dccommon import utils as common_utils
|
||||
from dcmanager.common import consts as common_consts
|
||||
from dcmanager.common import utils
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
@ -618,7 +619,7 @@ class SubcloudInstall(object):
|
||||
# for cleanup on process restart/SWACT.
|
||||
common_utils.run_playbook(log_file, install_command)
|
||||
except exceptions.PlaybookExecutionFailed:
|
||||
msg = ("Failed to install the subcloud %s, check individual "
|
||||
"log at %s for detailed output."
|
||||
% (self.name, log_file))
|
||||
msg = ("Failed to install %s, check individual "
|
||||
"log at %s or run %s for details"
|
||||
% (self.name, log_file, common_consts.ERROR_DESC_CMD))
|
||||
raise Exception(msg)
|
||||
|
@ -184,6 +184,7 @@ DEPLOY_STATE_DONE = 'complete'
|
||||
|
||||
# Subcloud errors
|
||||
ERROR_DESC_EMPTY = 'No errors present'
|
||||
ERROR_DESC_CMD = 'dcmanager subcloud errors <subcloud-name>'
|
||||
|
||||
# error_description max length
|
||||
ERROR_DESCRIPTION_LENGTH = 2048
|
||||
|
@ -178,7 +178,7 @@ class CertificateUploadError(DCManagerException):
|
||||
|
||||
|
||||
class LicenseInstallError(DCManagerException):
|
||||
message = _("Error while installing license on subcloud: %(subcloud_id)s")
|
||||
message = _("Error while installing license on subcloud: %(subcloud_id)s. %(error_message)s")
|
||||
|
||||
|
||||
class LicenseMissingError(DCManagerException):
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2020-2021 Wind River Systems, Inc.
|
||||
# Copyright (c) 2020-2022 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
@ -97,8 +97,15 @@ class CompletingUpgradeState(BaseState):
|
||||
# invoke the API 'upgrade-complete'
|
||||
# This is a partially blocking call that raises exception on failure.
|
||||
# We will re-attempt even if that failure is encountered
|
||||
self._upgrade_complete(strategy_step)
|
||||
|
||||
try:
|
||||
message = self._upgrade_complete(strategy_step)
|
||||
except Exception as e:
|
||||
msg = ("Failed to complete upgrade. %s" %
|
||||
str(e))
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
error_description=msg[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
raise
|
||||
# 'completion' deletes the upgrade. Need to loop until it is deleted
|
||||
counter = 0
|
||||
while True:
|
||||
@ -112,7 +119,12 @@ class CompletingUpgradeState(BaseState):
|
||||
break
|
||||
counter += 1
|
||||
if counter >= self.max_queries:
|
||||
raise Exception("Timeout waiting for completion to complete")
|
||||
msg = ("Timeout waiting for completion to complete: %s:" %
|
||||
message)
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
error_description=msg[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
raise Exception(msg)
|
||||
time.sleep(self.sleep_duration)
|
||||
|
||||
# When we return from this method without throwing an exception, the
|
||||
|
@ -11,6 +11,7 @@ from dcmanager.common import utils
|
||||
|
||||
from dcmanager.common.exceptions import StrategyStoppedException
|
||||
from dcmanager.common.exceptions import VaultLoadMissingError
|
||||
from dcmanager.db import api as db_api
|
||||
from dcmanager.orchestrator.states.base import BaseState
|
||||
from dcmanager.orchestrator.states.upgrade.cache.cache_specifications import \
|
||||
REGION_ONE_SYSTEM_INFO_CACHE_TYPE
|
||||
@ -155,11 +156,20 @@ class ImportingLoadState(BaseState):
|
||||
# Send only the required fields
|
||||
creation_keys = ['software_version', 'compatible_version', 'required_patches']
|
||||
target_load = {key: target_load[key] for key in creation_keys}
|
||||
load = self.get_sysinv_client(
|
||||
strategy_step.subcloud.name).import_load_metadata(target_load)
|
||||
self.info_log(strategy_step,
|
||||
"Load: %s is now: %s" % (
|
||||
load.software_version, load.state))
|
||||
try:
|
||||
load = self.get_sysinv_client(
|
||||
strategy_step.subcloud.name).import_load_metadata(target_load)
|
||||
self.info_log(strategy_step,
|
||||
"Load: %s is now: %s" % (
|
||||
load.software_version, load.state))
|
||||
except Exception as e:
|
||||
msg = ("Failed to import load metadata. %s" %
|
||||
str(e))
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
error_description=msg[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
self.error_log(strategy_step, msg)
|
||||
raise
|
||||
else:
|
||||
while True:
|
||||
# If event handler stop has been triggered, fail the state
|
||||
|
@ -6,6 +6,7 @@
|
||||
from dccommon import consts as dccommon_consts
|
||||
from dcmanager.common import consts
|
||||
from dcmanager.common import exceptions
|
||||
from dcmanager.db import api as db_api
|
||||
from dcmanager.orchestrator.states.base import BaseState
|
||||
from dcmanager.orchestrator.states.upgrade.cache.cache_specifications import \
|
||||
REGION_ONE_LICENSE_CACHE_TYPE
|
||||
@ -51,8 +52,15 @@ class InstallingLicenseState(BaseState):
|
||||
return self.next_state
|
||||
else:
|
||||
# An unexpected error occurred querying the license
|
||||
message = ('An unexpected error occurred querying the license %s. Detail: %s' %
|
||||
(dccommon_consts.SYSTEM_CONTROLLER_NAME,
|
||||
target_error))
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
error_description=message[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
raise exceptions.LicenseInstallError(
|
||||
subcloud_id=dccommon_consts.SYSTEM_CONTROLLER_NAME)
|
||||
subcloud_id=dccommon_consts.SYSTEM_CONTROLLER_NAME,
|
||||
error_message=target_error)
|
||||
|
||||
# retrieve the keystone session for the subcloud and query its license
|
||||
subcloud_sysinv_client = \
|
||||
@ -76,8 +84,17 @@ class InstallingLicenseState(BaseState):
|
||||
install_rc = subcloud_sysinv_client.install_license(target_license)
|
||||
install_error = install_rc.get('error')
|
||||
if len(install_error) != 0:
|
||||
# Save error response from sysinv into subcloud error description.
|
||||
# Provide exception with sysinv error response to strategy_step details
|
||||
message = ('Error installing license on subcloud %s. Detail: %s' %
|
||||
(strategy_step.subcloud.name,
|
||||
install_error))
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
error_description=message[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
raise exceptions.LicenseInstallError(
|
||||
subcloud_id=strategy_step.subcloud_id)
|
||||
subcloud_id=strategy_step.subcloud_id,
|
||||
error_message=install_error)
|
||||
|
||||
# The license has been successfully installed. Move to the next stage
|
||||
self.info_log(strategy_step, "License installed.")
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2020-2021 Wind River Systems, Inc.
|
||||
# Copyright (c) 2020-2022 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
@ -10,6 +10,7 @@ from dccommon.exceptions import PlaybookExecutionFailed
|
||||
from dccommon.utils import run_playbook
|
||||
from dcmanager.common import consts
|
||||
from dcmanager.common.exceptions import StrategyStoppedException
|
||||
from dcmanager.common import utils
|
||||
from dcmanager.db import api as db_api
|
||||
from dcmanager.orchestrator.states.base import BaseState
|
||||
|
||||
@ -32,16 +33,14 @@ DEFAULT_API_SLEEP = 60
|
||||
DEFAULT_ANSIBLE_SLEEP = 180
|
||||
|
||||
|
||||
def migrate_subcloud_data(subcloud_name, migrate_command):
|
||||
log_file = os.path.join(consts.DC_ANSIBLE_LOG_DIR, subcloud_name) + \
|
||||
'_playbook_output.log'
|
||||
def migrate_subcloud_data(migrate_command, log_file):
|
||||
try:
|
||||
run_playbook(log_file, migrate_command)
|
||||
except PlaybookExecutionFailed:
|
||||
msg = ("Failed to migrate data for subcloud %s, check individual "
|
||||
"log at %s for detailed output."
|
||||
% (subcloud_name, log_file))
|
||||
raise Exception(msg)
|
||||
msg_orch = ("Failed to migrate data, check individual "
|
||||
"log at %s or run %s for details"
|
||||
% (log_file, consts.ERROR_DESC_CMD))
|
||||
raise Exception(msg_orch)
|
||||
|
||||
|
||||
class MigratingDataState(BaseState):
|
||||
@ -142,7 +141,8 @@ class MigratingDataState(BaseState):
|
||||
ansible_subcloud_inventory_file = os.path.join(
|
||||
consts.ANSIBLE_OVERRIDES_PATH,
|
||||
strategy_step.subcloud.name + consts.INVENTORY_FILE_POSTFIX)
|
||||
|
||||
log_file = os.path.join(consts.DC_ANSIBLE_LOG_DIR, subcloud.name) + \
|
||||
'_playbook_output.log'
|
||||
# Send skip_patching=true to prevent the playbook from applying any patches present in the
|
||||
# upgrade_data. All the required patches will be included in the generated install iso.
|
||||
data_migrating_cmd = [
|
||||
@ -152,12 +152,17 @@ class MigratingDataState(BaseState):
|
||||
% (consts.TEMP_SYSADMIN_PASSWORD, consts.TEMP_SYSADMIN_PASSWORD)]
|
||||
|
||||
try:
|
||||
migrate_subcloud_data(strategy_step.subcloud.name,
|
||||
data_migrating_cmd)
|
||||
migrate_subcloud_data(data_migrating_cmd, log_file)
|
||||
except Exception as e:
|
||||
# Two error messages: one for subcloud error description and logs and
|
||||
# one for orchestrator strategy_step detail (shorter than the previous).
|
||||
msg_subcloud = utils.find_ansible_error_msg(
|
||||
strategy_step.subcloud.name, log_file, consts.DEPLOY_STATE_MIGRATING_DATA)
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED)
|
||||
deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED,
|
||||
error_description=msg_subcloud[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
self.error_log(strategy_step, msg_subcloud)
|
||||
self.error_log(strategy_step, str(e))
|
||||
raise
|
||||
|
||||
|
@ -90,6 +90,10 @@ class PreCheckState(BaseState):
|
||||
if (host.administrative == consts.ADMIN_LOCKED and upgrades):
|
||||
alarm_ignore_list.append(HOST_ADMINISTRATIVELY_LOCKED_ALARM)
|
||||
|
||||
# Clean old error messages
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
error_description=consts.ERROR_DESC_EMPTY)
|
||||
# The health conditions acceptable for upgrade are:
|
||||
# a) subcloud is completely healthy (i.e. no failed checks)
|
||||
# b) subcloud only fails alarm check and it only has non-management
|
||||
@ -106,8 +110,14 @@ class PreCheckState(BaseState):
|
||||
#
|
||||
# These could be Kubernetes or other related failure(s) which has not been been
|
||||
# converted into an alarm condition.
|
||||
details = "System health check failed. Please run 'system health-query' " \
|
||||
"command on the subcloud for more details."
|
||||
error_desc_msg = ("System health check failed. \n %s" %
|
||||
fails)
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
error_description=error_desc_msg[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
details = ("System health check failed. Please run 'system health-query' "
|
||||
"command on the subcloud or %s on central for details"
|
||||
% (consts.ERROR_DESC_CMD))
|
||||
self.error_log(strategy_step, "\n" + system_health)
|
||||
raise PreCheckFailedException(
|
||||
subcloud=strategy_step.subcloud.name,
|
||||
@ -125,9 +135,16 @@ class PreCheckState(BaseState):
|
||||
for alarm in alarms:
|
||||
if alarm.alarm_id not in alarm_ignore_list:
|
||||
if alarm.mgmt_affecting == "True":
|
||||
details = "System health check failed due to alarm %s. " \
|
||||
"Please run 'system health-query' " \
|
||||
"command on the subcloud for more details." % alarm.alarm_id
|
||||
error_desc_msg = ("System health check failed due to alarm %s. "
|
||||
"System health: \n %s" %
|
||||
(alarm.alarm_id, system_health))
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
error_description=error_desc_msg[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
details = ("System health check failed due to alarm %s. "
|
||||
"Please run 'system health-query' "
|
||||
"command on the subcloud or %s on central for details." %
|
||||
(alarm.alarm_id, consts.ERROR_DESC_CMD))
|
||||
self.error_log(strategy_step, "\n" + system_health)
|
||||
raise PreCheckFailedException(
|
||||
subcloud=strategy_step.subcloud.name,
|
||||
@ -135,9 +152,16 @@ class PreCheckState(BaseState):
|
||||
)
|
||||
else:
|
||||
# Multiple failures
|
||||
details = "System health check failed due to multiple failures. " \
|
||||
"Please run 'system health-query' command on the " \
|
||||
"subcloud for more details."
|
||||
error_desc_msg = ("System health check failed due to multiple failures. "
|
||||
"Health: \n %s" %
|
||||
(system_health))
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
error_description=error_desc_msg[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
details = ("System health check failed due to multiple failures. "
|
||||
"Please run 'system health-query' command on the "
|
||||
"subcloud or %s on central for details." %
|
||||
(consts.ERROR_DESC_CMD))
|
||||
self.error_log(strategy_step, "\n" + system_health)
|
||||
raise PreCheckFailedException(
|
||||
subcloud=strategy_step.subcloud.name,
|
||||
|
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2020-2021 Wind River Systems, Inc.
|
||||
# Copyright (c) 2020-2022 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
@ -201,11 +201,12 @@ class UpgradingSimplexState(BaseState):
|
||||
if not subcloud.data_install:
|
||||
# Set the deploy status to pre-install-failed so it can be
|
||||
# handled accordingly in pre check step.
|
||||
message = ("Failed to get upgrade data from install")
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
deploy_status=consts.DEPLOY_STATE_PRE_INSTALL_FAILED)
|
||||
deploy_status=consts.DEPLOY_STATE_PRE_INSTALL_FAILED,
|
||||
error_description=message)
|
||||
|
||||
message = ("Failed to get upgrade data from install")
|
||||
self.warn_log(strategy_step, message)
|
||||
raise Exception(message)
|
||||
|
||||
@ -337,6 +338,8 @@ class UpgradingSimplexState(BaseState):
|
||||
|
||||
def perform_subcloud_install(self, strategy_step, session, install_values):
|
||||
|
||||
log_file = os.path.join(consts.DC_ANSIBLE_LOG_DIR, strategy_step.subcloud.name) + \
|
||||
'_playbook_output.log'
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
deploy_status=consts.DEPLOY_STATE_PRE_INSTALL)
|
||||
@ -350,7 +353,8 @@ class UpgradingSimplexState(BaseState):
|
||||
except Exception as e:
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
deploy_status=consts.DEPLOY_STATE_PRE_INSTALL_FAILED)
|
||||
deploy_status=consts.DEPLOY_STATE_PRE_INSTALL_FAILED,
|
||||
error_description=str(e)[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
self.error_log(strategy_step, str(e))
|
||||
# TODO(jkung): cleanup to be implemented within SubcloudInstall
|
||||
install.cleanup()
|
||||
@ -379,9 +383,15 @@ class UpgradingSimplexState(BaseState):
|
||||
try:
|
||||
install.install(consts.DC_ANSIBLE_LOG_DIR, install_command)
|
||||
except Exception as e:
|
||||
# Detailed error message for subcloud error description field.
|
||||
# Exception message for strategy_step detail.
|
||||
msg = utils.find_ansible_error_msg(
|
||||
strategy_step.subcloud.name, log_file, consts.DEPLOY_STATE_INSTALLING)
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
deploy_status=consts.DEPLOY_STATE_INSTALL_FAILED)
|
||||
deploy_status=consts.DEPLOY_STATE_INSTALL_FAILED,
|
||||
error_description=msg[0:consts.ERROR_DESCRIPTION_LENGTH])
|
||||
self.error_log(strategy_step, msg)
|
||||
self.error_log(strategy_step, str(e))
|
||||
install.cleanup()
|
||||
raise
|
||||
|
Loading…
Reference in New Issue
Block a user