Get migration scripts output for subcloud

With this change, orchestrator will get the failed migration
script and output information from the platform log when
starting or activating upgrade step fails.
It will make the orchestrator to be more descriptive in these
failure cases.

Test plan:
PASS: Modify migration script involving action = 'start' in
      order to make the starting upgrade step fail.
      Run subcloud upgrade strategy.
      Check strategy details to verify msg.
PASS: Modify migration involving action = 'activate' in order to
      make the activating upgrade step fail. Run upgrade subcloud
      strategy.
      Check strategy details to verify msg.
PASS: Run strategy with the migration scripts well and check strategy
      overcome the steps.

Story: 2010768
Task: 48079

Depends-On: https://review.opendev.org/c/starlingx/config/+/883831

Signed-off-by: fperez <fabrizio.perez@windriver.com>
Change-Id: I05d857b2e98d1fe71eac7348991df6353058611c
This commit is contained in:
fperez
2023-05-22 11:23:37 -03:00
parent 4af71c2e29
commit 8c9a2162ed
6 changed files with 58 additions and 12 deletions

View File

@@ -1,5 +1,5 @@
# Copyright 2016 Ericsson AB
# Copyright (c) 2017-2022 Wind River Systems, Inc.
# Copyright (c) 2017-2023 Wind River Systems, Inc.
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
@@ -391,6 +391,10 @@ class SysinvClient(base.DriverBase):
"""Get a list of upgrades."""
return self.sysinv_client.upgrade.list()
def get_error_msg(self):
"""Get the upgrade message."""
return self.sysinv_client.upgrade.get_upgrade_msg()
def upgrade_activate(self):
"""Invoke the API for 'system upgrade-activate', which is an update """
patch = [{'op': 'replace',

View File

@@ -187,6 +187,7 @@ DEPLOY_STATE_RECONFIGURING_NETWORK = 'reconfiguring-network'
DEPLOY_STATE_RECONFIGURING_NETWORK_FAILED = 'network-reconfiguration-failed'
# Subcloud errors
ERROR_DESC_EMPTY = 'No errors present'
ERROR_DESC_FAILED = 'Failed to get error message. Please check sysinv log'
ERROR_DESC_CMD = 'dcmanager subcloud errors <subcloud-name>'
# Static content for error messages

View File

@@ -945,3 +945,19 @@ def decode_and_normalize_passwd(input_passwd):
passwd = "'" + passwd + "'"
return passwd
def get_failure_msg(subcloud_name):
try:
os_client = OpenStackDriver(region_name=subcloud_name,
region_clients=None)
keystone_client = os_client.keystone_client
endpoint = keystone_client.endpoint_cache.get_endpoint('sysinv')
sysinv_client = SysinvClient(subcloud_name,
keystone_client.session,
endpoint=endpoint)
msg = sysinv_client.get_error_msg()
return msg
except Exception as e:
LOG.exception("{}: {}".format(subcloud_name, e))
return consts.ERROR_DESC_FAILED

View File

@@ -1,5 +1,5 @@
#
# Copyright (c) 2020 Wind River Systems, Inc.
# Copyright (c) 2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@@ -7,6 +7,8 @@ import time
from dcmanager.common import consts
from dcmanager.common.exceptions import StrategyStoppedException
from dcmanager.common import utils
from dcmanager.db import api as db_api
from dcmanager.orchestrator.states.base import BaseState
ACTIVATING_COMPLETED_STATES = ['activation-complete',
@@ -84,8 +86,15 @@ class ActivatingUpgradeState(BaseState):
# if max retries have occurred, fail the state
if activate_retry_counter >= self.max_failed_retries:
raise Exception("Failed to activate upgrade. Please check "
"sysinv.log on the subcloud for details.")
error_msg = utils.get_failure_msg(strategy_step.subcloud.name)
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=error_msg[0:consts.ERROR_DESCRIPTION_LENGTH])
details = ("Failed to activate upgrade. Please check "
"sysinv.log on the subcloud or "
"%s on central for details." %
(consts.ERROR_DESC_CMD))
raise Exception(details)
# We may need multiple attempts to issue the first activate
# if keystone is down, impacting the ability to send the activate
@@ -137,9 +146,15 @@ class ActivatingUpgradeState(BaseState):
break
audit_counter += 1
if audit_counter >= self.max_queries:
raise Exception("Timeout waiting for activation to complete. "
"Please check sysinv.log on the subcloud for "
"details.")
error_msg = utils.get_failure_msg(strategy_step.subcloud.name)
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=error_msg[0:consts.ERROR_DESCRIPTION_LENGTH])
details = ("Timeout waiting for activation to complete. "
"Please check sysinv.log on the subcloud or "
"%s on central for details." %
(consts.ERROR_DESC_CMD))
raise Exception(details)
time.sleep(self.sleep_duration)
# When we return from this method without throwing an exception, the

View File

@@ -158,10 +158,13 @@ class MigratingDataState(BaseState):
# one for orchestrator strategy_step detail (shorter than the previous).
msg_subcloud = utils.find_ansible_error_msg(
strategy_step.subcloud.name, log_file, consts.DEPLOY_STATE_MIGRATING_DATA)
# Get script output in case it is available
error_msg = utils.get_failure_msg(strategy_step.subcloud.name)
failure = ('%s \n%s' % (error_msg, msg_subcloud))
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
deploy_status=consts.DEPLOY_STATE_DATA_MIGRATION_FAILED,
error_description=msg_subcloud[0:consts.ERROR_DESCRIPTION_LENGTH])
error_description=failure[0:consts.ERROR_DESCRIPTION_LENGTH])
self.error_log(strategy_step, msg_subcloud)
self.error_log(strategy_step, str(e))
raise

View File

@@ -1,5 +1,5 @@
#
# Copyright (c) 2020-2022 Wind River Systems, Inc.
# Copyright (c) 2020-2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@@ -9,6 +9,7 @@ from dccommon.drivers.openstack.vim import ALARM_RESTRICTIONS_RELAXED
from dcmanager.common import consts
from dcmanager.common.exceptions import StrategyStoppedException
from dcmanager.common import utils
from dcmanager.db import api as db_api
from dcmanager.orchestrator.states.base import BaseState
DEFAULT_FORCE_FLAG = False
@@ -95,9 +96,15 @@ class StartingUpgradeState(BaseState):
if upgrade_state in UPGRADE_RETRY_STATES:
retry_counter += 1
if retry_counter >= self.max_failed_retries:
raise Exception("Failed to start upgrade. Please "
"check sysinv.log on the subcloud for "
"details.")
error_msg = utils.get_failure_msg(strategy_step.subcloud.name)
db_api.subcloud_update(
self.context, strategy_step.subcloud_id,
error_description=error_msg[0:consts.ERROR_DESCRIPTION_LENGTH])
details = ("Failed to start upgrade. Please "
"check sysinv.log on the subcloud or "
"%s on central for details." %
(consts.ERROR_DESC_CMD))
raise Exception(details)
self.warn_log(strategy_step,
"Upgrade start failed, retrying... State=%s"
% upgrade_state)