Hold off dc audit and sync until activation is complete
Currently DC audit and sync resume as soon as the subcloud becomes unlocked-enabled-available post data migration while dc orchestrator proceeds to activating phase for the subcloud. During upgrade activation, many platform activities that occur at the same time on the subcloud. To reduce load on the subcloud, DC audit and sync are held off until upgrade activation is complete. Note: Platform services changes to reduce CPU contention over activation phase are addressed separately. There are 2 caveats as a result of this change: a) extended subcloud offline status (an alarm condition) and b) user is not able to log in the subcloud using sysadmin password until subcloud upgrade is complete. Test Plan: - Verify successful simplex subcloud upgrade and no audit or sync activities take place until upgrade activation is complete. - Induce data migration failure and verify that the strategy advances to "upgrading simplex" step upon retry. - Induce a data migration timeout (i.e. subcloud takes an abnormal amount of time to unlock but succeeds eventually with or without manual intervention) and verify that the strategy advances to "activation upgrade" step following online checks upon retry. - Induce activation failure and verify that the strategy advances to "upgrade activation" step upon retry. - Induce completing upgrade failure and verify that the strategy advances to "completing upgrade" step upon retry. - Induce a data migration timeout in the first upgrade and an activation failure in the second attempt and verify that the strategy advances to "migrating data" step upon the third attempt. - Induce a failure right after successful install and verify that the strategy advances to "migrating data" step upon retry. This is a very corner case. Story: 2010798 Task: 48672 Depends-On: https://review.opendev.org/c/starlingx/ansible-playbooks/+/893004 Signed-off-by: Tee Ngo <tee.ngo@windriver.com> Change-Id: Iafc8cea145c314d325fafd0b4b25076053f751ba
This commit is contained in:
parent
7073e525b9
commit
76e56e73d9
@ -121,7 +121,7 @@ class SubcloudAuditWorkerManager(manager.Manager):
|
|||||||
consts.DEPLOY_STATE_PRE_INSTALL_FAILED,
|
consts.DEPLOY_STATE_PRE_INSTALL_FAILED,
|
||||||
consts.DEPLOY_STATE_INSTALLING,
|
consts.DEPLOY_STATE_INSTALLING,
|
||||||
consts.DEPLOY_STATE_DATA_MIGRATION_FAILED,
|
consts.DEPLOY_STATE_DATA_MIGRATION_FAILED,
|
||||||
consts.DEPLOY_STATE_MIGRATED,
|
consts.DEPLOY_STATE_UPGRADE_ACTIVATED,
|
||||||
consts.DEPLOY_STATE_RESTORING,
|
consts.DEPLOY_STATE_RESTORING,
|
||||||
consts.DEPLOY_STATE_RESTORE_PREP_FAILED,
|
consts.DEPLOY_STATE_RESTORE_PREP_FAILED,
|
||||||
consts.DEPLOY_STATE_RESTORE_FAILED]
|
consts.DEPLOY_STATE_RESTORE_FAILED]
|
||||||
|
@ -207,6 +207,7 @@ DEPLOY_STATE_CONFIG_ABORTED = 'config-aborted'
|
|||||||
DEPLOY_STATE_MIGRATING_DATA = 'migrating-data'
|
DEPLOY_STATE_MIGRATING_DATA = 'migrating-data'
|
||||||
DEPLOY_STATE_DATA_MIGRATION_FAILED = 'data-migration-failed'
|
DEPLOY_STATE_DATA_MIGRATION_FAILED = 'data-migration-failed'
|
||||||
DEPLOY_STATE_MIGRATED = 'migrated'
|
DEPLOY_STATE_MIGRATED = 'migrated'
|
||||||
|
DEPLOY_STATE_UPGRADE_ACTIVATED = 'upgrade-activated'
|
||||||
DEPLOY_STATE_PRE_RESTORE = 'pre-restore'
|
DEPLOY_STATE_PRE_RESTORE = 'pre-restore'
|
||||||
DEPLOY_STATE_RESTORE_PREP_FAILED = 'restore-prep-failed'
|
DEPLOY_STATE_RESTORE_PREP_FAILED = 'restore-prep-failed'
|
||||||
DEPLOY_STATE_RESTORING = 'restoring'
|
DEPLOY_STATE_RESTORING = 'restoring'
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# Copyright (c) 2023 Wind River Systems, Inc.
|
# Copyright (c) 2020-2023 Wind River Systems, Inc.
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
#
|
#
|
||||||
@ -21,7 +21,7 @@ ACTIVATING_IN_PROGRESS_STATES = ['activating', 'activating-hosts', ]
|
|||||||
# Max time: 60 minutes = 60 queries x 60 seconds sleep between queries
|
# Max time: 60 minutes = 60 queries x 60 seconds sleep between queries
|
||||||
DEFAULT_MAX_QUERIES = 60
|
DEFAULT_MAX_QUERIES = 60
|
||||||
DEFAULT_SLEEP_DURATION = 60
|
DEFAULT_SLEEP_DURATION = 60
|
||||||
MAX_FAILED_RETRIES = 10
|
MAX_FAILED_RETRIES = 3
|
||||||
|
|
||||||
|
|
||||||
class ActivatingUpgradeState(BaseState):
|
class ActivatingUpgradeState(BaseState):
|
||||||
@ -122,6 +122,7 @@ class ActivatingUpgradeState(BaseState):
|
|||||||
upgrade_state = self.get_upgrade_state(strategy_step)
|
upgrade_state = self.get_upgrade_state(strategy_step)
|
||||||
if upgrade_state in ACTIVATING_RETRY_STATES:
|
if upgrade_state in ACTIVATING_RETRY_STATES:
|
||||||
# We failed. Better try again
|
# We failed. Better try again
|
||||||
|
time.sleep(self.sleep_duration * activate_retry_counter)
|
||||||
activate_retry_counter += 1
|
activate_retry_counter += 1
|
||||||
self.info_log(strategy_step,
|
self.info_log(strategy_step,
|
||||||
"Activation failed, retrying... State=%s"
|
"Activation failed, retrying... State=%s"
|
||||||
@ -159,4 +160,6 @@ class ActivatingUpgradeState(BaseState):
|
|||||||
|
|
||||||
# When we return from this method without throwing an exception, the
|
# When we return from this method without throwing an exception, the
|
||||||
# state machine can proceed to the next state
|
# state machine can proceed to the next state
|
||||||
|
db_api.subcloud_update(self.context, strategy_step.subcloud_id,
|
||||||
|
deploy_status=consts.DEPLOY_STATE_UPGRADE_ACTIVATED)
|
||||||
return self.next_state
|
return self.next_state
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# Copyright (c) 2020-2022 Wind River Systems, Inc.
|
# Copyright (c) 2020-2023 Wind River Systems, Inc.
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
#
|
#
|
||||||
@ -30,9 +30,6 @@ DEFAULT_FAILED_SLEEP = 60
|
|||||||
DEFAULT_MAX_API_QUERIES = 30
|
DEFAULT_MAX_API_QUERIES = 30
|
||||||
DEFAULT_API_SLEEP = 60
|
DEFAULT_API_SLEEP = 60
|
||||||
|
|
||||||
# sleep for 3 minutes after ansible completes
|
|
||||||
DEFAULT_ANSIBLE_SLEEP = 180
|
|
||||||
|
|
||||||
|
|
||||||
def migrate_subcloud_data(migrate_command, log_file):
|
def migrate_subcloud_data(migrate_command, log_file):
|
||||||
try:
|
try:
|
||||||
@ -51,7 +48,6 @@ class MigratingDataState(BaseState):
|
|||||||
super(MigratingDataState, self).__init__(
|
super(MigratingDataState, self).__init__(
|
||||||
next_state=consts.STRATEGY_STATE_UNLOCKING_CONTROLLER_0, region_name=region_name)
|
next_state=consts.STRATEGY_STATE_UNLOCKING_CONTROLLER_0, region_name=region_name)
|
||||||
|
|
||||||
self.ansible_sleep = DEFAULT_ANSIBLE_SLEEP
|
|
||||||
self.max_api_queries = DEFAULT_MAX_API_QUERIES
|
self.max_api_queries = DEFAULT_MAX_API_QUERIES
|
||||||
self.api_sleep_duration = DEFAULT_API_SLEEP
|
self.api_sleep_duration = DEFAULT_API_SLEEP
|
||||||
self.max_failed_queries = DEFAULT_MAX_FAILED_QUERIES
|
self.max_failed_queries = DEFAULT_MAX_FAILED_QUERIES
|
||||||
@ -170,10 +166,6 @@ class MigratingDataState(BaseState):
|
|||||||
self.error_log(strategy_step, str(e))
|
self.error_log(strategy_step, str(e))
|
||||||
raise
|
raise
|
||||||
|
|
||||||
# Ansible invokes an unlock. Need to wait for the unlock to complete.
|
|
||||||
# Wait for 3 minutes for mtc/scripts to shut down services
|
|
||||||
# todo(abailey): split this into smaller sleeps to allow stopping early
|
|
||||||
time.sleep(self.ansible_sleep)
|
|
||||||
# wait up to 60 minutes for reboot to complete
|
# wait up to 60 minutes for reboot to complete
|
||||||
self.wait_for_unlock(strategy_step)
|
self.wait_for_unlock(strategy_step)
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# Copyright (c) 2020-2022 Wind River Systems, Inc.
|
# Copyright (c) 2020-2023 Wind River Systems, Inc.
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
#
|
#
|
||||||
@ -245,8 +245,9 @@ class PreCheckState(BaseState):
|
|||||||
return self.next_state
|
return self.next_state
|
||||||
|
|
||||||
# Skip subcloud online checks if the subcloud deploy status is
|
# Skip subcloud online checks if the subcloud deploy status is
|
||||||
# "migrated".
|
# either "migrated" or "upgrade-activated".
|
||||||
if subcloud.deploy_status == consts.DEPLOY_STATE_MIGRATED:
|
if subcloud.deploy_status in [consts.DEPLOY_STATE_MIGRATED,
|
||||||
|
consts.DEPLOY_STATE_UPGRADE_ACTIVATED]:
|
||||||
self.info_log(strategy_step, "Online subcloud checks skipped.")
|
self.info_log(strategy_step, "Online subcloud checks skipped.")
|
||||||
else:
|
else:
|
||||||
self._perform_subcloud_online_checks(strategy_step,
|
self._perform_subcloud_online_checks(strategy_step,
|
||||||
@ -254,20 +255,30 @@ class PreCheckState(BaseState):
|
|||||||
subcloud_fm_client,
|
subcloud_fm_client,
|
||||||
host, upgrades)
|
host, upgrades)
|
||||||
|
|
||||||
if subcloud.deploy_status == consts.DEPLOY_STATE_MIGRATED:
|
if subcloud.deploy_status == consts.DEPLOY_STATE_UPGRADE_ACTIVATED:
|
||||||
# If the subcloud has completed data migration, advance directly
|
# If the subcloud has completed upgrade activation, advance directly
|
||||||
# to activating upgrade step.
|
# to completing step.
|
||||||
self.override_next_state(consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
|
self.override_next_state(consts.STRATEGY_STATE_COMPLETING_UPGRADE)
|
||||||
elif subcloud.deploy_status == consts.DEPLOY_STATE_DATA_MIGRATION_FAILED:
|
elif subcloud.deploy_status == consts.DEPLOY_STATE_DATA_MIGRATION_FAILED:
|
||||||
# If the subcloud deploy status is data-migration-failed but
|
# If the subcloud deploy status is data-migration-failed but
|
||||||
# it is online and has passed subcloud online checks, it must have
|
# it is online and has passed subcloud online checks, it must have
|
||||||
# timed out while waiting for the subcloud to reboot previously and
|
# timed out while waiting for the subcloud to unlock previously and
|
||||||
# has succesfully been unlocked since. Update the subcloud deploy
|
# has succesfully been unlocked since. Update the subcloud deploy
|
||||||
# status and advance to activating upgrade step.
|
# status and advance to activating upgrade step.
|
||||||
db_api.subcloud_update(
|
db_api.subcloud_update(
|
||||||
self.context, strategy_step.subcloud_id,
|
self.context, strategy_step.subcloud_id,
|
||||||
deploy_status=consts.DEPLOY_STATE_MIGRATED)
|
deploy_status=consts.DEPLOY_STATE_MIGRATED)
|
||||||
self.override_next_state(consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
|
self.override_next_state(consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
|
||||||
|
elif subcloud.deploy_status == consts.DEPLOY_STATE_MIGRATED:
|
||||||
|
# If the subcloud deploy status is migrated but it is online, it
|
||||||
|
# must have undergone 2 upgrade attempts:
|
||||||
|
# - in 1st upgrade attempt: strategy timed out while waiting
|
||||||
|
# for the subcloud to unlock
|
||||||
|
# - in 2nd upgrade attempt: the subcloud was unlocked successfully
|
||||||
|
# (with or without manual interventions) but failed to activate.
|
||||||
|
# Advance to activating upgrade step so activation can be retried
|
||||||
|
# after the manual intervention.
|
||||||
|
self.override_next_state(consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
|
||||||
else:
|
else:
|
||||||
# Duplex case
|
# Duplex case
|
||||||
if upgrades:
|
if upgrades:
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# Copyright (c) 2020, 2022 Wind River Systems, Inc.
|
# Copyright (c) 2020-2023 Wind River Systems, Inc.
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
#
|
#
|
||||||
@ -7,6 +7,7 @@ import itertools
|
|||||||
import mock
|
import mock
|
||||||
|
|
||||||
from dcmanager.common import consts
|
from dcmanager.common import consts
|
||||||
|
from dcmanager.db.sqlalchemy import api as db_api
|
||||||
from dcmanager.orchestrator.states.upgrade import activating
|
from dcmanager.orchestrator.states.upgrade import activating
|
||||||
|
|
||||||
from dcmanager.tests.unit.orchestrator.states.fakes import FakeUpgrade
|
from dcmanager.tests.unit.orchestrator.states.fakes import FakeUpgrade
|
||||||
@ -85,6 +86,11 @@ class TestSwUpgradeActivatingStage(TestSwUpgradeState):
|
|||||||
# verify the API call was invoked
|
# verify the API call was invoked
|
||||||
self.sysinv_client.upgrade_activate.assert_called()
|
self.sysinv_client.upgrade_activate.assert_called()
|
||||||
|
|
||||||
|
# verify the DB update was invoked
|
||||||
|
updated_subcloud = db_api.subcloud_get(self.ctx,
|
||||||
|
self.subcloud.id)
|
||||||
|
self.assertEqual(updated_subcloud.deploy_status, consts.DEPLOY_STATE_UPGRADE_ACTIVATED)
|
||||||
|
|
||||||
# On success, the state should be updated to the next state
|
# On success, the state should be updated to the next state
|
||||||
self.assert_step_updated(self.strategy_step.subcloud_id,
|
self.assert_step_updated(self.strategy_step.subcloud_id,
|
||||||
self.on_success_state)
|
self.on_success_state)
|
||||||
|
@ -30,8 +30,6 @@ CONTROLLER_0_UNLOCKED = \
|
|||||||
"DEFAULT_API_SLEEP", 1)
|
"DEFAULT_API_SLEEP", 1)
|
||||||
@mock.patch("dcmanager.orchestrator.states.upgrade.migrating_data."
|
@mock.patch("dcmanager.orchestrator.states.upgrade.migrating_data."
|
||||||
"DEFAULT_FAILED_SLEEP", 1)
|
"DEFAULT_FAILED_SLEEP", 1)
|
||||||
@mock.patch("dcmanager.orchestrator.states.upgrade.migrating_data."
|
|
||||||
"DEFAULT_ANSIBLE_SLEEP", 3)
|
|
||||||
class TestSwUpgradeMigratingDataStage(TestSwUpgradeState):
|
class TestSwUpgradeMigratingDataStage(TestSwUpgradeState):
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#
|
#
|
||||||
# Copyright (c) 2020-2022 Wind River Systems, Inc.
|
# Copyright (c) 2020-2023 Wind River Systems, Inc.
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
#
|
#
|
||||||
@ -423,16 +423,16 @@ class TestSwUpgradePreCheckStage(TestSwUpgradeState):
|
|||||||
|
|
||||||
class TestSwUpgradePreCheckSimplexStage(TestSwUpgradePreCheckStage):
|
class TestSwUpgradePreCheckSimplexStage(TestSwUpgradePreCheckStage):
|
||||||
|
|
||||||
def test_upgrade_pre_check_subcloud_online_migrated(self):
|
def test_upgrade_pre_check_subcloud_online_activated(self):
|
||||||
"""Test pre check step where the subcloud is online and running N+1 load
|
"""Test pre check step where the subcloud is online and running N+1 load
|
||||||
|
|
||||||
The pre-check in this scenario should advance directly to 'activating upgrade'.
|
The pre-check in this scenario should advance directly to 'completing upgrade'.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Update the subcloud to have deploy state as "migrated"
|
# Update the subcloud to have deploy state as "migrated"
|
||||||
db_api.subcloud_update(self.ctx,
|
db_api.subcloud_update(self.ctx,
|
||||||
self.subcloud.id,
|
self.subcloud.id,
|
||||||
deploy_status=consts.DEPLOY_STATE_MIGRATED)
|
deploy_status=consts.DEPLOY_STATE_UPGRADE_ACTIVATED)
|
||||||
|
|
||||||
# invoke the strategy state operation on the orch thread
|
# invoke the strategy state operation on the orch thread
|
||||||
self.worker.perform_state_action(self.strategy_step)
|
self.worker.perform_state_action(self.strategy_step)
|
||||||
@ -443,9 +443,9 @@ class TestSwUpgradePreCheckSimplexStage(TestSwUpgradePreCheckStage):
|
|||||||
# verify the get host filesystem API call was not invoked
|
# verify the get host filesystem API call was not invoked
|
||||||
self.sysinv_client.get_host_filesystem.assert_not_called()
|
self.sysinv_client.get_host_filesystem.assert_not_called()
|
||||||
|
|
||||||
# Verify the expected next state happened (activating upgrade)
|
# Verify the expected next state happened (completing upgrade)
|
||||||
self.assert_step_updated(self.strategy_step.subcloud_id,
|
self.assert_step_updated(self.strategy_step.subcloud_id,
|
||||||
consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
|
consts.STRATEGY_STATE_COMPLETING_UPGRADE)
|
||||||
|
|
||||||
def test_upgrade_pre_check_subcloud_online_migrate_failed(self):
|
def test_upgrade_pre_check_subcloud_online_migrate_failed(self):
|
||||||
"""Test pre check step where the subcloud is online following an unlock timeout
|
"""Test pre check step where the subcloud is online following an unlock timeout
|
||||||
@ -484,6 +484,30 @@ class TestSwUpgradePreCheckSimplexStage(TestSwUpgradePreCheckStage):
|
|||||||
self.assert_step_updated(self.strategy_step.subcloud_id,
|
self.assert_step_updated(self.strategy_step.subcloud_id,
|
||||||
consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
|
consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
|
||||||
|
|
||||||
|
def test_upgrade_pre_check_subcloud_online_migrated(self):
|
||||||
|
"""Test pre check step where the subcloud is online following an activation failure
|
||||||
|
|
||||||
|
The pre-check in this scenario should advance directly to 'activating upgrade'.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Update the subcloud to have deploy state as "migrated"
|
||||||
|
db_api.subcloud_update(self.ctx,
|
||||||
|
self.subcloud.id,
|
||||||
|
deploy_status=consts.DEPLOY_STATE_MIGRATED)
|
||||||
|
|
||||||
|
# invoke the strategy state operation on the orch thread
|
||||||
|
self.worker.perform_state_action(self.strategy_step)
|
||||||
|
|
||||||
|
# verify the get system health API call was not invoked
|
||||||
|
self.sysinv_client.get_system_health_upgrade.assert_not_called()
|
||||||
|
|
||||||
|
# verify the get host filesystem API call was not invoked
|
||||||
|
self.sysinv_client.get_host_filesystem.assert_not_called()
|
||||||
|
|
||||||
|
# Verify the expected next state happened (activating upgrade)
|
||||||
|
self.assert_step_updated(self.strategy_step.subcloud_id,
|
||||||
|
consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
|
||||||
|
|
||||||
def test_upgrade_pre_check_subcloud_online_no_data_install(self):
|
def test_upgrade_pre_check_subcloud_online_no_data_install(self):
|
||||||
"""Test pre check step where the subcloud is online without data install
|
"""Test pre check step where the subcloud is online without data install
|
||||||
|
|
||||||
@ -621,6 +645,28 @@ class TestSwUpgradePreCheckSimplexStage(TestSwUpgradePreCheckStage):
|
|||||||
self.assert_step_updated(self.strategy_step.subcloud_id,
|
self.assert_step_updated(self.strategy_step.subcloud_id,
|
||||||
consts.STRATEGY_STATE_MIGRATING_DATA)
|
consts.STRATEGY_STATE_MIGRATING_DATA)
|
||||||
|
|
||||||
|
def test_upgrade_pre_check_subcloud_jumps_to_activating(self):
|
||||||
|
"""Test pre check step which jumps to activating upgrade state
|
||||||
|
|
||||||
|
The pre-check should transition in this scenario to activating upgrade
|
||||||
|
state if the subcloud is now offline, and the deploy status can be
|
||||||
|
handled by that state.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Update the subcloud to have deploy state as "migrated",
|
||||||
|
# and availability status as "offline"
|
||||||
|
db_api.subcloud_update(self.ctx,
|
||||||
|
self.subcloud.id,
|
||||||
|
deploy_status=consts.DEPLOY_STATE_MIGRATED,
|
||||||
|
availability_status=dccommon_consts.AVAILABILITY_OFFLINE)
|
||||||
|
|
||||||
|
# invoke the strategy state operation on the orch thread
|
||||||
|
self.worker.perform_state_action(self.strategy_step)
|
||||||
|
|
||||||
|
# Verify the expected next state happened (activating upgrade)
|
||||||
|
self.assert_step_updated(self.strategy_step.subcloud_id,
|
||||||
|
consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
|
||||||
|
|
||||||
def test_upgrade_pre_check_subcloud_jumps_to_upgrading(self):
|
def test_upgrade_pre_check_subcloud_jumps_to_upgrading(self):
|
||||||
"""Test pre check step which jumps to the upgrading state
|
"""Test pre check step which jumps to the upgrading state
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user