Adding retries during upgrade complete state
During upgrade complete, many services are restarting and the API calls may randomly fail. This change adds retry wrapping around all client creation and API calls during that upgrade state. Currently 5 retries with 5 seconds delay Story: 2009665 Task: 44070 Change-Id: Ifacfd364c2e961fd396db695658abe2d027757c3 Signed-off-by: albailey <Al.Bailey@windriver.com>
This commit is contained in:
parent
855232c872
commit
7c88723d06
@ -95,6 +95,7 @@ Summary: DC Orchestrator
|
||||
# TODO(John): should we add Requires lines?
|
||||
Requires: openstack-ras
|
||||
Requires: python-psutil
|
||||
Requires: python-retrying
|
||||
|
||||
%description dcorch
|
||||
Distributed Cloud Orchestrator
|
||||
|
@ -1,8 +1,9 @@
|
||||
#
|
||||
# Copyright (c) 2020 Wind River Systems, Inc.
|
||||
# Copyright (c) 2020-2021 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
import retrying
|
||||
import time
|
||||
|
||||
from dcmanager.common import consts
|
||||
@ -15,6 +16,12 @@ from dcmanager.orchestrator.states.base import BaseState
|
||||
DEFAULT_MAX_QUERIES = 60
|
||||
DEFAULT_SLEEP_DURATION = 10
|
||||
|
||||
# There are additional retry loops for actions that should never fail
|
||||
# The sleep duration and number of retries are shorter, since these should
|
||||
# only occur if a service is being restarted
|
||||
RETRY_MAX_ATTEMPTS = 5
|
||||
RETRY_SLEEP_MILLIS = 5000
|
||||
|
||||
|
||||
class CompletingUpgradeState(BaseState):
|
||||
"""Upgrade state actions for completing an upgrade"""
|
||||
@ -26,10 +33,43 @@ class CompletingUpgradeState(BaseState):
|
||||
self.sleep_duration = DEFAULT_SLEEP_DURATION
|
||||
self.max_queries = DEFAULT_MAX_QUERIES
|
||||
|
||||
def finalize_upgrade(self, strategy_step):
|
||||
sysinv_client = self.get_sysinv_client(strategy_step.subcloud.name)
|
||||
@retrying.retry(stop_max_attempt_number=RETRY_MAX_ATTEMPTS,
|
||||
wait_fixed=RETRY_SLEEP_MILLIS)
|
||||
def _get_software_version(self, strategy_step):
|
||||
"""Internal utility method to query software version from a subcloud
|
||||
|
||||
software_version = sysinv_client.get_system().software_version
|
||||
This method is 'retry' wrapped to attempt multiple times with a
|
||||
small wait period between attempts if any exception is raised
|
||||
"""
|
||||
region = self.get_region_name(strategy_step)
|
||||
return self.get_sysinv_client(region).get_system().software_version
|
||||
|
||||
@retrying.retry(stop_max_attempt_number=RETRY_MAX_ATTEMPTS,
|
||||
wait_fixed=RETRY_SLEEP_MILLIS)
|
||||
def _get_upgrades(self, strategy_step):
|
||||
"""Internal utility method to query a subcloud for its upgrades
|
||||
|
||||
This method is 'retry' wrapped to attempt multiple times with a
|
||||
small wait period between attempts if any exception is raised
|
||||
"""
|
||||
region = self.get_region_name(strategy_step)
|
||||
return self.get_sysinv_client(region).get_upgrades()
|
||||
|
||||
@retrying.retry(stop_max_attempt_number=RETRY_MAX_ATTEMPTS,
|
||||
wait_fixed=RETRY_SLEEP_MILLIS)
|
||||
def _upgrade_complete(self, strategy_step):
|
||||
"""Internal utility method to complete an upgrade in a subcloud
|
||||
|
||||
This method is 'retry' wrapped to attempt multiple times with a
|
||||
small wait period between attempts if any exception is raised
|
||||
|
||||
returns None
|
||||
"""
|
||||
region = self.get_region_name(strategy_step)
|
||||
return self.get_sysinv_client(region).upgrade_complete()
|
||||
|
||||
def finalize_upgrade(self, strategy_step):
|
||||
software_version = self._get_software_version(strategy_step)
|
||||
|
||||
db_api.subcloud_update(
|
||||
self.context, strategy_step.subcloud_id,
|
||||
@ -37,27 +77,33 @@ class CompletingUpgradeState(BaseState):
|
||||
software_version=software_version)
|
||||
return self.next_state
|
||||
|
||||
# todo(abailey): determine if service restarts can be made predictable
|
||||
# todo(abailey): other states should have similar retry decorators and
|
||||
# this may also be reasonable to add within the client API calls.
|
||||
def perform_state_action(self, strategy_step):
|
||||
"""Complete an upgrade on a subcloud
|
||||
|
||||
We should never cache the client. re-query it.
|
||||
Returns the next state in the state machine on success.
|
||||
Any exceptions raised by this method set the strategy to FAILED.
|
||||
|
||||
This state runs during a time when manifests are applying and services
|
||||
are restarting, and therefore any API call in this method can randomly
|
||||
fail. To accomodate this, every call is wrapped with retries.
|
||||
"""
|
||||
# get the sysinv client for the subcloud
|
||||
sysinv_client = self.get_sysinv_client(strategy_step.subcloud.name)
|
||||
|
||||
# upgrade-complete causes the upgrade to be deleted.
|
||||
# if no upgrade exists, there is no need to call it.
|
||||
# The API should always return a list
|
||||
upgrades = sysinv_client.get_upgrades()
|
||||
upgrades = self._get_upgrades(strategy_step)
|
||||
if len(upgrades) == 0:
|
||||
self.info_log(strategy_step,
|
||||
"No upgrades exist. Nothing needs completing")
|
||||
return self.finalize_upgrade(strategy_step)
|
||||
|
||||
# invoke the API 'upgrade-complete'
|
||||
# This is a partially blocking call that raises exception on failure.
|
||||
sysinv_client.upgrade_complete()
|
||||
# We will re-attempt even if that failure is encountered
|
||||
self._upgrade_complete(strategy_step)
|
||||
|
||||
# 'completion' deletes the upgrade. Need to loop until it is deleted
|
||||
counter = 0
|
||||
@ -66,11 +112,9 @@ class CompletingUpgradeState(BaseState):
|
||||
if self.stopped():
|
||||
raise StrategyStoppedException()
|
||||
|
||||
upgrades = self.get_sysinv_client(
|
||||
strategy_step.subcloud.name).get_upgrades()
|
||||
upgrades = self._get_upgrades(strategy_step)
|
||||
if len(upgrades) == 0:
|
||||
self.info_log(strategy_step,
|
||||
"Upgrade completed.")
|
||||
self.info_log(strategy_step, "Upgrade completed.")
|
||||
break
|
||||
counter += 1
|
||||
if counter >= self.max_queries:
|
||||
|
@ -48,6 +48,7 @@ python-novaclient>=7.1.0 # Apache-2.0
|
||||
python-keystoneclient>=3.8.0 # Apache-2.0
|
||||
pycrypto>=2.6 # Public Domain
|
||||
requests_toolbelt
|
||||
retrying
|
||||
keyring
|
||||
kubernetes # Apache-2.0
|
||||
psutil
|
||||
|
@ -21,4 +21,4 @@ pylint==1.9.2;python_version<"3.0" # GPLv2
|
||||
pylint==2.3.1;python_version>="3.0" # GPLv2
|
||||
PyYAML>=3.1.0
|
||||
yamllint<1.26.1;python_version>="3.0" # GPLv2
|
||||
python-dev-tools;python_version>="3.0"
|
||||
#python-dev-tools;python_version>="3.0"
|
||||
|
Loading…
Reference in New Issue
Block a user