Add reboot handling to unlock state for DC upgrades
When an unlock is triggered, the host is rebooted. The maximum amount of time for this state to complete is 1 hour. - Reboot time (30 minutes) - pre and post reboot time (30 minutes) The unlock state handler for upgrade orchestration will now loop and ignore failures for a period of time, while waiting for the host to become unlocked / enabled. Change-Id: Ie7ad8dbfecd6ffbe241d098c8677a8e25f400b07 Story: 2007403 Task: 40048 Signed-off-by: albailey <Al.Bailey@windriver.com>
This commit is contained in:
parent
a307733a52
commit
cd12fc31a1
@ -43,6 +43,10 @@ AVAILABILITY_ONLINE = "online"
|
||||
ADMIN_LOCKED = 'locked'
|
||||
ADMIN_UNLOCKED = 'unlocked'
|
||||
|
||||
# operational status for hosts
|
||||
OPERATIONAL_ENABLED = 'enabled'
|
||||
OPERATIONAL_DISABLED = 'disabled'
|
||||
|
||||
# Subcloud sync status
|
||||
SYNC_STATUS_UNKNOWN = "unknown"
|
||||
SYNC_STATUS_IN_SYNC = "in-sync"
|
||||
|
@ -6,29 +6,32 @@
|
||||
import time
|
||||
|
||||
from dcmanager.common.consts import ADMIN_UNLOCKED
|
||||
from dcmanager.common.consts import OPERATIONAL_ENABLED
|
||||
from dcmanager.manager.states.base import BaseState
|
||||
|
||||
DEFAULT_MAX_QUERIES = 6
|
||||
DEFAULT_SLEEP_DURATION = 10
|
||||
# When an unlock occurs, a reboot is triggered. During reboot, API calls fail.
|
||||
# The max time allowed here is 30 minutes (ie: 30 queries with 1 minute sleep)
|
||||
DEFAULT_MAX_FAILED_QUERIES = 30
|
||||
DEFAULT_FAILED_SLEEP = 60
|
||||
|
||||
# Before and after reboot, the unlock needs to prepare for shutdown and
|
||||
# do post-reboot activities during which time the API will succeed, but the
|
||||
# expected states will not yet be set.
|
||||
# The max time allowed here is 30 minutes (ie: 30 queries with 1 minute sleep)
|
||||
DEFAULT_MAX_API_QUERIES = 30
|
||||
DEFAULT_API_SLEEP = 60
|
||||
|
||||
|
||||
class UnlockHostState(BaseState):
|
||||
"""Orchestration state for unlocking a host"""
|
||||
"""Orchestration state for unlocking a host."""
|
||||
|
||||
def __init__(self,
|
||||
hostname='controller-0',
|
||||
max_queries=DEFAULT_MAX_QUERIES,
|
||||
sleep_duration=DEFAULT_SLEEP_DURATION):
|
||||
def __init__(self, hostname='controller-0'):
|
||||
super(UnlockHostState, self).__init__()
|
||||
self.target_hostname = hostname
|
||||
# max time to wait (in seconds) is: sleep_duration * max_queries
|
||||
self.sleep_duration = sleep_duration
|
||||
self.max_queries = max_queries
|
||||
|
||||
def check_async_counter(self, counter):
|
||||
if counter >= self.max_queries:
|
||||
raise Exception("Timeout waiting for unlock to complete")
|
||||
time.sleep(self.sleep_duration)
|
||||
self.max_api_queries = DEFAULT_MAX_API_QUERIES
|
||||
self.api_sleep_duration = DEFAULT_API_SLEEP
|
||||
self.max_failed_queries = DEFAULT_MAX_FAILED_QUERIES
|
||||
self.failed_sleep_duration = DEFAULT_FAILED_SLEEP
|
||||
|
||||
def perform_state_action(self, strategy_step):
|
||||
"""Unlocks a host on the subcloud
|
||||
@ -57,19 +60,64 @@ class UnlockHostState(BaseState):
|
||||
if (response.ihost_action != 'unlock' or response.task != 'Unlocking'):
|
||||
raise Exception("Unable to unlock host %s" % self.target_hostname)
|
||||
|
||||
# this action is asynchronous, query until it completes or times out
|
||||
async_counter = 0
|
||||
# unlock triggers a reboot.
|
||||
# must ignore certain errors until the system completes the reboot
|
||||
# or a timeout occurs
|
||||
|
||||
# Allow separate durations for failures (ie: reboot) and api retries
|
||||
api_counter = 0
|
||||
fail_counter = 0
|
||||
# Allow just one failed auth (token expired)
|
||||
auth_failure = False
|
||||
|
||||
while True:
|
||||
# query the administrative state to see if it is the new state.
|
||||
host = sysinv_client.get_host(self.target_hostname)
|
||||
if host.administrative == ADMIN_UNLOCKED:
|
||||
msg = "Host: %s is now: %s" % (self.target_hostname,
|
||||
host.administrative)
|
||||
self.info_log(strategy_step, msg)
|
||||
break
|
||||
async_counter += 1
|
||||
# check_async_counter throws exception if loops exceeded or aborted
|
||||
self.check_async_counter(async_counter)
|
||||
try:
|
||||
# query the administrative state to see if it is the new state.
|
||||
host = sysinv_client.get_host(self.target_hostname)
|
||||
if (host.administrative == ADMIN_UNLOCKED and
|
||||
host.operational == OPERATIONAL_ENABLED):
|
||||
# Success. Break out of the loop.
|
||||
msg = "Host: %s is now: %s %s" % (self.target_hostname,
|
||||
host.administrative,
|
||||
host.operational)
|
||||
self.info_log(strategy_step, msg)
|
||||
break
|
||||
# no exception was raised so reset fail and auth checks
|
||||
auth_failure = False
|
||||
fail_counter = 0
|
||||
except Exception as e:
|
||||
if e.message == "Authorization failed":
|
||||
# Since a token could expire while waiting, generate
|
||||
# a new token (by re-creating the client) and re-try the
|
||||
# request, but only once.
|
||||
if not auth_failure:
|
||||
auth_failure = True
|
||||
self.info_log(strategy_step,
|
||||
"Authorization failure. Retrying...")
|
||||
ks_client = self.get_keystone_client(
|
||||
strategy_step.subcloud.name)
|
||||
sysinv_client = self.get_sysinv_client(
|
||||
strategy_step.subcloud.name,
|
||||
ks_client.session)
|
||||
continue
|
||||
else:
|
||||
raise Exception("Repeated authorization failures.")
|
||||
else:
|
||||
# Handle other exceptions due to being unreachable
|
||||
# for a significant period of time when there is a
|
||||
# controller swact, or in the case of AIO-SX,
|
||||
# when the controller reboots.
|
||||
fail_counter += 1
|
||||
if fail_counter >= self.max_failed_queries:
|
||||
raise Exception("Timeout waiting for reboot to complete")
|
||||
time.sleep(self.failed_sleep_duration)
|
||||
# skip the api_counter
|
||||
continue
|
||||
# If the max counter is exceeeded, raise a timeout exception
|
||||
api_counter += 1
|
||||
if api_counter >= self.max_api_queries:
|
||||
raise Exception("Timeout waiting for unlock to complete")
|
||||
time.sleep(self.api_sleep_duration)
|
||||
|
||||
# If we are here, the loop broke out cleanly and the action succeeded
|
||||
# When we return from this method without throwing an exception, the
|
||||
|
@ -6,8 +6,8 @@
|
||||
import mock
|
||||
import uuid
|
||||
|
||||
from dcmanager.common import consts
|
||||
from dcmanager.manager.states.base import BaseState
|
||||
from sysinv.common import constants as sysinv_constants
|
||||
|
||||
from dcmanager.tests.unit.manager.test_sw_upgrade import TestSwUpgrade
|
||||
|
||||
@ -70,14 +70,16 @@ class FakeController(object):
|
||||
def __init__(self,
|
||||
host_id=1,
|
||||
hostname='controller-0',
|
||||
administrative=sysinv_constants.ADMIN_UNLOCKED,
|
||||
availability=sysinv_constants.AVAILABILITY_AVAILABLE,
|
||||
administrative=consts.ADMIN_UNLOCKED,
|
||||
operational=consts.OPERATIONAL_ENABLED,
|
||||
availability=consts.AVAILABILITY_ONLINE,
|
||||
ihost_action=None,
|
||||
target_load=UPGRADED_VERSION,
|
||||
task=None):
|
||||
self.id = host_id
|
||||
self.hostname = hostname
|
||||
self.administrative = administrative
|
||||
self.operational = operational
|
||||
self.availability = availability
|
||||
self.ihost_action = ihost_action
|
||||
self.target_load = target_load
|
||||
|
@ -7,14 +7,16 @@ import itertools
|
||||
import mock
|
||||
|
||||
from dcmanager.common import consts
|
||||
from dcmanager.manager.states.unlock_host import DEFAULT_MAX_QUERIES
|
||||
from dcmanager.manager.states import unlock_host
|
||||
|
||||
from dcmanager.tests.unit.manager.states.upgrade.test_base \
|
||||
import FakeController
|
||||
from dcmanager.tests.unit.manager.states.upgrade.test_base \
|
||||
import TestSwUpgradeState
|
||||
|
||||
CONTROLLER_0_UNLOCKED = FakeController(administrative=consts.ADMIN_UNLOCKED)
|
||||
CONTROLLER_0_UNLOCKED = \
|
||||
FakeController(administrative=consts.ADMIN_UNLOCKED,
|
||||
operational=consts.OPERATIONAL_ENABLED)
|
||||
CONTROLLER_0_LOCKED = FakeController(administrative=consts.ADMIN_LOCKED)
|
||||
CONTROLLER_0_UNLOCKING = FakeController(administrative=consts.ADMIN_LOCKED,
|
||||
ihost_action='unlock',
|
||||
@ -25,6 +27,11 @@ CONTROLLER_0_UNLOCKING_FAILED = \
|
||||
task='Swacting')
|
||||
|
||||
|
||||
@mock.patch("dcmanager.manager.states.unlock_host.DEFAULT_MAX_API_QUERIES", 3)
|
||||
@mock.patch("dcmanager.manager.states.unlock_host.DEFAULT_MAX_FAILED_QUERIES",
|
||||
3)
|
||||
@mock.patch("dcmanager.manager.states.unlock_host.DEFAULT_API_SLEEP", 1)
|
||||
@mock.patch("dcmanager.manager.states.unlock_host.DEFAULT_FAILED_SLEEP", 1)
|
||||
class TestSwUpgradeUnlockControllerStage(TestSwUpgradeState):
|
||||
|
||||
def setUp(self):
|
||||
@ -102,7 +109,7 @@ class TestSwUpgradeUnlockControllerStage(TestSwUpgradeState):
|
||||
self.sysinv_client.unlock_host.assert_called()
|
||||
|
||||
# verify the query was invoked: 1 + max_attempts times
|
||||
self.assertEqual(DEFAULT_MAX_QUERIES + 1,
|
||||
self.assertEqual(unlock_host.DEFAULT_MAX_API_QUERIES + 1,
|
||||
self.sysinv_client.get_host.call_count)
|
||||
|
||||
# verify that state failed due to subcloud never finishing the unlock
|
||||
|
Loading…
Reference in New Issue
Block a user