Add API retries in finishing firmware update
In firmware update orchestration, after the subcloud is unlocked, the runtime manifests on the subcloud may take some time to complete causing the sysinv API calls to query the subcloud to fail. This update is to add retry attempts during the finishing firmware update strategy step. Closes-Bug: 1922281 Change-Id: I678eaf701d9c0cf40907fb8c378a5a2c531a88ae Signed-off-by: Teresa Ho <teresa.ho@windriver.com>
This commit is contained in:
@@ -1,14 +1,22 @@
|
||||
#
|
||||
# Copyright (c) 2020 Wind River Systems, Inc.
|
||||
# Copyright (c) 2020-2021 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
import time
|
||||
|
||||
from dcmanager.common import consts
|
||||
from dcmanager.common.exceptions import StrategyStoppedException
|
||||
from dcmanager.orchestrator.states.base import BaseState
|
||||
from dcmanager.orchestrator.states.firmware import utils
|
||||
from dcmanager.rpc import client as dcmanager_rpc_client
|
||||
from dcorch.common import consts as dcorch_consts
|
||||
|
||||
# When an unlock occurs, a reboot is triggered. During reboot, API calls fail.
|
||||
# The max time allowed here is 30 minutes (ie: 30 queries with 1 minute sleep)
|
||||
DEFAULT_MAX_FAILED_QUERIES = 30
|
||||
DEFAULT_FAILED_SLEEP = 60
|
||||
|
||||
|
||||
class FinishingFwUpdateState(BaseState):
|
||||
"""State for finishing the firmware update."""
|
||||
@@ -16,6 +24,8 @@ class FinishingFwUpdateState(BaseState):
|
||||
def __init__(self, region_name):
|
||||
super(FinishingFwUpdateState, self).__init__(
|
||||
next_state=consts.STRATEGY_STATE_COMPLETE, region_name=region_name)
|
||||
self.max_failed_queries = DEFAULT_MAX_FAILED_QUERIES
|
||||
self.failed_sleep_duration = DEFAULT_FAILED_SLEEP
|
||||
|
||||
def align_subcloud_status(self, strategy_step):
|
||||
self.info_log(strategy_step,
|
||||
@@ -51,13 +61,26 @@ class FinishingFwUpdateState(BaseState):
|
||||
|
||||
# get the list of enabled devices on the subcloud
|
||||
enabled_host_device_list = []
|
||||
subcloud_hosts = self.get_sysinv_client(region).get_hosts()
|
||||
for host in subcloud_hosts:
|
||||
host_devices = self.get_sysinv_client(
|
||||
region).get_host_device_list(host.uuid)
|
||||
for device in host_devices:
|
||||
if device.enabled:
|
||||
enabled_host_device_list.append(device)
|
||||
fail_counter = 0
|
||||
while True:
|
||||
# If event handler stop has been triggered, fail the state
|
||||
if self.stopped():
|
||||
raise StrategyStoppedException()
|
||||
try:
|
||||
subcloud_hosts = self.get_sysinv_client(region).get_hosts()
|
||||
for host in subcloud_hosts:
|
||||
host_devices = self.get_sysinv_client(
|
||||
region).get_host_device_list(host.uuid)
|
||||
for device in host_devices:
|
||||
if device.enabled:
|
||||
enabled_host_device_list.append(device)
|
||||
break
|
||||
except Exception:
|
||||
if fail_counter >= self.max_failed_queries:
|
||||
raise Exception("Timeout waiting to query subcloud hosts")
|
||||
fail_counter += 1
|
||||
time.sleep(self.failed_sleep_duration)
|
||||
|
||||
if not enabled_host_device_list:
|
||||
# There are no enabled devices in this subcloud, so break out
|
||||
# of this handler, since there will be nothing examine
|
||||
@@ -66,14 +89,26 @@ class FinishingFwUpdateState(BaseState):
|
||||
self.align_subcloud_status(strategy_step)
|
||||
return self.next_state
|
||||
|
||||
# determine list of applied subcloud images
|
||||
subcloud_images = self.get_sysinv_client(region).get_device_images()
|
||||
applied_subcloud_images = \
|
||||
utils.filter_applied_images(subcloud_images,
|
||||
expected_value=True)
|
||||
# Retrieve the device image states on this subcloud.
|
||||
subcloud_device_image_states = self.get_sysinv_client(
|
||||
region).get_device_image_states()
|
||||
fail_counter = 0
|
||||
while True:
|
||||
# If event handler stop has been triggered, fail the state
|
||||
if self.stopped():
|
||||
raise StrategyStoppedException()
|
||||
try:
|
||||
# determine list of applied subcloud images
|
||||
subcloud_images = self.get_sysinv_client(region).get_device_images()
|
||||
applied_subcloud_images = \
|
||||
utils.filter_applied_images(subcloud_images,
|
||||
expected_value=True)
|
||||
# Retrieve the device image states on this subcloud.
|
||||
subcloud_device_image_states = self.get_sysinv_client(
|
||||
region).get_device_image_states()
|
||||
break
|
||||
except Exception:
|
||||
if fail_counter >= self.max_failed_queries:
|
||||
raise Exception("Timeout waiting to query subcloud device image info")
|
||||
fail_counter += 1
|
||||
time.sleep(self.failed_sleep_duration)
|
||||
|
||||
device_map = utils.to_uuid_map(enabled_host_device_list)
|
||||
image_map = utils.to_uuid_map(applied_subcloud_images)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2020 Wind River Systems, Inc.
|
||||
# Copyright (c) 2020-2021 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
@@ -7,7 +7,7 @@ import mock
|
||||
|
||||
from dccommon.drivers.openstack import vim
|
||||
from dcmanager.common import consts
|
||||
from dcmanager.orchestrator.states.firmware.finishing_fw_update import FinishingFwUpdateState
|
||||
from dcmanager.orchestrator.states.firmware import finishing_fw_update
|
||||
|
||||
from dcmanager.tests.unit.fakes import FakeVimStrategy
|
||||
from dcmanager.tests.unit.orchestrator.states.firmware.test_base \
|
||||
@@ -16,6 +16,9 @@ from dcmanager.tests.unit.orchestrator.states.firmware.test_base \
|
||||
STRATEGY_APPLIED = FakeVimStrategy(state=vim.STATE_APPLIED)
|
||||
|
||||
|
||||
@mock.patch("dcmanager.orchestrator.states.firmware.finishing_fw_update.DEFAULT_MAX_FAILED_QUERIES",
|
||||
3)
|
||||
@mock.patch("dcmanager.orchestrator.states.firmware.finishing_fw_update.DEFAULT_FAILED_SLEEP", 1)
|
||||
class TestFwUpdateFinishingFwUpdateStage(TestFwUpdateState):
|
||||
|
||||
def setUp(self):
|
||||
@@ -35,8 +38,9 @@ class TestFwUpdateFinishingFwUpdateStage(TestFwUpdateState):
|
||||
self.vim_client.get_strategy = mock.MagicMock()
|
||||
self.vim_client.delete_strategy = mock.MagicMock()
|
||||
self.sysinv_client.get_hosts = mock.MagicMock()
|
||||
self.sysinv_client.get_host_device_list = mock.MagicMock()
|
||||
|
||||
p = mock.patch.object(FinishingFwUpdateState, 'align_subcloud_status')
|
||||
p = mock.patch.object(finishing_fw_update.FinishingFwUpdateState, 'align_subcloud_status')
|
||||
self.mock_align = p.start()
|
||||
self.addCleanup(p.stop)
|
||||
|
||||
@@ -75,3 +79,27 @@ class TestFwUpdateFinishingFwUpdateStage(TestFwUpdateState):
|
||||
# Successful promotion to next state
|
||||
self.assert_step_updated(self.strategy_step.subcloud_id,
|
||||
self.on_success_state)
|
||||
|
||||
def test_finishing_vim_strategy_failure_get_hosts(self):
|
||||
"""Test finishing firmware update with communication error to subcloud"""
|
||||
|
||||
# mock the get_host query fails and raises an exception
|
||||
self.sysinv_client.get_hosts.side_effect = \
|
||||
Exception("HTTP CommunicationError")
|
||||
|
||||
# invoke the strategy state operation on the orch thread
|
||||
self.worker.perform_state_action(self.strategy_step)
|
||||
|
||||
# verify the query was actually attempted
|
||||
self.sysinv_client.get_hosts.assert_called()
|
||||
|
||||
# verified the query was tried max retries + 1
|
||||
self.assertEqual(finishing_fw_update.DEFAULT_MAX_FAILED_QUERIES + 1,
|
||||
self.sysinv_client.get_hosts.call_count)
|
||||
|
||||
# verify the subsequent sysinv command was never attempted
|
||||
self.sysinv_client.get_host_device_list.assert_not_called()
|
||||
|
||||
# verify that the state moves to the next state
|
||||
self.assert_step_updated(self.strategy_step.subcloud_id,
|
||||
consts.STRATEGY_STATE_FAILED)
|
||||
|
||||
Reference in New Issue
Block a user