Add API retries in finishing firmware update

In firmware update orchestration, after the subcloud is unlocked,
the runtime manifests on the subcloud may take some time to complete
causing the sysinv API calls to query the subcloud to fail.
This update is to add retry attempts during the finishing firmware
update strategy step.

Closes-Bug: 1922281

Change-Id: I678eaf701d9c0cf40907fb8c378a5a2c531a88ae
Signed-off-by: Teresa Ho <teresa.ho@windriver.com>
This commit is contained in:
Teresa Ho
2021-04-01 15:38:30 -04:00
parent 4c3d7fb9c8
commit b0b3cfe07b
2 changed files with 82 additions and 19 deletions

View File

@@ -1,14 +1,22 @@
#
# Copyright (c) 2020 Wind River Systems, Inc.
# Copyright (c) 2020-2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
import time
from dcmanager.common import consts
from dcmanager.common.exceptions import StrategyStoppedException
from dcmanager.orchestrator.states.base import BaseState
from dcmanager.orchestrator.states.firmware import utils
from dcmanager.rpc import client as dcmanager_rpc_client
from dcorch.common import consts as dcorch_consts
# When an unlock occurs, a reboot is triggered. During reboot, API calls fail.
# The max time allowed here is 30 minutes (ie: 30 queries with 1 minute sleep)
DEFAULT_MAX_FAILED_QUERIES = 30
DEFAULT_FAILED_SLEEP = 60
class FinishingFwUpdateState(BaseState):
"""State for finishing the firmware update."""
@@ -16,6 +24,8 @@ class FinishingFwUpdateState(BaseState):
def __init__(self, region_name):
super(FinishingFwUpdateState, self).__init__(
next_state=consts.STRATEGY_STATE_COMPLETE, region_name=region_name)
self.max_failed_queries = DEFAULT_MAX_FAILED_QUERIES
self.failed_sleep_duration = DEFAULT_FAILED_SLEEP
def align_subcloud_status(self, strategy_step):
self.info_log(strategy_step,
@@ -51,13 +61,26 @@ class FinishingFwUpdateState(BaseState):
# get the list of enabled devices on the subcloud
enabled_host_device_list = []
subcloud_hosts = self.get_sysinv_client(region).get_hosts()
for host in subcloud_hosts:
host_devices = self.get_sysinv_client(
region).get_host_device_list(host.uuid)
for device in host_devices:
if device.enabled:
enabled_host_device_list.append(device)
fail_counter = 0
while True:
# If event handler stop has been triggered, fail the state
if self.stopped():
raise StrategyStoppedException()
try:
subcloud_hosts = self.get_sysinv_client(region).get_hosts()
for host in subcloud_hosts:
host_devices = self.get_sysinv_client(
region).get_host_device_list(host.uuid)
for device in host_devices:
if device.enabled:
enabled_host_device_list.append(device)
break
except Exception:
if fail_counter >= self.max_failed_queries:
raise Exception("Timeout waiting to query subcloud hosts")
fail_counter += 1
time.sleep(self.failed_sleep_duration)
if not enabled_host_device_list:
# There are no enabled devices in this subcloud, so break out
# of this handler, since there will be nothing examine
@@ -66,14 +89,26 @@ class FinishingFwUpdateState(BaseState):
self.align_subcloud_status(strategy_step)
return self.next_state
# determine list of applied subcloud images
subcloud_images = self.get_sysinv_client(region).get_device_images()
applied_subcloud_images = \
utils.filter_applied_images(subcloud_images,
expected_value=True)
# Retrieve the device image states on this subcloud.
subcloud_device_image_states = self.get_sysinv_client(
region).get_device_image_states()
fail_counter = 0
while True:
# If event handler stop has been triggered, fail the state
if self.stopped():
raise StrategyStoppedException()
try:
# determine list of applied subcloud images
subcloud_images = self.get_sysinv_client(region).get_device_images()
applied_subcloud_images = \
utils.filter_applied_images(subcloud_images,
expected_value=True)
# Retrieve the device image states on this subcloud.
subcloud_device_image_states = self.get_sysinv_client(
region).get_device_image_states()
break
except Exception:
if fail_counter >= self.max_failed_queries:
raise Exception("Timeout waiting to query subcloud device image info")
fail_counter += 1
time.sleep(self.failed_sleep_duration)
device_map = utils.to_uuid_map(enabled_host_device_list)
image_map = utils.to_uuid_map(applied_subcloud_images)

View File

@@ -1,5 +1,5 @@
#
# Copyright (c) 2020 Wind River Systems, Inc.
# Copyright (c) 2020-2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@@ -7,7 +7,7 @@ import mock
from dccommon.drivers.openstack import vim
from dcmanager.common import consts
from dcmanager.orchestrator.states.firmware.finishing_fw_update import FinishingFwUpdateState
from dcmanager.orchestrator.states.firmware import finishing_fw_update
from dcmanager.tests.unit.fakes import FakeVimStrategy
from dcmanager.tests.unit.orchestrator.states.firmware.test_base \
@@ -16,6 +16,9 @@ from dcmanager.tests.unit.orchestrator.states.firmware.test_base \
STRATEGY_APPLIED = FakeVimStrategy(state=vim.STATE_APPLIED)
@mock.patch("dcmanager.orchestrator.states.firmware.finishing_fw_update.DEFAULT_MAX_FAILED_QUERIES",
3)
@mock.patch("dcmanager.orchestrator.states.firmware.finishing_fw_update.DEFAULT_FAILED_SLEEP", 1)
class TestFwUpdateFinishingFwUpdateStage(TestFwUpdateState):
def setUp(self):
@@ -35,8 +38,9 @@ class TestFwUpdateFinishingFwUpdateStage(TestFwUpdateState):
self.vim_client.get_strategy = mock.MagicMock()
self.vim_client.delete_strategy = mock.MagicMock()
self.sysinv_client.get_hosts = mock.MagicMock()
self.sysinv_client.get_host_device_list = mock.MagicMock()
p = mock.patch.object(FinishingFwUpdateState, 'align_subcloud_status')
p = mock.patch.object(finishing_fw_update.FinishingFwUpdateState, 'align_subcloud_status')
self.mock_align = p.start()
self.addCleanup(p.stop)
@@ -75,3 +79,27 @@ class TestFwUpdateFinishingFwUpdateStage(TestFwUpdateState):
# Successful promotion to next state
self.assert_step_updated(self.strategy_step.subcloud_id,
self.on_success_state)
def test_finishing_vim_strategy_failure_get_hosts(self):
"""Test finishing firmware update with communication error to subcloud"""
# mock the get_host query fails and raises an exception
self.sysinv_client.get_hosts.side_effect = \
Exception("HTTP CommunicationError")
# invoke the strategy state operation on the orch thread
self.worker.perform_state_action(self.strategy_step)
# verify the query was actually attempted
self.sysinv_client.get_hosts.assert_called()
# verified the query was tried max retries + 1
self.assertEqual(finishing_fw_update.DEFAULT_MAX_FAILED_QUERIES + 1,
self.sysinv_client.get_hosts.call_count)
# verify the subsequent sysinv command was never attempted
self.sysinv_client.get_host_device_list.assert_not_called()
# verify that the state moves to the next state
self.assert_step_updated(self.strategy_step.subcloud_id,
consts.STRATEGY_STATE_FAILED)