Handle NodeLocked failures

Some actions can fail due to the node being locked as part of
normal operations. This was previously handled silently by
python-ironicclient, but when inspector was changed to use
openstacksdk, this was no longer handled.

Adds the retry wrapper around critical path methods involving
power/reboot ops and node updates which require locks.

Change-Id: I3d26cf46da02b3e8f3f773c0aeaed6843e0f26e5
Story: 2009107
Task: 42966
This commit is contained in:
Julia Kreger 2021-08-09 08:16:47 -07:00 committed by Steve Baker
parent 948325cae7
commit 7f6c4c4378
5 changed files with 66 additions and 12 deletions

View File

@ -123,8 +123,8 @@ def _do_introspect(node_info, ironic):
if node_info.manage_boot:
try:
ironic.set_node_boot_device(
node_info.uuid, 'pxe',
ir_utils.call_with_retries(
ironic.set_node_boot_device, node_info.uuid, 'pxe',
persistent=_persistent_ramdisk_boot(node_info.node()))
except Exception as exc:
raise utils.Error(_('Failed to set boot device to PXE: %s') % exc,
@ -133,7 +133,8 @@ def _do_introspect(node_info, ironic):
_wait_for_turn(node_info)
try:
ironic.set_node_power_state(node_info.uuid, 'rebooting')
ir_utils.call_with_retries(
ironic.set_node_power_state, node_info.uuid, 'rebooting')
except Exception as exc:
raise utils.Error(_('Failed to power on the node, check its '
'power management configuration: %s') % exc,
@ -175,7 +176,8 @@ def _abort(node_info, ironic):
LOG.debug('Forcing power-off', node_info=node_info)
if node_info.manage_boot:
try:
ironic.set_node_power_state(node_info.uuid, 'power off')
ir_utils.call_with_retries(
ironic.set_node_power_state, node_info.uuid, 'power off')
except Exception as exc:
LOG.warning('Failed to power off node: %s', exc,
node_info=node_info)

View File

@ -461,7 +461,8 @@ class NodeInfo(object):
:param ironic: Ironic client to use instead of self.ironic
"""
ironic = ironic or self.ironic
ironic.add_node_trait(self.uuid, trait)
ir_utils.call_with_retries(
ironic.add_node_trait, self.uuid, trait)
def remove_trait(self, trait, ironic=None):
"""Remove a trait from the node.
@ -471,6 +472,8 @@ class NodeInfo(object):
"""
ironic = ironic or self.ironic
try:
# TODO(TheJulia): This should really have a retry around it for
# connection failure, however its not a big deal.
ironic.remove_node_trait(self.uuid, trait)
except os_exc.NotFoundException:
LOG.debug('Trait %s is not set, cannot remove', trait,
@ -486,8 +489,8 @@ class NodeInfo(object):
ports = self.ports()
if isinstance(port, str):
port = ports[port]
ironic.delete_port(port.id)
ir_utils.call_with_retries(
ironic.delete_port, port.id)
del ports[port.address]
def get_by_path(self, path):

View File

@ -92,6 +92,34 @@ class TestIntrospect(BaseTest):
self.node_info.acquire_lock.assert_called_once_with()
self.node_info.release_lock.assert_called_once_with()
def test_ok_retries_node_locked(self, client_mock, start_mock):
cli = self._prepare(client_mock)
start_mock.return_value = self.node_info
cli.set_node_power_state.side_effect = [
os_exc.ConflictException("Locked"),
None]
introspect.introspect(self.node.uuid)
cli.get_node.assert_called_once_with(self.uuid)
cli.validate_node.assert_called_once_with(self.uuid, required='power')
start_mock.assert_called_once_with(self.uuid,
bmc_address=[self.bmc_address],
manage_boot=True,
ironic=cli)
self.node_info.ports.assert_called_once_with()
self.node_info.add_attribute.assert_called_once_with(
'mac', self.macs)
self.sync_filter_mock.assert_called_with(cli)
cli.set_node_boot_device.assert_called_once_with(
self.uuid, 'pxe', persistent=False)
cli.set_node_power_state.assert_has_calls([
mock.call(self.uuid, 'rebooting'),
mock.call(self.uuid, 'rebooting')])
self.node_info.acquire_lock.assert_called_once_with()
self.node_info.release_lock.assert_called_once_with()
@mock.patch.object(ir_utils, 'get_ipmi_address', autospec=True)
def test_resolved_bmc_address(self, ipmi_mock, client_mock, start_mock):
self.node.driver_info['ipmi_address'] = 'example.com'
@ -177,8 +205,9 @@ class TestIntrospect(BaseTest):
cli.set_node_boot_device.assert_called_once_with(self.uuid,
'pxe',
persistent=False)
cli.set_node_power_state.assert_called_once_with(self.uuid,
'rebooting')
power_call = mock.call(self.uuid, 'rebooting')
cli.set_node_power_state.assert_has_calls([
power_call, power_call, power_call, power_call, power_call])
start_mock.return_value.finished.assert_called_once_with(
introspect.istate.Events.error, error=mock.ANY)
self.node_info.acquire_lock.assert_called_once_with()
@ -218,9 +247,9 @@ class TestIntrospect(BaseTest):
bmc_address=[self.bmc_address],
manage_boot=True,
ironic=cli)
cli.set_node_boot_device.assert_called_once_with(self.uuid,
'pxe',
persistent=False)
dev_call = mock.call(self.uuid, 'pxe', persistent=False)
cli.set_node_boot_device.assert_has_calls([
dev_call, dev_call, dev_call, dev_call, dev_call])
cli.set_node_power_state.assert_not_called()
start_mock.return_value.finished.assert_called_once_with(
introspect.istate.Events.error, error=mock.ANY)

View File

@ -19,6 +19,7 @@ import unittest
from unittest import mock
import automaton
from openstack import exceptions as os_exc
from oslo_config import cfg
from oslo_utils import timeutils
from oslo_utils import uuidutils
@ -759,6 +760,14 @@ class TestUpdate(test_base.NodeTest):
self.ironic.delete_port.assert_called_once_with('0')
self.assertEqual(['mac1'], list(self.node_info.ports()))
def test_delete_port_retries(self):
self.ironic.delete_port.side_effect = \
os_exc.ConflictException("Locked")
self.assertRaises(
os_exc.ConflictException,
self.node_info.delete_port, self.ports['mac0'])
self.assertEqual(5, self.ironic.delete_port.call_count)
def test_delete_port_by_mac(self):
self.node_info.delete_port('mac0')

View File

@ -0,0 +1,11 @@
---
fixes:
- |
Fixes issues in Inspector where various tasks would not have retry logic
applied to them and may sporadically fail. This is because the OpenStack
SDK does not comprehend the NodeLocked error, which previously
python-ironicclient silently handled. Basic operations such as
"power reboot" and "set boot device" will now be retried automatically if
they fail.
For more information, please see
`story 2009107 <https://storyboard.openstack.org/#!/story/2009107>`_.