Merge "Handle NodeLocked failures"

This commit is contained in:
Zuul 2021-08-11 00:58:32 +00:00 committed by Gerrit Code Review
commit 00c2e99b5a
5 changed files with 66 additions and 12 deletions

@ -123,8 +123,8 @@ def _do_introspect(node_info, ironic):
if node_info.manage_boot: if node_info.manage_boot:
try: try:
ironic.set_node_boot_device( ir_utils.call_with_retries(
node_info.uuid, 'pxe', ironic.set_node_boot_device, node_info.uuid, 'pxe',
persistent=_persistent_ramdisk_boot(node_info.node())) persistent=_persistent_ramdisk_boot(node_info.node()))
except Exception as exc: except Exception as exc:
raise utils.Error(_('Failed to set boot device to PXE: %s') % exc, raise utils.Error(_('Failed to set boot device to PXE: %s') % exc,
@ -133,7 +133,8 @@ def _do_introspect(node_info, ironic):
_wait_for_turn(node_info) _wait_for_turn(node_info)
try: try:
ironic.set_node_power_state(node_info.uuid, 'rebooting') ir_utils.call_with_retries(
ironic.set_node_power_state, node_info.uuid, 'rebooting')
except Exception as exc: except Exception as exc:
raise utils.Error(_('Failed to power on the node, check its ' raise utils.Error(_('Failed to power on the node, check its '
'power management configuration: %s') % exc, 'power management configuration: %s') % exc,
@ -175,7 +176,8 @@ def _abort(node_info, ironic):
LOG.debug('Forcing power-off', node_info=node_info) LOG.debug('Forcing power-off', node_info=node_info)
if node_info.manage_boot: if node_info.manage_boot:
try: try:
ironic.set_node_power_state(node_info.uuid, 'power off') ir_utils.call_with_retries(
ironic.set_node_power_state, node_info.uuid, 'power off')
except Exception as exc: except Exception as exc:
LOG.warning('Failed to power off node: %s', exc, LOG.warning('Failed to power off node: %s', exc,
node_info=node_info) node_info=node_info)

@ -461,7 +461,8 @@ class NodeInfo(object):
:param ironic: Ironic client to use instead of self.ironic :param ironic: Ironic client to use instead of self.ironic
""" """
ironic = ironic or self.ironic ironic = ironic or self.ironic
ironic.add_node_trait(self.uuid, trait) ir_utils.call_with_retries(
ironic.add_node_trait, self.uuid, trait)
def remove_trait(self, trait, ironic=None): def remove_trait(self, trait, ironic=None):
"""Remove a trait from the node. """Remove a trait from the node.
@ -471,6 +472,8 @@ class NodeInfo(object):
""" """
ironic = ironic or self.ironic ironic = ironic or self.ironic
try: try:
# TODO(TheJulia): This should really have a retry around it for
# connection failure, however its not a big deal.
ironic.remove_node_trait(self.uuid, trait) ironic.remove_node_trait(self.uuid, trait)
except os_exc.NotFoundException: except os_exc.NotFoundException:
LOG.debug('Trait %s is not set, cannot remove', trait, LOG.debug('Trait %s is not set, cannot remove', trait,
@ -486,8 +489,8 @@ class NodeInfo(object):
ports = self.ports() ports = self.ports()
if isinstance(port, str): if isinstance(port, str):
port = ports[port] port = ports[port]
ir_utils.call_with_retries(
ironic.delete_port(port.id) ironic.delete_port, port.id)
del ports[port.address] del ports[port.address]
def get_by_path(self, path): def get_by_path(self, path):

@ -92,6 +92,34 @@ class TestIntrospect(BaseTest):
self.node_info.acquire_lock.assert_called_once_with() self.node_info.acquire_lock.assert_called_once_with()
self.node_info.release_lock.assert_called_once_with() self.node_info.release_lock.assert_called_once_with()
def test_ok_retries_node_locked(self, client_mock, start_mock):
cli = self._prepare(client_mock)
start_mock.return_value = self.node_info
cli.set_node_power_state.side_effect = [
os_exc.ConflictException("Locked"),
None]
introspect.introspect(self.node.uuid)
cli.get_node.assert_called_once_with(self.uuid)
cli.validate_node.assert_called_once_with(self.uuid, required='power')
start_mock.assert_called_once_with(self.uuid,
bmc_address=[self.bmc_address],
manage_boot=True,
ironic=cli)
self.node_info.ports.assert_called_once_with()
self.node_info.add_attribute.assert_called_once_with(
'mac', self.macs)
self.sync_filter_mock.assert_called_with(cli)
cli.set_node_boot_device.assert_called_once_with(
self.uuid, 'pxe', persistent=False)
cli.set_node_power_state.assert_has_calls([
mock.call(self.uuid, 'rebooting'),
mock.call(self.uuid, 'rebooting')])
self.node_info.acquire_lock.assert_called_once_with()
self.node_info.release_lock.assert_called_once_with()
@mock.patch.object(ir_utils, 'get_ipmi_address', autospec=True) @mock.patch.object(ir_utils, 'get_ipmi_address', autospec=True)
def test_resolved_bmc_address(self, ipmi_mock, client_mock, start_mock): def test_resolved_bmc_address(self, ipmi_mock, client_mock, start_mock):
self.node.driver_info['ipmi_address'] = 'example.com' self.node.driver_info['ipmi_address'] = 'example.com'
@ -177,8 +205,9 @@ class TestIntrospect(BaseTest):
cli.set_node_boot_device.assert_called_once_with(self.uuid, cli.set_node_boot_device.assert_called_once_with(self.uuid,
'pxe', 'pxe',
persistent=False) persistent=False)
cli.set_node_power_state.assert_called_once_with(self.uuid, power_call = mock.call(self.uuid, 'rebooting')
'rebooting') cli.set_node_power_state.assert_has_calls([
power_call, power_call, power_call, power_call, power_call])
start_mock.return_value.finished.assert_called_once_with( start_mock.return_value.finished.assert_called_once_with(
introspect.istate.Events.error, error=mock.ANY) introspect.istate.Events.error, error=mock.ANY)
self.node_info.acquire_lock.assert_called_once_with() self.node_info.acquire_lock.assert_called_once_with()
@ -218,9 +247,9 @@ class TestIntrospect(BaseTest):
bmc_address=[self.bmc_address], bmc_address=[self.bmc_address],
manage_boot=True, manage_boot=True,
ironic=cli) ironic=cli)
cli.set_node_boot_device.assert_called_once_with(self.uuid, dev_call = mock.call(self.uuid, 'pxe', persistent=False)
'pxe', cli.set_node_boot_device.assert_has_calls([
persistent=False) dev_call, dev_call, dev_call, dev_call, dev_call])
cli.set_node_power_state.assert_not_called() cli.set_node_power_state.assert_not_called()
start_mock.return_value.finished.assert_called_once_with( start_mock.return_value.finished.assert_called_once_with(
introspect.istate.Events.error, error=mock.ANY) introspect.istate.Events.error, error=mock.ANY)

@ -19,6 +19,7 @@ import unittest
from unittest import mock from unittest import mock
import automaton import automaton
from openstack import exceptions as os_exc
from oslo_config import cfg from oslo_config import cfg
from oslo_utils import timeutils from oslo_utils import timeutils
from oslo_utils import uuidutils from oslo_utils import uuidutils
@ -759,6 +760,14 @@ class TestUpdate(test_base.NodeTest):
self.ironic.delete_port.assert_called_once_with('0') self.ironic.delete_port.assert_called_once_with('0')
self.assertEqual(['mac1'], list(self.node_info.ports())) self.assertEqual(['mac1'], list(self.node_info.ports()))
def test_delete_port_retries(self):
self.ironic.delete_port.side_effect = \
os_exc.ConflictException("Locked")
self.assertRaises(
os_exc.ConflictException,
self.node_info.delete_port, self.ports['mac0'])
self.assertEqual(5, self.ironic.delete_port.call_count)
def test_delete_port_by_mac(self): def test_delete_port_by_mac(self):
self.node_info.delete_port('mac0') self.node_info.delete_port('mac0')

@ -0,0 +1,11 @@
---
fixes:
- |
Fixes issues in Inspector where various tasks would not have retry logic
applied to them and may sporadically fail. This is because the OpenStack
SDK does not comprehend the NodeLocked error, which previously
python-ironicclient silently handled. Basic operations such as
"power reboot" and "set boot device" will now be retried automatically if
they fail.
For more information, please see
`story 2009107 <https://storyboard.openstack.org/#!/story/2009107>`_.