Handle NodeLocked failures
Some actions can fail due to the node being locked as part of normal operations. This was previously handled silently by python-ironicclient, but when inspector was changed to use openstacksdk, this was no longer handled. Adds the retry wrapper around critical path methods involving power/reboot ops and node updates which require locks. Change-Id: I3d26cf46da02b3e8f3f773c0aeaed6843e0f26e5 Story: 2009107 Task: 42966
This commit is contained in:
parent
948325cae7
commit
7f6c4c4378
@ -123,8 +123,8 @@ def _do_introspect(node_info, ironic):
|
||||
|
||||
if node_info.manage_boot:
|
||||
try:
|
||||
ironic.set_node_boot_device(
|
||||
node_info.uuid, 'pxe',
|
||||
ir_utils.call_with_retries(
|
||||
ironic.set_node_boot_device, node_info.uuid, 'pxe',
|
||||
persistent=_persistent_ramdisk_boot(node_info.node()))
|
||||
except Exception as exc:
|
||||
raise utils.Error(_('Failed to set boot device to PXE: %s') % exc,
|
||||
@ -133,7 +133,8 @@ def _do_introspect(node_info, ironic):
|
||||
_wait_for_turn(node_info)
|
||||
|
||||
try:
|
||||
ironic.set_node_power_state(node_info.uuid, 'rebooting')
|
||||
ir_utils.call_with_retries(
|
||||
ironic.set_node_power_state, node_info.uuid, 'rebooting')
|
||||
except Exception as exc:
|
||||
raise utils.Error(_('Failed to power on the node, check its '
|
||||
'power management configuration: %s') % exc,
|
||||
@ -175,7 +176,8 @@ def _abort(node_info, ironic):
|
||||
LOG.debug('Forcing power-off', node_info=node_info)
|
||||
if node_info.manage_boot:
|
||||
try:
|
||||
ironic.set_node_power_state(node_info.uuid, 'power off')
|
||||
ir_utils.call_with_retries(
|
||||
ironic.set_node_power_state, node_info.uuid, 'power off')
|
||||
except Exception as exc:
|
||||
LOG.warning('Failed to power off node: %s', exc,
|
||||
node_info=node_info)
|
||||
|
@ -461,7 +461,8 @@ class NodeInfo(object):
|
||||
:param ironic: Ironic client to use instead of self.ironic
|
||||
"""
|
||||
ironic = ironic or self.ironic
|
||||
ironic.add_node_trait(self.uuid, trait)
|
||||
ir_utils.call_with_retries(
|
||||
ironic.add_node_trait, self.uuid, trait)
|
||||
|
||||
def remove_trait(self, trait, ironic=None):
|
||||
"""Remove a trait from the node.
|
||||
@ -471,6 +472,8 @@ class NodeInfo(object):
|
||||
"""
|
||||
ironic = ironic or self.ironic
|
||||
try:
|
||||
# TODO(TheJulia): This should really have a retry around it for
|
||||
# connection failure, however its not a big deal.
|
||||
ironic.remove_node_trait(self.uuid, trait)
|
||||
except os_exc.NotFoundException:
|
||||
LOG.debug('Trait %s is not set, cannot remove', trait,
|
||||
@ -486,8 +489,8 @@ class NodeInfo(object):
|
||||
ports = self.ports()
|
||||
if isinstance(port, str):
|
||||
port = ports[port]
|
||||
|
||||
ironic.delete_port(port.id)
|
||||
ir_utils.call_with_retries(
|
||||
ironic.delete_port, port.id)
|
||||
del ports[port.address]
|
||||
|
||||
def get_by_path(self, path):
|
||||
|
@ -92,6 +92,34 @@ class TestIntrospect(BaseTest):
|
||||
self.node_info.acquire_lock.assert_called_once_with()
|
||||
self.node_info.release_lock.assert_called_once_with()
|
||||
|
||||
def test_ok_retries_node_locked(self, client_mock, start_mock):
|
||||
cli = self._prepare(client_mock)
|
||||
start_mock.return_value = self.node_info
|
||||
cli.set_node_power_state.side_effect = [
|
||||
os_exc.ConflictException("Locked"),
|
||||
None]
|
||||
introspect.introspect(self.node.uuid)
|
||||
|
||||
cli.get_node.assert_called_once_with(self.uuid)
|
||||
cli.validate_node.assert_called_once_with(self.uuid, required='power')
|
||||
|
||||
start_mock.assert_called_once_with(self.uuid,
|
||||
bmc_address=[self.bmc_address],
|
||||
manage_boot=True,
|
||||
ironic=cli)
|
||||
self.node_info.ports.assert_called_once_with()
|
||||
self.node_info.add_attribute.assert_called_once_with(
|
||||
'mac', self.macs)
|
||||
self.sync_filter_mock.assert_called_with(cli)
|
||||
cli.set_node_boot_device.assert_called_once_with(
|
||||
self.uuid, 'pxe', persistent=False)
|
||||
cli.set_node_power_state.assert_has_calls([
|
||||
mock.call(self.uuid, 'rebooting'),
|
||||
mock.call(self.uuid, 'rebooting')])
|
||||
|
||||
self.node_info.acquire_lock.assert_called_once_with()
|
||||
self.node_info.release_lock.assert_called_once_with()
|
||||
|
||||
@mock.patch.object(ir_utils, 'get_ipmi_address', autospec=True)
|
||||
def test_resolved_bmc_address(self, ipmi_mock, client_mock, start_mock):
|
||||
self.node.driver_info['ipmi_address'] = 'example.com'
|
||||
@ -177,8 +205,9 @@ class TestIntrospect(BaseTest):
|
||||
cli.set_node_boot_device.assert_called_once_with(self.uuid,
|
||||
'pxe',
|
||||
persistent=False)
|
||||
cli.set_node_power_state.assert_called_once_with(self.uuid,
|
||||
'rebooting')
|
||||
power_call = mock.call(self.uuid, 'rebooting')
|
||||
cli.set_node_power_state.assert_has_calls([
|
||||
power_call, power_call, power_call, power_call, power_call])
|
||||
start_mock.return_value.finished.assert_called_once_with(
|
||||
introspect.istate.Events.error, error=mock.ANY)
|
||||
self.node_info.acquire_lock.assert_called_once_with()
|
||||
@ -218,9 +247,9 @@ class TestIntrospect(BaseTest):
|
||||
bmc_address=[self.bmc_address],
|
||||
manage_boot=True,
|
||||
ironic=cli)
|
||||
cli.set_node_boot_device.assert_called_once_with(self.uuid,
|
||||
'pxe',
|
||||
persistent=False)
|
||||
dev_call = mock.call(self.uuid, 'pxe', persistent=False)
|
||||
cli.set_node_boot_device.assert_has_calls([
|
||||
dev_call, dev_call, dev_call, dev_call, dev_call])
|
||||
cli.set_node_power_state.assert_not_called()
|
||||
start_mock.return_value.finished.assert_called_once_with(
|
||||
introspect.istate.Events.error, error=mock.ANY)
|
||||
|
@ -19,6 +19,7 @@ import unittest
|
||||
from unittest import mock
|
||||
|
||||
import automaton
|
||||
from openstack import exceptions as os_exc
|
||||
from oslo_config import cfg
|
||||
from oslo_utils import timeutils
|
||||
from oslo_utils import uuidutils
|
||||
@ -759,6 +760,14 @@ class TestUpdate(test_base.NodeTest):
|
||||
self.ironic.delete_port.assert_called_once_with('0')
|
||||
self.assertEqual(['mac1'], list(self.node_info.ports()))
|
||||
|
||||
def test_delete_port_retries(self):
|
||||
self.ironic.delete_port.side_effect = \
|
||||
os_exc.ConflictException("Locked")
|
||||
self.assertRaises(
|
||||
os_exc.ConflictException,
|
||||
self.node_info.delete_port, self.ports['mac0'])
|
||||
self.assertEqual(5, self.ironic.delete_port.call_count)
|
||||
|
||||
def test_delete_port_by_mac(self):
|
||||
self.node_info.delete_port('mac0')
|
||||
|
||||
|
@ -0,0 +1,11 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixes issues in Inspector where various tasks would not have retry logic
|
||||
applied to them and may sporadically fail. This is because the OpenStack
|
||||
SDK does not comprehend the NodeLocked error, which previously
|
||||
python-ironicclient silently handled. Basic operations such as
|
||||
"power reboot" and "set boot device" will now be retried automatically if
|
||||
they fail.
|
||||
For more information, please see
|
||||
`story 2009107 <https://storyboard.openstack.org/#!/story/2009107>`_.
|
Loading…
Reference in New Issue
Block a user