From 7f6c4c4378eff3d7fb7ae600faf17445dac0111d Mon Sep 17 00:00:00 2001 From: Julia Kreger Date: Mon, 9 Aug 2021 08:16:47 -0700 Subject: [PATCH] Handle NodeLocked failures Some actions can fail due to the node being locked as part of normal operations. This was previously handled silently by python-ironicclient, but when inspector was changed to use openstacksdk, this was no longer handled. Adds the retry wrapper around critical path methods involving power/reboot ops and node updates which require locks. Change-Id: I3d26cf46da02b3e8f3f773c0aeaed6843e0f26e5 Story: 2009107 Task: 42966 --- ironic_inspector/introspect.py | 10 +++-- ironic_inspector/node_cache.py | 9 +++-- ironic_inspector/test/unit/test_introspect.py | 39 ++++++++++++++++--- ironic_inspector/test/unit/test_node_cache.py | 9 +++++ ...e_transient_failures-e1da302fd1d06528.yaml | 11 ++++++ 5 files changed, 66 insertions(+), 12 deletions(-) create mode 100644 releasenotes/notes/retry_to_handle_transient_failures-e1da302fd1d06528.yaml diff --git a/ironic_inspector/introspect.py b/ironic_inspector/introspect.py index 45b62cea6..13297baa0 100644 --- a/ironic_inspector/introspect.py +++ b/ironic_inspector/introspect.py @@ -123,8 +123,8 @@ def _do_introspect(node_info, ironic): if node_info.manage_boot: try: - ironic.set_node_boot_device( - node_info.uuid, 'pxe', + ir_utils.call_with_retries( + ironic.set_node_boot_device, node_info.uuid, 'pxe', persistent=_persistent_ramdisk_boot(node_info.node())) except Exception as exc: raise utils.Error(_('Failed to set boot device to PXE: %s') % exc, @@ -133,7 +133,8 @@ def _do_introspect(node_info, ironic): _wait_for_turn(node_info) try: - ironic.set_node_power_state(node_info.uuid, 'rebooting') + ir_utils.call_with_retries( + ironic.set_node_power_state, node_info.uuid, 'rebooting') except Exception as exc: raise utils.Error(_('Failed to power on the node, check its ' 'power management configuration: %s') % exc, @@ -175,7 +176,8 @@ def _abort(node_info, ironic): LOG.debug('Forcing power-off', node_info=node_info) if node_info.manage_boot: try: - ironic.set_node_power_state(node_info.uuid, 'power off') + ir_utils.call_with_retries( + ironic.set_node_power_state, node_info.uuid, 'power off') except Exception as exc: LOG.warning('Failed to power off node: %s', exc, node_info=node_info) diff --git a/ironic_inspector/node_cache.py b/ironic_inspector/node_cache.py index 01d8e6ff7..a2591ab52 100644 --- a/ironic_inspector/node_cache.py +++ b/ironic_inspector/node_cache.py @@ -461,7 +461,8 @@ class NodeInfo(object): :param ironic: Ironic client to use instead of self.ironic """ ironic = ironic or self.ironic - ironic.add_node_trait(self.uuid, trait) + ir_utils.call_with_retries( + ironic.add_node_trait, self.uuid, trait) def remove_trait(self, trait, ironic=None): """Remove a trait from the node. @@ -471,6 +472,8 @@ class NodeInfo(object): """ ironic = ironic or self.ironic try: + # TODO(TheJulia): This should really have a retry around it for + # connection failure, however its not a big deal. ironic.remove_node_trait(self.uuid, trait) except os_exc.NotFoundException: LOG.debug('Trait %s is not set, cannot remove', trait, @@ -486,8 +489,8 @@ class NodeInfo(object): ports = self.ports() if isinstance(port, str): port = ports[port] - - ironic.delete_port(port.id) + ir_utils.call_with_retries( + ironic.delete_port, port.id) del ports[port.address] def get_by_path(self, path): diff --git a/ironic_inspector/test/unit/test_introspect.py b/ironic_inspector/test/unit/test_introspect.py index 705979bfd..62caf8d29 100644 --- a/ironic_inspector/test/unit/test_introspect.py +++ b/ironic_inspector/test/unit/test_introspect.py @@ -92,6 +92,34 @@ class TestIntrospect(BaseTest): self.node_info.acquire_lock.assert_called_once_with() self.node_info.release_lock.assert_called_once_with() + def test_ok_retries_node_locked(self, client_mock, start_mock): + cli = self._prepare(client_mock) + start_mock.return_value = self.node_info + cli.set_node_power_state.side_effect = [ + os_exc.ConflictException("Locked"), + None] + introspect.introspect(self.node.uuid) + + cli.get_node.assert_called_once_with(self.uuid) + cli.validate_node.assert_called_once_with(self.uuid, required='power') + + start_mock.assert_called_once_with(self.uuid, + bmc_address=[self.bmc_address], + manage_boot=True, + ironic=cli) + self.node_info.ports.assert_called_once_with() + self.node_info.add_attribute.assert_called_once_with( + 'mac', self.macs) + self.sync_filter_mock.assert_called_with(cli) + cli.set_node_boot_device.assert_called_once_with( + self.uuid, 'pxe', persistent=False) + cli.set_node_power_state.assert_has_calls([ + mock.call(self.uuid, 'rebooting'), + mock.call(self.uuid, 'rebooting')]) + + self.node_info.acquire_lock.assert_called_once_with() + self.node_info.release_lock.assert_called_once_with() + @mock.patch.object(ir_utils, 'get_ipmi_address', autospec=True) def test_resolved_bmc_address(self, ipmi_mock, client_mock, start_mock): self.node.driver_info['ipmi_address'] = 'example.com' @@ -177,8 +205,9 @@ class TestIntrospect(BaseTest): cli.set_node_boot_device.assert_called_once_with(self.uuid, 'pxe', persistent=False) - cli.set_node_power_state.assert_called_once_with(self.uuid, - 'rebooting') + power_call = mock.call(self.uuid, 'rebooting') + cli.set_node_power_state.assert_has_calls([ + power_call, power_call, power_call, power_call, power_call]) start_mock.return_value.finished.assert_called_once_with( introspect.istate.Events.error, error=mock.ANY) self.node_info.acquire_lock.assert_called_once_with() @@ -218,9 +247,9 @@ class TestIntrospect(BaseTest): bmc_address=[self.bmc_address], manage_boot=True, ironic=cli) - cli.set_node_boot_device.assert_called_once_with(self.uuid, - 'pxe', - persistent=False) + dev_call = mock.call(self.uuid, 'pxe', persistent=False) + cli.set_node_boot_device.assert_has_calls([ + dev_call, dev_call, dev_call, dev_call, dev_call]) cli.set_node_power_state.assert_not_called() start_mock.return_value.finished.assert_called_once_with( introspect.istate.Events.error, error=mock.ANY) diff --git a/ironic_inspector/test/unit/test_node_cache.py b/ironic_inspector/test/unit/test_node_cache.py index f1fdb5e98..0b4ddf682 100644 --- a/ironic_inspector/test/unit/test_node_cache.py +++ b/ironic_inspector/test/unit/test_node_cache.py @@ -19,6 +19,7 @@ import unittest from unittest import mock import automaton +from openstack import exceptions as os_exc from oslo_config import cfg from oslo_utils import timeutils from oslo_utils import uuidutils @@ -759,6 +760,14 @@ class TestUpdate(test_base.NodeTest): self.ironic.delete_port.assert_called_once_with('0') self.assertEqual(['mac1'], list(self.node_info.ports())) + def test_delete_port_retries(self): + self.ironic.delete_port.side_effect = \ + os_exc.ConflictException("Locked") + self.assertRaises( + os_exc.ConflictException, + self.node_info.delete_port, self.ports['mac0']) + self.assertEqual(5, self.ironic.delete_port.call_count) + def test_delete_port_by_mac(self): self.node_info.delete_port('mac0') diff --git a/releasenotes/notes/retry_to_handle_transient_failures-e1da302fd1d06528.yaml b/releasenotes/notes/retry_to_handle_transient_failures-e1da302fd1d06528.yaml new file mode 100644 index 000000000..34a2a982f --- /dev/null +++ b/releasenotes/notes/retry_to_handle_transient_failures-e1da302fd1d06528.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + Fixes issues in Inspector where various tasks would not have retry logic + applied to them and may sporadically fail. This is because the OpenStack + SDK does not comprehend the NodeLocked error, which previously + python-ironicclient silently handled. Basic operations such as + "power reboot" and "set boot device" will now be retried automatically if + they fail. + For more information, please see + `story 2009107 `_.