From 56cb5f52fb833bd77bd0324ab24281edd59ab83e Mon Sep 17 00:00:00 2001 From: Jay Faulkner Date: Thu, 20 Nov 2025 15:14:36 -0800 Subject: [PATCH] [ironic] Ensure unprovision happens for new states States were added to the Ironic API to enable the node servicing feature, which can be performed on nodes provisioned with Nova instances. Current nova, if asked to delete these instances, will only remove the instance metadata and not tear them down. This change has two parts: - I have added the new, relevant states to _UNPROVISION_STATES in driver.py, which now allows Nova to know that SERVIC* states and DEPLOYHOLD are safe to unprovision from. - I have added all existing ironic states to ironic_states.py and the PROVISION_STATE_LIST constant and check the state against it -- in a case where a completely unknown state is returned, we should attempt an unprovision. This fix needs to be backported as far as possible, as this bug has existed since Antelope / 2023.1 (DEPLOYHOLD) or Bobcat / 2023.3 (SERVIC*). Assisted-by: Claude Code Closes-bug: #2131960 Change-Id: I31c70d35b0e6e9f8d2252bfb2f0bdec477cc6cc7 Signed-off-by: Jay Faulkner --- nova/tests/unit/virt/ironic/test_driver.py | 85 +++++++++++++++++++ nova/virt/ironic/driver.py | 17 ++-- nova/virt/ironic/ironic_states.py | 38 ++++++++- ...960-servicing-states-446d0e58bb9aa5f2.yaml | 25 ++++++ 4 files changed, 157 insertions(+), 8 deletions(-) create mode 100644 releasenotes/notes/bug-2131960-servicing-states-446d0e58bb9aa5f2.yaml diff --git a/nova/tests/unit/virt/ironic/test_driver.py b/nova/tests/unit/virt/ironic/test_driver.py index fc2ae12917b3..988f61a62d43 100644 --- a/nova/tests/unit/virt/ironic/test_driver.py +++ b/nova/tests/unit/virt/ironic/test_driver.py @@ -1746,6 +1746,91 @@ class IronicDriverTestCase(test.NoDBTestCase): for state in ironic_states.PROVISION_STATE_LIST: self._test_destroy(state) + @mock.patch.object(ironic_driver.IronicDriver, + '_remove_instance_info_from_node') + @mock.patch.object(ironic_driver.IronicDriver, '_cleanup_deploy') + def test_destroy_servicing_states_unprovision(self, mock_cleanup_deploy, + mock_remove_instance_info): + """Test that servicing and deployhold states trigger unprovisioning. + + This is a regression test for bug 2131960 where nodes in servicing + states (SERVICING, SERVICEWAIT, SERVICEHOLD, SERVICEFAIL) and + DEPLOYHOLD were not being properly unprovisioned when destroyed. + """ + # Test all servicing-related states and deployhold + servicing_states = [ + ironic_states.DEPLOYHOLD, + ironic_states.SERVICING, + ironic_states.SERVICEWAIT, + ironic_states.SERVICEFAIL, + ironic_states.SERVICEHOLD, + ] + + for state in servicing_states: + mock_cleanup_deploy.reset_mock() + mock_remove_instance_info.reset_mock() + self.mock_conn.set_node_provision_state.reset_mock() + + node_id = 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee' + network_info = 'foo' + + node = _get_cached_node( + driver='fake', id=node_id, provision_state=state) + instance = fake_instance.fake_instance_obj(self.ctx, node=node_id) + + def fake_set_node_provision_state(*_): + node.provision_state = None + + self.mock_conn.nodes.return_value = iter([node]) + self.mock_conn.set_node_provision_state.side_effect = \ + fake_set_node_provision_state + self.driver.destroy(self.ctx, instance, network_info, None) + + # All these states should trigger unprovisioning, not cleanup + self.mock_conn.set_node_provision_state.assert_called_once_with( + node_id, 'deleted', + ) + self.assertFalse(mock_remove_instance_info.called) + self.assertFalse(mock_cleanup_deploy.called) + + @mock.patch.object(ironic_driver.IronicDriver, + '_remove_instance_info_from_node') + @mock.patch.object(ironic_driver.IronicDriver, '_cleanup_deploy') + def test_destroy_unknown_state_unprovision(self, mock_cleanup_deploy, + mock_remove_instance_info): + """Test that unknown provision states trigger unprovisioning. + + This is a regression test for bug 2131960. As a safety mechanism, + nodes in provision states that are not recognized (not in + PROVISION_STATE_LIST) should trigger unprovisioning rather than + cleanup to ensure proper node teardown even when encountering + unexpected or future Ironic states. + """ + # Test with a completely unknown state that doesn't exist in + # PROVISION_STATE_LIST + node_id = 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee' + network_info = 'foo' + unknown_state = 'unknown-future-state' + + node = _get_cached_node( + driver='fake', id=node_id, provision_state=unknown_state) + instance = fake_instance.fake_instance_obj(self.ctx, node=node_id) + + def fake_set_node_provision_state(*_): + node.provision_state = None + + self.mock_conn.nodes.return_value = iter([node]) + self.mock_conn.set_node_provision_state.side_effect = \ + fake_set_node_provision_state + self.driver.destroy(self.ctx, instance, network_info, None) + + # Unknown states should trigger unprovisioning for safety + self.mock_conn.set_node_provision_state.assert_called_once_with( + node_id, 'deleted', + ) + self.assertFalse(mock_remove_instance_info.called) + self.assertFalse(mock_cleanup_deploy.called) + @mock.patch.object(ironic_driver.IronicDriver, '_validate_instance_and_node') @mock.patch.object(ironic_driver.IronicDriver, diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py index 6da6c457ee5b..647fca237032 100644 --- a/nova/virt/ironic/driver.py +++ b/nova/virt/ironic/driver.py @@ -71,10 +71,12 @@ _POWER_STATE_MAP = { _UNPROVISION_STATES = (ironic_states.ACTIVE, ironic_states.DEPLOYFAIL, ironic_states.ERROR, ironic_states.DEPLOYWAIT, - ironic_states.DEPLOYING, ironic_states.RESCUE, - ironic_states.RESCUING, ironic_states.RESCUEWAIT, - ironic_states.RESCUEFAIL, ironic_states.UNRESCUING, - ironic_states.UNRESCUEFAIL) + ironic_states.DEPLOYING, ironic_states.DEPLOYHOLD, + ironic_states.RESCUE, ironic_states.RESCUING, + ironic_states.RESCUEWAIT, ironic_states.RESCUEFAIL, + ironic_states.UNRESCUING, ironic_states.UNRESCUEFAIL, + ironic_states.SERVICING, ironic_states.SERVICEWAIT, + ironic_states.SERVICEFAIL, ironic_states.SERVICEHOLD) _NODE_FIELDS = ('uuid', 'power_state', 'target_power_state', 'provision_state', 'target_provision_state', 'last_error', 'maintenance', @@ -1370,13 +1372,18 @@ class IronicDriver(virt_driver.ComputeDriver): # without raising any exceptions. return - if node.provision_state in _UNPROVISION_STATES: + if (node.provision_state in _UNPROVISION_STATES or + node.provision_state not in ironic_states.PROVISION_STATE_LIST): # NOTE(mgoddard): Ironic's node tear-down procedure includes all of # the things we do in _cleanup_deploy, so let's not repeat them # here. Doing so would also race with the node cleaning process, # which may acquire the node lock and prevent us from making # changes to the node. See # https://bugs.launchpad.net/nova/+bug/2019977 + # NOTE(JayF): For maximum safety, we call unprovision also in cases + # where we see an unrecognized state. See + # https://bugs.launchpad.net/nova/+bug/2131960 for more + # information. self._unprovision(instance, node) else: self._cleanup_deploy(node, instance, network_info) diff --git a/nova/virt/ironic/ironic_states.py b/nova/virt/ironic/ironic_states.py index aafdd47e64a1..f2e93a902261 100644 --- a/nova/virt/ironic/ironic_states.py +++ b/nova/virt/ironic/ironic_states.py @@ -110,6 +110,9 @@ the driver to finish a cleaning step. CLEANFAIL = 'clean failed' """ Node failed cleaning. This requires operator intervention to resolve. """ +CLEANHOLD = 'clean hold' +""" Node is being held by a cleaning step. """ + ERROR = 'error' """ An error occurred during node processing. @@ -131,6 +134,8 @@ inspected node shall transition to MANAGEABLE status. INSPECTFAIL = 'inspect failed' """ Node inspection failed. """ +INSPECTWAIT = 'inspect wait' +""" Node is waiting for inspection callback. """ RESCUE = 'rescue' """ Node is in rescue mode. @@ -158,6 +163,30 @@ UNRESCUEFAIL = 'unrescue failed' UNRESCUING = "unrescuing" """ Node is unrescuing. """ +DEPLOYHOLD = 'deploy hold' +""" Node is being held by a deploy step. """ + +SERVICING = 'servicing' +""" Node is actively being changed by a service step. """ + +SERVICEWAIT = 'service wait' +""" Node is waiting for an operation to complete. """ + +SERVICEFAIL = 'service failed' +""" Node has failed in a service step execution. """ + +SERVICEHOLD = 'service hold' +""" Node is being held for direct intervention from a service step. """ + +ENROLL = 'enroll' +""" Node being entrolled into Ironic. +Nova should never see a node in this state.""" + +VERIFYING = 'verifying' +""" Node driver attributes being verified. +Nova should never see a node in this state.""" + + ############## # Power states ############## @@ -176,7 +205,10 @@ REBOOT = 'rebooting' ################## PROVISION_STATE_LIST = (NOSTATE, MANAGEABLE, AVAILABLE, ACTIVE, DEPLOYWAIT, - DEPLOYING, DEPLOYFAIL, DEPLOYDONE, DELETING, DELETED, - CLEANING, CLEANWAIT, CLEANFAIL, ERROR, REBUILD, - INSPECTING, INSPECTFAIL) + DEPLOYING, DEPLOYFAIL, DEPLOYDONE, DEPLOYHOLD, + DELETING, DELETED, CLEANING, CLEANWAIT, CLEANFAIL, + CLEANHOLD, ERROR, REBUILD, INSPECTING, INSPECTFAIL, + INSPECTWAIT, RESCUE, RESCUEFAIL, RESCUEWAIT, RESCUING, + SERVICING, SERVICEWAIT, SERVICEFAIL, SERVICEHOLD, + UNRESCUEFAIL, UNRESCUING, ENROLL, VERIFYING) """ A list of all provision states. """ diff --git a/releasenotes/notes/bug-2131960-servicing-states-446d0e58bb9aa5f2.yaml b/releasenotes/notes/bug-2131960-servicing-states-446d0e58bb9aa5f2.yaml new file mode 100644 index 000000000000..326b8a7c50b4 --- /dev/null +++ b/releasenotes/notes/bug-2131960-servicing-states-446d0e58bb9aa5f2.yaml @@ -0,0 +1,25 @@ +--- +security: + - | + [`bug 2131960 `_] + Fixed improper teardown of Ironic nodes in servicing states. + Previously, when Nova's virt driver called ``destroy()`` on Ironic + nodes in certain states (``deploy hold``, ``servicing``, ``service wait``, + ``service hold``, ``service failed``), these nodes would fail to undergo + proper teardown, resulting in orphaned instances where instance metadata + was removed but nodes remained unprovisioned in Ironic. In flat networking + configurations, this could allow nodes traversing servicing states to + briefly access tenant networks. In Neutron networking configurations, + while VIFs would detach correctly, nodes would remain in Ironic with a + shutdown IPA, and future deprovisioning attempts would fail due to missing + networking metadata. These states are now properly handled to ensure + complete node deprovisioning. +fixes: + - | + [`bug 2131960 `_] + Fixed a bug where Ironic nodes in servicing-related provision states + (``deploy hold``, ``servicing``, ``service wait``, ``service hold``, + ``service failed``) were not properly unprovisioned when instances were + destroyed. The Nova Ironic virt driver now correctly triggers + unprovisioning for nodes in these states, preventing orphaned node + configurations in Ironic.