From 1b28f649feaf2c9929f15214814f8af950e5c19c Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Wed, 17 May 2023 14:20:54 +0100 Subject: [PATCH] ironic: Fix ConflictException when deleting server When unprovision works via Ironic, all operations in _cleanup_deploy have already been completed. Previous to this patch, we continue attempting all the clean up steps, which eventually errors out with BadRequest, or similar, and we complete the delete. Howerver, if cleaning has started, we hit a conflict exception, so we don't hit the expected error above. Prior to moving to the SDK, that landed in Caracal, we would retry on conflict errors. You can tweak the config to keep retrying for the length of time cleaning usually takes in your enviroment. After this patch: Ieda5636a5e80ea4af25db2e90be241869759e30c We now hard fail with this error: openstack.exceptions.ConflictException: Failed to detach VIF xxx from bare metal node yyy ... Node yyy is locked by host zzz, please retry after the current operation is completed. This change simply skips calling the operations that will always error out, avoiding the need to wait for cleainging to complete before getting the expected error message. Closes-Bug: #2019977 Related-Bug: #1815799 Change-Id: I60971b58cf1f24bdb5d40f668e380ebfee2ac495 (cherry picked from commit 6ebd8a56d1c79fc15f385f232ce0e22937b2fdac) --- nova/tests/unit/virt/ironic/test_driver.py | 39 ++----------------- nova/virt/ironic/driver.py | 31 ++++++--------- .../notes/bug-2019977-4afe7658394130b8.yaml | 7 ++++ 3 files changed, 22 insertions(+), 55 deletions(-) create mode 100644 releasenotes/notes/bug-2019977-4afe7658394130b8.yaml diff --git a/nova/tests/unit/virt/ironic/test_driver.py b/nova/tests/unit/virt/ironic/test_driver.py index 95ee0c00808f..e696a24c00b2 100644 --- a/nova/tests/unit/virt/ironic/test_driver.py +++ b/nova/tests/unit/virt/ironic/test_driver.py @@ -1723,8 +1723,6 @@ class IronicDriverTestCase(test.NoDBTestCase): self.mock_conn.nodes.assert_called_with( instance_id=instance.uuid, fields=ironic_driver._NODE_FIELDS) - mock_cleanup_deploy.assert_called_with(node, instance, network_info, - remove_instance_info=False) # For states that makes sense check if set_node_provision_state has # been called @@ -1735,7 +1733,8 @@ class IronicDriverTestCase(test.NoDBTestCase): self.assertFalse(mock_remove_instance_info.called) else: self.mock_conn.set_node_provision_state.assert_not_called() - mock_remove_instance_info.assert_called_once_with(node) + mock_cleanup_deploy.assert_called_with( + node, instance, network_info) # we call this innter function twice so we need to reset mocks self.mock_conn.set_node_provision_state.reset_mock() @@ -1765,27 +1764,7 @@ class IronicDriverTestCase(test.NoDBTestCase): self.driver.destroy, self.ctx, instance, None, None, ) - mock_clean.assert_called_once_with(node, instance, None, - remove_instance_info=False) - - @mock.patch.object(ironic_driver.IronicDriver, - '_validate_instance_and_node') - @mock.patch.object(ironic_driver.IronicDriver, - '_cleanup_deploy') - def test_destroy_trigger_remove_info_fail(self, mock_clean, fake_validate): - node_uuid = 'aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee' - node = ironic_utils.get_test_node( - driver='fake', id=node_uuid, - provision_state=ironic_states.AVAILABLE) - fake_validate.return_value = node - instance = fake_instance.fake_instance_obj(self.ctx, - node=node_uuid) - self.mock_conn.update_node.side_effect = SystemError('unexpected ' - 'error') - self.assertRaises(SystemError, self.driver.destroy, - self.ctx, instance, None, None) - mock_clean.assert_called_once_with(node, instance, None, - remove_instance_info=False) + mock_clean.assert_not_called() @mock.patch.object(ironic_driver.IronicDriver, '_validate_instance_and_node') @@ -2617,18 +2596,6 @@ class IronicDriverTestCase(test.NoDBTestCase): mock_unvif.assert_called_once_with(node, instance, None) mock_remove_info.assert_called_once_with(node) - @mock.patch.object(ironic_driver.IronicDriver, '_unplug_vifs') - @mock.patch.object(ironic_driver.IronicDriver, - '_cleanup_volume_target_info') - def test__cleanup_deploy_no_remove_ii(self, mock_vol, mock_unvif): - # TODO(TheJulia): This REALLY should be updated to cover all of the - # calls that take place. - node = ironic_utils.get_test_node(driver='fake') - instance = fake_instance.fake_instance_obj(self.ctx, node=node.id) - self.driver._cleanup_deploy(node, instance, remove_instance_info=False) - mock_vol.assert_called_once_with(instance) - mock_unvif.assert_called_once_with(node, instance, None) - class IronicDriverSyncTestCase(IronicDriverTestCase): diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py index d3c7d191bba7..271728d989ec 100644 --- a/nova/virt/ironic/driver.py +++ b/nova/virt/ironic/driver.py @@ -502,12 +502,10 @@ class IronicDriver(virt_driver.ComputeDriver): 'reason': e}, instance=instance) - def _cleanup_deploy(self, node, instance, network_info=None, - remove_instance_info=True): + def _cleanup_deploy(self, node, instance, network_info=None): self._cleanup_volume_target_info(instance) self._unplug_vifs(node, instance, network_info) - if remove_instance_info: - self._remove_instance_info_from_node(node) + self._remove_instance_info_from_node(node) def _wait_for_active(self, instance): """Wait for the node to be marked as ACTIVE in Ironic.""" @@ -1364,21 +1362,16 @@ class IronicDriver(virt_driver.ComputeDriver): # without raising any exceptions. return - try: - if node.provision_state in _UNPROVISION_STATES: - self._unprovision(instance, node) - else: - # NOTE(hshiina): if spawn() fails before ironic starts - # provisioning, instance information should be - # removed from ironic node. - self._remove_instance_info_from_node(node) - finally: - # NOTE(mgoddard): We don't need to remove instance info at this - # point since we will have already done it. The destroy will only - # succeed if this method returns without error, so we will end up - # removing the instance info eventually. - self._cleanup_deploy(node, instance, network_info, - remove_instance_info=False) + if node.provision_state in _UNPROVISION_STATES: + # NOTE(mgoddard): Ironic's node tear-down procedure includes all of + # the things we do in _cleanup_deploy, so let's not repeat them + # here. Doing so would also race with the node cleaning process, + # which may acquire the node lock and prevent us from making + # changes to the node. See + # https://bugs.launchpad.net/nova/+bug/2019977 + self._unprovision(instance, node) + else: + self._cleanup_deploy(node, instance, network_info) LOG.info('Successfully unprovisioned Ironic node %s', node.id, instance=instance) diff --git a/releasenotes/notes/bug-2019977-4afe7658394130b8.yaml b/releasenotes/notes/bug-2019977-4afe7658394130b8.yaml new file mode 100644 index 000000000000..51d8fe1846d7 --- /dev/null +++ b/releasenotes/notes/bug-2019977-4afe7658394130b8.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Fixes an issue seen when using bare metal (Ironic) instances where an + instance could fail to delete. See `Bug 2019977`_ for more details. + + .. _Bug 2019977: https://bugs.launchpad.net/nova/+bug/2019977