From 31c80874087b87dca3353710bdd36c775eae89b5 Mon Sep 17 00:00:00 2001 From: Kaifeng Wang Date: Wed, 14 Sep 2022 23:42:40 +0800 Subject: [PATCH] Fix nodes stuck at cleaning on Network Service issues Ironic validates network interface before the cleaning process, currently invalid parameter is captured but for not others. There is chance that a node could be stucked at the cleaning state on networking issues or temporary service down of neutron service. This patch adds NetworkError to the exception hanlding to cover such cases. Change-Id: If20de2ad4ae4177dea10b7ebfc9a91ca6fbabdb9 --- ironic/conductor/cleaning.py | 2 +- ironic/tests/unit/conductor/test_cleaning.py | 26 ++++++++++++++----- ...tuck-on-networkerror-4aedbf3673413af6.yaml | 8 ++++++ 3 files changed, 29 insertions(+), 7 deletions(-) create mode 100644 releasenotes/notes/fix-cleaning-stuck-on-networkerror-4aedbf3673413af6.yaml diff --git a/ironic/conductor/cleaning.py b/ironic/conductor/cleaning.py index e3151d4b8f..53d66ddd81 100644 --- a/ironic/conductor/cleaning.py +++ b/ironic/conductor/cleaning.py @@ -69,7 +69,7 @@ def do_node_clean(task, clean_steps=None, disable_ramdisk=False): task.driver.power.validate(task) if not disable_ramdisk: task.driver.network.validate(task) - except exception.InvalidParameterValue as e: + except (exception.InvalidParameterValue, exception.NetworkError) as e: msg = (_('Validation of node %(node)s for cleaning failed: %(msg)s') % {'node': node.uuid, 'msg': e}) return utils.cleaning_error_handler(task, msg) diff --git a/ironic/tests/unit/conductor/test_cleaning.py b/ironic/tests/unit/conductor/test_cleaning.py index 65261450af..a4c3d57b6b 100644 --- a/ironic/tests/unit/conductor/test_cleaning.py +++ b/ironic/tests/unit/conductor/test_cleaning.py @@ -51,8 +51,6 @@ class DoNodeCleanTestCase(db_base.DbTestCase): 'step': 'build_raid', 'priority': 0, 'interface': 'deploy'} def __do_node_clean_validate_fail(self, mock_validate, clean_steps=None): - # InvalidParameterValue should cause node to go to CLEANFAIL - mock_validate.side_effect = exception.InvalidParameterValue('error') tgt_prov_state = states.MANAGEABLE if clean_steps else states.AVAILABLE node = obj_utils.create_test_node( self.context, driver='fake-hardware', @@ -68,26 +66,42 @@ class DoNodeCleanTestCase(db_base.DbTestCase): self.assertIsNone(node.fault) mock_validate.assert_called_once_with(mock.ANY, mock.ANY) + def __do_node_clean_validate_fail_invalid(self, mock_validate, + clean_steps=None): + # InvalidParameterValue should cause node to go to CLEANFAIL + mock_validate.side_effect = exception.InvalidParameterValue('error') + self.__do_node_clean_validate_fail(mock_validate, + clean_steps=clean_steps) + @mock.patch('ironic.drivers.modules.fake.FakePower.validate', autospec=True) def test__do_node_clean_automated_power_validate_fail(self, mock_validate): - self.__do_node_clean_validate_fail(mock_validate) + self.__do_node_clean_validate_fail_invalid(mock_validate) @mock.patch('ironic.drivers.modules.fake.FakePower.validate', autospec=True) def test__do_node_clean_manual_power_validate_fail(self, mock_validate): - self.__do_node_clean_validate_fail(mock_validate, clean_steps=[]) + self.__do_node_clean_validate_fail_invalid(mock_validate, + clean_steps=[]) @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', autospec=True) def test__do_node_clean_automated_network_validate_fail(self, mock_validate): - self.__do_node_clean_validate_fail(mock_validate) + self.__do_node_clean_validate_fail_invalid(mock_validate) @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', autospec=True) def test__do_node_clean_manual_network_validate_fail(self, mock_validate): - self.__do_node_clean_validate_fail(mock_validate, clean_steps=[]) + self.__do_node_clean_validate_fail_invalid(mock_validate, + clean_steps=[]) + + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', + autospec=True) + def test__do_node_clean_network_error_fail(self, mock_validate): + # NetworkError should cause node to go to CLEANFAIL + mock_validate.side_effect = exception.NetworkError() + self.__do_node_clean_validate_fail(mock_validate) @mock.patch.object(conductor_utils, 'LOG', autospec=True) @mock.patch.object(conductor_steps, 'set_node_cleaning_steps', diff --git a/releasenotes/notes/fix-cleaning-stuck-on-networkerror-4aedbf3673413af6.yaml b/releasenotes/notes/fix-cleaning-stuck-on-networkerror-4aedbf3673413af6.yaml new file mode 100644 index 0000000000..f7769afc1e --- /dev/null +++ b/releasenotes/notes/fix-cleaning-stuck-on-networkerror-4aedbf3673413af6.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Fixes an issue where cleaning operations could fail in such a way that was + not easily recoverable when pre-cleaning network interface configuration + was validated, yet contained invalid configuration. + Now Ironic properly captures the error and exits from cleaning in a + state which allows for cleaning to be retried.