Fix nodes stuck at cleaning on Network Service issues

Ironic validates network interface before the cleaning process,
currently invalid parameter is captured but for not others.
There is chance that a node could be stucked at the cleaning
state on networking issues or temporary service down of neutron
service.

This patch adds NetworkError to the exception hanlding to cover
such cases.

Change-Id: If20de2ad4ae4177dea10b7ebfc9a91ca6fbabdb9
This commit is contained in:
Kaifeng Wang 2022-09-14 23:42:40 +08:00 committed by Julia Kreger
parent aae524a46c
commit 31c8087408
3 changed files with 29 additions and 7 deletions

View File

@ -69,7 +69,7 @@ def do_node_clean(task, clean_steps=None, disable_ramdisk=False):
task.driver.power.validate(task) task.driver.power.validate(task)
if not disable_ramdisk: if not disable_ramdisk:
task.driver.network.validate(task) task.driver.network.validate(task)
except exception.InvalidParameterValue as e: except (exception.InvalidParameterValue, exception.NetworkError) as e:
msg = (_('Validation of node %(node)s for cleaning failed: %(msg)s') % msg = (_('Validation of node %(node)s for cleaning failed: %(msg)s') %
{'node': node.uuid, 'msg': e}) {'node': node.uuid, 'msg': e})
return utils.cleaning_error_handler(task, msg) return utils.cleaning_error_handler(task, msg)

View File

@ -51,8 +51,6 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
'step': 'build_raid', 'priority': 0, 'interface': 'deploy'} 'step': 'build_raid', 'priority': 0, 'interface': 'deploy'}
def __do_node_clean_validate_fail(self, mock_validate, clean_steps=None): def __do_node_clean_validate_fail(self, mock_validate, clean_steps=None):
# InvalidParameterValue should cause node to go to CLEANFAIL
mock_validate.side_effect = exception.InvalidParameterValue('error')
tgt_prov_state = states.MANAGEABLE if clean_steps else states.AVAILABLE tgt_prov_state = states.MANAGEABLE if clean_steps else states.AVAILABLE
node = obj_utils.create_test_node( node = obj_utils.create_test_node(
self.context, driver='fake-hardware', self.context, driver='fake-hardware',
@ -68,26 +66,42 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
self.assertIsNone(node.fault) self.assertIsNone(node.fault)
mock_validate.assert_called_once_with(mock.ANY, mock.ANY) mock_validate.assert_called_once_with(mock.ANY, mock.ANY)
def __do_node_clean_validate_fail_invalid(self, mock_validate,
clean_steps=None):
# InvalidParameterValue should cause node to go to CLEANFAIL
mock_validate.side_effect = exception.InvalidParameterValue('error')
self.__do_node_clean_validate_fail(mock_validate,
clean_steps=clean_steps)
@mock.patch('ironic.drivers.modules.fake.FakePower.validate', @mock.patch('ironic.drivers.modules.fake.FakePower.validate',
autospec=True) autospec=True)
def test__do_node_clean_automated_power_validate_fail(self, mock_validate): def test__do_node_clean_automated_power_validate_fail(self, mock_validate):
self.__do_node_clean_validate_fail(mock_validate) self.__do_node_clean_validate_fail_invalid(mock_validate)
@mock.patch('ironic.drivers.modules.fake.FakePower.validate', @mock.patch('ironic.drivers.modules.fake.FakePower.validate',
autospec=True) autospec=True)
def test__do_node_clean_manual_power_validate_fail(self, mock_validate): def test__do_node_clean_manual_power_validate_fail(self, mock_validate):
self.__do_node_clean_validate_fail(mock_validate, clean_steps=[]) self.__do_node_clean_validate_fail_invalid(mock_validate,
clean_steps=[])
@mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate',
autospec=True) autospec=True)
def test__do_node_clean_automated_network_validate_fail(self, def test__do_node_clean_automated_network_validate_fail(self,
mock_validate): mock_validate):
self.__do_node_clean_validate_fail(mock_validate) self.__do_node_clean_validate_fail_invalid(mock_validate)
@mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate',
autospec=True) autospec=True)
def test__do_node_clean_manual_network_validate_fail(self, mock_validate): def test__do_node_clean_manual_network_validate_fail(self, mock_validate):
self.__do_node_clean_validate_fail(mock_validate, clean_steps=[]) self.__do_node_clean_validate_fail_invalid(mock_validate,
clean_steps=[])
@mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate',
autospec=True)
def test__do_node_clean_network_error_fail(self, mock_validate):
# NetworkError should cause node to go to CLEANFAIL
mock_validate.side_effect = exception.NetworkError()
self.__do_node_clean_validate_fail(mock_validate)
@mock.patch.object(conductor_utils, 'LOG', autospec=True) @mock.patch.object(conductor_utils, 'LOG', autospec=True)
@mock.patch.object(conductor_steps, 'set_node_cleaning_steps', @mock.patch.object(conductor_steps, 'set_node_cleaning_steps',

View File

@ -0,0 +1,8 @@
---
fixes:
- |
Fixes an issue where cleaning operations could fail in such a way that was
not easily recoverable when pre-cleaning network interface configuration
was validated, yet contained invalid configuration.
Now Ironic properly captures the error and exits from cleaning in a
state which allows for cleaning to be retried.