Merge "Add timeout to SERVICEWAIT"

This commit is contained in:
Zuul 2024-10-08 15:53:54 +00:00 committed by Gerrit Code Review
commit fca4843295
4 changed files with 60 additions and 0 deletions

View File

@ -1904,6 +1904,28 @@ class ConductorManager(base_manager.BaseConductorManager):
callback_method=utils.cleanup_rescuewait_timeout callback_method=utils.cleanup_rescuewait_timeout
) )
@METRICS.timer('ConductorManager._check_servicewait_timeouts')
@periodics.periodic(
spacing=CONF.conductor.check_provision_state_interval,
enabled=CONF.conductor.check_provision_state_interval > 0
and CONF.conductor.service_callback_timeout > 0)
def _check_servicewait_timeouts(self, context):
"""Periodically check if servicing has timed out waiting for heartbeat.
:param context: request context
"""
callback_timeout = CONF.conductor.service_callback_timeout
filters = {'reserved': False,
'provision_state': states.SERVICEWAIT,
'maintenance': False,
'provisioned_before': callback_timeout}
self._fail_if_in_state(
context, filters, states.SERVICEWAIT,
'provision_updated_at',
keep_target_state=True,
callback_method=utils.cleanup_servicewait_timeout)
@METRICS.timer('ConductorManager._sync_local_state') @METRICS.timer('ConductorManager._sync_local_state')
@periodics.node_periodic( @periodics.node_periodic(
purpose='node take over', purpose='node take over',

View File

@ -170,6 +170,13 @@ opts = [
'ramdisk doing the cleaning. If the timeout is reached ' 'ramdisk doing the cleaning. If the timeout is reached '
'the node will be put in the "clean failed" provision ' 'the node will be put in the "clean failed" provision '
'state. Set to 0 to disable timeout.')), 'state. Set to 0 to disable timeout.')),
cfg.IntOpt('service_callback_timeout',
default=1800,
min=0,
help=_('Timeout (seconds) to wait for a callback from the '
'ramdisk doing the servicing. If the timeout is reached '
'the node will be put in the "service failed" provision '
'state. Set to 0 to disable timeout.')),
cfg.IntOpt('rescue_callback_timeout', cfg.IntOpt('rescue_callback_timeout',
default=1800, default=1800,
min=0, min=0,

View File

@ -2370,6 +2370,31 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
mock_clean_up.assert_called_once_with(mock.ANY, mock.ANY) mock_clean_up.assert_called_once_with(mock.ANY, mock.ANY)
node_power_mock.assert_called_once_with(mock.ANY, states.POWER_OFF) node_power_mock.assert_called_once_with(mock.ANY, states.POWER_OFF)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.tear_down_service',
autospec=True)
@mock.patch.object(conductor_utils, 'node_power_action', autospec=True)
def test_check_servicewait_timeouts(self, node_power_mock, mock_clean_up):
self._start_service()
CONF.set_override('service_callback_timeout', 1, group='conductor')
tgt_prov_state = states.RESCUE
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
network_interface='flat',
provision_state=states.SERVICEWAIT,
target_provision_state=tgt_prov_state,
provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0))
self.service._check_servicewait_timeouts(self.context)
self._stop_service()
node.refresh()
self.assertEqual(states.SERVICEFAIL, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state)
self.assertIsNotNone(node.last_error)
self.assertIn('Timeout reached while servicing the node',
node.last_error)
mock_clean_up.assert_called_once_with(mock.ANY, mock.ANY)
node_power_mock.assert_not_called()
@mgr_utils.mock_record_keepalive @mgr_utils.mock_record_keepalive
class DoNodeTearDownTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): class DoNodeTearDownTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):

View File

@ -0,0 +1,6 @@
---
fixes:
- |
Adds a timeout to the ``service wait`` state. Previously, a node stuck in
this state would remain in it forever. The timeout value can be adjusted
via the new option ``[conductor]service_callback_timeout``.