diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 58654fc2d1..553c09b3f5 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -1904,6 +1904,28 @@ class ConductorManager(base_manager.BaseConductorManager): callback_method=utils.cleanup_rescuewait_timeout ) + @METRICS.timer('ConductorManager._check_servicewait_timeouts') + @periodics.periodic( + spacing=CONF.conductor.check_provision_state_interval, + enabled=CONF.conductor.check_provision_state_interval > 0 + and CONF.conductor.service_callback_timeout > 0) + def _check_servicewait_timeouts(self, context): + """Periodically check if servicing has timed out waiting for heartbeat. + + :param context: request context + + """ + callback_timeout = CONF.conductor.service_callback_timeout + filters = {'reserved': False, + 'provision_state': states.SERVICEWAIT, + 'maintenance': False, + 'provisioned_before': callback_timeout} + self._fail_if_in_state( + context, filters, states.SERVICEWAIT, + 'provision_updated_at', + keep_target_state=True, + callback_method=utils.cleanup_servicewait_timeout) + @METRICS.timer('ConductorManager._sync_local_state') @periodics.node_periodic( purpose='node take over', diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py index 809c48cf79..3b3e1581c0 100644 --- a/ironic/conf/conductor.py +++ b/ironic/conf/conductor.py @@ -170,6 +170,13 @@ opts = [ 'ramdisk doing the cleaning. If the timeout is reached ' 'the node will be put in the "clean failed" provision ' 'state. Set to 0 to disable timeout.')), + cfg.IntOpt('service_callback_timeout', + default=1800, + min=0, + help=_('Timeout (seconds) to wait for a callback from the ' + 'ramdisk doing the servicing. If the timeout is reached ' + 'the node will be put in the "service failed" provision ' + 'state. Set to 0 to disable timeout.')), cfg.IntOpt('rescue_callback_timeout', default=1800, min=0, diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index a258f5a9af..5dc4e45e42 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -2370,6 +2370,31 @@ class CheckTimeoutsTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): mock_clean_up.assert_called_once_with(mock.ANY, mock.ANY) node_power_mock.assert_called_once_with(mock.ANY, states.POWER_OFF) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.tear_down_service', + autospec=True) + @mock.patch.object(conductor_utils, 'node_power_action', autospec=True) + def test_check_servicewait_timeouts(self, node_power_mock, mock_clean_up): + self._start_service() + CONF.set_override('service_callback_timeout', 1, group='conductor') + tgt_prov_state = states.RESCUE + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + network_interface='flat', + provision_state=states.SERVICEWAIT, + target_provision_state=tgt_prov_state, + provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0)) + + self.service._check_servicewait_timeouts(self.context) + self._stop_service() + node.refresh() + self.assertEqual(states.SERVICEFAIL, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertIsNotNone(node.last_error) + self.assertIn('Timeout reached while servicing the node', + node.last_error) + mock_clean_up.assert_called_once_with(mock.ANY, mock.ANY) + node_power_mock.assert_not_called() + @mgr_utils.mock_record_keepalive class DoNodeTearDownTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): diff --git a/releasenotes/notes/servicewait-timeout-6ab4aca87cf76cc2.yaml b/releasenotes/notes/servicewait-timeout-6ab4aca87cf76cc2.yaml new file mode 100644 index 0000000000..5493c8cb6d --- /dev/null +++ b/releasenotes/notes/servicewait-timeout-6ab4aca87cf76cc2.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Adds a timeout to the ``service wait`` state. Previously, a node stuck in + this state would remain in it forever. The timeout value can be adjusted + via the new option ``[conductor]service_callback_timeout``.