diff --git a/etc/ironic/ironic.conf.sample b/etc/ironic/ironic.conf.sample index fdad52d21e..dc23de31e5 100644 --- a/etc/ironic/ironic.conf.sample +++ b/etc/ironic/ironic.conf.sample @@ -1267,6 +1267,12 @@ # (integer value) #check_provision_state_interval = 60 +# Interval (seconds) between checks of rescue timeouts. This +# option is part of rescue feature work, which is not +# currently exposed to users. (integer value) +# Minimum value: 1 +#check_rescue_state_interval = 60 + # Timeout (seconds) to wait for a callback from a deploy # ramdisk. Set to 0 to disable timeout. (integer value) #deploy_callback_timeout = 1800 @@ -1355,6 +1361,14 @@ # disable timeout. (integer value) #clean_callback_timeout = 1800 +# Timeout (seconds) to wait for a callback from the rescue +# ramdisk. If the timeout is reached the node will be put in +# the "rescue failed" provision state. Set to 0 to disable +# timeout. This option is part of rescue feature work, which +# is not currently exposed to users. (integer value) +# Minimum value: 0 +#rescue_callback_timeout = 1800 + # Timeout (in seconds) of soft reboot and soft power off # operation. This value always has to be positive. (integer # value) diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 0fd6bffc6f..ffba6b3dad 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -1727,6 +1727,27 @@ class ConductorManager(base_manager.BaseConductorManager): keep_target_state=True, callback_method=utils.cleanup_cleanwait_timeout) + @METRICS.timer('ConductorManager._check_rescuewait_timeouts') + @periodics.periodic(spacing=CONF.conductor.check_rescue_state_interval, + enabled=bool(CONF.conductor.rescue_callback_timeout)) + def _check_rescuewait_timeouts(self, context): + """Periodically checks if rescue has timed out waiting for heartbeat. + + If a rescue call has timed out, fail the rescue and clean up. + + :param context: request context. + """ + callback_timeout = CONF.conductor.rescue_callback_timeout + filters = {'reserved': False, + 'provision_state': states.RESCUEWAIT, + 'maintenance': False, + 'provisioned_before': callback_timeout} + self._fail_if_in_state(context, filters, states.RESCUEWAIT, + 'provision_updated_at', + keep_target_state=True, + callback_method=utils.cleanup_rescuewait_timeout + ) + @METRICS.timer('ConductorManager._sync_local_state') @periodics.periodic(spacing=CONF.conductor.sync_local_state_interval) def _sync_local_state(self, context): diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 0ed6a68084..e45c913cb9 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -398,14 +398,48 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True, task.process_event('fail', target_state=target_state) +@task_manager.require_exclusive_lock +def cleanup_rescuewait_timeout(task): + """Cleanup rescue task after timeout. + + :param task: a TaskManager instance. + """ + node = task.node + msg = _('Timeout reached while waiting for rescue ramdisk callback ' + 'for node') + errmsg = msg + ' %(node)s' + LOG.error(errmsg, {'node': node.uuid}) + try: + node_power_action(task, states.POWER_OFF) + task.driver.rescue.clean_up(task) + node.last_error = msg + node.save() + except Exception as e: + if isinstance(e, exception.IronicException): + error_msg = _('Cleanup failed for %(node_info)s after rescue ' + 'timeout: %(error)s') + node_info = ('node') + node.last_error = error_msg % {'node_info': node_info, 'error': e} + node_info = ('node %s') % node.uuid + LOG.error(error_msg, {'node_info': node_info, 'error': e}) + else: + node.last_error = _('Rescue timed out, but an unhandled ' + 'exception was encountered while aborting. ' + 'More info may be found in the log file.') + LOG.exception('Rescue timed out for node %(node)s, an exception ' + 'was encountered while aborting. Error: %(err)s', + {'node': node.uuid, 'err': e}) + node.save() + + def _spawn_error_handler(e, node, state): """Handle spawning error for node.""" if isinstance(e, exception.NoFreeConductorWorker): node.last_error = (_("No free conductor workers available")) node.save() LOG.warning("No free conductor workers available to perform " - "%(state)s on node %(node)s", - {'state': state, 'node': node.uuid}) + "%(operation)s on node %(node)s", + {'operation': state, 'node': node.uuid}) def spawn_cleaning_error_handler(e, node): diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py index 5cdbf72864..ae01347310 100644 --- a/ironic/conf/conductor.py +++ b/ironic/conf/conductor.py @@ -52,6 +52,12 @@ opts = [ default=60, help=_('Interval between checks of provision timeouts, ' 'in seconds.')), + cfg.IntOpt('check_rescue_state_interval', + default=60, + min=1, + help=_('Interval (seconds) between checks of rescue ' + 'timeouts. This option is part of rescue feature ' + 'work, which is not currently exposed to users.')), cfg.IntOpt('deploy_callback_timeout', default=1800, help=_('Timeout (seconds) to wait for a callback from ' @@ -145,6 +151,15 @@ opts = [ 'ramdisk doing the cleaning. If the timeout is reached ' 'the node will be put in the "clean failed" provision ' 'state. Set to 0 to disable timeout.')), + cfg.IntOpt('rescue_callback_timeout', + default=1800, + min=0, + help=_('Timeout (seconds) to wait for a callback from the ' + 'rescue ramdisk. If the timeout is reached the node ' + 'will be put in the "rescue failed" provision state. ' + 'Set to 0 to disable timeout. This option is part of ' + 'rescue feature work, which is not currently exposed ' + 'to users.')), cfg.IntOpt('soft_power_off_timeout', default=600, min=1, diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index 286d5b67b1..431282cbcd 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -1610,6 +1610,32 @@ class DoNodeDeployTearDownTestCase(mgr_utils.ServiceSetUpMixin, def test__check_cleanwait_timeouts_manual_clean(self): self._check_cleanwait_timeouts(manual=True) + @mock.patch('ironic.drivers.modules.fake.FakeRescue.clean_up') + @mock.patch.object(conductor_utils, 'node_power_action') + def test_check_rescuewait_timeouts(self, node_power_mock, + mock_clean_up): + self._start_service() + CONF.set_override('rescue_callback_timeout', 1, group='conductor') + tgt_prov_state = states.RESCUE + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + rescue_interface='fake', + network_interface='flat', + provision_state=states.RESCUEWAIT, + target_provision_state=tgt_prov_state, + provision_updated_at=datetime.datetime(2000, 1, 1, 0, 0)) + + self.service._check_rescuewait_timeouts(self.context) + self._stop_service() + node.refresh() + self.assertEqual(states.RESCUEFAIL, node.provision_state) + self.assertEqual(tgt_prov_state, node.target_provision_state) + self.assertIsNotNone(node.last_error) + self.assertIn('Timeout reached while waiting for rescue ramdisk', + node.last_error) + mock_clean_up.assert_called_once_with(mock.ANY) + node_power_mock.assert_called_once_with(mock.ANY, states.POWER_OFF) + def test_do_node_tear_down_invalid_state(self): self._start_service() # test node.provision_state is incorrect for tear_down diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index f4d547233f..9eb75e6dbf 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -1075,8 +1075,9 @@ class ErrorHandlersTestCase(tests_base.TestCase): def setUp(self): super(ErrorHandlersTestCase, self).setUp() self.task = mock.Mock(spec=task_manager.TaskManager) - self.task.driver = mock.Mock(spec_set=['deploy']) + self.task.driver = mock.Mock(spec_set=['deploy', 'network', 'rescue']) self.task.node = mock.Mock(spec_set=objects.Node) + self.task.shared = False self.node = self.task.node # NOTE(mariojv) Some of the test cases that use the task below require # strict typing of the node power state fields and would fail if passed @@ -1237,6 +1238,43 @@ class ErrorHandlersTestCase(tests_base.TestCase): self.assertFalse(self.node.save.called) self.assertFalse(log_mock.warning.called) + @mock.patch.object(conductor_utils, 'LOG') + @mock.patch.object(conductor_utils, 'node_power_action') + def test_cleanup_rescuewait_timeout(self, node_power_mock, log_mock): + conductor_utils.cleanup_rescuewait_timeout(self.task) + self.assertTrue(log_mock.error.called) + node_power_mock.assert_called_once_with(mock.ANY, states.POWER_OFF) + self.task.driver.rescue.clean_up.assert_called_once_with(self.task) + self.assertIn('Timeout reached', self.node.last_error) + self.node.save.assert_called_once_with() + + @mock.patch.object(conductor_utils, 'LOG') + @mock.patch.object(conductor_utils, 'node_power_action') + def test_cleanup_rescuewait_timeout_known_exc( + self, node_power_mock, log_mock): + clean_up_mock = self.task.driver.rescue.clean_up + clean_up_mock.side_effect = exception.IronicException('moocow') + conductor_utils.cleanup_rescuewait_timeout(self.task) + self.assertEqual(2, log_mock.error.call_count) + node_power_mock.assert_called_once_with(mock.ANY, states.POWER_OFF) + self.task.driver.rescue.clean_up.assert_called_once_with(self.task) + self.assertIn('moocow', self.node.last_error) + self.node.save.assert_called_once_with() + + @mock.patch.object(conductor_utils, 'LOG') + @mock.patch.object(conductor_utils, 'node_power_action') + def test_cleanup_rescuewait_timeout_unknown_exc( + self, node_power_mock, log_mock): + clean_up_mock = self.task.driver.rescue.clean_up + clean_up_mock.side_effect = Exception('moocow') + conductor_utils.cleanup_rescuewait_timeout(self.task) + self.assertTrue(log_mock.error.called) + node_power_mock.assert_called_once_with(mock.ANY, states.POWER_OFF) + self.task.driver.rescue.clean_up.assert_called_once_with(self.task) + self.assertIn('Rescue timed out', self.node.last_error) + self.node.save.assert_called_once_with() + self.assertTrue(log_mock.exception.called) + class ValidatePortPhysnetTestCase(db_base.DbTestCase):