diff --git a/ironic/conf/ipmi.py b/ironic/conf/ipmi.py index 253cebd3d1..80ade2322f 100644 --- a/ironic/conf/ipmi.py +++ b/ironic/conf/ipmi.py @@ -35,6 +35,12 @@ opts = [ 'sent to a server. There is a risk with some hardware ' 'that setting this too low may cause the BMC to crash. ' 'Recommended setting is 5 seconds.')), + cfg.BoolOpt('kill_on_timeout', + default=True, + help=_('Kill `ipmitool` process invoked by ironic to read ' + 'node power state if `ipmitool` process does not exit ' + 'after `command_retry_timeout` timeout expires. ' + 'Recommended setting is True')), ] diff --git a/ironic/drivers/modules/ipmitool.py b/ironic/drivers/modules/ipmitool.py index e4e9e6e671..1dcaa04bcb 100644 --- a/ironic/drivers/modules/ipmitool.py +++ b/ironic/drivers/modules/ipmitool.py @@ -30,6 +30,7 @@ DRIVER. """ import contextlib +import functools import os import re import subprocess @@ -378,12 +379,41 @@ def _parse_driver_info(node): } -def _exec_ipmitool(driver_info, command, check_exit_code=None): +def _exec_ipmitool_wait(timeout, driver_info, popen_obj): + wait_interval = min(timeout, 0.5) + + while timeout >= 0: + if not popen_obj.poll(): + return + + time.sleep(wait_interval) + timeout -= wait_interval + + LOG.warning('Killing timed out IPMI process "%(cmd)s" for node %(node)s.', + {'node': driver_info['uuid'], 'cmd': popen_obj.cmd}) + + popen_obj.terminate() + time.sleep(0.5) + if popen_obj.poll(): + popen_obj.kill() + + time.sleep(1) + + if popen_obj.poll(): + LOG.warning('Could not kill IPMI process "%(cmd)s" for node %(node)s.', + {'node': driver_info['uuid'], 'cmd': popen_obj.cmd}) + + +def _exec_ipmitool(driver_info, command, check_exit_code=None, + kill_on_timeout=False): """Execute the ipmitool command. :param driver_info: the ipmitool parameters for accessing a node. :param command: the ipmitool command to be executed. :param check_exit_code: Single bool, int, or list of allowed exit codes. + :param kill_on_timeout: if `True`, kill unresponsive ipmitool on + `min_command_interval` timeout. Default is `False`. Makes no + effect on Windows. :returns: (stdout, stderr) from executing the command. :raises: PasswordFileFailedToCreate from creating or writing to the temporary file. @@ -426,6 +456,21 @@ def _exec_ipmitool(driver_info, command, check_exit_code=None): args.append('-N') args.append(str(CONF.ipmi.min_command_interval)) + extra_args = {} + + if kill_on_timeout: + # NOTE(etingof): We can't trust ipmitool to terminate in time. + # Therefore we have to kill it if it is running for longer than + # we asked it to. + # For that purpose we inject the time-capped `popen.wait` call + # before the uncapped `popen.communicate` is called internally. + # That gives us a chance to kill misbehaving `ipmitool` child. + extra_args['on_execute'] = functools.partial( + _exec_ipmitool_wait, timeout, driver_info) + + if check_exit_code is not None: + extra_args['check_exit_code'] = check_exit_code + end_time = (time.time() + timeout) while True: @@ -439,9 +484,6 @@ def _exec_ipmitool(driver_info, command, check_exit_code=None): # Resetting the list that will be utilized so the password arguments # from any previous execution are preserved. cmd_args = args[:] - extra_args = {} - if check_exit_code is not None: - extra_args['check_exit_code'] = check_exit_code # 'ipmitool' command will prompt password if there is no '-f' # option, we set it to '\0' to write a password file to support # empty password @@ -567,7 +609,8 @@ def _power_status(driver_info): """ cmd = "power status" try: - out_err = _exec_ipmitool(driver_info, cmd) + out_err = _exec_ipmitool( + driver_info, cmd, kill_on_timeout=CONF.ipmi.kill_on_timeout) except (exception.PasswordFileFailedToCreate, processutils.ProcessExecutionError) as e: LOG.warning("IPMI power status failed for node %(node_id)s with " @@ -1066,7 +1109,8 @@ class IPMIManagement(base.ManagementInterface): # extended sensor informations cmd = "sdr -v" try: - out, err = _exec_ipmitool(driver_info, cmd) + out, err = _exec_ipmitool( + driver_info, cmd, kill_on_timeout=CONF.ipmi.kill_on_timeout) except (exception.PasswordFileFailedToCreate, processutils.ProcessExecutionError) as e: raise exception.FailedToGetSensorData(node=task.node.uuid, diff --git a/ironic/tests/unit/drivers/modules/test_ipmitool.py b/ironic/tests/unit/drivers/modules/test_ipmitool.py index eb5fb1b50e..1c8e73bb27 100644 --- a/ironic/tests/unit/drivers/modules/test_ipmitool.py +++ b/ironic/tests/unit/drivers/modules/test_ipmitool.py @@ -996,6 +996,14 @@ class IPMIToolPrivateMethodTestCase(Base): mock_support.assert_called_once_with('timing') mock_exec.assert_called_once_with(*args) + def test__exec_ipmitool_wait(self): + mock_popen = mock.MagicMock() + mock_popen.poll.side_effect = [1, 1, 1, 1, 1] + ipmi._exec_ipmitool_wait(1, {'uuid': ''}, mock_popen) + + self.assertTrue(mock_popen.terminate.called) + self.assertTrue(mock_popen.kill.called) + @mock.patch.object(ipmi, '_is_option_supported', autospec=True) @mock.patch.object(ipmi, '_make_password_file', _make_password_file_stub) @mock.patch.object(utils, 'execute', autospec=True) @@ -1267,7 +1275,8 @@ class IPMIToolPrivateMethodTestCase(Base): state = ipmi._power_status(self.info) - mock_exec.assert_called_once_with(self.info, "power status") + mock_exec.assert_called_once_with(self.info, "power status", + kill_on_timeout=True) self.assertEqual(states.POWER_ON, state) @mock.patch.object(ipmi, '_exec_ipmitool', autospec=True) @@ -1276,7 +1285,8 @@ class IPMIToolPrivateMethodTestCase(Base): state = ipmi._power_status(self.info) - mock_exec.assert_called_once_with(self.info, "power status") + mock_exec.assert_called_once_with(self.info, "power status", + kill_on_timeout=True) self.assertEqual(states.POWER_OFF, state) @mock.patch.object(ipmi, '_exec_ipmitool', autospec=True) @@ -1285,7 +1295,8 @@ class IPMIToolPrivateMethodTestCase(Base): state = ipmi._power_status(self.info) - mock_exec.assert_called_once_with(self.info, "power status") + mock_exec.assert_called_once_with(self.info, "power status", + kill_on_timeout=True) self.assertEqual(states.ERROR, state) @mock.patch.object(ipmi, '_exec_ipmitool', autospec=True) @@ -1294,14 +1305,15 @@ class IPMIToolPrivateMethodTestCase(Base): self.assertRaises(exception.IPMIFailure, ipmi._power_status, self.info) - mock_exec.assert_called_once_with(self.info, "power status") + mock_exec.assert_called_once_with(self.info, "power status", + kill_on_timeout=True) @mock.patch.object(ipmi, '_exec_ipmitool', autospec=True) @mock.patch('eventlet.greenthread.sleep', autospec=True) def test__power_on_max_retries(self, sleep_mock, mock_exec): self.config(command_retry_timeout=2, group='ipmi') - def side_effect(driver_info, command): + def side_effect(driver_info, command, **kwargs): resp_dict = {"power status": ["Chassis Power is off\n", None], "power on": [None, None]} return resp_dict.get(command, ["Bad\n", None]) @@ -1309,8 +1321,8 @@ class IPMIToolPrivateMethodTestCase(Base): mock_exec.side_effect = side_effect expected = [mock.call(self.info, "power on"), - mock.call(self.info, "power status"), - mock.call(self.info, "power status")] + mock.call(self.info, "power status", kill_on_timeout=True), + mock.call(self.info, "power status", kill_on_timeout=True)] with task_manager.acquire(self.context, self.node.uuid) as task: self.assertRaises(exception.PowerStateFailure, @@ -1322,7 +1334,7 @@ class IPMIToolPrivateMethodTestCase(Base): @mock.patch('eventlet.greenthread.sleep', autospec=True) def test__soft_power_off(self, sleep_mock, mock_exec): - def side_effect(driver_info, command): + def side_effect(driver_info, command, **kwargs): resp_dict = {"power status": ["Chassis Power is off\n", None], "power soft": [None, None]} return resp_dict.get(command, ["Bad\n", None]) @@ -1330,7 +1342,7 @@ class IPMIToolPrivateMethodTestCase(Base): mock_exec.side_effect = side_effect expected = [mock.call(self.info, "power soft"), - mock.call(self.info, "power status")] + mock.call(self.info, "power status", kill_on_timeout=True)] with task_manager.acquire(self.context, self.node.uuid) as task: state = ipmi._soft_power_off(task, self.info) @@ -1342,7 +1354,7 @@ class IPMIToolPrivateMethodTestCase(Base): @mock.patch('eventlet.greenthread.sleep', autospec=True) def test__soft_power_off_max_retries(self, sleep_mock, mock_exec): - def side_effect(driver_info, command): + def side_effect(driver_info, command, **kwargs): resp_dict = {"power status": ["Chassis Power is on\n", None], "power soft": [None, None]} return resp_dict.get(command, ["Bad\n", None]) @@ -1350,8 +1362,8 @@ class IPMIToolPrivateMethodTestCase(Base): mock_exec.side_effect = side_effect expected = [mock.call(self.info, "power soft"), - mock.call(self.info, "power status"), - mock.call(self.info, "power status")] + mock.call(self.info, "power status", kill_on_timeout=True), + mock.call(self.info, "power status", kill_on_timeout=True)] with task_manager.acquire(self.context, self.node.uuid) as task: self.assertRaises(exception.PowerStateFailure, @@ -1402,9 +1414,9 @@ class IPMIToolDriverTestCase(Base): returns = iter([["Chassis Power is off\n", None], ["Chassis Power is on\n", None], ["\n", None]]) - expected = [mock.call(self.info, "power status"), - mock.call(self.info, "power status"), - mock.call(self.info, "power status")] + expected = [mock.call(self.info, "power status", kill_on_timeout=True), + mock.call(self.info, "power status", kill_on_timeout=True), + mock.call(self.info, "power status", kill_on_timeout=True)] mock_exec.side_effect = returns with task_manager.acquire(self.context, self.node.uuid) as task: @@ -1426,7 +1438,8 @@ class IPMIToolDriverTestCase(Base): self.assertRaises(exception.IPMIFailure, self.power.get_power_state, task) - mock_exec.assert_called_once_with(self.info, "power status") + mock_exec.assert_called_once_with(self.info, "power status", + kill_on_timeout=True) @mock.patch.object(ipmi, '_power_on', autospec=True) @mock.patch.object(ipmi, '_power_off', autospec=True) diff --git a/releasenotes/notes/force-out-hung-ipmitool-process-519c7567bcbaa882.yaml b/releasenotes/notes/force-out-hung-ipmitool-process-519c7567bcbaa882.yaml new file mode 100644 index 0000000000..4199c76090 --- /dev/null +++ b/releasenotes/notes/force-out-hung-ipmitool-process-519c7567bcbaa882.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + Kill ``ipmitool`` process invoked by ironic to read node's power state if + ``ipmitool`` process does not exit after configured timeout expires. It + appears pretty common for ``ipmitool`` to run for five minutes (with + current ironic defauls) once it hits a non-responsive bare metal node. + This could slow down the management of other nodes due periodic tasks + slots exhaustion. The new behaviour could is enabled by default, but + could be disabled via the ``[ipmi]kill_on_timeout`` ironic configuration + option.