Merge "Kill misbehaving `ipmitool` process" into stable/rocky
This commit is contained in:
commit
574676e85b
|
@ -35,6 +35,12 @@ opts = [
|
|||
'sent to a server. There is a risk with some hardware '
|
||||
'that setting this too low may cause the BMC to crash. '
|
||||
'Recommended setting is 5 seconds.')),
|
||||
cfg.BoolOpt('kill_on_timeout',
|
||||
default=True,
|
||||
help=_('Kill `ipmitool` process invoked by ironic to read '
|
||||
'node power state if `ipmitool` process does not exit '
|
||||
'after `command_retry_timeout` timeout expires. '
|
||||
'Recommended setting is True')),
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -30,6 +30,7 @@ DRIVER.
|
|||
"""
|
||||
|
||||
import contextlib
|
||||
import functools
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
|
@ -378,12 +379,41 @@ def _parse_driver_info(node):
|
|||
}
|
||||
|
||||
|
||||
def _exec_ipmitool(driver_info, command, check_exit_code=None):
|
||||
def _exec_ipmitool_wait(timeout, driver_info, popen_obj):
|
||||
wait_interval = min(timeout, 0.5)
|
||||
|
||||
while timeout >= 0:
|
||||
if not popen_obj.poll():
|
||||
return
|
||||
|
||||
time.sleep(wait_interval)
|
||||
timeout -= wait_interval
|
||||
|
||||
LOG.warning('Killing timed out IPMI process "%(cmd)s" for node %(node)s.',
|
||||
{'node': driver_info['uuid'], 'cmd': popen_obj.cmd})
|
||||
|
||||
popen_obj.terminate()
|
||||
time.sleep(0.5)
|
||||
if popen_obj.poll():
|
||||
popen_obj.kill()
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
if popen_obj.poll():
|
||||
LOG.warning('Could not kill IPMI process "%(cmd)s" for node %(node)s.',
|
||||
{'node': driver_info['uuid'], 'cmd': popen_obj.cmd})
|
||||
|
||||
|
||||
def _exec_ipmitool(driver_info, command, check_exit_code=None,
|
||||
kill_on_timeout=False):
|
||||
"""Execute the ipmitool command.
|
||||
|
||||
:param driver_info: the ipmitool parameters for accessing a node.
|
||||
:param command: the ipmitool command to be executed.
|
||||
:param check_exit_code: Single bool, int, or list of allowed exit codes.
|
||||
:param kill_on_timeout: if `True`, kill unresponsive ipmitool on
|
||||
`min_command_interval` timeout. Default is `False`. Makes no
|
||||
effect on Windows.
|
||||
:returns: (stdout, stderr) from executing the command.
|
||||
:raises: PasswordFileFailedToCreate from creating or writing to the
|
||||
temporary file.
|
||||
|
@ -426,6 +456,21 @@ def _exec_ipmitool(driver_info, command, check_exit_code=None):
|
|||
args.append('-N')
|
||||
args.append(str(CONF.ipmi.min_command_interval))
|
||||
|
||||
extra_args = {}
|
||||
|
||||
if kill_on_timeout:
|
||||
# NOTE(etingof): We can't trust ipmitool to terminate in time.
|
||||
# Therefore we have to kill it if it is running for longer than
|
||||
# we asked it to.
|
||||
# For that purpose we inject the time-capped `popen.wait` call
|
||||
# before the uncapped `popen.communicate` is called internally.
|
||||
# That gives us a chance to kill misbehaving `ipmitool` child.
|
||||
extra_args['on_execute'] = functools.partial(
|
||||
_exec_ipmitool_wait, timeout, driver_info)
|
||||
|
||||
if check_exit_code is not None:
|
||||
extra_args['check_exit_code'] = check_exit_code
|
||||
|
||||
end_time = (time.time() + timeout)
|
||||
|
||||
while True:
|
||||
|
@ -439,9 +484,6 @@ def _exec_ipmitool(driver_info, command, check_exit_code=None):
|
|||
# Resetting the list that will be utilized so the password arguments
|
||||
# from any previous execution are preserved.
|
||||
cmd_args = args[:]
|
||||
extra_args = {}
|
||||
if check_exit_code is not None:
|
||||
extra_args['check_exit_code'] = check_exit_code
|
||||
# 'ipmitool' command will prompt password if there is no '-f'
|
||||
# option, we set it to '\0' to write a password file to support
|
||||
# empty password
|
||||
|
@ -567,7 +609,8 @@ def _power_status(driver_info):
|
|||
"""
|
||||
cmd = "power status"
|
||||
try:
|
||||
out_err = _exec_ipmitool(driver_info, cmd)
|
||||
out_err = _exec_ipmitool(
|
||||
driver_info, cmd, kill_on_timeout=CONF.ipmi.kill_on_timeout)
|
||||
except (exception.PasswordFileFailedToCreate,
|
||||
processutils.ProcessExecutionError) as e:
|
||||
LOG.warning("IPMI power status failed for node %(node_id)s with "
|
||||
|
@ -1066,7 +1109,8 @@ class IPMIManagement(base.ManagementInterface):
|
|||
# extended sensor informations
|
||||
cmd = "sdr -v"
|
||||
try:
|
||||
out, err = _exec_ipmitool(driver_info, cmd)
|
||||
out, err = _exec_ipmitool(
|
||||
driver_info, cmd, kill_on_timeout=CONF.ipmi.kill_on_timeout)
|
||||
except (exception.PasswordFileFailedToCreate,
|
||||
processutils.ProcessExecutionError) as e:
|
||||
raise exception.FailedToGetSensorData(node=task.node.uuid,
|
||||
|
|
|
@ -996,6 +996,14 @@ class IPMIToolPrivateMethodTestCase(Base):
|
|||
mock_support.assert_called_once_with('timing')
|
||||
mock_exec.assert_called_once_with(*args)
|
||||
|
||||
def test__exec_ipmitool_wait(self):
|
||||
mock_popen = mock.MagicMock()
|
||||
mock_popen.poll.side_effect = [1, 1, 1, 1, 1]
|
||||
ipmi._exec_ipmitool_wait(1, {'uuid': ''}, mock_popen)
|
||||
|
||||
self.assertTrue(mock_popen.terminate.called)
|
||||
self.assertTrue(mock_popen.kill.called)
|
||||
|
||||
@mock.patch.object(ipmi, '_is_option_supported', autospec=True)
|
||||
@mock.patch.object(ipmi, '_make_password_file', _make_password_file_stub)
|
||||
@mock.patch.object(utils, 'execute', autospec=True)
|
||||
|
@ -1267,7 +1275,8 @@ class IPMIToolPrivateMethodTestCase(Base):
|
|||
|
||||
state = ipmi._power_status(self.info)
|
||||
|
||||
mock_exec.assert_called_once_with(self.info, "power status")
|
||||
mock_exec.assert_called_once_with(self.info, "power status",
|
||||
kill_on_timeout=True)
|
||||
self.assertEqual(states.POWER_ON, state)
|
||||
|
||||
@mock.patch.object(ipmi, '_exec_ipmitool', autospec=True)
|
||||
|
@ -1276,7 +1285,8 @@ class IPMIToolPrivateMethodTestCase(Base):
|
|||
|
||||
state = ipmi._power_status(self.info)
|
||||
|
||||
mock_exec.assert_called_once_with(self.info, "power status")
|
||||
mock_exec.assert_called_once_with(self.info, "power status",
|
||||
kill_on_timeout=True)
|
||||
self.assertEqual(states.POWER_OFF, state)
|
||||
|
||||
@mock.patch.object(ipmi, '_exec_ipmitool', autospec=True)
|
||||
|
@ -1285,7 +1295,8 @@ class IPMIToolPrivateMethodTestCase(Base):
|
|||
|
||||
state = ipmi._power_status(self.info)
|
||||
|
||||
mock_exec.assert_called_once_with(self.info, "power status")
|
||||
mock_exec.assert_called_once_with(self.info, "power status",
|
||||
kill_on_timeout=True)
|
||||
self.assertEqual(states.ERROR, state)
|
||||
|
||||
@mock.patch.object(ipmi, '_exec_ipmitool', autospec=True)
|
||||
|
@ -1294,14 +1305,15 @@ class IPMIToolPrivateMethodTestCase(Base):
|
|||
self.assertRaises(exception.IPMIFailure,
|
||||
ipmi._power_status,
|
||||
self.info)
|
||||
mock_exec.assert_called_once_with(self.info, "power status")
|
||||
mock_exec.assert_called_once_with(self.info, "power status",
|
||||
kill_on_timeout=True)
|
||||
|
||||
@mock.patch.object(ipmi, '_exec_ipmitool', autospec=True)
|
||||
@mock.patch('eventlet.greenthread.sleep', autospec=True)
|
||||
def test__power_on_max_retries(self, sleep_mock, mock_exec):
|
||||
self.config(command_retry_timeout=2, group='ipmi')
|
||||
|
||||
def side_effect(driver_info, command):
|
||||
def side_effect(driver_info, command, **kwargs):
|
||||
resp_dict = {"power status": ["Chassis Power is off\n", None],
|
||||
"power on": [None, None]}
|
||||
return resp_dict.get(command, ["Bad\n", None])
|
||||
|
@ -1309,8 +1321,8 @@ class IPMIToolPrivateMethodTestCase(Base):
|
|||
mock_exec.side_effect = side_effect
|
||||
|
||||
expected = [mock.call(self.info, "power on"),
|
||||
mock.call(self.info, "power status"),
|
||||
mock.call(self.info, "power status")]
|
||||
mock.call(self.info, "power status", kill_on_timeout=True),
|
||||
mock.call(self.info, "power status", kill_on_timeout=True)]
|
||||
|
||||
with task_manager.acquire(self.context, self.node.uuid) as task:
|
||||
self.assertRaises(exception.PowerStateFailure,
|
||||
|
@ -1322,7 +1334,7 @@ class IPMIToolPrivateMethodTestCase(Base):
|
|||
@mock.patch('eventlet.greenthread.sleep', autospec=True)
|
||||
def test__soft_power_off(self, sleep_mock, mock_exec):
|
||||
|
||||
def side_effect(driver_info, command):
|
||||
def side_effect(driver_info, command, **kwargs):
|
||||
resp_dict = {"power status": ["Chassis Power is off\n", None],
|
||||
"power soft": [None, None]}
|
||||
return resp_dict.get(command, ["Bad\n", None])
|
||||
|
@ -1330,7 +1342,7 @@ class IPMIToolPrivateMethodTestCase(Base):
|
|||
mock_exec.side_effect = side_effect
|
||||
|
||||
expected = [mock.call(self.info, "power soft"),
|
||||
mock.call(self.info, "power status")]
|
||||
mock.call(self.info, "power status", kill_on_timeout=True)]
|
||||
|
||||
with task_manager.acquire(self.context, self.node.uuid) as task:
|
||||
state = ipmi._soft_power_off(task, self.info)
|
||||
|
@ -1342,7 +1354,7 @@ class IPMIToolPrivateMethodTestCase(Base):
|
|||
@mock.patch('eventlet.greenthread.sleep', autospec=True)
|
||||
def test__soft_power_off_max_retries(self, sleep_mock, mock_exec):
|
||||
|
||||
def side_effect(driver_info, command):
|
||||
def side_effect(driver_info, command, **kwargs):
|
||||
resp_dict = {"power status": ["Chassis Power is on\n", None],
|
||||
"power soft": [None, None]}
|
||||
return resp_dict.get(command, ["Bad\n", None])
|
||||
|
@ -1350,8 +1362,8 @@ class IPMIToolPrivateMethodTestCase(Base):
|
|||
mock_exec.side_effect = side_effect
|
||||
|
||||
expected = [mock.call(self.info, "power soft"),
|
||||
mock.call(self.info, "power status"),
|
||||
mock.call(self.info, "power status")]
|
||||
mock.call(self.info, "power status", kill_on_timeout=True),
|
||||
mock.call(self.info, "power status", kill_on_timeout=True)]
|
||||
|
||||
with task_manager.acquire(self.context, self.node.uuid) as task:
|
||||
self.assertRaises(exception.PowerStateFailure,
|
||||
|
@ -1402,9 +1414,9 @@ class IPMIToolDriverTestCase(Base):
|
|||
returns = iter([["Chassis Power is off\n", None],
|
||||
["Chassis Power is on\n", None],
|
||||
["\n", None]])
|
||||
expected = [mock.call(self.info, "power status"),
|
||||
mock.call(self.info, "power status"),
|
||||
mock.call(self.info, "power status")]
|
||||
expected = [mock.call(self.info, "power status", kill_on_timeout=True),
|
||||
mock.call(self.info, "power status", kill_on_timeout=True),
|
||||
mock.call(self.info, "power status", kill_on_timeout=True)]
|
||||
mock_exec.side_effect = returns
|
||||
|
||||
with task_manager.acquire(self.context, self.node.uuid) as task:
|
||||
|
@ -1426,7 +1438,8 @@ class IPMIToolDriverTestCase(Base):
|
|||
self.assertRaises(exception.IPMIFailure,
|
||||
self.power.get_power_state,
|
||||
task)
|
||||
mock_exec.assert_called_once_with(self.info, "power status")
|
||||
mock_exec.assert_called_once_with(self.info, "power status",
|
||||
kill_on_timeout=True)
|
||||
|
||||
@mock.patch.object(ipmi, '_power_on', autospec=True)
|
||||
@mock.patch.object(ipmi, '_power_off', autospec=True)
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
---
|
||||
fixes:
|
||||
- |
|
||||
Kill ``ipmitool`` process invoked by ironic to read node's power state if
|
||||
``ipmitool`` process does not exit after configured timeout expires. It
|
||||
appears pretty common for ``ipmitool`` to run for five minutes (with
|
||||
current ironic defauls) once it hits a non-responsive bare metal node.
|
||||
This could slow down the management of other nodes due periodic tasks
|
||||
slots exhaustion. The new behaviour could is enabled by default, but
|
||||
could be disabled via the ``[ipmi]kill_on_timeout`` ironic configuration
|
||||
option.
|
Loading…
Reference in New Issue