Merge "Kill misbehaving `ipmitool` process" into stable/rocky
This commit is contained in:
commit
574676e85b
|
@ -35,6 +35,12 @@ opts = [
|
||||||
'sent to a server. There is a risk with some hardware '
|
'sent to a server. There is a risk with some hardware '
|
||||||
'that setting this too low may cause the BMC to crash. '
|
'that setting this too low may cause the BMC to crash. '
|
||||||
'Recommended setting is 5 seconds.')),
|
'Recommended setting is 5 seconds.')),
|
||||||
|
cfg.BoolOpt('kill_on_timeout',
|
||||||
|
default=True,
|
||||||
|
help=_('Kill `ipmitool` process invoked by ironic to read '
|
||||||
|
'node power state if `ipmitool` process does not exit '
|
||||||
|
'after `command_retry_timeout` timeout expires. '
|
||||||
|
'Recommended setting is True')),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -30,6 +30,7 @@ DRIVER.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import contextlib
|
import contextlib
|
||||||
|
import functools
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
|
@ -378,12 +379,41 @@ def _parse_driver_info(node):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _exec_ipmitool(driver_info, command, check_exit_code=None):
|
def _exec_ipmitool_wait(timeout, driver_info, popen_obj):
|
||||||
|
wait_interval = min(timeout, 0.5)
|
||||||
|
|
||||||
|
while timeout >= 0:
|
||||||
|
if not popen_obj.poll():
|
||||||
|
return
|
||||||
|
|
||||||
|
time.sleep(wait_interval)
|
||||||
|
timeout -= wait_interval
|
||||||
|
|
||||||
|
LOG.warning('Killing timed out IPMI process "%(cmd)s" for node %(node)s.',
|
||||||
|
{'node': driver_info['uuid'], 'cmd': popen_obj.cmd})
|
||||||
|
|
||||||
|
popen_obj.terminate()
|
||||||
|
time.sleep(0.5)
|
||||||
|
if popen_obj.poll():
|
||||||
|
popen_obj.kill()
|
||||||
|
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
if popen_obj.poll():
|
||||||
|
LOG.warning('Could not kill IPMI process "%(cmd)s" for node %(node)s.',
|
||||||
|
{'node': driver_info['uuid'], 'cmd': popen_obj.cmd})
|
||||||
|
|
||||||
|
|
||||||
|
def _exec_ipmitool(driver_info, command, check_exit_code=None,
|
||||||
|
kill_on_timeout=False):
|
||||||
"""Execute the ipmitool command.
|
"""Execute the ipmitool command.
|
||||||
|
|
||||||
:param driver_info: the ipmitool parameters for accessing a node.
|
:param driver_info: the ipmitool parameters for accessing a node.
|
||||||
:param command: the ipmitool command to be executed.
|
:param command: the ipmitool command to be executed.
|
||||||
:param check_exit_code: Single bool, int, or list of allowed exit codes.
|
:param check_exit_code: Single bool, int, or list of allowed exit codes.
|
||||||
|
:param kill_on_timeout: if `True`, kill unresponsive ipmitool on
|
||||||
|
`min_command_interval` timeout. Default is `False`. Makes no
|
||||||
|
effect on Windows.
|
||||||
:returns: (stdout, stderr) from executing the command.
|
:returns: (stdout, stderr) from executing the command.
|
||||||
:raises: PasswordFileFailedToCreate from creating or writing to the
|
:raises: PasswordFileFailedToCreate from creating or writing to the
|
||||||
temporary file.
|
temporary file.
|
||||||
|
@ -426,6 +456,21 @@ def _exec_ipmitool(driver_info, command, check_exit_code=None):
|
||||||
args.append('-N')
|
args.append('-N')
|
||||||
args.append(str(CONF.ipmi.min_command_interval))
|
args.append(str(CONF.ipmi.min_command_interval))
|
||||||
|
|
||||||
|
extra_args = {}
|
||||||
|
|
||||||
|
if kill_on_timeout:
|
||||||
|
# NOTE(etingof): We can't trust ipmitool to terminate in time.
|
||||||
|
# Therefore we have to kill it if it is running for longer than
|
||||||
|
# we asked it to.
|
||||||
|
# For that purpose we inject the time-capped `popen.wait` call
|
||||||
|
# before the uncapped `popen.communicate` is called internally.
|
||||||
|
# That gives us a chance to kill misbehaving `ipmitool` child.
|
||||||
|
extra_args['on_execute'] = functools.partial(
|
||||||
|
_exec_ipmitool_wait, timeout, driver_info)
|
||||||
|
|
||||||
|
if check_exit_code is not None:
|
||||||
|
extra_args['check_exit_code'] = check_exit_code
|
||||||
|
|
||||||
end_time = (time.time() + timeout)
|
end_time = (time.time() + timeout)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
@ -439,9 +484,6 @@ def _exec_ipmitool(driver_info, command, check_exit_code=None):
|
||||||
# Resetting the list that will be utilized so the password arguments
|
# Resetting the list that will be utilized so the password arguments
|
||||||
# from any previous execution are preserved.
|
# from any previous execution are preserved.
|
||||||
cmd_args = args[:]
|
cmd_args = args[:]
|
||||||
extra_args = {}
|
|
||||||
if check_exit_code is not None:
|
|
||||||
extra_args['check_exit_code'] = check_exit_code
|
|
||||||
# 'ipmitool' command will prompt password if there is no '-f'
|
# 'ipmitool' command will prompt password if there is no '-f'
|
||||||
# option, we set it to '\0' to write a password file to support
|
# option, we set it to '\0' to write a password file to support
|
||||||
# empty password
|
# empty password
|
||||||
|
@ -567,7 +609,8 @@ def _power_status(driver_info):
|
||||||
"""
|
"""
|
||||||
cmd = "power status"
|
cmd = "power status"
|
||||||
try:
|
try:
|
||||||
out_err = _exec_ipmitool(driver_info, cmd)
|
out_err = _exec_ipmitool(
|
||||||
|
driver_info, cmd, kill_on_timeout=CONF.ipmi.kill_on_timeout)
|
||||||
except (exception.PasswordFileFailedToCreate,
|
except (exception.PasswordFileFailedToCreate,
|
||||||
processutils.ProcessExecutionError) as e:
|
processutils.ProcessExecutionError) as e:
|
||||||
LOG.warning("IPMI power status failed for node %(node_id)s with "
|
LOG.warning("IPMI power status failed for node %(node_id)s with "
|
||||||
|
@ -1066,7 +1109,8 @@ class IPMIManagement(base.ManagementInterface):
|
||||||
# extended sensor informations
|
# extended sensor informations
|
||||||
cmd = "sdr -v"
|
cmd = "sdr -v"
|
||||||
try:
|
try:
|
||||||
out, err = _exec_ipmitool(driver_info, cmd)
|
out, err = _exec_ipmitool(
|
||||||
|
driver_info, cmd, kill_on_timeout=CONF.ipmi.kill_on_timeout)
|
||||||
except (exception.PasswordFileFailedToCreate,
|
except (exception.PasswordFileFailedToCreate,
|
||||||
processutils.ProcessExecutionError) as e:
|
processutils.ProcessExecutionError) as e:
|
||||||
raise exception.FailedToGetSensorData(node=task.node.uuid,
|
raise exception.FailedToGetSensorData(node=task.node.uuid,
|
||||||
|
|
|
@ -996,6 +996,14 @@ class IPMIToolPrivateMethodTestCase(Base):
|
||||||
mock_support.assert_called_once_with('timing')
|
mock_support.assert_called_once_with('timing')
|
||||||
mock_exec.assert_called_once_with(*args)
|
mock_exec.assert_called_once_with(*args)
|
||||||
|
|
||||||
|
def test__exec_ipmitool_wait(self):
|
||||||
|
mock_popen = mock.MagicMock()
|
||||||
|
mock_popen.poll.side_effect = [1, 1, 1, 1, 1]
|
||||||
|
ipmi._exec_ipmitool_wait(1, {'uuid': ''}, mock_popen)
|
||||||
|
|
||||||
|
self.assertTrue(mock_popen.terminate.called)
|
||||||
|
self.assertTrue(mock_popen.kill.called)
|
||||||
|
|
||||||
@mock.patch.object(ipmi, '_is_option_supported', autospec=True)
|
@mock.patch.object(ipmi, '_is_option_supported', autospec=True)
|
||||||
@mock.patch.object(ipmi, '_make_password_file', _make_password_file_stub)
|
@mock.patch.object(ipmi, '_make_password_file', _make_password_file_stub)
|
||||||
@mock.patch.object(utils, 'execute', autospec=True)
|
@mock.patch.object(utils, 'execute', autospec=True)
|
||||||
|
@ -1267,7 +1275,8 @@ class IPMIToolPrivateMethodTestCase(Base):
|
||||||
|
|
||||||
state = ipmi._power_status(self.info)
|
state = ipmi._power_status(self.info)
|
||||||
|
|
||||||
mock_exec.assert_called_once_with(self.info, "power status")
|
mock_exec.assert_called_once_with(self.info, "power status",
|
||||||
|
kill_on_timeout=True)
|
||||||
self.assertEqual(states.POWER_ON, state)
|
self.assertEqual(states.POWER_ON, state)
|
||||||
|
|
||||||
@mock.patch.object(ipmi, '_exec_ipmitool', autospec=True)
|
@mock.patch.object(ipmi, '_exec_ipmitool', autospec=True)
|
||||||
|
@ -1276,7 +1285,8 @@ class IPMIToolPrivateMethodTestCase(Base):
|
||||||
|
|
||||||
state = ipmi._power_status(self.info)
|
state = ipmi._power_status(self.info)
|
||||||
|
|
||||||
mock_exec.assert_called_once_with(self.info, "power status")
|
mock_exec.assert_called_once_with(self.info, "power status",
|
||||||
|
kill_on_timeout=True)
|
||||||
self.assertEqual(states.POWER_OFF, state)
|
self.assertEqual(states.POWER_OFF, state)
|
||||||
|
|
||||||
@mock.patch.object(ipmi, '_exec_ipmitool', autospec=True)
|
@mock.patch.object(ipmi, '_exec_ipmitool', autospec=True)
|
||||||
|
@ -1285,7 +1295,8 @@ class IPMIToolPrivateMethodTestCase(Base):
|
||||||
|
|
||||||
state = ipmi._power_status(self.info)
|
state = ipmi._power_status(self.info)
|
||||||
|
|
||||||
mock_exec.assert_called_once_with(self.info, "power status")
|
mock_exec.assert_called_once_with(self.info, "power status",
|
||||||
|
kill_on_timeout=True)
|
||||||
self.assertEqual(states.ERROR, state)
|
self.assertEqual(states.ERROR, state)
|
||||||
|
|
||||||
@mock.patch.object(ipmi, '_exec_ipmitool', autospec=True)
|
@mock.patch.object(ipmi, '_exec_ipmitool', autospec=True)
|
||||||
|
@ -1294,14 +1305,15 @@ class IPMIToolPrivateMethodTestCase(Base):
|
||||||
self.assertRaises(exception.IPMIFailure,
|
self.assertRaises(exception.IPMIFailure,
|
||||||
ipmi._power_status,
|
ipmi._power_status,
|
||||||
self.info)
|
self.info)
|
||||||
mock_exec.assert_called_once_with(self.info, "power status")
|
mock_exec.assert_called_once_with(self.info, "power status",
|
||||||
|
kill_on_timeout=True)
|
||||||
|
|
||||||
@mock.patch.object(ipmi, '_exec_ipmitool', autospec=True)
|
@mock.patch.object(ipmi, '_exec_ipmitool', autospec=True)
|
||||||
@mock.patch('eventlet.greenthread.sleep', autospec=True)
|
@mock.patch('eventlet.greenthread.sleep', autospec=True)
|
||||||
def test__power_on_max_retries(self, sleep_mock, mock_exec):
|
def test__power_on_max_retries(self, sleep_mock, mock_exec):
|
||||||
self.config(command_retry_timeout=2, group='ipmi')
|
self.config(command_retry_timeout=2, group='ipmi')
|
||||||
|
|
||||||
def side_effect(driver_info, command):
|
def side_effect(driver_info, command, **kwargs):
|
||||||
resp_dict = {"power status": ["Chassis Power is off\n", None],
|
resp_dict = {"power status": ["Chassis Power is off\n", None],
|
||||||
"power on": [None, None]}
|
"power on": [None, None]}
|
||||||
return resp_dict.get(command, ["Bad\n", None])
|
return resp_dict.get(command, ["Bad\n", None])
|
||||||
|
@ -1309,8 +1321,8 @@ class IPMIToolPrivateMethodTestCase(Base):
|
||||||
mock_exec.side_effect = side_effect
|
mock_exec.side_effect = side_effect
|
||||||
|
|
||||||
expected = [mock.call(self.info, "power on"),
|
expected = [mock.call(self.info, "power on"),
|
||||||
mock.call(self.info, "power status"),
|
mock.call(self.info, "power status", kill_on_timeout=True),
|
||||||
mock.call(self.info, "power status")]
|
mock.call(self.info, "power status", kill_on_timeout=True)]
|
||||||
|
|
||||||
with task_manager.acquire(self.context, self.node.uuid) as task:
|
with task_manager.acquire(self.context, self.node.uuid) as task:
|
||||||
self.assertRaises(exception.PowerStateFailure,
|
self.assertRaises(exception.PowerStateFailure,
|
||||||
|
@ -1322,7 +1334,7 @@ class IPMIToolPrivateMethodTestCase(Base):
|
||||||
@mock.patch('eventlet.greenthread.sleep', autospec=True)
|
@mock.patch('eventlet.greenthread.sleep', autospec=True)
|
||||||
def test__soft_power_off(self, sleep_mock, mock_exec):
|
def test__soft_power_off(self, sleep_mock, mock_exec):
|
||||||
|
|
||||||
def side_effect(driver_info, command):
|
def side_effect(driver_info, command, **kwargs):
|
||||||
resp_dict = {"power status": ["Chassis Power is off\n", None],
|
resp_dict = {"power status": ["Chassis Power is off\n", None],
|
||||||
"power soft": [None, None]}
|
"power soft": [None, None]}
|
||||||
return resp_dict.get(command, ["Bad\n", None])
|
return resp_dict.get(command, ["Bad\n", None])
|
||||||
|
@ -1330,7 +1342,7 @@ class IPMIToolPrivateMethodTestCase(Base):
|
||||||
mock_exec.side_effect = side_effect
|
mock_exec.side_effect = side_effect
|
||||||
|
|
||||||
expected = [mock.call(self.info, "power soft"),
|
expected = [mock.call(self.info, "power soft"),
|
||||||
mock.call(self.info, "power status")]
|
mock.call(self.info, "power status", kill_on_timeout=True)]
|
||||||
|
|
||||||
with task_manager.acquire(self.context, self.node.uuid) as task:
|
with task_manager.acquire(self.context, self.node.uuid) as task:
|
||||||
state = ipmi._soft_power_off(task, self.info)
|
state = ipmi._soft_power_off(task, self.info)
|
||||||
|
@ -1342,7 +1354,7 @@ class IPMIToolPrivateMethodTestCase(Base):
|
||||||
@mock.patch('eventlet.greenthread.sleep', autospec=True)
|
@mock.patch('eventlet.greenthread.sleep', autospec=True)
|
||||||
def test__soft_power_off_max_retries(self, sleep_mock, mock_exec):
|
def test__soft_power_off_max_retries(self, sleep_mock, mock_exec):
|
||||||
|
|
||||||
def side_effect(driver_info, command):
|
def side_effect(driver_info, command, **kwargs):
|
||||||
resp_dict = {"power status": ["Chassis Power is on\n", None],
|
resp_dict = {"power status": ["Chassis Power is on\n", None],
|
||||||
"power soft": [None, None]}
|
"power soft": [None, None]}
|
||||||
return resp_dict.get(command, ["Bad\n", None])
|
return resp_dict.get(command, ["Bad\n", None])
|
||||||
|
@ -1350,8 +1362,8 @@ class IPMIToolPrivateMethodTestCase(Base):
|
||||||
mock_exec.side_effect = side_effect
|
mock_exec.side_effect = side_effect
|
||||||
|
|
||||||
expected = [mock.call(self.info, "power soft"),
|
expected = [mock.call(self.info, "power soft"),
|
||||||
mock.call(self.info, "power status"),
|
mock.call(self.info, "power status", kill_on_timeout=True),
|
||||||
mock.call(self.info, "power status")]
|
mock.call(self.info, "power status", kill_on_timeout=True)]
|
||||||
|
|
||||||
with task_manager.acquire(self.context, self.node.uuid) as task:
|
with task_manager.acquire(self.context, self.node.uuid) as task:
|
||||||
self.assertRaises(exception.PowerStateFailure,
|
self.assertRaises(exception.PowerStateFailure,
|
||||||
|
@ -1402,9 +1414,9 @@ class IPMIToolDriverTestCase(Base):
|
||||||
returns = iter([["Chassis Power is off\n", None],
|
returns = iter([["Chassis Power is off\n", None],
|
||||||
["Chassis Power is on\n", None],
|
["Chassis Power is on\n", None],
|
||||||
["\n", None]])
|
["\n", None]])
|
||||||
expected = [mock.call(self.info, "power status"),
|
expected = [mock.call(self.info, "power status", kill_on_timeout=True),
|
||||||
mock.call(self.info, "power status"),
|
mock.call(self.info, "power status", kill_on_timeout=True),
|
||||||
mock.call(self.info, "power status")]
|
mock.call(self.info, "power status", kill_on_timeout=True)]
|
||||||
mock_exec.side_effect = returns
|
mock_exec.side_effect = returns
|
||||||
|
|
||||||
with task_manager.acquire(self.context, self.node.uuid) as task:
|
with task_manager.acquire(self.context, self.node.uuid) as task:
|
||||||
|
@ -1426,7 +1438,8 @@ class IPMIToolDriverTestCase(Base):
|
||||||
self.assertRaises(exception.IPMIFailure,
|
self.assertRaises(exception.IPMIFailure,
|
||||||
self.power.get_power_state,
|
self.power.get_power_state,
|
||||||
task)
|
task)
|
||||||
mock_exec.assert_called_once_with(self.info, "power status")
|
mock_exec.assert_called_once_with(self.info, "power status",
|
||||||
|
kill_on_timeout=True)
|
||||||
|
|
||||||
@mock.patch.object(ipmi, '_power_on', autospec=True)
|
@mock.patch.object(ipmi, '_power_on', autospec=True)
|
||||||
@mock.patch.object(ipmi, '_power_off', autospec=True)
|
@mock.patch.object(ipmi, '_power_off', autospec=True)
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
---
|
||||||
|
fixes:
|
||||||
|
- |
|
||||||
|
Kill ``ipmitool`` process invoked by ironic to read node's power state if
|
||||||
|
``ipmitool`` process does not exit after configured timeout expires. It
|
||||||
|
appears pretty common for ``ipmitool`` to run for five minutes (with
|
||||||
|
current ironic defauls) once it hits a non-responsive bare metal node.
|
||||||
|
This could slow down the management of other nodes due periodic tasks
|
||||||
|
slots exhaustion. The new behaviour could is enabled by default, but
|
||||||
|
could be disabled via the ``[ipmi]kill_on_timeout`` ironic configuration
|
||||||
|
option.
|
Loading…
Reference in New Issue