Config to stop powering off nodes on failure
This adds the config option deploy.power_off_after_deploy_failure, which will prevent powering off nodes once a deployment fails. Powering off nodes means loss of the log output from the agent. Choosing to leave the machine powered on would give an operator the opportunity to troubleshoot the issue using agent logs on the system in case of failure, or to prevent needing to power the machine back on for provisioning after cleaning. As a note, Nova will still attempt to delete the node on a failed deployment, so the option will not be very effective in non-standalone deployments until a similar patch is put into nova. Related-bug: 1531217 Change-Id: I7c25946de50ffda2869307cb4016e0ccfe81d192 Co-Authored-By: Josh Gachnang <josh@servercobra.com>
This commit is contained in:
parent
797e6f20e0
commit
6c96257142
@ -919,6 +919,10 @@
|
||||
# Deprecated group/name - [agent]/agent_erase_devices_iterations
|
||||
#erase_devices_iterations=1
|
||||
|
||||
# Whether to power off a node after deploy failure. Defaults
|
||||
# to True. (boolean value)
|
||||
#power_off_after_deploy_failure=true
|
||||
|
||||
|
||||
[dhcp]
|
||||
|
||||
|
@ -71,6 +71,10 @@ deploy_opts = [
|
||||
deprecated_group='agent',
|
||||
default=1,
|
||||
help=_('Number of iterations to be run for erasing devices.')),
|
||||
cfg.BoolOpt('power_off_after_deploy_failure',
|
||||
default=True,
|
||||
help=_('Whether to power off a node after deploy failure. '
|
||||
'Defaults to True.')),
|
||||
]
|
||||
CONF = cfg.CONF
|
||||
CONF.register_opts(deploy_opts, group='deploy')
|
||||
@ -492,15 +496,15 @@ def set_failed_state(task, msg):
|
||||
% {'node': node.uuid, 'state': node.provision_state})
|
||||
LOG.exception(msg2)
|
||||
|
||||
try:
|
||||
manager_utils.node_power_action(task, states.POWER_OFF)
|
||||
except Exception:
|
||||
msg2 = (_LE('Node %s failed to power off while handling deploy '
|
||||
'failure. This may be a serious condition. Node '
|
||||
'should be removed from Ironic or put in maintenance '
|
||||
'mode until the problem is resolved.') % node.uuid)
|
||||
LOG.exception(msg2)
|
||||
|
||||
if CONF.deploy.power_off_after_deploy_failure:
|
||||
try:
|
||||
manager_utils.node_power_action(task, states.POWER_OFF)
|
||||
except Exception:
|
||||
msg2 = (_LE('Node %s failed to power off while handling deploy '
|
||||
'failure. This may be a serious condition. Node '
|
||||
'should be removed from Ironic or put in maintenance '
|
||||
'mode until the problem is resolved.') % node.uuid)
|
||||
LOG.exception(msg2)
|
||||
# NOTE(deva): node_power_action() erases node.last_error
|
||||
# so we need to set it here.
|
||||
node.last_error = msg
|
||||
|
@ -1252,7 +1252,7 @@ class OtherFunctionTestCase(db_base.DbTestCase):
|
||||
autospec=True)
|
||||
def _test_set_failed_state(self, mock_event, mock_power, mock_log,
|
||||
event_value=None, power_value=None,
|
||||
log_calls=None):
|
||||
log_calls=None, poweroff=True):
|
||||
err_msg = 'some failure'
|
||||
mock_event.side_effect = event_value
|
||||
mock_power.side_effect = power_value
|
||||
@ -1260,9 +1260,12 @@ class OtherFunctionTestCase(db_base.DbTestCase):
|
||||
shared=False) as task:
|
||||
utils.set_failed_state(task, err_msg)
|
||||
mock_event.assert_called_once_with(task, 'fail')
|
||||
mock_power.assert_called_once_with(task, states.POWER_OFF)
|
||||
if poweroff:
|
||||
mock_power.assert_called_once_with(task, states.POWER_OFF)
|
||||
else:
|
||||
self.assertFalse(mock_power.called)
|
||||
self.assertEqual(err_msg, task.node.last_error)
|
||||
if log_calls:
|
||||
if (log_calls and poweroff):
|
||||
mock_log.exception.assert_has_calls(log_calls)
|
||||
else:
|
||||
self.assertFalse(mock_log.called)
|
||||
@ -1283,6 +1286,23 @@ class OtherFunctionTestCase(db_base.DbTestCase):
|
||||
power_value=iter([exc_param] * len(calls)),
|
||||
log_calls=calls)
|
||||
|
||||
def test_set_failed_state_no_poweroff(self):
|
||||
cfg.CONF.deploy.power_off_after_deploy_failure = False
|
||||
exc_state = exception.InvalidState('invalid state')
|
||||
exc_param = exception.InvalidParameterValue('invalid parameter')
|
||||
mock_call = mock.call(mock.ANY)
|
||||
self._test_set_failed_state(poweroff=False)
|
||||
calls = [mock_call]
|
||||
self._test_set_failed_state(event_value=iter([exc_state] * len(calls)),
|
||||
log_calls=calls, poweroff=False)
|
||||
calls = [mock_call]
|
||||
self._test_set_failed_state(power_value=iter([exc_param] * len(calls)),
|
||||
log_calls=calls, poweroff=False)
|
||||
calls = [mock_call, mock_call]
|
||||
self._test_set_failed_state(event_value=iter([exc_state] * len(calls)),
|
||||
power_value=iter([exc_param] * len(calls)),
|
||||
log_calls=calls, poweroff=False)
|
||||
|
||||
def test_get_boot_option(self):
|
||||
self.node.instance_info = {'capabilities': '{"boot_option": "local"}'}
|
||||
result = utils.get_boot_option(self.node)
|
||||
|
@ -0,0 +1,12 @@
|
||||
---
|
||||
features:
|
||||
- Operators can now set
|
||||
deploy.power_off_after_deploy_failure to leave nodes
|
||||
powered on when a deployment fails. This is useful
|
||||
for troubleshooting deployment issues. As a note,
|
||||
Nova will still attempt to delete a node after a failed
|
||||
deployment, so deploy.power_off_after_deploy_failure
|
||||
may not be very effective in non-standalone
|
||||
deployments until a similar patch to ironic's driver in
|
||||
nova is proposed.
|
||||
|
Loading…
Reference in New Issue
Block a user