Don't collect logs from powered off nodes

The agent deploy mixin tries to collect logs from the deployment ramdisk
after powering off a node. This is clearly not going to work, as it
requires a call to the IPA API.

This change avoids collecting logs from the ramdisk in failure scenarios
where the node is powered off.

Change-Id: I353e73e3612ece893276c683ce469f8561e9ebf9
Closes-Bug: #1732939
This commit is contained in:
Mark Goddard 2017-11-17 16:17:29 +00:00
parent af050b5d69
commit 752c82ccd9
4 changed files with 69 additions and 7 deletions

View File

@ -203,10 +203,18 @@ def _get_completed_cleaning_command(task, commands):
@METRICS.timer('log_and_raise_deployment_error') @METRICS.timer('log_and_raise_deployment_error')
def log_and_raise_deployment_error(task, msg): def log_and_raise_deployment_error(task, msg, collect_logs=True):
"""Helper method to log the error and raise exception.""" """Helper method to log the error and raise exception.
:param task: a TaskManager instance containing the node to act on.
:param msg: the message to set in last_error of the node.
:param collect_logs: Boolean indicating whether to attempt to collect
logs from IPA-based ramdisk. Defaults to True.
Actual log collection is also affected by
CONF.agent.deploy_logs_collect config option.
"""
LOG.error(msg) LOG.error(msg)
deploy_utils.set_failed_state(task, msg) deploy_utils.set_failed_state(task, msg, collect_logs=collect_logs)
raise exception.InstanceDeployFailure(msg) raise exception.InstanceDeployFailure(msg)
@ -590,7 +598,13 @@ class AgentDeployMixin(HeartbeatMixin):
{'node': node.uuid, 'error': error}) {'node': node.uuid, 'error': error})
manager_utils.node_power_action(task, states.POWER_OFF) manager_utils.node_power_action(task, states.POWER_OFF)
except Exception as e:
msg = (_('Error rebooting node %(node)s after deploy. '
'Error: %(error)s') %
{'node': node.uuid, 'error': e})
log_and_raise_deployment_error(task, msg)
try:
task.driver.network.remove_provisioning_network(task) task.driver.network.remove_provisioning_network(task)
task.driver.network.configure_tenant_networks(task) task.driver.network.configure_tenant_networks(task)
@ -599,7 +613,9 @@ class AgentDeployMixin(HeartbeatMixin):
msg = (_('Error rebooting node %(node)s after deploy. ' msg = (_('Error rebooting node %(node)s after deploy. '
'Error: %(error)s') % 'Error: %(error)s') %
{'node': node.uuid, 'error': e}) {'node': node.uuid, 'error': e})
log_and_raise_deployment_error(task, msg) # NOTE(mgoddard): Don't collect logs since the node has been
# powered off.
log_and_raise_deployment_error(task, msg, collect_logs=False)
task.process_event('done') task.process_event('done')
LOG.info('Deployment to node %s done', task.node.uuid) LOG.info('Deployment to node %s done', task.node.uuid)

View File

@ -504,7 +504,7 @@ def set_failed_state(task, msg, collect_logs=True):
:param task: a TaskManager instance containing the node to act on. :param task: a TaskManager instance containing the node to act on.
:param msg: the message to set in last_error of the node. :param msg: the message to set in last_error of the node.
:param collect_logs: boolean indicating whether to attempt collect :param collect_logs: Boolean indicating whether to attempt to collect
logs from IPA-based ramdisk. Defaults to True. logs from IPA-based ramdisk. Defaults to True.
Actual log collection is also affected by Actual log collection is also affected by
CONF.agent.deploy_logs_collect config option. CONF.agent.deploy_logs_collect config option.

View File

@ -437,7 +437,7 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest):
configure_tenant_net_mock.assert_called_once_with(mock.ANY, task) configure_tenant_net_mock.assert_called_once_with(mock.ANY, task)
self.assertEqual(states.DEPLOYFAIL, task.node.provision_state) self.assertEqual(states.DEPLOYFAIL, task.node.provision_state)
self.assertEqual(states.ACTIVE, task.node.target_provision_state) self.assertEqual(states.ACTIVE, task.node.target_provision_state)
mock_collect.assert_called_once_with(task.node) self.assertFalse(mock_collect.called)
@mock.patch.object(driver_utils, 'collect_ramdisk_logs', autospec=True) @mock.patch.object(driver_utils, 'collect_ramdisk_logs', autospec=True)
@mock.patch.object(time, 'sleep', lambda seconds: None) @mock.patch.object(time, 'sleep', lambda seconds: None)
@ -446,7 +446,7 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest):
spec=types.FunctionType) spec=types.FunctionType)
@mock.patch.object(agent_client.AgentClient, 'power_off', @mock.patch.object(agent_client.AgentClient, 'power_off',
spec=types.FunctionType) spec=types.FunctionType)
def test_reboot_and_finish_deploy_power_action_fails( def test_reboot_and_finish_deploy_power_off_fails(
self, power_off_mock, get_power_state_mock, self, power_off_mock, get_power_state_mock,
node_power_action_mock, mock_collect): node_power_action_mock, mock_collect):
self.node.provision_state = states.DEPLOYING self.node.provision_state = states.DEPLOYING
@ -467,6 +467,44 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest):
self.assertEqual(states.ACTIVE, task.node.target_provision_state) self.assertEqual(states.ACTIVE, task.node.target_provision_state)
mock_collect.assert_called_once_with(task.node) mock_collect.assert_called_once_with(task.node)
@mock.patch.object(driver_utils, 'collect_ramdisk_logs', autospec=True)
@mock.patch.object(time, 'sleep', lambda seconds: None)
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
@mock.patch.object(fake.FakePower, 'get_power_state',
spec=types.FunctionType)
@mock.patch.object(agent_client.AgentClient, 'power_off',
spec=types.FunctionType)
@mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.'
'remove_provisioning_network', spec_set=True, autospec=True)
@mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.'
'configure_tenant_networks', spec_set=True, autospec=True)
def test_reboot_and_finish_deploy_power_on_fails(
self, configure_tenant_net_mock, remove_provisioning_net_mock,
power_off_mock, get_power_state_mock,
node_power_action_mock, mock_collect):
self.node.provision_state = states.DEPLOYING
self.node.target_provision_state = states.ACTIVE
self.node.save()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
get_power_state_mock.return_value = states.POWER_ON
node_power_action_mock.side_effect = [None,
RuntimeError("boom")]
self.assertRaises(exception.InstanceDeployFailure,
self.deploy.reboot_and_finish_deploy,
task)
power_off_mock.assert_called_once_with(task.node)
self.assertEqual(7, get_power_state_mock.call_count)
node_power_action_mock.assert_has_calls([
mock.call(task, states.POWER_OFF),
mock.call(task, states.POWER_ON)])
remove_provisioning_net_mock.assert_called_once_with(mock.ANY,
task)
configure_tenant_net_mock.assert_called_once_with(mock.ANY, task)
self.assertEqual(states.DEPLOYFAIL, task.node.provision_state)
self.assertEqual(states.ACTIVE, task.node.target_provision_state)
self.assertFalse(mock_collect.called)
@mock.patch.object(driver_utils, 'collect_ramdisk_logs', autospec=True) @mock.patch.object(driver_utils, 'collect_ramdisk_logs', autospec=True)
@mock.patch.object(manager_utils, 'node_power_action', autospec=True) @mock.patch.object(manager_utils, 'node_power_action', autospec=True)
@mock.patch.object(agent_client.AgentClient, 'sync', @mock.patch.object(agent_client.AgentClient, 'sync',

View File

@ -0,0 +1,8 @@
---
fixes:
- |
Fixes an issue where if a failure occurs during deployment, the Bare Metal
service could attempt to collect logs from a node that had been powered
off. This would result in a number of failed attempts to collect the logs
before failing the deployment. See `bug 1732939
<https://bugs.launchpad.net/ironic/+bug/1732939>`_ for details.