From 752c82ccd9a696d99fbed50e27b9c4e02908a25c Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Fri, 17 Nov 2017 16:17:29 +0000 Subject: [PATCH] Don't collect logs from powered off nodes The agent deploy mixin tries to collect logs from the deployment ramdisk after powering off a node. This is clearly not going to work, as it requires a call to the IPA API. This change avoids collecting logs from the ramdisk in failure scenarios where the node is powered off. Change-Id: I353e73e3612ece893276c683ce469f8561e9ebf9 Closes-Bug: #1732939 --- ironic/drivers/modules/agent_base_vendor.py | 24 +++++++++-- ironic/drivers/modules/deploy_utils.py | 2 +- .../drivers/modules/test_agent_base_vendor.py | 42 ++++++++++++++++++- ...eboot-log-collection-c3e22fc166135e61.yaml | 8 ++++ 4 files changed, 69 insertions(+), 7 deletions(-) create mode 100644 releasenotes/notes/fix-reboot-log-collection-c3e22fc166135e61.yaml diff --git a/ironic/drivers/modules/agent_base_vendor.py b/ironic/drivers/modules/agent_base_vendor.py index 139f025f3f..efb1551e83 100644 --- a/ironic/drivers/modules/agent_base_vendor.py +++ b/ironic/drivers/modules/agent_base_vendor.py @@ -203,10 +203,18 @@ def _get_completed_cleaning_command(task, commands): @METRICS.timer('log_and_raise_deployment_error') -def log_and_raise_deployment_error(task, msg): - """Helper method to log the error and raise exception.""" +def log_and_raise_deployment_error(task, msg, collect_logs=True): + """Helper method to log the error and raise exception. + + :param task: a TaskManager instance containing the node to act on. + :param msg: the message to set in last_error of the node. + :param collect_logs: Boolean indicating whether to attempt to collect + logs from IPA-based ramdisk. Defaults to True. + Actual log collection is also affected by + CONF.agent.deploy_logs_collect config option. + """ LOG.error(msg) - deploy_utils.set_failed_state(task, msg) + deploy_utils.set_failed_state(task, msg, collect_logs=collect_logs) raise exception.InstanceDeployFailure(msg) @@ -590,7 +598,13 @@ class AgentDeployMixin(HeartbeatMixin): {'node': node.uuid, 'error': error}) manager_utils.node_power_action(task, states.POWER_OFF) + except Exception as e: + msg = (_('Error rebooting node %(node)s after deploy. ' + 'Error: %(error)s') % + {'node': node.uuid, 'error': e}) + log_and_raise_deployment_error(task, msg) + try: task.driver.network.remove_provisioning_network(task) task.driver.network.configure_tenant_networks(task) @@ -599,7 +613,9 @@ class AgentDeployMixin(HeartbeatMixin): msg = (_('Error rebooting node %(node)s after deploy. ' 'Error: %(error)s') % {'node': node.uuid, 'error': e}) - log_and_raise_deployment_error(task, msg) + # NOTE(mgoddard): Don't collect logs since the node has been + # powered off. + log_and_raise_deployment_error(task, msg, collect_logs=False) task.process_event('done') LOG.info('Deployment to node %s done', task.node.uuid) diff --git a/ironic/drivers/modules/deploy_utils.py b/ironic/drivers/modules/deploy_utils.py index cdd52d567d..d42a4174e0 100644 --- a/ironic/drivers/modules/deploy_utils.py +++ b/ironic/drivers/modules/deploy_utils.py @@ -504,7 +504,7 @@ def set_failed_state(task, msg, collect_logs=True): :param task: a TaskManager instance containing the node to act on. :param msg: the message to set in last_error of the node. - :param collect_logs: boolean indicating whether to attempt collect + :param collect_logs: Boolean indicating whether to attempt to collect logs from IPA-based ramdisk. Defaults to True. Actual log collection is also affected by CONF.agent.deploy_logs_collect config option. diff --git a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py index a043ab6683..09a7e288b9 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py +++ b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py @@ -437,7 +437,7 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest): configure_tenant_net_mock.assert_called_once_with(mock.ANY, task) self.assertEqual(states.DEPLOYFAIL, task.node.provision_state) self.assertEqual(states.ACTIVE, task.node.target_provision_state) - mock_collect.assert_called_once_with(task.node) + self.assertFalse(mock_collect.called) @mock.patch.object(driver_utils, 'collect_ramdisk_logs', autospec=True) @mock.patch.object(time, 'sleep', lambda seconds: None) @@ -446,7 +446,7 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest): spec=types.FunctionType) @mock.patch.object(agent_client.AgentClient, 'power_off', spec=types.FunctionType) - def test_reboot_and_finish_deploy_power_action_fails( + def test_reboot_and_finish_deploy_power_off_fails( self, power_off_mock, get_power_state_mock, node_power_action_mock, mock_collect): self.node.provision_state = states.DEPLOYING @@ -467,6 +467,44 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest): self.assertEqual(states.ACTIVE, task.node.target_provision_state) mock_collect.assert_called_once_with(task.node) + @mock.patch.object(driver_utils, 'collect_ramdisk_logs', autospec=True) + @mock.patch.object(time, 'sleep', lambda seconds: None) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(fake.FakePower, 'get_power_state', + spec=types.FunctionType) + @mock.patch.object(agent_client.AgentClient, 'power_off', + spec=types.FunctionType) + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.' + 'remove_provisioning_network', spec_set=True, autospec=True) + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.' + 'configure_tenant_networks', spec_set=True, autospec=True) + def test_reboot_and_finish_deploy_power_on_fails( + self, configure_tenant_net_mock, remove_provisioning_net_mock, + power_off_mock, get_power_state_mock, + node_power_action_mock, mock_collect): + self.node.provision_state = states.DEPLOYING + self.node.target_provision_state = states.ACTIVE + self.node.save() + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + get_power_state_mock.return_value = states.POWER_ON + node_power_action_mock.side_effect = [None, + RuntimeError("boom")] + self.assertRaises(exception.InstanceDeployFailure, + self.deploy.reboot_and_finish_deploy, + task) + power_off_mock.assert_called_once_with(task.node) + self.assertEqual(7, get_power_state_mock.call_count) + node_power_action_mock.assert_has_calls([ + mock.call(task, states.POWER_OFF), + mock.call(task, states.POWER_ON)]) + remove_provisioning_net_mock.assert_called_once_with(mock.ANY, + task) + configure_tenant_net_mock.assert_called_once_with(mock.ANY, task) + self.assertEqual(states.DEPLOYFAIL, task.node.provision_state) + self.assertEqual(states.ACTIVE, task.node.target_provision_state) + self.assertFalse(mock_collect.called) + @mock.patch.object(driver_utils, 'collect_ramdisk_logs', autospec=True) @mock.patch.object(manager_utils, 'node_power_action', autospec=True) @mock.patch.object(agent_client.AgentClient, 'sync', diff --git a/releasenotes/notes/fix-reboot-log-collection-c3e22fc166135e61.yaml b/releasenotes/notes/fix-reboot-log-collection-c3e22fc166135e61.yaml new file mode 100644 index 0000000000..fd9bf2e582 --- /dev/null +++ b/releasenotes/notes/fix-reboot-log-collection-c3e22fc166135e61.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Fixes an issue where if a failure occurs during deployment, the Bare Metal + service could attempt to collect logs from a node that had been powered + off. This would result in a number of failed attempts to collect the logs + before failing the deployment. See `bug 1732939 + `_ for details.