From 0fa282eee7697427d70eef5255c2113397c7d816 Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Thu, 28 May 2020 18:01:13 +0200 Subject: [PATCH] agent: do not hard power off if soft power off actually succeeds Change-Id: I747e7bd2a341c828006e9f2216c81dbf9a01eec6 --- ironic/drivers/modules/agent_base.py | 17 +++++-- .../unit/drivers/modules/test_agent_base.py | 45 ++++++++++++++++++- .../agent-power-off-2115fcfaac030bd0.yaml | 5 +++ 3 files changed, 62 insertions(+), 5 deletions(-) create mode 100644 releasenotes/notes/agent-power-off-2115fcfaac030bd0.yaml diff --git a/ironic/drivers/modules/agent_base.py b/ironic/drivers/modules/agent_base.py index 0cdd854703..61ef16451c 100644 --- a/ironic/drivers/modules/agent_base.py +++ b/ironic/drivers/modules/agent_base.py @@ -1015,16 +1015,25 @@ class AgentDeployMixin(HeartbeatMixin): if not oob_power_off: try: self._client.power_off(node) - _wait_until_powered_off(task) except Exception as e: - LOG.warning('Failed to soft power off node %(node_uuid)s ' - 'in at least %(timeout)d seconds. ' + LOG.warning('Failed to soft power off node %(node_uuid)s. ' '%(cls)s: %(error)s', {'node_uuid': node.uuid, - 'timeout': (wait * (attempts - 1)) / 1000, 'cls': e.__class__.__name__, 'error': e}, exc_info=not isinstance( e, exception.IronicException)) + + # NOTE(dtantsur): in rare cases it may happen that the power + # off request comes through but we never receive the response. + # Check the power state before trying to force off. + try: + _wait_until_powered_off(task) + except Exception: + LOG.warning('Failed to soft power off node %(node_uuid)s ' + 'in at least %(timeout)d seconds. Forcing ' + 'hard power off and proceeding.', + {'node_uuid': node.uuid, + 'timeout': (wait * (attempts - 1)) / 1000}) manager_utils.node_power_action(task, states.POWER_OFF) else: # Flush the file system prior to hard rebooting the node diff --git a/ironic/tests/unit/drivers/modules/test_agent_base.py b/ironic/tests/unit/drivers/modules/test_agent_base.py index 31a4ade11d..d684fda9b5 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_base.py +++ b/ironic/tests/unit/drivers/modules/test_agent_base.py @@ -926,7 +926,10 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest): @mock.patch.object(manager_utils, 'notify_conductor_resume_deploy', autospec=True) @mock.patch.object(driver_utils, 'collect_ramdisk_logs', autospec=True) + @mock.patch.object(time, 'sleep', lambda seconds: None) @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(fake.FakePower, 'get_power_state', + spec=types.FunctionType) @mock.patch.object(agent_client.AgentClient, 'power_off', spec=types.FunctionType) @mock.patch('ironic.drivers.modules.network.noop.NoopNetwork.' @@ -935,13 +938,15 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest): 'configure_tenant_networks', spec_set=True, autospec=True) def test_reboot_and_finish_deploy_soft_poweroff_fails( self, configure_tenant_net_mock, remove_provisioning_net_mock, - power_off_mock, node_power_action_mock, mock_collect, resume_mock): + power_off_mock, get_power_state_mock, node_power_action_mock, + mock_collect, resume_mock): power_off_mock.side_effect = RuntimeError("boom") self.node.provision_state = states.DEPLOYING self.node.target_provision_state = states.ACTIVE self.node.save() with task_manager.acquire(self.context, self.node.uuid, shared=True) as task: + get_power_state_mock.return_value = states.POWER_ON self.deploy.reboot_and_finish_deploy(task) power_off_mock.assert_called_once_with(task.node) node_power_action_mock.assert_has_calls([ @@ -954,6 +959,44 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest): self.assertEqual(states.ACTIVE, task.node.target_provision_state) self.assertFalse(mock_collect.called) + @mock.patch.object(manager_utils, 'notify_conductor_resume_deploy', + autospec=True) + @mock.patch.object(driver_utils, 'collect_ramdisk_logs', autospec=True) + @mock.patch.object(time, 'sleep', lambda seconds: None) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(fake.FakePower, 'get_power_state', + spec=types.FunctionType) + @mock.patch.object(agent_client.AgentClient, 'power_off', + spec=types.FunctionType) + @mock.patch('ironic.drivers.modules.network.noop.NoopNetwork.' + 'remove_provisioning_network', spec_set=True, autospec=True) + @mock.patch('ironic.drivers.modules.network.noop.NoopNetwork.' + 'configure_tenant_networks', spec_set=True, autospec=True) + def test_reboot_and_finish_deploy_soft_poweroff_race( + self, configure_tenant_net_mock, remove_provisioning_net_mock, + power_off_mock, get_power_state_mock, node_power_action_mock, + mock_collect, resume_mock): + # Test the situation when soft power off works, but ironic doesn't + # learn about it. + power_off_mock.side_effect = RuntimeError("boom") + self.node.provision_state = states.DEPLOYING + self.node.target_provision_state = states.ACTIVE + self.node.save() + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + get_power_state_mock.side_effect = [states.POWER_ON, + states.POWER_OFF] + self.deploy.reboot_and_finish_deploy(task) + power_off_mock.assert_called_once_with(task.node) + node_power_action_mock.assert_called_once_with( + task, states.POWER_ON) + remove_provisioning_net_mock.assert_called_once_with(mock.ANY, + task) + configure_tenant_net_mock.assert_called_once_with(mock.ANY, task) + self.assertEqual(states.DEPLOYWAIT, task.node.provision_state) + self.assertEqual(states.ACTIVE, task.node.target_provision_state) + self.assertFalse(mock_collect.called) + @mock.patch.object(manager_utils, 'power_on_node_if_needed') @mock.patch.object(manager_utils, 'notify_conductor_resume_deploy', autospec=True) diff --git a/releasenotes/notes/agent-power-off-2115fcfaac030bd0.yaml b/releasenotes/notes/agent-power-off-2115fcfaac030bd0.yaml new file mode 100644 index 0000000000..74f84fbab3 --- /dev/null +++ b/releasenotes/notes/agent-power-off-2115fcfaac030bd0.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Fixes a rare issue when agent successfully powers off a node after + deployment, but ironic never learns about it and does another reboot.