diff --git a/doc/source/deploy/cleaning.rst b/doc/source/deploy/cleaning.rst index 483746a553..59c0d5e7de 100644 --- a/doc/source/deploy/cleaning.rst +++ b/doc/source/deploy/cleaning.rst @@ -207,8 +207,7 @@ ironic-python-agent ships with a minimal cleaning configuration, only erasing disks. However, with this ramdisk, you can add your own cleaning steps and/or override default cleaning steps with a custom Hardware Manager. -There is currently no support for in-band cleaning using the ironic pxe -ramdisk. +In-band cleaning is not supported by the deprecated bash ramdisk. Out-of-band ----------- diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 1ee2f8cd7c..b7a6b1d9c2 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -210,6 +210,8 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True, node.clean_step = {} info = node.driver_internal_info info.pop('clean_step_index', None) + # Clear any leftover metadata about cleaning reboots + info.pop('cleaning_reboot', None) node.driver_internal_info = info # For manual cleaning, the target provision state is MANAGEABLE, whereas # for automated cleaning, it is AVAILABLE. diff --git a/ironic/drivers/modules/agent_base_vendor.py b/ironic/drivers/modules/agent_base_vendor.py index 9df0cd392b..491a958289 100644 --- a/ironic/drivers/modules/agent_base_vendor.py +++ b/ironic/drivers/modules/agent_base_vendor.py @@ -289,19 +289,39 @@ class BaseAgentVendor(base.VendorInterface): unable to tell if an ordering change will cause a cleaning issue so it returns CLEAN_VERSION_MISMATCH. For automated cleaning, we restart the entire cleaning cycle. For manual cleaning, we don't. + + Additionally, if a clean_step includes the reboot_requested property + set to True, this method will coordinate the reboot once the step is + completed. """ node = task.node # For manual clean, the target provision state is MANAGEABLE, whereas # for automated cleaning, it is (the default) AVAILABLE. manual_clean = node.target_provision_state == states.MANAGEABLE - command = self._get_completed_cleaning_command(task) + agent_commands = self._client.get_commands_status(task.node) + + if (not agent_commands and + task.node.driver_internal_info.get('cleaning_reboot')): + # Node finished a cleaning step that requested a reboot, and + # this is the first heartbeat after booting. Continue cleaning. + info = task.node.driver_internal_info + info.pop('cleaning_reboot', None) + task.node.driver_internal_info = info + self.notify_conductor_resume_clean(task) + return + + if not agent_commands: + # Agent has no commands whatsoever + return + + command = self._get_completed_cleaning_command(task, agent_commands) LOG.debug('Cleaning command status for node %(node)s on step %(step)s:' ' %(command)s', {'node': node.uuid, 'step': node.clean_step, 'command': command}) if not command: - # Command is not done yet + # Agent command in progress return if command.get('command_status') == 'FAILED': @@ -377,6 +397,10 @@ class BaseAgentVendor(base.VendorInterface): LOG.exception(msg) return manager_utils.cleaning_error_handler(task, msg) + if task.node.clean_step.get('reboot_requested'): + self._cleaning_reboot(task) + return + LOG.info(_LI('Agent on node %s returned cleaning command success, ' 'moving to next clean step'), node.uuid) self.notify_conductor_resume_clean(task) @@ -389,6 +413,34 @@ class BaseAgentVendor(base.VendorInterface): LOG.error(msg) return manager_utils.cleaning_error_handler(task, msg) + def _cleaning_reboot(self, task): + """Reboots a node out of band after a clean step that requires it. + + If an agent clean step has 'reboot_requested': True, reboots the + node when the step is completed. Will put the node in CLEANFAIL + if the node cannot be rebooted. + + :param task: a TaskManager instance + """ + try: + manager_utils.node_power_action(task, states.REBOOT) + except Exception as e: + msg = (_('Reboot requested by clean step %(step)s failed for ' + 'node %(node)s: %(err)s') % + {'step': task.node.clean_step, + 'node': task.node.uuid, + 'err': e}) + LOG.error(msg) + # do not set cleaning_reboot if we didn't reboot + manager_utils.cleaning_error_handler(task, msg) + return + + # Signify that we've rebooted + driver_internal_info = task.node.driver_internal_info + driver_internal_info['cleaning_reboot'] = True + task.node.driver_internal_info = driver_internal_info + task.node.save() + @base.passthru(['POST']) @task_manager.require_exclusive_lock def heartbeat(self, task, **kwargs): @@ -540,9 +592,12 @@ class BaseAgentVendor(base.VendorInterface): 'node': node.as_dict() } - def _get_completed_cleaning_command(self, task): - """Returns None or a completed cleaning command from the agent.""" - commands = self._client.get_commands_status(task.node) + def _get_completed_cleaning_command(self, task, commands): + """Returns None or a completed cleaning command from the agent. + + :param commands: a set of command results from the agent, typically + fetched with agent_client.get_commands_status() + """ if not commands: return diff --git a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py index 29f4ee94b4..625dc8d5b5 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py +++ b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py @@ -958,6 +958,77 @@ class TestBaseAgentVendor(db_base.DbTestCase): self.passthru.continue_cleaning(task) notify_mock.assert_called_once_with(mock.ANY, task) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + def test__cleaning_reboot(self, mock_reboot): + with task_manager.acquire(self.context, self.node['uuid'], + shared=False) as task: + self.passthru._cleaning_reboot(task) + mock_reboot.assert_called_once_with(task, states.REBOOT) + self.assertTrue(task.node.driver_internal_info.get( + 'cleaning_reboot')) + + @mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + def test__cleaning_reboot_fail(self, mock_reboot, mock_handler): + mock_reboot.side_effect = iter([RuntimeError("broken")]) + + with task_manager.acquire(self.context, self.node['uuid'], + shared=False) as task: + self.passthru._cleaning_reboot(task) + mock_reboot.assert_called_once_with(task, states.REBOOT) + mock_handler.assert_called_once_with(task, mock.ANY) + self.assertNotIn('cleaning_reboot', + task.node.driver_internal_info) + + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(agent_client.AgentClient, 'get_commands_status', + autospec=True) + def test_continue_cleaning_reboot(self, status_mock, reboot_mock): + # Test a successful execute clean step on the agent, with reboot + self.node.clean_step = { + 'priority': 42, + 'interface': 'deploy', + 'step': 'reboot_me_afterwards', + 'reboot_requested': True + } + self.node.save() + status_mock.return_value = [{ + 'command_status': 'SUCCEEDED', + 'command_name': 'execute_clean_step', + 'command_result': { + 'clean_step': self.node.clean_step + } + }] + with task_manager.acquire(self.context, self.node['uuid'], + shared=False) as task: + self.passthru.continue_cleaning(task) + reboot_mock.assert_called_once_with(task, states.REBOOT) + + @mock.patch.object(agent_base_vendor.BaseAgentVendor, + 'notify_conductor_resume_clean', autospec=True) + @mock.patch.object(agent_client.AgentClient, 'get_commands_status', + autospec=True) + def test_continue_cleaning_after_reboot(self, status_mock, notify_mock): + # Test a successful execute clean step on the agent, with reboot + self.node.clean_step = { + 'priority': 42, + 'interface': 'deploy', + 'step': 'reboot_me_afterwards', + 'reboot_requested': True + } + driver_internal_info = self.node.driver_internal_info + driver_internal_info['cleaning_reboot'] = True + self.node.driver_internal_info = driver_internal_info + self.node.save() + # Represents a freshly booted agent with no commands + status_mock.return_value = [] + with task_manager.acquire(self.context, self.node['uuid'], + shared=False) as task: + self.passthru.continue_cleaning(task) + notify_mock.assert_called_once_with(mock.ANY, task) + self.assertNotIn('cleaning_reboot', + task.node.driver_internal_info) + @mock.patch.object(agent_base_vendor, '_get_post_clean_step_hook', autospec=True) @mock.patch.object(agent_base_vendor.BaseAgentVendor, diff --git a/releasenotes/notes/agent-can-request-reboot-6238e13e2e898f68.yaml b/releasenotes/notes/agent-can-request-reboot-6238e13e2e898f68.yaml new file mode 100644 index 0000000000..a1a2920f14 --- /dev/null +++ b/releasenotes/notes/agent-can-request-reboot-6238e13e2e898f68.yaml @@ -0,0 +1,8 @@ +--- +features: + - This adds the reboot_requested option for in-band + cleaning. If set to true, Ironic will reboot the node + after that step has completed and before continuing + with the next step. This option is useful for when + some action, such as a BIOS upgrade or setting change, + requires a reboot to take effect.