Support reboot_requested bool on agent clean_steps
After IPA has completed a cleaning step such as upgrading firmware, it may require a reboot. Ironic should be the one controlling the power state and out of band reboots are preferable to in band reboots. Change-Id: Iee3aeea198c556ab9adb9dcfd949d9cffb51418c Co-Authored-By: Josh Gachnang <josh@servercobra.com> Closes-bug: 1526918
This commit is contained in:
parent
797e6f20e0
commit
bf985bcdd9
@ -207,8 +207,7 @@ ironic-python-agent ships with a minimal cleaning configuration, only erasing
|
||||
disks. However, with this ramdisk, you can add your own cleaning steps and/or
|
||||
override default cleaning steps with a custom Hardware Manager.
|
||||
|
||||
There is currently no support for in-band cleaning using the ironic pxe
|
||||
ramdisk.
|
||||
In-band cleaning is not supported by the deprecated bash ramdisk.
|
||||
|
||||
Out-of-band
|
||||
-----------
|
||||
|
@ -210,6 +210,8 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
|
||||
node.clean_step = {}
|
||||
info = node.driver_internal_info
|
||||
info.pop('clean_step_index', None)
|
||||
# Clear any leftover metadata about cleaning reboots
|
||||
info.pop('cleaning_reboot', None)
|
||||
node.driver_internal_info = info
|
||||
# For manual cleaning, the target provision state is MANAGEABLE, whereas
|
||||
# for automated cleaning, it is AVAILABLE.
|
||||
|
@ -289,19 +289,39 @@ class BaseAgentVendor(base.VendorInterface):
|
||||
unable to tell if an ordering change will cause a cleaning issue so
|
||||
it returns CLEAN_VERSION_MISMATCH. For automated cleaning, we restart
|
||||
the entire cleaning cycle. For manual cleaning, we don't.
|
||||
|
||||
Additionally, if a clean_step includes the reboot_requested property
|
||||
set to True, this method will coordinate the reboot once the step is
|
||||
completed.
|
||||
"""
|
||||
node = task.node
|
||||
# For manual clean, the target provision state is MANAGEABLE, whereas
|
||||
# for automated cleaning, it is (the default) AVAILABLE.
|
||||
manual_clean = node.target_provision_state == states.MANAGEABLE
|
||||
command = self._get_completed_cleaning_command(task)
|
||||
agent_commands = self._client.get_commands_status(task.node)
|
||||
|
||||
if (not agent_commands and
|
||||
task.node.driver_internal_info.get('cleaning_reboot')):
|
||||
# Node finished a cleaning step that requested a reboot, and
|
||||
# this is the first heartbeat after booting. Continue cleaning.
|
||||
info = task.node.driver_internal_info
|
||||
info.pop('cleaning_reboot', None)
|
||||
task.node.driver_internal_info = info
|
||||
self.notify_conductor_resume_clean(task)
|
||||
return
|
||||
|
||||
if not agent_commands:
|
||||
# Agent has no commands whatsoever
|
||||
return
|
||||
|
||||
command = self._get_completed_cleaning_command(task, agent_commands)
|
||||
LOG.debug('Cleaning command status for node %(node)s on step %(step)s:'
|
||||
' %(command)s', {'node': node.uuid,
|
||||
'step': node.clean_step,
|
||||
'command': command})
|
||||
|
||||
if not command:
|
||||
# Command is not done yet
|
||||
# Agent command in progress
|
||||
return
|
||||
|
||||
if command.get('command_status') == 'FAILED':
|
||||
@ -377,6 +397,10 @@ class BaseAgentVendor(base.VendorInterface):
|
||||
LOG.exception(msg)
|
||||
return manager_utils.cleaning_error_handler(task, msg)
|
||||
|
||||
if task.node.clean_step.get('reboot_requested'):
|
||||
self._cleaning_reboot(task)
|
||||
return
|
||||
|
||||
LOG.info(_LI('Agent on node %s returned cleaning command success, '
|
||||
'moving to next clean step'), node.uuid)
|
||||
self.notify_conductor_resume_clean(task)
|
||||
@ -389,6 +413,34 @@ class BaseAgentVendor(base.VendorInterface):
|
||||
LOG.error(msg)
|
||||
return manager_utils.cleaning_error_handler(task, msg)
|
||||
|
||||
def _cleaning_reboot(self, task):
|
||||
"""Reboots a node out of band after a clean step that requires it.
|
||||
|
||||
If an agent clean step has 'reboot_requested': True, reboots the
|
||||
node when the step is completed. Will put the node in CLEANFAIL
|
||||
if the node cannot be rebooted.
|
||||
|
||||
:param task: a TaskManager instance
|
||||
"""
|
||||
try:
|
||||
manager_utils.node_power_action(task, states.REBOOT)
|
||||
except Exception as e:
|
||||
msg = (_('Reboot requested by clean step %(step)s failed for '
|
||||
'node %(node)s: %(err)s') %
|
||||
{'step': task.node.clean_step,
|
||||
'node': task.node.uuid,
|
||||
'err': e})
|
||||
LOG.error(msg)
|
||||
# do not set cleaning_reboot if we didn't reboot
|
||||
manager_utils.cleaning_error_handler(task, msg)
|
||||
return
|
||||
|
||||
# Signify that we've rebooted
|
||||
driver_internal_info = task.node.driver_internal_info
|
||||
driver_internal_info['cleaning_reboot'] = True
|
||||
task.node.driver_internal_info = driver_internal_info
|
||||
task.node.save()
|
||||
|
||||
@base.passthru(['POST'])
|
||||
@task_manager.require_exclusive_lock
|
||||
def heartbeat(self, task, **kwargs):
|
||||
@ -540,9 +592,12 @@ class BaseAgentVendor(base.VendorInterface):
|
||||
'node': node.as_dict()
|
||||
}
|
||||
|
||||
def _get_completed_cleaning_command(self, task):
|
||||
"""Returns None or a completed cleaning command from the agent."""
|
||||
commands = self._client.get_commands_status(task.node)
|
||||
def _get_completed_cleaning_command(self, task, commands):
|
||||
"""Returns None or a completed cleaning command from the agent.
|
||||
|
||||
:param commands: a set of command results from the agent, typically
|
||||
fetched with agent_client.get_commands_status()
|
||||
"""
|
||||
if not commands:
|
||||
return
|
||||
|
||||
|
@ -958,6 +958,77 @@ class TestBaseAgentVendor(db_base.DbTestCase):
|
||||
self.passthru.continue_cleaning(task)
|
||||
notify_mock.assert_called_once_with(mock.ANY, task)
|
||||
|
||||
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
|
||||
def test__cleaning_reboot(self, mock_reboot):
|
||||
with task_manager.acquire(self.context, self.node['uuid'],
|
||||
shared=False) as task:
|
||||
self.passthru._cleaning_reboot(task)
|
||||
mock_reboot.assert_called_once_with(task, states.REBOOT)
|
||||
self.assertTrue(task.node.driver_internal_info.get(
|
||||
'cleaning_reboot'))
|
||||
|
||||
@mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
|
||||
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
|
||||
def test__cleaning_reboot_fail(self, mock_reboot, mock_handler):
|
||||
mock_reboot.side_effect = iter([RuntimeError("broken")])
|
||||
|
||||
with task_manager.acquire(self.context, self.node['uuid'],
|
||||
shared=False) as task:
|
||||
self.passthru._cleaning_reboot(task)
|
||||
mock_reboot.assert_called_once_with(task, states.REBOOT)
|
||||
mock_handler.assert_called_once_with(task, mock.ANY)
|
||||
self.assertNotIn('cleaning_reboot',
|
||||
task.node.driver_internal_info)
|
||||
|
||||
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
|
||||
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
|
||||
autospec=True)
|
||||
def test_continue_cleaning_reboot(self, status_mock, reboot_mock):
|
||||
# Test a successful execute clean step on the agent, with reboot
|
||||
self.node.clean_step = {
|
||||
'priority': 42,
|
||||
'interface': 'deploy',
|
||||
'step': 'reboot_me_afterwards',
|
||||
'reboot_requested': True
|
||||
}
|
||||
self.node.save()
|
||||
status_mock.return_value = [{
|
||||
'command_status': 'SUCCEEDED',
|
||||
'command_name': 'execute_clean_step',
|
||||
'command_result': {
|
||||
'clean_step': self.node.clean_step
|
||||
}
|
||||
}]
|
||||
with task_manager.acquire(self.context, self.node['uuid'],
|
||||
shared=False) as task:
|
||||
self.passthru.continue_cleaning(task)
|
||||
reboot_mock.assert_called_once_with(task, states.REBOOT)
|
||||
|
||||
@mock.patch.object(agent_base_vendor.BaseAgentVendor,
|
||||
'notify_conductor_resume_clean', autospec=True)
|
||||
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
|
||||
autospec=True)
|
||||
def test_continue_cleaning_after_reboot(self, status_mock, notify_mock):
|
||||
# Test a successful execute clean step on the agent, with reboot
|
||||
self.node.clean_step = {
|
||||
'priority': 42,
|
||||
'interface': 'deploy',
|
||||
'step': 'reboot_me_afterwards',
|
||||
'reboot_requested': True
|
||||
}
|
||||
driver_internal_info = self.node.driver_internal_info
|
||||
driver_internal_info['cleaning_reboot'] = True
|
||||
self.node.driver_internal_info = driver_internal_info
|
||||
self.node.save()
|
||||
# Represents a freshly booted agent with no commands
|
||||
status_mock.return_value = []
|
||||
with task_manager.acquire(self.context, self.node['uuid'],
|
||||
shared=False) as task:
|
||||
self.passthru.continue_cleaning(task)
|
||||
notify_mock.assert_called_once_with(mock.ANY, task)
|
||||
self.assertNotIn('cleaning_reboot',
|
||||
task.node.driver_internal_info)
|
||||
|
||||
@mock.patch.object(agent_base_vendor,
|
||||
'_get_post_clean_step_hook', autospec=True)
|
||||
@mock.patch.object(agent_base_vendor.BaseAgentVendor,
|
||||
|
@ -0,0 +1,8 @@
|
||||
---
|
||||
features:
|
||||
- This adds the reboot_requested option for in-band
|
||||
cleaning. If set to true, Ironic will reboot the node
|
||||
after that step has completed and before continuing
|
||||
with the next step. This option is useful for when
|
||||
some action, such as a BIOS upgrade or setting change,
|
||||
requires a reboot to take effect.
|
Loading…
Reference in New Issue
Block a user