Support reboot_requested bool on agent clean_steps
After IPA has completed a cleaning step such as upgrading firmware, it may require a reboot. Ironic should be the one controlling the power state and out of band reboots are preferable to in band reboots. Change-Id: Iee3aeea198c556ab9adb9dcfd949d9cffb51418c Co-Authored-By: Josh Gachnang <josh@servercobra.com> Closes-bug: 1526918
This commit is contained in:
parent
797e6f20e0
commit
bf985bcdd9
@ -207,8 +207,7 @@ ironic-python-agent ships with a minimal cleaning configuration, only erasing
|
|||||||
disks. However, with this ramdisk, you can add your own cleaning steps and/or
|
disks. However, with this ramdisk, you can add your own cleaning steps and/or
|
||||||
override default cleaning steps with a custom Hardware Manager.
|
override default cleaning steps with a custom Hardware Manager.
|
||||||
|
|
||||||
There is currently no support for in-band cleaning using the ironic pxe
|
In-band cleaning is not supported by the deprecated bash ramdisk.
|
||||||
ramdisk.
|
|
||||||
|
|
||||||
Out-of-band
|
Out-of-band
|
||||||
-----------
|
-----------
|
||||||
|
@ -210,6 +210,8 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
|
|||||||
node.clean_step = {}
|
node.clean_step = {}
|
||||||
info = node.driver_internal_info
|
info = node.driver_internal_info
|
||||||
info.pop('clean_step_index', None)
|
info.pop('clean_step_index', None)
|
||||||
|
# Clear any leftover metadata about cleaning reboots
|
||||||
|
info.pop('cleaning_reboot', None)
|
||||||
node.driver_internal_info = info
|
node.driver_internal_info = info
|
||||||
# For manual cleaning, the target provision state is MANAGEABLE, whereas
|
# For manual cleaning, the target provision state is MANAGEABLE, whereas
|
||||||
# for automated cleaning, it is AVAILABLE.
|
# for automated cleaning, it is AVAILABLE.
|
||||||
|
@ -289,19 +289,39 @@ class BaseAgentVendor(base.VendorInterface):
|
|||||||
unable to tell if an ordering change will cause a cleaning issue so
|
unable to tell if an ordering change will cause a cleaning issue so
|
||||||
it returns CLEAN_VERSION_MISMATCH. For automated cleaning, we restart
|
it returns CLEAN_VERSION_MISMATCH. For automated cleaning, we restart
|
||||||
the entire cleaning cycle. For manual cleaning, we don't.
|
the entire cleaning cycle. For manual cleaning, we don't.
|
||||||
|
|
||||||
|
Additionally, if a clean_step includes the reboot_requested property
|
||||||
|
set to True, this method will coordinate the reboot once the step is
|
||||||
|
completed.
|
||||||
"""
|
"""
|
||||||
node = task.node
|
node = task.node
|
||||||
# For manual clean, the target provision state is MANAGEABLE, whereas
|
# For manual clean, the target provision state is MANAGEABLE, whereas
|
||||||
# for automated cleaning, it is (the default) AVAILABLE.
|
# for automated cleaning, it is (the default) AVAILABLE.
|
||||||
manual_clean = node.target_provision_state == states.MANAGEABLE
|
manual_clean = node.target_provision_state == states.MANAGEABLE
|
||||||
command = self._get_completed_cleaning_command(task)
|
agent_commands = self._client.get_commands_status(task.node)
|
||||||
|
|
||||||
|
if (not agent_commands and
|
||||||
|
task.node.driver_internal_info.get('cleaning_reboot')):
|
||||||
|
# Node finished a cleaning step that requested a reboot, and
|
||||||
|
# this is the first heartbeat after booting. Continue cleaning.
|
||||||
|
info = task.node.driver_internal_info
|
||||||
|
info.pop('cleaning_reboot', None)
|
||||||
|
task.node.driver_internal_info = info
|
||||||
|
self.notify_conductor_resume_clean(task)
|
||||||
|
return
|
||||||
|
|
||||||
|
if not agent_commands:
|
||||||
|
# Agent has no commands whatsoever
|
||||||
|
return
|
||||||
|
|
||||||
|
command = self._get_completed_cleaning_command(task, agent_commands)
|
||||||
LOG.debug('Cleaning command status for node %(node)s on step %(step)s:'
|
LOG.debug('Cleaning command status for node %(node)s on step %(step)s:'
|
||||||
' %(command)s', {'node': node.uuid,
|
' %(command)s', {'node': node.uuid,
|
||||||
'step': node.clean_step,
|
'step': node.clean_step,
|
||||||
'command': command})
|
'command': command})
|
||||||
|
|
||||||
if not command:
|
if not command:
|
||||||
# Command is not done yet
|
# Agent command in progress
|
||||||
return
|
return
|
||||||
|
|
||||||
if command.get('command_status') == 'FAILED':
|
if command.get('command_status') == 'FAILED':
|
||||||
@ -377,6 +397,10 @@ class BaseAgentVendor(base.VendorInterface):
|
|||||||
LOG.exception(msg)
|
LOG.exception(msg)
|
||||||
return manager_utils.cleaning_error_handler(task, msg)
|
return manager_utils.cleaning_error_handler(task, msg)
|
||||||
|
|
||||||
|
if task.node.clean_step.get('reboot_requested'):
|
||||||
|
self._cleaning_reboot(task)
|
||||||
|
return
|
||||||
|
|
||||||
LOG.info(_LI('Agent on node %s returned cleaning command success, '
|
LOG.info(_LI('Agent on node %s returned cleaning command success, '
|
||||||
'moving to next clean step'), node.uuid)
|
'moving to next clean step'), node.uuid)
|
||||||
self.notify_conductor_resume_clean(task)
|
self.notify_conductor_resume_clean(task)
|
||||||
@ -389,6 +413,34 @@ class BaseAgentVendor(base.VendorInterface):
|
|||||||
LOG.error(msg)
|
LOG.error(msg)
|
||||||
return manager_utils.cleaning_error_handler(task, msg)
|
return manager_utils.cleaning_error_handler(task, msg)
|
||||||
|
|
||||||
|
def _cleaning_reboot(self, task):
|
||||||
|
"""Reboots a node out of band after a clean step that requires it.
|
||||||
|
|
||||||
|
If an agent clean step has 'reboot_requested': True, reboots the
|
||||||
|
node when the step is completed. Will put the node in CLEANFAIL
|
||||||
|
if the node cannot be rebooted.
|
||||||
|
|
||||||
|
:param task: a TaskManager instance
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
manager_utils.node_power_action(task, states.REBOOT)
|
||||||
|
except Exception as e:
|
||||||
|
msg = (_('Reboot requested by clean step %(step)s failed for '
|
||||||
|
'node %(node)s: %(err)s') %
|
||||||
|
{'step': task.node.clean_step,
|
||||||
|
'node': task.node.uuid,
|
||||||
|
'err': e})
|
||||||
|
LOG.error(msg)
|
||||||
|
# do not set cleaning_reboot if we didn't reboot
|
||||||
|
manager_utils.cleaning_error_handler(task, msg)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Signify that we've rebooted
|
||||||
|
driver_internal_info = task.node.driver_internal_info
|
||||||
|
driver_internal_info['cleaning_reboot'] = True
|
||||||
|
task.node.driver_internal_info = driver_internal_info
|
||||||
|
task.node.save()
|
||||||
|
|
||||||
@base.passthru(['POST'])
|
@base.passthru(['POST'])
|
||||||
@task_manager.require_exclusive_lock
|
@task_manager.require_exclusive_lock
|
||||||
def heartbeat(self, task, **kwargs):
|
def heartbeat(self, task, **kwargs):
|
||||||
@ -540,9 +592,12 @@ class BaseAgentVendor(base.VendorInterface):
|
|||||||
'node': node.as_dict()
|
'node': node.as_dict()
|
||||||
}
|
}
|
||||||
|
|
||||||
def _get_completed_cleaning_command(self, task):
|
def _get_completed_cleaning_command(self, task, commands):
|
||||||
"""Returns None or a completed cleaning command from the agent."""
|
"""Returns None or a completed cleaning command from the agent.
|
||||||
commands = self._client.get_commands_status(task.node)
|
|
||||||
|
:param commands: a set of command results from the agent, typically
|
||||||
|
fetched with agent_client.get_commands_status()
|
||||||
|
"""
|
||||||
if not commands:
|
if not commands:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -958,6 +958,77 @@ class TestBaseAgentVendor(db_base.DbTestCase):
|
|||||||
self.passthru.continue_cleaning(task)
|
self.passthru.continue_cleaning(task)
|
||||||
notify_mock.assert_called_once_with(mock.ANY, task)
|
notify_mock.assert_called_once_with(mock.ANY, task)
|
||||||
|
|
||||||
|
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
|
||||||
|
def test__cleaning_reboot(self, mock_reboot):
|
||||||
|
with task_manager.acquire(self.context, self.node['uuid'],
|
||||||
|
shared=False) as task:
|
||||||
|
self.passthru._cleaning_reboot(task)
|
||||||
|
mock_reboot.assert_called_once_with(task, states.REBOOT)
|
||||||
|
self.assertTrue(task.node.driver_internal_info.get(
|
||||||
|
'cleaning_reboot'))
|
||||||
|
|
||||||
|
@mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
|
||||||
|
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
|
||||||
|
def test__cleaning_reboot_fail(self, mock_reboot, mock_handler):
|
||||||
|
mock_reboot.side_effect = iter([RuntimeError("broken")])
|
||||||
|
|
||||||
|
with task_manager.acquire(self.context, self.node['uuid'],
|
||||||
|
shared=False) as task:
|
||||||
|
self.passthru._cleaning_reboot(task)
|
||||||
|
mock_reboot.assert_called_once_with(task, states.REBOOT)
|
||||||
|
mock_handler.assert_called_once_with(task, mock.ANY)
|
||||||
|
self.assertNotIn('cleaning_reboot',
|
||||||
|
task.node.driver_internal_info)
|
||||||
|
|
||||||
|
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
|
||||||
|
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
|
||||||
|
autospec=True)
|
||||||
|
def test_continue_cleaning_reboot(self, status_mock, reboot_mock):
|
||||||
|
# Test a successful execute clean step on the agent, with reboot
|
||||||
|
self.node.clean_step = {
|
||||||
|
'priority': 42,
|
||||||
|
'interface': 'deploy',
|
||||||
|
'step': 'reboot_me_afterwards',
|
||||||
|
'reboot_requested': True
|
||||||
|
}
|
||||||
|
self.node.save()
|
||||||
|
status_mock.return_value = [{
|
||||||
|
'command_status': 'SUCCEEDED',
|
||||||
|
'command_name': 'execute_clean_step',
|
||||||
|
'command_result': {
|
||||||
|
'clean_step': self.node.clean_step
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
with task_manager.acquire(self.context, self.node['uuid'],
|
||||||
|
shared=False) as task:
|
||||||
|
self.passthru.continue_cleaning(task)
|
||||||
|
reboot_mock.assert_called_once_with(task, states.REBOOT)
|
||||||
|
|
||||||
|
@mock.patch.object(agent_base_vendor.BaseAgentVendor,
|
||||||
|
'notify_conductor_resume_clean', autospec=True)
|
||||||
|
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
|
||||||
|
autospec=True)
|
||||||
|
def test_continue_cleaning_after_reboot(self, status_mock, notify_mock):
|
||||||
|
# Test a successful execute clean step on the agent, with reboot
|
||||||
|
self.node.clean_step = {
|
||||||
|
'priority': 42,
|
||||||
|
'interface': 'deploy',
|
||||||
|
'step': 'reboot_me_afterwards',
|
||||||
|
'reboot_requested': True
|
||||||
|
}
|
||||||
|
driver_internal_info = self.node.driver_internal_info
|
||||||
|
driver_internal_info['cleaning_reboot'] = True
|
||||||
|
self.node.driver_internal_info = driver_internal_info
|
||||||
|
self.node.save()
|
||||||
|
# Represents a freshly booted agent with no commands
|
||||||
|
status_mock.return_value = []
|
||||||
|
with task_manager.acquire(self.context, self.node['uuid'],
|
||||||
|
shared=False) as task:
|
||||||
|
self.passthru.continue_cleaning(task)
|
||||||
|
notify_mock.assert_called_once_with(mock.ANY, task)
|
||||||
|
self.assertNotIn('cleaning_reboot',
|
||||||
|
task.node.driver_internal_info)
|
||||||
|
|
||||||
@mock.patch.object(agent_base_vendor,
|
@mock.patch.object(agent_base_vendor,
|
||||||
'_get_post_clean_step_hook', autospec=True)
|
'_get_post_clean_step_hook', autospec=True)
|
||||||
@mock.patch.object(agent_base_vendor.BaseAgentVendor,
|
@mock.patch.object(agent_base_vendor.BaseAgentVendor,
|
||||||
|
@ -0,0 +1,8 @@
|
|||||||
|
---
|
||||||
|
features:
|
||||||
|
- This adds the reboot_requested option for in-band
|
||||||
|
cleaning. If set to true, Ironic will reboot the node
|
||||||
|
after that step has completed and before continuing
|
||||||
|
with the next step. This option is useful for when
|
||||||
|
some action, such as a BIOS upgrade or setting change,
|
||||||
|
requires a reboot to take effect.
|
Loading…
Reference in New Issue
Block a user