Support reboot_requested bool on agent clean_steps

After IPA has completed a cleaning step such as upgrading firmware, it
may require a reboot. Ironic should be the one controlling the power
state and out of band reboots are preferable to in band reboots.

Change-Id: Iee3aeea198c556ab9adb9dcfd949d9cffb51418c
Co-Authored-By: Josh Gachnang <josh@servercobra.com>
Closes-bug: 1526918
This commit is contained in:
Jay Faulkner 2016-03-23 18:46:03 +00:00
parent 797e6f20e0
commit bf985bcdd9
5 changed files with 142 additions and 7 deletions

View File

@ -207,8 +207,7 @@ ironic-python-agent ships with a minimal cleaning configuration, only erasing
disks. However, with this ramdisk, you can add your own cleaning steps and/or disks. However, with this ramdisk, you can add your own cleaning steps and/or
override default cleaning steps with a custom Hardware Manager. override default cleaning steps with a custom Hardware Manager.
There is currently no support for in-band cleaning using the ironic pxe In-band cleaning is not supported by the deprecated bash ramdisk.
ramdisk.
Out-of-band Out-of-band
----------- -----------

View File

@ -210,6 +210,8 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
node.clean_step = {} node.clean_step = {}
info = node.driver_internal_info info = node.driver_internal_info
info.pop('clean_step_index', None) info.pop('clean_step_index', None)
# Clear any leftover metadata about cleaning reboots
info.pop('cleaning_reboot', None)
node.driver_internal_info = info node.driver_internal_info = info
# For manual cleaning, the target provision state is MANAGEABLE, whereas # For manual cleaning, the target provision state is MANAGEABLE, whereas
# for automated cleaning, it is AVAILABLE. # for automated cleaning, it is AVAILABLE.

View File

@ -289,19 +289,39 @@ class BaseAgentVendor(base.VendorInterface):
unable to tell if an ordering change will cause a cleaning issue so unable to tell if an ordering change will cause a cleaning issue so
it returns CLEAN_VERSION_MISMATCH. For automated cleaning, we restart it returns CLEAN_VERSION_MISMATCH. For automated cleaning, we restart
the entire cleaning cycle. For manual cleaning, we don't. the entire cleaning cycle. For manual cleaning, we don't.
Additionally, if a clean_step includes the reboot_requested property
set to True, this method will coordinate the reboot once the step is
completed.
""" """
node = task.node node = task.node
# For manual clean, the target provision state is MANAGEABLE, whereas # For manual clean, the target provision state is MANAGEABLE, whereas
# for automated cleaning, it is (the default) AVAILABLE. # for automated cleaning, it is (the default) AVAILABLE.
manual_clean = node.target_provision_state == states.MANAGEABLE manual_clean = node.target_provision_state == states.MANAGEABLE
command = self._get_completed_cleaning_command(task) agent_commands = self._client.get_commands_status(task.node)
if (not agent_commands and
task.node.driver_internal_info.get('cleaning_reboot')):
# Node finished a cleaning step that requested a reboot, and
# this is the first heartbeat after booting. Continue cleaning.
info = task.node.driver_internal_info
info.pop('cleaning_reboot', None)
task.node.driver_internal_info = info
self.notify_conductor_resume_clean(task)
return
if not agent_commands:
# Agent has no commands whatsoever
return
command = self._get_completed_cleaning_command(task, agent_commands)
LOG.debug('Cleaning command status for node %(node)s on step %(step)s:' LOG.debug('Cleaning command status for node %(node)s on step %(step)s:'
' %(command)s', {'node': node.uuid, ' %(command)s', {'node': node.uuid,
'step': node.clean_step, 'step': node.clean_step,
'command': command}) 'command': command})
if not command: if not command:
# Command is not done yet # Agent command in progress
return return
if command.get('command_status') == 'FAILED': if command.get('command_status') == 'FAILED':
@ -377,6 +397,10 @@ class BaseAgentVendor(base.VendorInterface):
LOG.exception(msg) LOG.exception(msg)
return manager_utils.cleaning_error_handler(task, msg) return manager_utils.cleaning_error_handler(task, msg)
if task.node.clean_step.get('reboot_requested'):
self._cleaning_reboot(task)
return
LOG.info(_LI('Agent on node %s returned cleaning command success, ' LOG.info(_LI('Agent on node %s returned cleaning command success, '
'moving to next clean step'), node.uuid) 'moving to next clean step'), node.uuid)
self.notify_conductor_resume_clean(task) self.notify_conductor_resume_clean(task)
@ -389,6 +413,34 @@ class BaseAgentVendor(base.VendorInterface):
LOG.error(msg) LOG.error(msg)
return manager_utils.cleaning_error_handler(task, msg) return manager_utils.cleaning_error_handler(task, msg)
def _cleaning_reboot(self, task):
"""Reboots a node out of band after a clean step that requires it.
If an agent clean step has 'reboot_requested': True, reboots the
node when the step is completed. Will put the node in CLEANFAIL
if the node cannot be rebooted.
:param task: a TaskManager instance
"""
try:
manager_utils.node_power_action(task, states.REBOOT)
except Exception as e:
msg = (_('Reboot requested by clean step %(step)s failed for '
'node %(node)s: %(err)s') %
{'step': task.node.clean_step,
'node': task.node.uuid,
'err': e})
LOG.error(msg)
# do not set cleaning_reboot if we didn't reboot
manager_utils.cleaning_error_handler(task, msg)
return
# Signify that we've rebooted
driver_internal_info = task.node.driver_internal_info
driver_internal_info['cleaning_reboot'] = True
task.node.driver_internal_info = driver_internal_info
task.node.save()
@base.passthru(['POST']) @base.passthru(['POST'])
@task_manager.require_exclusive_lock @task_manager.require_exclusive_lock
def heartbeat(self, task, **kwargs): def heartbeat(self, task, **kwargs):
@ -540,9 +592,12 @@ class BaseAgentVendor(base.VendorInterface):
'node': node.as_dict() 'node': node.as_dict()
} }
def _get_completed_cleaning_command(self, task): def _get_completed_cleaning_command(self, task, commands):
"""Returns None or a completed cleaning command from the agent.""" """Returns None or a completed cleaning command from the agent.
commands = self._client.get_commands_status(task.node)
:param commands: a set of command results from the agent, typically
fetched with agent_client.get_commands_status()
"""
if not commands: if not commands:
return return

View File

@ -958,6 +958,77 @@ class TestBaseAgentVendor(db_base.DbTestCase):
self.passthru.continue_cleaning(task) self.passthru.continue_cleaning(task)
notify_mock.assert_called_once_with(mock.ANY, task) notify_mock.assert_called_once_with(mock.ANY, task)
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
def test__cleaning_reboot(self, mock_reboot):
with task_manager.acquire(self.context, self.node['uuid'],
shared=False) as task:
self.passthru._cleaning_reboot(task)
mock_reboot.assert_called_once_with(task, states.REBOOT)
self.assertTrue(task.node.driver_internal_info.get(
'cleaning_reboot'))
@mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
def test__cleaning_reboot_fail(self, mock_reboot, mock_handler):
mock_reboot.side_effect = iter([RuntimeError("broken")])
with task_manager.acquire(self.context, self.node['uuid'],
shared=False) as task:
self.passthru._cleaning_reboot(task)
mock_reboot.assert_called_once_with(task, states.REBOOT)
mock_handler.assert_called_once_with(task, mock.ANY)
self.assertNotIn('cleaning_reboot',
task.node.driver_internal_info)
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
autospec=True)
def test_continue_cleaning_reboot(self, status_mock, reboot_mock):
# Test a successful execute clean step on the agent, with reboot
self.node.clean_step = {
'priority': 42,
'interface': 'deploy',
'step': 'reboot_me_afterwards',
'reboot_requested': True
}
self.node.save()
status_mock.return_value = [{
'command_status': 'SUCCEEDED',
'command_name': 'execute_clean_step',
'command_result': {
'clean_step': self.node.clean_step
}
}]
with task_manager.acquire(self.context, self.node['uuid'],
shared=False) as task:
self.passthru.continue_cleaning(task)
reboot_mock.assert_called_once_with(task, states.REBOOT)
@mock.patch.object(agent_base_vendor.BaseAgentVendor,
'notify_conductor_resume_clean', autospec=True)
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
autospec=True)
def test_continue_cleaning_after_reboot(self, status_mock, notify_mock):
# Test a successful execute clean step on the agent, with reboot
self.node.clean_step = {
'priority': 42,
'interface': 'deploy',
'step': 'reboot_me_afterwards',
'reboot_requested': True
}
driver_internal_info = self.node.driver_internal_info
driver_internal_info['cleaning_reboot'] = True
self.node.driver_internal_info = driver_internal_info
self.node.save()
# Represents a freshly booted agent with no commands
status_mock.return_value = []
with task_manager.acquire(self.context, self.node['uuid'],
shared=False) as task:
self.passthru.continue_cleaning(task)
notify_mock.assert_called_once_with(mock.ANY, task)
self.assertNotIn('cleaning_reboot',
task.node.driver_internal_info)
@mock.patch.object(agent_base_vendor, @mock.patch.object(agent_base_vendor,
'_get_post_clean_step_hook', autospec=True) '_get_post_clean_step_hook', autospec=True)
@mock.patch.object(agent_base_vendor.BaseAgentVendor, @mock.patch.object(agent_base_vendor.BaseAgentVendor,

View File

@ -0,0 +1,8 @@
---
features:
- This adds the reboot_requested option for in-band
cleaning. If set to true, Ironic will reboot the node
after that step has completed and before continuing
with the next step. This option is useful for when
some action, such as a BIOS upgrade or setting change,
requires a reboot to take effect.