Agent: Out-of-band power off on deploy
The patch is adding a new property called "deploy_forces_oob_reboot" for the Agent deployment methodology. If the property is specified in the node's driver_info with the value of True, Ironic will call IPA to flush the file system and then issue a out-of-band reboot at the end of the deployment to the node to boot into the user's image. Some hardware/firmware may have problems doing a power off in-band (usually faulty hardware/bad firmware, see the bug this commit is fixing) therefore we need Ironic to do it directly in the BMC. This patch also closes one more consistency gap between the IPA deploy model and the now deprecated bash ramdisk one which used the hard reboot to finish the deployment by default. Depends-On: I5cd1d1b821426e995dc584452494b93ab23917e0 Closes-Bug: #1512492 Change-Id: I5e389c5245826f86466aedfcb9730117e24b59db
This commit is contained in:
parent
0ad5b13b5a
commit
824ad1676b
@ -104,3 +104,23 @@ Steps to enable proxies
|
||||
``image_no_proxy`` to driver_info properties in each node that will use the
|
||||
proxy. Please refer to ``ironic driver-properties`` output of the
|
||||
``agent_*`` driver you're using for descriptions of these properties.
|
||||
|
||||
Advanced configuration
|
||||
======================
|
||||
|
||||
Out-of-band Vs. in-band power off on deploy
|
||||
-------------------------------------------
|
||||
|
||||
After deploying an image onto the node's hard disk Ironic will reboot
|
||||
the machine into the new image. By default this power action happens
|
||||
``in-band``, meaning that the ironic-conductor will instruct the IPA
|
||||
ramdisk to power itself off.
|
||||
|
||||
Some hardware may have a problem with the default approach and
|
||||
would require Ironic to talk directly to the management controller
|
||||
to switch the power off and on again. In order to tell Ironic to do
|
||||
that you have to update the node's ``driver_info`` field and set the
|
||||
``deploy_forces_oob_reboot`` parameter with the value of **True**. For
|
||||
example, the below command sets this configuration in a specific node::
|
||||
|
||||
ironic node-update <UUID or name> add driver_info/deploy_forces_oob_reboot=True
|
||||
|
@ -22,6 +22,7 @@ import time
|
||||
from oslo_config import cfg
|
||||
from oslo_log import log
|
||||
from oslo_utils import excutils
|
||||
from oslo_utils import strutils
|
||||
from oslo_utils import timeutils
|
||||
import retrying
|
||||
|
||||
@ -82,6 +83,13 @@ LOG = log.getLogger(__name__)
|
||||
# completing 'delete_configuration' of raid interface.
|
||||
POST_CLEAN_STEP_HOOKS = {}
|
||||
|
||||
VENDOR_PROPERTIES = {
|
||||
'deploy_forces_oob_reboot': _(
|
||||
'Whether Ironic should force a reboot of the Node via the out-of-band '
|
||||
'channel after deployment is complete. Provides compatiblity with '
|
||||
'older deploy ramdisks. Defaults to False. Optional.')
|
||||
}
|
||||
|
||||
|
||||
def _get_client():
|
||||
client = agent_client.AgentClient()
|
||||
@ -178,9 +186,7 @@ class BaseAgentVendor(base.VendorInterface):
|
||||
|
||||
:returns: dictionary of <property name>:<property description> entries.
|
||||
"""
|
||||
# NOTE(jroll) all properties are set by the driver,
|
||||
# not by the operator.
|
||||
return {}
|
||||
return VENDOR_PROPERTIES
|
||||
|
||||
def validate(self, task, method, **kwargs):
|
||||
"""Validate the driver-specific Node deployment info.
|
||||
@ -688,18 +694,38 @@ class BaseAgentVendor(base.VendorInterface):
|
||||
return task.driver.power.get_power_state(task)
|
||||
|
||||
node = task.node
|
||||
# Whether ironic should power off the node via out-of-band or
|
||||
# in-band methods
|
||||
oob_power_off = strutils.bool_from_string(
|
||||
node.driver_info.get('deploy_forces_oob_reboot', False))
|
||||
|
||||
try:
|
||||
try:
|
||||
self._client.power_off(node)
|
||||
_wait_until_powered_off(task)
|
||||
except Exception as e:
|
||||
LOG.warning(
|
||||
_LW('Failed to soft power off node %(node_uuid)s '
|
||||
'in at least %(timeout)d seconds. Error: %(error)s'),
|
||||
{'node_uuid': node.uuid,
|
||||
'timeout': (wait * (attempts - 1)) / 1000,
|
||||
'error': e})
|
||||
if not oob_power_off:
|
||||
try:
|
||||
self._client.power_off(node)
|
||||
_wait_until_powered_off(task)
|
||||
except Exception as e:
|
||||
LOG.warning(
|
||||
_LW('Failed to soft power off node %(node_uuid)s '
|
||||
'in at least %(timeout)d seconds. '
|
||||
'Error: %(error)s'),
|
||||
{'node_uuid': node.uuid,
|
||||
'timeout': (wait * (attempts - 1)) / 1000,
|
||||
'error': e})
|
||||
else:
|
||||
# Flush the file system prior to hard rebooting the node
|
||||
result = self._client.sync(node)
|
||||
error = result.get('faultstring')
|
||||
if error:
|
||||
if 'Unknown command' in error:
|
||||
error = _('The version of the IPA ramdisk used in '
|
||||
'the deployment do not support the '
|
||||
'command "sync"')
|
||||
LOG.warning(_LW(
|
||||
'Failed to flush the file system prior to hard '
|
||||
'rebooting the node %(node)s. Error: %(error)s'),
|
||||
{'node': node.uuid, 'error': error})
|
||||
|
||||
manager_utils.node_power_action(task, states.REBOOT)
|
||||
except Exception as e:
|
||||
msg = (_('Error rebooting node %(node)s after deploy. '
|
||||
|
@ -174,3 +174,10 @@ class AgentClient(object):
|
||||
return self._command(node=node,
|
||||
method='standby.power_off',
|
||||
params={})
|
||||
|
||||
def sync(self, node):
|
||||
"""Flush file system buffers forcing changed blocks to disk."""
|
||||
return self._command(node=node,
|
||||
method='standby.sync',
|
||||
params={},
|
||||
wait=True)
|
||||
|
@ -57,9 +57,6 @@ class IloVirtualMediaAgentVendorInterface(agent.AgentVendorInterface):
|
||||
class VendorPassthru(iscsi_deploy.VendorPassthru):
|
||||
"""Vendor-specific interfaces for iLO deploy drivers."""
|
||||
|
||||
def get_properties(self):
|
||||
return {}
|
||||
|
||||
def validate(self, task, method, **kwargs):
|
||||
"""Validate vendor-specific actions.
|
||||
|
||||
|
@ -701,9 +701,6 @@ class ISCSIDeploy(base.DeployInterface):
|
||||
class VendorPassthru(agent_base_vendor.BaseAgentVendor):
|
||||
"""Interface to mix IPMI and PXE vendor-specific interfaces."""
|
||||
|
||||
def get_properties(self):
|
||||
return {}
|
||||
|
||||
def validate(self, task, method, **kwargs):
|
||||
"""Validates the inputs for a vendor passthru.
|
||||
|
||||
|
@ -3803,7 +3803,8 @@ class ManagerTestProperties(tests_db_base.DbTestCase):
|
||||
self._check_driver_properties("fake_ssh", expected)
|
||||
|
||||
def test_driver_properties_fake_pxe(self):
|
||||
expected = ['deploy_kernel', 'deploy_ramdisk']
|
||||
expected = ['deploy_kernel', 'deploy_ramdisk',
|
||||
'deploy_forces_oob_reboot']
|
||||
self._check_driver_properties("fake_pxe", expected)
|
||||
|
||||
def test_driver_properties_fake_seamicro(self):
|
||||
@ -3824,34 +3825,37 @@ class ManagerTestProperties(tests_db_base.DbTestCase):
|
||||
'ipmi_transit_address', 'ipmi_target_channel',
|
||||
'ipmi_target_address', 'ipmi_local_address',
|
||||
'deploy_kernel', 'deploy_ramdisk', 'ipmi_protocol_version',
|
||||
'ipmi_force_boot_device'
|
||||
]
|
||||
'ipmi_force_boot_device', 'deploy_forces_oob_reboot']
|
||||
self._check_driver_properties("pxe_ipmitool", expected)
|
||||
|
||||
def test_driver_properties_pxe_ipminative(self):
|
||||
expected = ['ipmi_address', 'ipmi_password', 'ipmi_username',
|
||||
'deploy_kernel', 'deploy_ramdisk',
|
||||
'ipmi_terminal_port', 'ipmi_force_boot_device']
|
||||
'ipmi_terminal_port', 'ipmi_force_boot_device',
|
||||
'deploy_forces_oob_reboot']
|
||||
self._check_driver_properties("pxe_ipminative", expected)
|
||||
|
||||
def test_driver_properties_pxe_ssh(self):
|
||||
expected = ['deploy_kernel', 'deploy_ramdisk',
|
||||
'ssh_address', 'ssh_username', 'ssh_virt_type',
|
||||
'ssh_key_contents', 'ssh_key_filename',
|
||||
'ssh_password', 'ssh_port', 'ssh_terminal_port']
|
||||
'ssh_password', 'ssh_port', 'ssh_terminal_port',
|
||||
'deploy_forces_oob_reboot']
|
||||
self._check_driver_properties("pxe_ssh", expected)
|
||||
|
||||
def test_driver_properties_pxe_seamicro(self):
|
||||
expected = ['deploy_kernel', 'deploy_ramdisk',
|
||||
'seamicro_api_endpoint', 'seamicro_password',
|
||||
'seamicro_server_id', 'seamicro_username',
|
||||
'seamicro_api_version', 'seamicro_terminal_port']
|
||||
'seamicro_api_version', 'seamicro_terminal_port',
|
||||
'deploy_forces_oob_reboot']
|
||||
self._check_driver_properties("pxe_seamicro", expected)
|
||||
|
||||
def test_driver_properties_pxe_snmp(self):
|
||||
expected = ['deploy_kernel', 'deploy_ramdisk',
|
||||
'snmp_driver', 'snmp_address', 'snmp_port', 'snmp_version',
|
||||
'snmp_community', 'snmp_security', 'snmp_outlet']
|
||||
'snmp_community', 'snmp_security', 'snmp_outlet',
|
||||
'deploy_forces_oob_reboot']
|
||||
self._check_driver_properties("pxe_snmp", expected)
|
||||
|
||||
def test_driver_properties_fake_ilo(self):
|
||||
@ -3862,13 +3866,15 @@ class ManagerTestProperties(tests_db_base.DbTestCase):
|
||||
def test_driver_properties_ilo_iscsi(self):
|
||||
expected = ['ilo_address', 'ilo_username', 'ilo_password',
|
||||
'client_port', 'client_timeout', 'ilo_deploy_iso',
|
||||
'console_port', 'ilo_change_password']
|
||||
'console_port', 'ilo_change_password',
|
||||
'deploy_forces_oob_reboot']
|
||||
self._check_driver_properties("iscsi_ilo", expected)
|
||||
|
||||
def test_driver_properties_agent_ilo(self):
|
||||
expected = ['ilo_address', 'ilo_username', 'ilo_password',
|
||||
'client_port', 'client_timeout', 'ilo_deploy_iso',
|
||||
'console_port', 'ilo_change_password']
|
||||
'console_port', 'ilo_change_password',
|
||||
'deploy_forces_oob_reboot']
|
||||
self._check_driver_properties("agent_ilo", expected)
|
||||
|
||||
def test_driver_properties_fail(self):
|
||||
|
@ -684,6 +684,60 @@ class TestBaseAgentVendor(db_base.DbTestCase):
|
||||
self.assertEqual(states.DEPLOYFAIL, task.node.provision_state)
|
||||
self.assertEqual(states.ACTIVE, task.node.target_provision_state)
|
||||
|
||||
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
|
||||
@mock.patch.object(agent_client.AgentClient, 'sync',
|
||||
spec=types.FunctionType)
|
||||
def test_reboot_and_finish_deploy_power_action_oob_power_off(
|
||||
self, sync_mock, node_power_action_mock):
|
||||
# Enable force power off
|
||||
driver_info = self.node.driver_info
|
||||
driver_info['deploy_forces_oob_reboot'] = True
|
||||
self.node.driver_info = driver_info
|
||||
|
||||
self.node.provision_state = states.DEPLOYING
|
||||
self.node.target_provision_state = states.ACTIVE
|
||||
self.node.save()
|
||||
with task_manager.acquire(self.context, self.node.uuid,
|
||||
shared=True) as task:
|
||||
self.passthru.reboot_and_finish_deploy(task)
|
||||
|
||||
sync_mock.assert_called_once_with(task.node)
|
||||
node_power_action_mock.assert_called_once_with(
|
||||
task, states.REBOOT)
|
||||
self.assertEqual(states.ACTIVE, task.node.provision_state)
|
||||
self.assertEqual(states.NOSTATE, task.node.target_provision_state)
|
||||
|
||||
@mock.patch.object(agent_base_vendor.LOG, 'warning', autospec=True)
|
||||
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
|
||||
@mock.patch.object(agent_client.AgentClient, 'sync',
|
||||
spec=types.FunctionType)
|
||||
def test_reboot_and_finish_deploy_power_action_oob_power_off_failed(
|
||||
self, sync_mock, node_power_action_mock, log_mock):
|
||||
# Enable force power off
|
||||
driver_info = self.node.driver_info
|
||||
driver_info['deploy_forces_oob_reboot'] = True
|
||||
self.node.driver_info = driver_info
|
||||
|
||||
self.node.provision_state = states.DEPLOYING
|
||||
self.node.target_provision_state = states.ACTIVE
|
||||
self.node.save()
|
||||
with task_manager.acquire(self.context, self.node.uuid,
|
||||
shared=True) as task:
|
||||
sync_mock.return_value = {'faultstring': 'Unknown command: blah'}
|
||||
self.passthru.reboot_and_finish_deploy(task)
|
||||
|
||||
sync_mock.assert_called_once_with(task.node)
|
||||
node_power_action_mock.assert_called_once_with(
|
||||
task, states.REBOOT)
|
||||
self.assertEqual(states.ACTIVE, task.node.provision_state)
|
||||
self.assertEqual(states.NOSTATE, task.node.target_provision_state)
|
||||
log_error = ('The version of the IPA ramdisk used in the '
|
||||
'deployment do not support the command "sync"')
|
||||
log_mock.assert_called_once_with(
|
||||
'Failed to flush the file system prior to hard rebooting the '
|
||||
'node %(node)s. Error: %(error)s',
|
||||
{'node': task.node.uuid, 'error': log_error})
|
||||
|
||||
@mock.patch.object(agent_client.AgentClient, 'install_bootloader',
|
||||
autospec=True)
|
||||
@mock.patch.object(deploy_utils, 'try_set_boot_device', autospec=True)
|
||||
@ -1276,3 +1330,7 @@ class TestRefreshCleanSteps(TestBaseAgentVendor):
|
||||
task)
|
||||
client_mock.assert_called_once_with(mock.ANY, task.node,
|
||||
task.ports)
|
||||
|
||||
def test_get_properties(self):
|
||||
expected = agent_base_vendor.VENDOR_PROPERTIES
|
||||
self.assertEqual(expected, self.passthru.get_properties())
|
||||
|
@ -236,3 +236,9 @@ class TestAgentClient(base.TestCase):
|
||||
self.client.power_off(self.node)
|
||||
self.client._command.assert_called_once_with(
|
||||
node=self.node, method='standby.power_off', params={})
|
||||
|
||||
def test_sync(self):
|
||||
self.client._command = mock.MagicMock(spec_set=[])
|
||||
self.client.sync(self.node)
|
||||
self.client._command.assert_called_once_with(
|
||||
node=self.node, method='standby.sync', params={}, wait=True)
|
||||
|
@ -32,6 +32,7 @@ from ironic.common.glance_service import base_image_service
|
||||
from ironic.common import pxe_utils
|
||||
from ironic.common import states
|
||||
from ironic.conductor import task_manager
|
||||
from ironic.drivers.modules import agent_base_vendor
|
||||
from ironic.drivers.modules import deploy_utils
|
||||
from ironic.drivers.modules import pxe
|
||||
from ironic.tests.unit.conductor import mgr_utils
|
||||
@ -549,6 +550,7 @@ class PXEBootTestCase(db_base.DbTestCase):
|
||||
|
||||
def test_get_properties(self):
|
||||
expected = pxe.COMMON_PROPERTIES
|
||||
expected.update(agent_base_vendor.VENDOR_PROPERTIES)
|
||||
with task_manager.acquire(self.context, self.node.uuid,
|
||||
shared=True) as task:
|
||||
self.assertEqual(expected, task.driver.get_properties())
|
||||
|
7
releasenotes/notes/oob-power-off-7bbdf5947ed24bf8.yaml
Normal file
7
releasenotes/notes/oob-power-off-7bbdf5947ed24bf8.yaml
Normal file
@ -0,0 +1,7 @@
|
||||
---
|
||||
fixes:
|
||||
- Fixes a problem where some hardware/firmware (specially faulty ones)
|
||||
won't come back online after an in-band ACPI soft power off by adding
|
||||
a new driver property called "deploy_forces_oob_reboot" that can be set
|
||||
to the nodes being deployed by the IPA ramdisk. If the value of this
|
||||
property is True, Ironic will power cycle the node via out-of-band.
|
Loading…
Reference in New Issue
Block a user