Agent: Out-of-band power off on deploy

The patch is adding  a new property called "deploy_forces_oob_reboot"
for the Agent deployment methodology. If the property is specified in
the node's driver_info with the value of True, Ironic will call IPA to
flush the file system and then issue a out-of-band reboot at the end of
the deployment to the node to boot into the user's image.

Some hardware/firmware may have problems doing a power off in-band
(usually faulty hardware/bad firmware, see the bug this commit is fixing)
therefore we need Ironic to do it directly in the BMC.

This patch also closes one more consistency gap between the IPA deploy
model and the now deprecated bash ramdisk one which used the hard reboot
to finish the deployment by default.

Depends-On: I5cd1d1b821426e995dc584452494b93ab23917e0
Closes-Bug: #1512492
Change-Id: I5e389c5245826f86466aedfcb9730117e24b59db
This commit is contained in:
Lucas Alvares Gomes 2016-03-14 12:20:58 +00:00
parent 0ad5b13b5a
commit 824ad1676b
10 changed files with 154 additions and 28 deletions

View File

@ -104,3 +104,23 @@ Steps to enable proxies
``image_no_proxy`` to driver_info properties in each node that will use the
proxy. Please refer to ``ironic driver-properties`` output of the
``agent_*`` driver you're using for descriptions of these properties.
Advanced configuration
======================
Out-of-band Vs. in-band power off on deploy
-------------------------------------------
After deploying an image onto the node's hard disk Ironic will reboot
the machine into the new image. By default this power action happens
``in-band``, meaning that the ironic-conductor will instruct the IPA
ramdisk to power itself off.
Some hardware may have a problem with the default approach and
would require Ironic to talk directly to the management controller
to switch the power off and on again. In order to tell Ironic to do
that you have to update the node's ``driver_info`` field and set the
``deploy_forces_oob_reboot`` parameter with the value of **True**. For
example, the below command sets this configuration in a specific node::
ironic node-update <UUID or name> add driver_info/deploy_forces_oob_reboot=True

View File

@ -22,6 +22,7 @@ import time
from oslo_config import cfg
from oslo_log import log
from oslo_utils import excutils
from oslo_utils import strutils
from oslo_utils import timeutils
import retrying
@ -82,6 +83,13 @@ LOG = log.getLogger(__name__)
# completing 'delete_configuration' of raid interface.
POST_CLEAN_STEP_HOOKS = {}
VENDOR_PROPERTIES = {
'deploy_forces_oob_reboot': _(
'Whether Ironic should force a reboot of the Node via the out-of-band '
'channel after deployment is complete. Provides compatiblity with '
'older deploy ramdisks. Defaults to False. Optional.')
}
def _get_client():
client = agent_client.AgentClient()
@ -178,9 +186,7 @@ class BaseAgentVendor(base.VendorInterface):
:returns: dictionary of <property name>:<property description> entries.
"""
# NOTE(jroll) all properties are set by the driver,
# not by the operator.
return {}
return VENDOR_PROPERTIES
def validate(self, task, method, **kwargs):
"""Validate the driver-specific Node deployment info.
@ -688,18 +694,38 @@ class BaseAgentVendor(base.VendorInterface):
return task.driver.power.get_power_state(task)
node = task.node
# Whether ironic should power off the node via out-of-band or
# in-band methods
oob_power_off = strutils.bool_from_string(
node.driver_info.get('deploy_forces_oob_reboot', False))
try:
try:
self._client.power_off(node)
_wait_until_powered_off(task)
except Exception as e:
LOG.warning(
_LW('Failed to soft power off node %(node_uuid)s '
'in at least %(timeout)d seconds. Error: %(error)s'),
{'node_uuid': node.uuid,
'timeout': (wait * (attempts - 1)) / 1000,
'error': e})
if not oob_power_off:
try:
self._client.power_off(node)
_wait_until_powered_off(task)
except Exception as e:
LOG.warning(
_LW('Failed to soft power off node %(node_uuid)s '
'in at least %(timeout)d seconds. '
'Error: %(error)s'),
{'node_uuid': node.uuid,
'timeout': (wait * (attempts - 1)) / 1000,
'error': e})
else:
# Flush the file system prior to hard rebooting the node
result = self._client.sync(node)
error = result.get('faultstring')
if error:
if 'Unknown command' in error:
error = _('The version of the IPA ramdisk used in '
'the deployment do not support the '
'command "sync"')
LOG.warning(_LW(
'Failed to flush the file system prior to hard '
'rebooting the node %(node)s. Error: %(error)s'),
{'node': node.uuid, 'error': error})
manager_utils.node_power_action(task, states.REBOOT)
except Exception as e:
msg = (_('Error rebooting node %(node)s after deploy. '

View File

@ -174,3 +174,10 @@ class AgentClient(object):
return self._command(node=node,
method='standby.power_off',
params={})
def sync(self, node):
"""Flush file system buffers forcing changed blocks to disk."""
return self._command(node=node,
method='standby.sync',
params={},
wait=True)

View File

@ -57,9 +57,6 @@ class IloVirtualMediaAgentVendorInterface(agent.AgentVendorInterface):
class VendorPassthru(iscsi_deploy.VendorPassthru):
"""Vendor-specific interfaces for iLO deploy drivers."""
def get_properties(self):
return {}
def validate(self, task, method, **kwargs):
"""Validate vendor-specific actions.

View File

@ -701,9 +701,6 @@ class ISCSIDeploy(base.DeployInterface):
class VendorPassthru(agent_base_vendor.BaseAgentVendor):
"""Interface to mix IPMI and PXE vendor-specific interfaces."""
def get_properties(self):
return {}
def validate(self, task, method, **kwargs):
"""Validates the inputs for a vendor passthru.

View File

@ -3803,7 +3803,8 @@ class ManagerTestProperties(tests_db_base.DbTestCase):
self._check_driver_properties("fake_ssh", expected)
def test_driver_properties_fake_pxe(self):
expected = ['deploy_kernel', 'deploy_ramdisk']
expected = ['deploy_kernel', 'deploy_ramdisk',
'deploy_forces_oob_reboot']
self._check_driver_properties("fake_pxe", expected)
def test_driver_properties_fake_seamicro(self):
@ -3824,34 +3825,37 @@ class ManagerTestProperties(tests_db_base.DbTestCase):
'ipmi_transit_address', 'ipmi_target_channel',
'ipmi_target_address', 'ipmi_local_address',
'deploy_kernel', 'deploy_ramdisk', 'ipmi_protocol_version',
'ipmi_force_boot_device'
]
'ipmi_force_boot_device', 'deploy_forces_oob_reboot']
self._check_driver_properties("pxe_ipmitool", expected)
def test_driver_properties_pxe_ipminative(self):
expected = ['ipmi_address', 'ipmi_password', 'ipmi_username',
'deploy_kernel', 'deploy_ramdisk',
'ipmi_terminal_port', 'ipmi_force_boot_device']
'ipmi_terminal_port', 'ipmi_force_boot_device',
'deploy_forces_oob_reboot']
self._check_driver_properties("pxe_ipminative", expected)
def test_driver_properties_pxe_ssh(self):
expected = ['deploy_kernel', 'deploy_ramdisk',
'ssh_address', 'ssh_username', 'ssh_virt_type',
'ssh_key_contents', 'ssh_key_filename',
'ssh_password', 'ssh_port', 'ssh_terminal_port']
'ssh_password', 'ssh_port', 'ssh_terminal_port',
'deploy_forces_oob_reboot']
self._check_driver_properties("pxe_ssh", expected)
def test_driver_properties_pxe_seamicro(self):
expected = ['deploy_kernel', 'deploy_ramdisk',
'seamicro_api_endpoint', 'seamicro_password',
'seamicro_server_id', 'seamicro_username',
'seamicro_api_version', 'seamicro_terminal_port']
'seamicro_api_version', 'seamicro_terminal_port',
'deploy_forces_oob_reboot']
self._check_driver_properties("pxe_seamicro", expected)
def test_driver_properties_pxe_snmp(self):
expected = ['deploy_kernel', 'deploy_ramdisk',
'snmp_driver', 'snmp_address', 'snmp_port', 'snmp_version',
'snmp_community', 'snmp_security', 'snmp_outlet']
'snmp_community', 'snmp_security', 'snmp_outlet',
'deploy_forces_oob_reboot']
self._check_driver_properties("pxe_snmp", expected)
def test_driver_properties_fake_ilo(self):
@ -3862,13 +3866,15 @@ class ManagerTestProperties(tests_db_base.DbTestCase):
def test_driver_properties_ilo_iscsi(self):
expected = ['ilo_address', 'ilo_username', 'ilo_password',
'client_port', 'client_timeout', 'ilo_deploy_iso',
'console_port', 'ilo_change_password']
'console_port', 'ilo_change_password',
'deploy_forces_oob_reboot']
self._check_driver_properties("iscsi_ilo", expected)
def test_driver_properties_agent_ilo(self):
expected = ['ilo_address', 'ilo_username', 'ilo_password',
'client_port', 'client_timeout', 'ilo_deploy_iso',
'console_port', 'ilo_change_password']
'console_port', 'ilo_change_password',
'deploy_forces_oob_reboot']
self._check_driver_properties("agent_ilo", expected)
def test_driver_properties_fail(self):

View File

@ -684,6 +684,60 @@ class TestBaseAgentVendor(db_base.DbTestCase):
self.assertEqual(states.DEPLOYFAIL, task.node.provision_state)
self.assertEqual(states.ACTIVE, task.node.target_provision_state)
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
@mock.patch.object(agent_client.AgentClient, 'sync',
spec=types.FunctionType)
def test_reboot_and_finish_deploy_power_action_oob_power_off(
self, sync_mock, node_power_action_mock):
# Enable force power off
driver_info = self.node.driver_info
driver_info['deploy_forces_oob_reboot'] = True
self.node.driver_info = driver_info
self.node.provision_state = states.DEPLOYING
self.node.target_provision_state = states.ACTIVE
self.node.save()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
self.passthru.reboot_and_finish_deploy(task)
sync_mock.assert_called_once_with(task.node)
node_power_action_mock.assert_called_once_with(
task, states.REBOOT)
self.assertEqual(states.ACTIVE, task.node.provision_state)
self.assertEqual(states.NOSTATE, task.node.target_provision_state)
@mock.patch.object(agent_base_vendor.LOG, 'warning', autospec=True)
@mock.patch.object(manager_utils, 'node_power_action', autospec=True)
@mock.patch.object(agent_client.AgentClient, 'sync',
spec=types.FunctionType)
def test_reboot_and_finish_deploy_power_action_oob_power_off_failed(
self, sync_mock, node_power_action_mock, log_mock):
# Enable force power off
driver_info = self.node.driver_info
driver_info['deploy_forces_oob_reboot'] = True
self.node.driver_info = driver_info
self.node.provision_state = states.DEPLOYING
self.node.target_provision_state = states.ACTIVE
self.node.save()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
sync_mock.return_value = {'faultstring': 'Unknown command: blah'}
self.passthru.reboot_and_finish_deploy(task)
sync_mock.assert_called_once_with(task.node)
node_power_action_mock.assert_called_once_with(
task, states.REBOOT)
self.assertEqual(states.ACTIVE, task.node.provision_state)
self.assertEqual(states.NOSTATE, task.node.target_provision_state)
log_error = ('The version of the IPA ramdisk used in the '
'deployment do not support the command "sync"')
log_mock.assert_called_once_with(
'Failed to flush the file system prior to hard rebooting the '
'node %(node)s. Error: %(error)s',
{'node': task.node.uuid, 'error': log_error})
@mock.patch.object(agent_client.AgentClient, 'install_bootloader',
autospec=True)
@mock.patch.object(deploy_utils, 'try_set_boot_device', autospec=True)
@ -1276,3 +1330,7 @@ class TestRefreshCleanSteps(TestBaseAgentVendor):
task)
client_mock.assert_called_once_with(mock.ANY, task.node,
task.ports)
def test_get_properties(self):
expected = agent_base_vendor.VENDOR_PROPERTIES
self.assertEqual(expected, self.passthru.get_properties())

View File

@ -236,3 +236,9 @@ class TestAgentClient(base.TestCase):
self.client.power_off(self.node)
self.client._command.assert_called_once_with(
node=self.node, method='standby.power_off', params={})
def test_sync(self):
self.client._command = mock.MagicMock(spec_set=[])
self.client.sync(self.node)
self.client._command.assert_called_once_with(
node=self.node, method='standby.sync', params={}, wait=True)

View File

@ -32,6 +32,7 @@ from ironic.common.glance_service import base_image_service
from ironic.common import pxe_utils
from ironic.common import states
from ironic.conductor import task_manager
from ironic.drivers.modules import agent_base_vendor
from ironic.drivers.modules import deploy_utils
from ironic.drivers.modules import pxe
from ironic.tests.unit.conductor import mgr_utils
@ -549,6 +550,7 @@ class PXEBootTestCase(db_base.DbTestCase):
def test_get_properties(self):
expected = pxe.COMMON_PROPERTIES
expected.update(agent_base_vendor.VENDOR_PROPERTIES)
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
self.assertEqual(expected, task.driver.get_properties())

View File

@ -0,0 +1,7 @@
---
fixes:
- Fixes a problem where some hardware/firmware (specially faulty ones)
won't come back online after an in-band ACPI soft power off by adding
a new driver property called "deploy_forces_oob_reboot" that can be set
to the nodes being deployed by the IPA ramdisk. If the value of this
property is True, Ironic will power cycle the node via out-of-band.