From 824ad1676bd8032fb4a4eb8ffc7625a376a64371 Mon Sep 17 00:00:00 2001 From: Lucas Alvares Gomes Date: Mon, 14 Mar 2016 12:20:58 +0000 Subject: [PATCH] Agent: Out-of-band power off on deploy The patch is adding a new property called "deploy_forces_oob_reboot" for the Agent deployment methodology. If the property is specified in the node's driver_info with the value of True, Ironic will call IPA to flush the file system and then issue a out-of-band reboot at the end of the deployment to the node to boot into the user's image. Some hardware/firmware may have problems doing a power off in-band (usually faulty hardware/bad firmware, see the bug this commit is fixing) therefore we need Ironic to do it directly in the BMC. This patch also closes one more consistency gap between the IPA deploy model and the now deprecated bash ramdisk one which used the hard reboot to finish the deployment by default. Depends-On: I5cd1d1b821426e995dc584452494b93ab23917e0 Closes-Bug: #1512492 Change-Id: I5e389c5245826f86466aedfcb9730117e24b59db --- doc/source/drivers/ipa.rst | 20 +++++++ ironic/drivers/modules/agent_base_vendor.py | 52 ++++++++++++----- ironic/drivers/modules/agent_client.py | 7 +++ ironic/drivers/modules/ilo/vendor.py | 3 - ironic/drivers/modules/iscsi_deploy.py | 3 - ironic/tests/unit/conductor/test_manager.py | 24 +++++--- .../drivers/modules/test_agent_base_vendor.py | 58 +++++++++++++++++++ .../unit/drivers/modules/test_agent_client.py | 6 ++ ironic/tests/unit/drivers/modules/test_pxe.py | 2 + .../notes/oob-power-off-7bbdf5947ed24bf8.yaml | 7 +++ 10 files changed, 154 insertions(+), 28 deletions(-) create mode 100644 releasenotes/notes/oob-power-off-7bbdf5947ed24bf8.yaml diff --git a/doc/source/drivers/ipa.rst b/doc/source/drivers/ipa.rst index aa5ecf7a0e..bebbafba5d 100644 --- a/doc/source/drivers/ipa.rst +++ b/doc/source/drivers/ipa.rst @@ -104,3 +104,23 @@ Steps to enable proxies ``image_no_proxy`` to driver_info properties in each node that will use the proxy. Please refer to ``ironic driver-properties`` output of the ``agent_*`` driver you're using for descriptions of these properties. + +Advanced configuration +====================== + +Out-of-band Vs. in-band power off on deploy +------------------------------------------- + +After deploying an image onto the node's hard disk Ironic will reboot +the machine into the new image. By default this power action happens +``in-band``, meaning that the ironic-conductor will instruct the IPA +ramdisk to power itself off. + +Some hardware may have a problem with the default approach and +would require Ironic to talk directly to the management controller +to switch the power off and on again. In order to tell Ironic to do +that you have to update the node's ``driver_info`` field and set the +``deploy_forces_oob_reboot`` parameter with the value of **True**. For +example, the below command sets this configuration in a specific node:: + + ironic node-update add driver_info/deploy_forces_oob_reboot=True diff --git a/ironic/drivers/modules/agent_base_vendor.py b/ironic/drivers/modules/agent_base_vendor.py index bf10d1a35f..9df0cd392b 100644 --- a/ironic/drivers/modules/agent_base_vendor.py +++ b/ironic/drivers/modules/agent_base_vendor.py @@ -22,6 +22,7 @@ import time from oslo_config import cfg from oslo_log import log from oslo_utils import excutils +from oslo_utils import strutils from oslo_utils import timeutils import retrying @@ -82,6 +83,13 @@ LOG = log.getLogger(__name__) # completing 'delete_configuration' of raid interface. POST_CLEAN_STEP_HOOKS = {} +VENDOR_PROPERTIES = { + 'deploy_forces_oob_reboot': _( + 'Whether Ironic should force a reboot of the Node via the out-of-band ' + 'channel after deployment is complete. Provides compatiblity with ' + 'older deploy ramdisks. Defaults to False. Optional.') +} + def _get_client(): client = agent_client.AgentClient() @@ -178,9 +186,7 @@ class BaseAgentVendor(base.VendorInterface): :returns: dictionary of : entries. """ - # NOTE(jroll) all properties are set by the driver, - # not by the operator. - return {} + return VENDOR_PROPERTIES def validate(self, task, method, **kwargs): """Validate the driver-specific Node deployment info. @@ -688,18 +694,38 @@ class BaseAgentVendor(base.VendorInterface): return task.driver.power.get_power_state(task) node = task.node + # Whether ironic should power off the node via out-of-band or + # in-band methods + oob_power_off = strutils.bool_from_string( + node.driver_info.get('deploy_forces_oob_reboot', False)) try: - try: - self._client.power_off(node) - _wait_until_powered_off(task) - except Exception as e: - LOG.warning( - _LW('Failed to soft power off node %(node_uuid)s ' - 'in at least %(timeout)d seconds. Error: %(error)s'), - {'node_uuid': node.uuid, - 'timeout': (wait * (attempts - 1)) / 1000, - 'error': e}) + if not oob_power_off: + try: + self._client.power_off(node) + _wait_until_powered_off(task) + except Exception as e: + LOG.warning( + _LW('Failed to soft power off node %(node_uuid)s ' + 'in at least %(timeout)d seconds. ' + 'Error: %(error)s'), + {'node_uuid': node.uuid, + 'timeout': (wait * (attempts - 1)) / 1000, + 'error': e}) + else: + # Flush the file system prior to hard rebooting the node + result = self._client.sync(node) + error = result.get('faultstring') + if error: + if 'Unknown command' in error: + error = _('The version of the IPA ramdisk used in ' + 'the deployment do not support the ' + 'command "sync"') + LOG.warning(_LW( + 'Failed to flush the file system prior to hard ' + 'rebooting the node %(node)s. Error: %(error)s'), + {'node': node.uuid, 'error': error}) + manager_utils.node_power_action(task, states.REBOOT) except Exception as e: msg = (_('Error rebooting node %(node)s after deploy. ' diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py index 86c985e599..d27c8dbd13 100644 --- a/ironic/drivers/modules/agent_client.py +++ b/ironic/drivers/modules/agent_client.py @@ -174,3 +174,10 @@ class AgentClient(object): return self._command(node=node, method='standby.power_off', params={}) + + def sync(self, node): + """Flush file system buffers forcing changed blocks to disk.""" + return self._command(node=node, + method='standby.sync', + params={}, + wait=True) diff --git a/ironic/drivers/modules/ilo/vendor.py b/ironic/drivers/modules/ilo/vendor.py index d991a145d5..981afa5999 100644 --- a/ironic/drivers/modules/ilo/vendor.py +++ b/ironic/drivers/modules/ilo/vendor.py @@ -57,9 +57,6 @@ class IloVirtualMediaAgentVendorInterface(agent.AgentVendorInterface): class VendorPassthru(iscsi_deploy.VendorPassthru): """Vendor-specific interfaces for iLO deploy drivers.""" - def get_properties(self): - return {} - def validate(self, task, method, **kwargs): """Validate vendor-specific actions. diff --git a/ironic/drivers/modules/iscsi_deploy.py b/ironic/drivers/modules/iscsi_deploy.py index 0206b738d0..bc111643e7 100644 --- a/ironic/drivers/modules/iscsi_deploy.py +++ b/ironic/drivers/modules/iscsi_deploy.py @@ -701,9 +701,6 @@ class ISCSIDeploy(base.DeployInterface): class VendorPassthru(agent_base_vendor.BaseAgentVendor): """Interface to mix IPMI and PXE vendor-specific interfaces.""" - def get_properties(self): - return {} - def validate(self, task, method, **kwargs): """Validates the inputs for a vendor passthru. diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index 48cf4c038f..ffd1ec20d9 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -3803,7 +3803,8 @@ class ManagerTestProperties(tests_db_base.DbTestCase): self._check_driver_properties("fake_ssh", expected) def test_driver_properties_fake_pxe(self): - expected = ['deploy_kernel', 'deploy_ramdisk'] + expected = ['deploy_kernel', 'deploy_ramdisk', + 'deploy_forces_oob_reboot'] self._check_driver_properties("fake_pxe", expected) def test_driver_properties_fake_seamicro(self): @@ -3824,34 +3825,37 @@ class ManagerTestProperties(tests_db_base.DbTestCase): 'ipmi_transit_address', 'ipmi_target_channel', 'ipmi_target_address', 'ipmi_local_address', 'deploy_kernel', 'deploy_ramdisk', 'ipmi_protocol_version', - 'ipmi_force_boot_device' - ] + 'ipmi_force_boot_device', 'deploy_forces_oob_reboot'] self._check_driver_properties("pxe_ipmitool", expected) def test_driver_properties_pxe_ipminative(self): expected = ['ipmi_address', 'ipmi_password', 'ipmi_username', 'deploy_kernel', 'deploy_ramdisk', - 'ipmi_terminal_port', 'ipmi_force_boot_device'] + 'ipmi_terminal_port', 'ipmi_force_boot_device', + 'deploy_forces_oob_reboot'] self._check_driver_properties("pxe_ipminative", expected) def test_driver_properties_pxe_ssh(self): expected = ['deploy_kernel', 'deploy_ramdisk', 'ssh_address', 'ssh_username', 'ssh_virt_type', 'ssh_key_contents', 'ssh_key_filename', - 'ssh_password', 'ssh_port', 'ssh_terminal_port'] + 'ssh_password', 'ssh_port', 'ssh_terminal_port', + 'deploy_forces_oob_reboot'] self._check_driver_properties("pxe_ssh", expected) def test_driver_properties_pxe_seamicro(self): expected = ['deploy_kernel', 'deploy_ramdisk', 'seamicro_api_endpoint', 'seamicro_password', 'seamicro_server_id', 'seamicro_username', - 'seamicro_api_version', 'seamicro_terminal_port'] + 'seamicro_api_version', 'seamicro_terminal_port', + 'deploy_forces_oob_reboot'] self._check_driver_properties("pxe_seamicro", expected) def test_driver_properties_pxe_snmp(self): expected = ['deploy_kernel', 'deploy_ramdisk', 'snmp_driver', 'snmp_address', 'snmp_port', 'snmp_version', - 'snmp_community', 'snmp_security', 'snmp_outlet'] + 'snmp_community', 'snmp_security', 'snmp_outlet', + 'deploy_forces_oob_reboot'] self._check_driver_properties("pxe_snmp", expected) def test_driver_properties_fake_ilo(self): @@ -3862,13 +3866,15 @@ class ManagerTestProperties(tests_db_base.DbTestCase): def test_driver_properties_ilo_iscsi(self): expected = ['ilo_address', 'ilo_username', 'ilo_password', 'client_port', 'client_timeout', 'ilo_deploy_iso', - 'console_port', 'ilo_change_password'] + 'console_port', 'ilo_change_password', + 'deploy_forces_oob_reboot'] self._check_driver_properties("iscsi_ilo", expected) def test_driver_properties_agent_ilo(self): expected = ['ilo_address', 'ilo_username', 'ilo_password', 'client_port', 'client_timeout', 'ilo_deploy_iso', - 'console_port', 'ilo_change_password'] + 'console_port', 'ilo_change_password', + 'deploy_forces_oob_reboot'] self._check_driver_properties("agent_ilo", expected) def test_driver_properties_fail(self): diff --git a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py index b4c1ded179..29f4ee94b4 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py +++ b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py @@ -684,6 +684,60 @@ class TestBaseAgentVendor(db_base.DbTestCase): self.assertEqual(states.DEPLOYFAIL, task.node.provision_state) self.assertEqual(states.ACTIVE, task.node.target_provision_state) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(agent_client.AgentClient, 'sync', + spec=types.FunctionType) + def test_reboot_and_finish_deploy_power_action_oob_power_off( + self, sync_mock, node_power_action_mock): + # Enable force power off + driver_info = self.node.driver_info + driver_info['deploy_forces_oob_reboot'] = True + self.node.driver_info = driver_info + + self.node.provision_state = states.DEPLOYING + self.node.target_provision_state = states.ACTIVE + self.node.save() + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + self.passthru.reboot_and_finish_deploy(task) + + sync_mock.assert_called_once_with(task.node) + node_power_action_mock.assert_called_once_with( + task, states.REBOOT) + self.assertEqual(states.ACTIVE, task.node.provision_state) + self.assertEqual(states.NOSTATE, task.node.target_provision_state) + + @mock.patch.object(agent_base_vendor.LOG, 'warning', autospec=True) + @mock.patch.object(manager_utils, 'node_power_action', autospec=True) + @mock.patch.object(agent_client.AgentClient, 'sync', + spec=types.FunctionType) + def test_reboot_and_finish_deploy_power_action_oob_power_off_failed( + self, sync_mock, node_power_action_mock, log_mock): + # Enable force power off + driver_info = self.node.driver_info + driver_info['deploy_forces_oob_reboot'] = True + self.node.driver_info = driver_info + + self.node.provision_state = states.DEPLOYING + self.node.target_provision_state = states.ACTIVE + self.node.save() + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + sync_mock.return_value = {'faultstring': 'Unknown command: blah'} + self.passthru.reboot_and_finish_deploy(task) + + sync_mock.assert_called_once_with(task.node) + node_power_action_mock.assert_called_once_with( + task, states.REBOOT) + self.assertEqual(states.ACTIVE, task.node.provision_state) + self.assertEqual(states.NOSTATE, task.node.target_provision_state) + log_error = ('The version of the IPA ramdisk used in the ' + 'deployment do not support the command "sync"') + log_mock.assert_called_once_with( + 'Failed to flush the file system prior to hard rebooting the ' + 'node %(node)s. Error: %(error)s', + {'node': task.node.uuid, 'error': log_error}) + @mock.patch.object(agent_client.AgentClient, 'install_bootloader', autospec=True) @mock.patch.object(deploy_utils, 'try_set_boot_device', autospec=True) @@ -1276,3 +1330,7 @@ class TestRefreshCleanSteps(TestBaseAgentVendor): task) client_mock.assert_called_once_with(mock.ANY, task.node, task.ports) + + def test_get_properties(self): + expected = agent_base_vendor.VENDOR_PROPERTIES + self.assertEqual(expected, self.passthru.get_properties()) diff --git a/ironic/tests/unit/drivers/modules/test_agent_client.py b/ironic/tests/unit/drivers/modules/test_agent_client.py index fe5be2dba8..aead8915a7 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_client.py +++ b/ironic/tests/unit/drivers/modules/test_agent_client.py @@ -236,3 +236,9 @@ class TestAgentClient(base.TestCase): self.client.power_off(self.node) self.client._command.assert_called_once_with( node=self.node, method='standby.power_off', params={}) + + def test_sync(self): + self.client._command = mock.MagicMock(spec_set=[]) + self.client.sync(self.node) + self.client._command.assert_called_once_with( + node=self.node, method='standby.sync', params={}, wait=True) diff --git a/ironic/tests/unit/drivers/modules/test_pxe.py b/ironic/tests/unit/drivers/modules/test_pxe.py index 7456262400..9e023ea182 100644 --- a/ironic/tests/unit/drivers/modules/test_pxe.py +++ b/ironic/tests/unit/drivers/modules/test_pxe.py @@ -32,6 +32,7 @@ from ironic.common.glance_service import base_image_service from ironic.common import pxe_utils from ironic.common import states from ironic.conductor import task_manager +from ironic.drivers.modules import agent_base_vendor from ironic.drivers.modules import deploy_utils from ironic.drivers.modules import pxe from ironic.tests.unit.conductor import mgr_utils @@ -549,6 +550,7 @@ class PXEBootTestCase(db_base.DbTestCase): def test_get_properties(self): expected = pxe.COMMON_PROPERTIES + expected.update(agent_base_vendor.VENDOR_PROPERTIES) with task_manager.acquire(self.context, self.node.uuid, shared=True) as task: self.assertEqual(expected, task.driver.get_properties()) diff --git a/releasenotes/notes/oob-power-off-7bbdf5947ed24bf8.yaml b/releasenotes/notes/oob-power-off-7bbdf5947ed24bf8.yaml new file mode 100644 index 0000000000..ddbbf152a1 --- /dev/null +++ b/releasenotes/notes/oob-power-off-7bbdf5947ed24bf8.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - Fixes a problem where some hardware/firmware (specially faulty ones) + won't come back online after an in-band ACPI soft power off by adding + a new driver property called "deploy_forces_oob_reboot" that can be set + to the nodes being deployed by the IPA ramdisk. If the value of this + property is True, Ironic will power cycle the node via out-of-band.