From 46f8c857524e8a45935836ac97463928a2d9a0a9 Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Mon, 15 Jun 2020 15:30:39 +0200 Subject: [PATCH] Add agent power interface This change adds a new 'agent' power interface that can be used together with fast-track to deploy nodes without knowing their power credentials. It relies on the agent staying powered on during the whole pre-deployment and deployment process. Story: #2007771 Task: #39995 Change-Id: I3d7157c1c4464b650adebbd7f894ee33d0f8f25b --- ironic/conductor/task_manager.py | 13 ++ ironic/conductor/utils.py | 19 +- ironic/drivers/generic.py | 3 +- ironic/drivers/modules/agent_client.py | 17 ++ ironic/drivers/modules/agent_power.py | 220 ++++++++++++++++++ .../unit/drivers/modules/test_agent_power.py | 127 ++++++++++ .../notes/agent-power-a000fdf37cb870e4.yaml | 6 + setup.cfg | 1 + 8 files changed, 402 insertions(+), 4 deletions(-) create mode 100644 ironic/drivers/modules/agent_power.py create mode 100644 ironic/tests/unit/drivers/modules/test_agent_power.py create mode 100644 releasenotes/notes/agent-power-a000fdf37cb870e4.yaml diff --git a/ironic/conductor/task_manager.py b/ironic/conductor/task_manager.py index df68c27aa4..b4089d6187 100644 --- a/ironic/conductor/task_manager.py +++ b/ironic/conductor/task_manager.py @@ -336,6 +336,19 @@ class TaskManager(object): self._on_error_args = args self._on_error_kwargs = kwargs + def downgrade_lock(self): + """Downgrade the lock to a shared one.""" + if self.node is None: + raise RuntimeError("Cannot downgrade an already released lock") + + if not self.shared: + objects.Node.release(self.context, CONF.host, self.node.id) + self.shared = True + self.node.refresh() + LOG.debug("Successfully downgraded lock for %(purpose)s " + "on node %(node)s", + {'purpose': self._purpose, 'node': self.node.uuid}) + def release_resources(self): """Unlock a node and release resources. diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 60bc4a5427..3423969070 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -963,6 +963,21 @@ def value_within_timeout(value, timeout): return last_valid <= last +def agent_is_alive(node, timeout=None): + """Check that the agent is likely alive. + + The method then checks for the last agent heartbeat, and if it occured + within the timeout set by [deploy]fast_track_timeout, then agent is + presumed alive. + + :param node: A node object. + :param timeout: Heartbeat timeout, defaults to `fast_track_timeout`. + """ + return value_within_timeout( + node.driver_internal_info.get('agent_last_heartbeat'), + timeout or CONF.deploy.fast_track_timeout) + + def is_fast_track(task): """Checks a fast track is available. @@ -987,9 +1002,7 @@ def is_fast_track(task): {'node': task.node.uuid, 'error': task.node.last_error}) return False - if value_within_timeout( - task.node.driver_internal_info.get('agent_last_heartbeat'), - CONF.deploy.fast_track_timeout): + if agent_is_alive(task.node): return True else: LOG.debug('Node %(node)s should be fast-track-able, but the agent ' diff --git a/ironic/drivers/generic.py b/ironic/drivers/generic.py index 1e7a83c4b8..599e1139cf 100644 --- a/ironic/drivers/generic.py +++ b/ironic/drivers/generic.py @@ -18,6 +18,7 @@ Generic hardware types. from ironic.drivers import hardware_type from ironic.drivers.modules import agent +from ironic.drivers.modules import agent_power from ironic.drivers.modules.ansible import deploy as ansible_deploy from ironic.drivers.modules import fake from ironic.drivers.modules import inspector @@ -102,7 +103,7 @@ class ManualManagementHardware(GenericHardware): @property def supported_power_interfaces(self): """List of supported power interfaces.""" - return [fake.FakePower] + return [agent_power.AgentPower, fake.FakePower] @property def supported_vendor_interfaces(self): diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py index e4395ed250..d1f3d67591 100644 --- a/ironic/drivers/modules/agent_client.py +++ b/ironic/drivers/modules/agent_client.py @@ -30,6 +30,8 @@ METRICS = metrics_utils.get_metrics_logger(__name__) DEFAULT_IPA_PORTAL_PORT = 3260 +REBOOT_COMMAND = 'run_image' + def get_command_error(command): """Extract an error string from the command result. @@ -564,6 +566,21 @@ class AgentClient(object): method='standby.power_off', params={}) + @METRICS.timer('AgentClient.reboot') + def reboot(self, node): + """Soft reboots the bare metal node by shutting down ramdisk OS. + + :param node: A Node object. + :raises: IronicException when failed to issue the request or there was + a malformed response from the agent. + :raises: AgentAPIError when agent failed to execute specified command. + :returns: A dict containing command response from agent. + See :func:`get_commands_status` for a command result sample. + """ + return self._command(node=node, + method='standby.%s' % REBOOT_COMMAND, + params={}) + @METRICS.timer('AgentClient.sync') def sync(self, node): """Flush file system buffers forcing changed blocks to disk. diff --git a/ironic/drivers/modules/agent_power.py b/ironic/drivers/modules/agent_power.py new file mode 100644 index 0000000000..11ef5711a7 --- /dev/null +++ b/ironic/drivers/modules/agent_power.py @@ -0,0 +1,220 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +""" +The agent power interface. +""" + +import time + +from oslo_config import cfg +from oslo_log import log +import retrying + +from ironic.common import exception +from ironic.common.i18n import _ +from ironic.common import states +from ironic.conductor import utils as cond_utils +from ironic.drivers import base +from ironic.drivers.modules import agent_client + + +CONF = cfg.CONF + +LOG = log.getLogger(__name__) + +_POWER_WAIT = 30 + + +class AgentPower(base.PowerInterface): + """Power interface using the running agent for power actions.""" + + def __init__(self): + super(AgentPower, self).__init__() + if not CONF.deploy.fast_track: + raise exception.InvalidParameterValue( + _('[deploy]fast_track must be True to enable the agent ' + 'power interface')) + self._client = agent_client.AgentClient() + + def get_properties(self): + """Return the properties of the interface. + + :returns: dictionary of : entries. + """ + return {} + + def validate(self, task): + """Validate the driver-specific Node deployment info. + + :param task: A TaskManager instance containing the node to act on. + :raises: InvalidParameterValue on malformed parameter(s) + """ + # NOTE(dtantsur): the fast_track option is mutable, so we have to check + # it again on validation. + if not CONF.deploy.fast_track: + raise exception.InvalidParameterValue( + _('[deploy]fast_track must be True to enable the agent ' + 'power interface')) + # TODO(dtantsur): support ACTIVE nodes + if not cond_utils.agent_is_alive(task.node): + raise exception.InvalidParameterValue( + _('Agent seems offline for node %s, the agent power interface ' + 'cannot be used') % task.node.uuid) + + def supports_power_sync(self, task): + """Check if power sync is supported for the given node. + + Not supported for the agent power since it is not possible to power + on/off nodes. + + :param task: A TaskManager instance containing the node to act on + with a **shared** lock. + :returns: boolean, whether power sync is supported. + """ + return False + + def get_supported_power_states(self, task): + """Get a list of the supported power states. + + Only contains REBOOT. + + :param task: A TaskManager instance containing the node to act on. + :returns: A list with the supported power states defined + in :mod:`ironic.common.states`. + """ + return [states.REBOOT, states.SOFT_REBOOT] + + def get_power_state(self, task): + """Return the power state of the task's node. + + Essentially, the only known state is POWER ON, everything else is + an error (or more precisely ``None``). + + :param task: A TaskManager instance containing the node to act on. + :returns: A power state. One of :mod:`ironic.common.states`. + """ + # TODO(dtantsur): support ACTIVE nodes + if cond_utils.agent_is_alive(task.node): + return states.POWER_ON + else: + LOG.error('Node %s is not fast-track-able, cannot determine ' + 'its power state via the "agent" power interface', + task.node.uuid) + return None + + def set_power_state(self, task, power_state, timeout=None): + """Set the power state of the task's node. + + :param task: A TaskManager instance containing the node to act on. + :param power_state: Power state from :mod:`ironic.common.states`. + Only REBOOT and SOFT_REBOOT are supported and are synonymous. + :param timeout: timeout (in seconds) positive integer (> 0) for any + power state. ``None`` indicates to use default timeout. + :raises: PowerStateFailure on non-supported power state. + """ + if power_state in (states.REBOOT, states.SOFT_REBOOT): + return self.reboot(task) + else: + LOG.error('Power state %(state)s is not implemented for node ' + '%(node)s using the "agent" power interface', + {'node': task.node.uuid, 'state': power_state}) + raise exception.PowerStateFailure(pstate=power_state) + + def reboot(self, task, timeout=None): + """Perform a reboot of the task's node. + + Only soft reboot is implemented. + + :param task: A TaskManager instance containing the node to act on. + :param timeout: timeout (in seconds) positive integer (> 0) for any + power state. ``None`` indicates to use default timeout. + """ + node = task.node + + self._client.reboot(node) + + info = node.driver_internal_info + # NOTE(dtantsur): wipe the agent token, otherwise the rebooted agent + # won't be able to heartbeat. This is mostly a precaution since the + # calling code in conductor is expected to handle it. + if not info.get('agent_secret_token_pregenerated'): + info.pop('agent_secret_token', None) + # NOTE(dtantsur): the URL may change on reboot, wipe it as well (but + # only after we call reboot). + info.pop('agent_url', None) + node.driver_internal_info = info + node.save() + + LOG.debug('Requested reboot of node %(node)s via the agent, waiting ' + '%(wait)d seconds for the node to power down', + {'node': task.node.uuid, 'wait': _POWER_WAIT}) + time.sleep(_POWER_WAIT) + + if (node.provision_state in (states.DEPLOYING, states.CLEANING) + and (node.driver_internal_info.get('deployment_reboot') + or node.driver_internal_info.get('cleaning_reboot'))): + # NOTE(dtantsur): we need to downgrade the lock otherwise + # heartbeats won't be processed. It should not have side effects + # for nodes in DEPLOYING/CLEANING. + task.downgrade_lock() + + try: + self._wait_for_reboot(task, timeout) + finally: + # The caller probably expects a lock, so re-acquire it + task.upgrade_lock() + + def _wait_for_reboot(self, task, timeout): + wait = CONF.agent.post_deploy_get_power_state_retry_interval + if not timeout: + timeout = CONF.agent.post_deploy_get_power_state_retries * wait + + @retrying.retry( + stop_max_delay=timeout, + retry_on_result=lambda result: not result, + retry_on_exception=( + lambda e: isinstance(e, exception.AgentConnectionFailed)), + wait_fixed=wait * 1000 + ) + def _wait_until_rebooted(task): + try: + status = self._client.get_commands_status( + task.node, retry_connection=False, expect_errors=True) + except exception.AgentConnectionFailed: + LOG.debug('Still waiting for the agent to come back on the ' + 'node %s', task.node.uuid) + raise + + if any(cmd['command_name'] == agent_client.REBOOT_COMMAND + for cmd in status): + LOG.debug('Still waiting for the agent to power off on the ' + 'node %s', task.node.uuid) + return False + + return True + + try: + _wait_until_rebooted(task) + except exception.AgentConnectionFailed as exc: + msg = _('Agent failed to come back on %(node)s with the "agent" ' + 'power interface: %(exc)s') % { + 'node': task.node.uuid, 'exc': exc} + LOG.error(msg) + raise exception.PowerStateFailure(msg) + except Exception as exc: + LOG.error('Could not reboot node %(node)s with the "agent" power ' + 'interface: %(exc)s', + {'node': task.node.uuid, 'exc': exc}) + raise exception.PowerStateFailure( + _('Unexpected error when rebooting through the agent: %s') + % exc) diff --git a/ironic/tests/unit/drivers/modules/test_agent_power.py b/ironic/tests/unit/drivers/modules/test_agent_power.py new file mode 100644 index 0000000000..0d4004c663 --- /dev/null +++ b/ironic/tests/unit/drivers/modules/test_agent_power.py @@ -0,0 +1,127 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import datetime +from unittest import mock + +from ironic.common import exception +from ironic.common import states +from ironic.conductor import task_manager +from ironic.drivers.modules import agent_client +from ironic.drivers.modules import agent_power +from ironic.tests.unit.db import base as db_base +from ironic.tests.unit.objects import utils as object_utils + + +@mock.patch('time.sleep', lambda _sec: None) +class AgentPowerTest(db_base.DbTestCase): + + def setUp(self): + super(AgentPowerTest, self).setUp() + self.config(fast_track=True, group='deploy') + self.power = agent_power.AgentPower() + dii = { + 'agent_last_heartbeat': datetime.datetime.now().strftime( + "%Y-%m-%dT%H:%M:%S.%f"), + 'deployment_reboot': True, + 'agent_url': 'http://url', + 'agent_secret_token': 'very secret', + } + self.node = object_utils.create_test_node( + self.context, driver_internal_info=dii, + provision_state=states.DEPLOYING) + self.task = mock.Mock(spec=task_manager.TaskManager, node=self.node) + + def test_basics(self): + self.assertEqual({}, self.power.get_properties()) + self.assertFalse(self.power.supports_power_sync(self.task)) + self.assertEqual([states.REBOOT, states.SOFT_REBOOT], + self.power.get_supported_power_states(self.task)) + + def test_validate(self): + self.power.validate(self.task) + + def test_validate_fails(self): + self.node.driver_internal_info['agent_last_heartbeat'] = \ + datetime.datetime(2010, 7, 19).strftime( + "%Y-%m-%dT%H:%M:%S.%f") + self.assertRaises(exception.InvalidParameterValue, + self.power.validate, self.task) + + del self.node.driver_internal_info['agent_last_heartbeat'] + self.assertRaises(exception.InvalidParameterValue, + self.power.validate, self.task) + + def test_get_power_state(self): + self.assertEqual(states.POWER_ON, + self.power.get_power_state(self.task)) + + def test_get_power_state_unknown(self): + self.node.driver_internal_info['agent_last_heartbeat'] = \ + datetime.datetime(2010, 7, 19).strftime( + "%Y-%m-%dT%H:%M:%S.%f") + self.assertIsNone(self.power.get_power_state(self.task)) + + del self.node.driver_internal_info['agent_last_heartbeat'] + self.assertIsNone(self.power.get_power_state(self.task)) + + @mock.patch.object(agent_client.AgentClient, 'get_commands_status', + autospec=True) + @mock.patch.object(agent_client.AgentClient, 'reboot', autospec=True) + def test_reboot(self, mock_reboot, mock_commands): + mock_commands.side_effect = [ + [{'command_name': 'run_image', 'command_status': 'RUNNING'}], + exception.AgentConnectionFailed, + exception.AgentConnectionFailed, + [{'command_name': 'get_deploy_steps', 'command_status': 'RUNNING'}] + ] + with task_manager.acquire(self.context, self.node.id) as task: + # Save the node since the upgrade_lock call changes it + node = task.node + self.power.reboot(task) + mock_reboot.assert_called_once_with(self.power._client, node) + mock_commands.assert_called_with(self.power._client, node, + retry_connection=False, + expect_errors=True) + self.assertEqual(4, mock_commands.call_count) + + node.refresh() + self.assertNotIn('agent_secret_token', node.driver_internal_info) + self.assertNotIn('agent_url', node.driver_internal_info) + + @mock.patch.object(agent_client.AgentClient, 'get_commands_status', + autospec=True) + @mock.patch.object(agent_client.AgentClient, 'reboot', autospec=True) + def test_reboot_timeout(self, mock_reboot, mock_commands): + mock_commands.side_effect = exception.AgentConnectionFailed + with task_manager.acquire(self.context, self.node.id) as task: + node = task.node + self.assertRaisesRegex(exception.PowerStateFailure, + 'Agent failed to come back', + self.power.reboot, task, timeout=0.001) + mock_commands.assert_called_with(self.power._client, node, + retry_connection=False, + expect_errors=True) + + @mock.patch.object(agent_client.AgentClient, 'reboot', autospec=True) + def test_reboot_another_state(self, mock_reboot): + with task_manager.acquire(self.context, self.node.id) as task: + task.node.provision_state = states.DEPLOYWAIT + self.power.reboot(task) + mock_reboot.assert_called_once_with(self.power._client, task.node) + + @mock.patch.object(agent_client.AgentClient, 'reboot', autospec=True) + def test_reboot_into_instance(self, mock_reboot): + with task_manager.acquire(self.context, self.node.id) as task: + del task.node.driver_internal_info['deployment_reboot'] + self.power.reboot(task) + mock_reboot.assert_called_once_with(self.power._client, task.node) diff --git a/releasenotes/notes/agent-power-a000fdf37cb870e4.yaml b/releasenotes/notes/agent-power-a000fdf37cb870e4.yaml new file mode 100644 index 0000000000..549a78b21d --- /dev/null +++ b/releasenotes/notes/agent-power-a000fdf37cb870e4.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + The new **experimental** ``agent`` power interface allows limited + provisioning operations on nodes without BMC credentials. See `story + 2007771 `_ for details. diff --git a/setup.cfg b/setup.cfg index 1101c09b59..c314e21a5d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -121,6 +121,7 @@ ironic.hardware.interfaces.network = noop = ironic.drivers.modules.network.noop:NoopNetwork ironic.hardware.interfaces.power = + agent = ironic.drivers.modules.agent_power:AgentPower fake = ironic.drivers.modules.fake:FakePower ibmc = ironic.drivers.modules.ibmc.power:IBMCPower idrac = ironic.drivers.modules.drac.power:DracPower