Add agent power interface

This change adds a new 'agent' power interface that can be used together
with fast-track to deploy nodes without knowing their power credentials.
It relies on the agent staying powered on during the whole pre-deployment
and deployment process.

Story: #2007771
Task: #39995
Change-Id: I3d7157c1c4464b650adebbd7f894ee33d0f8f25b
This commit is contained in:
Dmitry Tantsur 2020-06-15 15:30:39 +02:00
parent e804f6c56b
commit 46f8c85752
8 changed files with 402 additions and 4 deletions

View File

@ -336,6 +336,19 @@ class TaskManager(object):
self._on_error_args = args
self._on_error_kwargs = kwargs
def downgrade_lock(self):
"""Downgrade the lock to a shared one."""
if self.node is None:
raise RuntimeError("Cannot downgrade an already released lock")
if not self.shared:
objects.Node.release(self.context, CONF.host, self.node.id)
self.shared = True
self.node.refresh()
LOG.debug("Successfully downgraded lock for %(purpose)s "
"on node %(node)s",
{'purpose': self._purpose, 'node': self.node.uuid})
def release_resources(self):
"""Unlock a node and release resources.

View File

@ -963,6 +963,21 @@ def value_within_timeout(value, timeout):
return last_valid <= last
def agent_is_alive(node, timeout=None):
"""Check that the agent is likely alive.
The method then checks for the last agent heartbeat, and if it occured
within the timeout set by [deploy]fast_track_timeout, then agent is
presumed alive.
:param node: A node object.
:param timeout: Heartbeat timeout, defaults to `fast_track_timeout`.
"""
return value_within_timeout(
node.driver_internal_info.get('agent_last_heartbeat'),
timeout or CONF.deploy.fast_track_timeout)
def is_fast_track(task):
"""Checks a fast track is available.
@ -987,9 +1002,7 @@ def is_fast_track(task):
{'node': task.node.uuid, 'error': task.node.last_error})
return False
if value_within_timeout(
task.node.driver_internal_info.get('agent_last_heartbeat'),
CONF.deploy.fast_track_timeout):
if agent_is_alive(task.node):
return True
else:
LOG.debug('Node %(node)s should be fast-track-able, but the agent '

View File

@ -18,6 +18,7 @@ Generic hardware types.
from ironic.drivers import hardware_type
from ironic.drivers.modules import agent
from ironic.drivers.modules import agent_power
from ironic.drivers.modules.ansible import deploy as ansible_deploy
from ironic.drivers.modules import fake
from ironic.drivers.modules import inspector
@ -102,7 +103,7 @@ class ManualManagementHardware(GenericHardware):
@property
def supported_power_interfaces(self):
"""List of supported power interfaces."""
return [fake.FakePower]
return [agent_power.AgentPower, fake.FakePower]
@property
def supported_vendor_interfaces(self):

View File

@ -30,6 +30,8 @@ METRICS = metrics_utils.get_metrics_logger(__name__)
DEFAULT_IPA_PORTAL_PORT = 3260
REBOOT_COMMAND = 'run_image'
def get_command_error(command):
"""Extract an error string from the command result.
@ -564,6 +566,21 @@ class AgentClient(object):
method='standby.power_off',
params={})
@METRICS.timer('AgentClient.reboot')
def reboot(self, node):
"""Soft reboots the bare metal node by shutting down ramdisk OS.
:param node: A Node object.
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
:returns: A dict containing command response from agent.
See :func:`get_commands_status` for a command result sample.
"""
return self._command(node=node,
method='standby.%s' % REBOOT_COMMAND,
params={})
@METRICS.timer('AgentClient.sync')
def sync(self, node):
"""Flush file system buffers forcing changed blocks to disk.

View File

@ -0,0 +1,220 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""
The agent power interface.
"""
import time
from oslo_config import cfg
from oslo_log import log
import retrying
from ironic.common import exception
from ironic.common.i18n import _
from ironic.common import states
from ironic.conductor import utils as cond_utils
from ironic.drivers import base
from ironic.drivers.modules import agent_client
CONF = cfg.CONF
LOG = log.getLogger(__name__)
_POWER_WAIT = 30
class AgentPower(base.PowerInterface):
"""Power interface using the running agent for power actions."""
def __init__(self):
super(AgentPower, self).__init__()
if not CONF.deploy.fast_track:
raise exception.InvalidParameterValue(
_('[deploy]fast_track must be True to enable the agent '
'power interface'))
self._client = agent_client.AgentClient()
def get_properties(self):
"""Return the properties of the interface.
:returns: dictionary of <property name>:<property description> entries.
"""
return {}
def validate(self, task):
"""Validate the driver-specific Node deployment info.
:param task: A TaskManager instance containing the node to act on.
:raises: InvalidParameterValue on malformed parameter(s)
"""
# NOTE(dtantsur): the fast_track option is mutable, so we have to check
# it again on validation.
if not CONF.deploy.fast_track:
raise exception.InvalidParameterValue(
_('[deploy]fast_track must be True to enable the agent '
'power interface'))
# TODO(dtantsur): support ACTIVE nodes
if not cond_utils.agent_is_alive(task.node):
raise exception.InvalidParameterValue(
_('Agent seems offline for node %s, the agent power interface '
'cannot be used') % task.node.uuid)
def supports_power_sync(self, task):
"""Check if power sync is supported for the given node.
Not supported for the agent power since it is not possible to power
on/off nodes.
:param task: A TaskManager instance containing the node to act on
with a **shared** lock.
:returns: boolean, whether power sync is supported.
"""
return False
def get_supported_power_states(self, task):
"""Get a list of the supported power states.
Only contains REBOOT.
:param task: A TaskManager instance containing the node to act on.
:returns: A list with the supported power states defined
in :mod:`ironic.common.states`.
"""
return [states.REBOOT, states.SOFT_REBOOT]
def get_power_state(self, task):
"""Return the power state of the task's node.
Essentially, the only known state is POWER ON, everything else is
an error (or more precisely ``None``).
:param task: A TaskManager instance containing the node to act on.
:returns: A power state. One of :mod:`ironic.common.states`.
"""
# TODO(dtantsur): support ACTIVE nodes
if cond_utils.agent_is_alive(task.node):
return states.POWER_ON
else:
LOG.error('Node %s is not fast-track-able, cannot determine '
'its power state via the "agent" power interface',
task.node.uuid)
return None
def set_power_state(self, task, power_state, timeout=None):
"""Set the power state of the task's node.
:param task: A TaskManager instance containing the node to act on.
:param power_state: Power state from :mod:`ironic.common.states`.
Only REBOOT and SOFT_REBOOT are supported and are synonymous.
:param timeout: timeout (in seconds) positive integer (> 0) for any
power state. ``None`` indicates to use default timeout.
:raises: PowerStateFailure on non-supported power state.
"""
if power_state in (states.REBOOT, states.SOFT_REBOOT):
return self.reboot(task)
else:
LOG.error('Power state %(state)s is not implemented for node '
'%(node)s using the "agent" power interface',
{'node': task.node.uuid, 'state': power_state})
raise exception.PowerStateFailure(pstate=power_state)
def reboot(self, task, timeout=None):
"""Perform a reboot of the task's node.
Only soft reboot is implemented.
:param task: A TaskManager instance containing the node to act on.
:param timeout: timeout (in seconds) positive integer (> 0) for any
power state. ``None`` indicates to use default timeout.
"""
node = task.node
self._client.reboot(node)
info = node.driver_internal_info
# NOTE(dtantsur): wipe the agent token, otherwise the rebooted agent
# won't be able to heartbeat. This is mostly a precaution since the
# calling code in conductor is expected to handle it.
if not info.get('agent_secret_token_pregenerated'):
info.pop('agent_secret_token', None)
# NOTE(dtantsur): the URL may change on reboot, wipe it as well (but
# only after we call reboot).
info.pop('agent_url', None)
node.driver_internal_info = info
node.save()
LOG.debug('Requested reboot of node %(node)s via the agent, waiting '
'%(wait)d seconds for the node to power down',
{'node': task.node.uuid, 'wait': _POWER_WAIT})
time.sleep(_POWER_WAIT)
if (node.provision_state in (states.DEPLOYING, states.CLEANING)
and (node.driver_internal_info.get('deployment_reboot')
or node.driver_internal_info.get('cleaning_reboot'))):
# NOTE(dtantsur): we need to downgrade the lock otherwise
# heartbeats won't be processed. It should not have side effects
# for nodes in DEPLOYING/CLEANING.
task.downgrade_lock()
try:
self._wait_for_reboot(task, timeout)
finally:
# The caller probably expects a lock, so re-acquire it
task.upgrade_lock()
def _wait_for_reboot(self, task, timeout):
wait = CONF.agent.post_deploy_get_power_state_retry_interval
if not timeout:
timeout = CONF.agent.post_deploy_get_power_state_retries * wait
@retrying.retry(
stop_max_delay=timeout,
retry_on_result=lambda result: not result,
retry_on_exception=(
lambda e: isinstance(e, exception.AgentConnectionFailed)),
wait_fixed=wait * 1000
)
def _wait_until_rebooted(task):
try:
status = self._client.get_commands_status(
task.node, retry_connection=False, expect_errors=True)
except exception.AgentConnectionFailed:
LOG.debug('Still waiting for the agent to come back on the '
'node %s', task.node.uuid)
raise
if any(cmd['command_name'] == agent_client.REBOOT_COMMAND
for cmd in status):
LOG.debug('Still waiting for the agent to power off on the '
'node %s', task.node.uuid)
return False
return True
try:
_wait_until_rebooted(task)
except exception.AgentConnectionFailed as exc:
msg = _('Agent failed to come back on %(node)s with the "agent" '
'power interface: %(exc)s') % {
'node': task.node.uuid, 'exc': exc}
LOG.error(msg)
raise exception.PowerStateFailure(msg)
except Exception as exc:
LOG.error('Could not reboot node %(node)s with the "agent" power '
'interface: %(exc)s',
{'node': task.node.uuid, 'exc': exc})
raise exception.PowerStateFailure(
_('Unexpected error when rebooting through the agent: %s')
% exc)

View File

@ -0,0 +1,127 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import datetime
from unittest import mock
from ironic.common import exception
from ironic.common import states
from ironic.conductor import task_manager
from ironic.drivers.modules import agent_client
from ironic.drivers.modules import agent_power
from ironic.tests.unit.db import base as db_base
from ironic.tests.unit.objects import utils as object_utils
@mock.patch('time.sleep', lambda _sec: None)
class AgentPowerTest(db_base.DbTestCase):
def setUp(self):
super(AgentPowerTest, self).setUp()
self.config(fast_track=True, group='deploy')
self.power = agent_power.AgentPower()
dii = {
'agent_last_heartbeat': datetime.datetime.now().strftime(
"%Y-%m-%dT%H:%M:%S.%f"),
'deployment_reboot': True,
'agent_url': 'http://url',
'agent_secret_token': 'very secret',
}
self.node = object_utils.create_test_node(
self.context, driver_internal_info=dii,
provision_state=states.DEPLOYING)
self.task = mock.Mock(spec=task_manager.TaskManager, node=self.node)
def test_basics(self):
self.assertEqual({}, self.power.get_properties())
self.assertFalse(self.power.supports_power_sync(self.task))
self.assertEqual([states.REBOOT, states.SOFT_REBOOT],
self.power.get_supported_power_states(self.task))
def test_validate(self):
self.power.validate(self.task)
def test_validate_fails(self):
self.node.driver_internal_info['agent_last_heartbeat'] = \
datetime.datetime(2010, 7, 19).strftime(
"%Y-%m-%dT%H:%M:%S.%f")
self.assertRaises(exception.InvalidParameterValue,
self.power.validate, self.task)
del self.node.driver_internal_info['agent_last_heartbeat']
self.assertRaises(exception.InvalidParameterValue,
self.power.validate, self.task)
def test_get_power_state(self):
self.assertEqual(states.POWER_ON,
self.power.get_power_state(self.task))
def test_get_power_state_unknown(self):
self.node.driver_internal_info['agent_last_heartbeat'] = \
datetime.datetime(2010, 7, 19).strftime(
"%Y-%m-%dT%H:%M:%S.%f")
self.assertIsNone(self.power.get_power_state(self.task))
del self.node.driver_internal_info['agent_last_heartbeat']
self.assertIsNone(self.power.get_power_state(self.task))
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
autospec=True)
@mock.patch.object(agent_client.AgentClient, 'reboot', autospec=True)
def test_reboot(self, mock_reboot, mock_commands):
mock_commands.side_effect = [
[{'command_name': 'run_image', 'command_status': 'RUNNING'}],
exception.AgentConnectionFailed,
exception.AgentConnectionFailed,
[{'command_name': 'get_deploy_steps', 'command_status': 'RUNNING'}]
]
with task_manager.acquire(self.context, self.node.id) as task:
# Save the node since the upgrade_lock call changes it
node = task.node
self.power.reboot(task)
mock_reboot.assert_called_once_with(self.power._client, node)
mock_commands.assert_called_with(self.power._client, node,
retry_connection=False,
expect_errors=True)
self.assertEqual(4, mock_commands.call_count)
node.refresh()
self.assertNotIn('agent_secret_token', node.driver_internal_info)
self.assertNotIn('agent_url', node.driver_internal_info)
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
autospec=True)
@mock.patch.object(agent_client.AgentClient, 'reboot', autospec=True)
def test_reboot_timeout(self, mock_reboot, mock_commands):
mock_commands.side_effect = exception.AgentConnectionFailed
with task_manager.acquire(self.context, self.node.id) as task:
node = task.node
self.assertRaisesRegex(exception.PowerStateFailure,
'Agent failed to come back',
self.power.reboot, task, timeout=0.001)
mock_commands.assert_called_with(self.power._client, node,
retry_connection=False,
expect_errors=True)
@mock.patch.object(agent_client.AgentClient, 'reboot', autospec=True)
def test_reboot_another_state(self, mock_reboot):
with task_manager.acquire(self.context, self.node.id) as task:
task.node.provision_state = states.DEPLOYWAIT
self.power.reboot(task)
mock_reboot.assert_called_once_with(self.power._client, task.node)
@mock.patch.object(agent_client.AgentClient, 'reboot', autospec=True)
def test_reboot_into_instance(self, mock_reboot):
with task_manager.acquire(self.context, self.node.id) as task:
del task.node.driver_internal_info['deployment_reboot']
self.power.reboot(task)
mock_reboot.assert_called_once_with(self.power._client, task.node)

View File

@ -0,0 +1,6 @@
---
features:
- |
The new **experimental** ``agent`` power interface allows limited
provisioning operations on nodes without BMC credentials. See `story
2007771 <https://storyboard.openstack.org/#!/story/2007771>`_ for details.

View File

@ -121,6 +121,7 @@ ironic.hardware.interfaces.network =
noop = ironic.drivers.modules.network.noop:NoopNetwork
ironic.hardware.interfaces.power =
agent = ironic.drivers.modules.agent_power:AgentPower
fake = ironic.drivers.modules.fake:FakePower
ibmc = ironic.drivers.modules.ibmc.power:IBMCPower
idrac = ironic.drivers.modules.drac.power:DracPower