Add agent power interface
This change adds a new 'agent' power interface that can be used together with fast-track to deploy nodes without knowing their power credentials. It relies on the agent staying powered on during the whole pre-deployment and deployment process. Story: #2007771 Task: #39995 Change-Id: I3d7157c1c4464b650adebbd7f894ee33d0f8f25b
This commit is contained in:
parent
e804f6c56b
commit
46f8c85752
@ -336,6 +336,19 @@ class TaskManager(object):
|
||||
self._on_error_args = args
|
||||
self._on_error_kwargs = kwargs
|
||||
|
||||
def downgrade_lock(self):
|
||||
"""Downgrade the lock to a shared one."""
|
||||
if self.node is None:
|
||||
raise RuntimeError("Cannot downgrade an already released lock")
|
||||
|
||||
if not self.shared:
|
||||
objects.Node.release(self.context, CONF.host, self.node.id)
|
||||
self.shared = True
|
||||
self.node.refresh()
|
||||
LOG.debug("Successfully downgraded lock for %(purpose)s "
|
||||
"on node %(node)s",
|
||||
{'purpose': self._purpose, 'node': self.node.uuid})
|
||||
|
||||
def release_resources(self):
|
||||
"""Unlock a node and release resources.
|
||||
|
||||
|
@ -963,6 +963,21 @@ def value_within_timeout(value, timeout):
|
||||
return last_valid <= last
|
||||
|
||||
|
||||
def agent_is_alive(node, timeout=None):
|
||||
"""Check that the agent is likely alive.
|
||||
|
||||
The method then checks for the last agent heartbeat, and if it occured
|
||||
within the timeout set by [deploy]fast_track_timeout, then agent is
|
||||
presumed alive.
|
||||
|
||||
:param node: A node object.
|
||||
:param timeout: Heartbeat timeout, defaults to `fast_track_timeout`.
|
||||
"""
|
||||
return value_within_timeout(
|
||||
node.driver_internal_info.get('agent_last_heartbeat'),
|
||||
timeout or CONF.deploy.fast_track_timeout)
|
||||
|
||||
|
||||
def is_fast_track(task):
|
||||
"""Checks a fast track is available.
|
||||
|
||||
@ -987,9 +1002,7 @@ def is_fast_track(task):
|
||||
{'node': task.node.uuid, 'error': task.node.last_error})
|
||||
return False
|
||||
|
||||
if value_within_timeout(
|
||||
task.node.driver_internal_info.get('agent_last_heartbeat'),
|
||||
CONF.deploy.fast_track_timeout):
|
||||
if agent_is_alive(task.node):
|
||||
return True
|
||||
else:
|
||||
LOG.debug('Node %(node)s should be fast-track-able, but the agent '
|
||||
|
@ -18,6 +18,7 @@ Generic hardware types.
|
||||
|
||||
from ironic.drivers import hardware_type
|
||||
from ironic.drivers.modules import agent
|
||||
from ironic.drivers.modules import agent_power
|
||||
from ironic.drivers.modules.ansible import deploy as ansible_deploy
|
||||
from ironic.drivers.modules import fake
|
||||
from ironic.drivers.modules import inspector
|
||||
@ -102,7 +103,7 @@ class ManualManagementHardware(GenericHardware):
|
||||
@property
|
||||
def supported_power_interfaces(self):
|
||||
"""List of supported power interfaces."""
|
||||
return [fake.FakePower]
|
||||
return [agent_power.AgentPower, fake.FakePower]
|
||||
|
||||
@property
|
||||
def supported_vendor_interfaces(self):
|
||||
|
@ -30,6 +30,8 @@ METRICS = metrics_utils.get_metrics_logger(__name__)
|
||||
|
||||
DEFAULT_IPA_PORTAL_PORT = 3260
|
||||
|
||||
REBOOT_COMMAND = 'run_image'
|
||||
|
||||
|
||||
def get_command_error(command):
|
||||
"""Extract an error string from the command result.
|
||||
@ -564,6 +566,21 @@ class AgentClient(object):
|
||||
method='standby.power_off',
|
||||
params={})
|
||||
|
||||
@METRICS.timer('AgentClient.reboot')
|
||||
def reboot(self, node):
|
||||
"""Soft reboots the bare metal node by shutting down ramdisk OS.
|
||||
|
||||
:param node: A Node object.
|
||||
:raises: IronicException when failed to issue the request or there was
|
||||
a malformed response from the agent.
|
||||
:raises: AgentAPIError when agent failed to execute specified command.
|
||||
:returns: A dict containing command response from agent.
|
||||
See :func:`get_commands_status` for a command result sample.
|
||||
"""
|
||||
return self._command(node=node,
|
||||
method='standby.%s' % REBOOT_COMMAND,
|
||||
params={})
|
||||
|
||||
@METRICS.timer('AgentClient.sync')
|
||||
def sync(self, node):
|
||||
"""Flush file system buffers forcing changed blocks to disk.
|
||||
|
220
ironic/drivers/modules/agent_power.py
Normal file
220
ironic/drivers/modules/agent_power.py
Normal file
@ -0,0 +1,220 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
"""
|
||||
The agent power interface.
|
||||
"""
|
||||
|
||||
import time
|
||||
|
||||
from oslo_config import cfg
|
||||
from oslo_log import log
|
||||
import retrying
|
||||
|
||||
from ironic.common import exception
|
||||
from ironic.common.i18n import _
|
||||
from ironic.common import states
|
||||
from ironic.conductor import utils as cond_utils
|
||||
from ironic.drivers import base
|
||||
from ironic.drivers.modules import agent_client
|
||||
|
||||
|
||||
CONF = cfg.CONF
|
||||
|
||||
LOG = log.getLogger(__name__)
|
||||
|
||||
_POWER_WAIT = 30
|
||||
|
||||
|
||||
class AgentPower(base.PowerInterface):
|
||||
"""Power interface using the running agent for power actions."""
|
||||
|
||||
def __init__(self):
|
||||
super(AgentPower, self).__init__()
|
||||
if not CONF.deploy.fast_track:
|
||||
raise exception.InvalidParameterValue(
|
||||
_('[deploy]fast_track must be True to enable the agent '
|
||||
'power interface'))
|
||||
self._client = agent_client.AgentClient()
|
||||
|
||||
def get_properties(self):
|
||||
"""Return the properties of the interface.
|
||||
|
||||
:returns: dictionary of <property name>:<property description> entries.
|
||||
"""
|
||||
return {}
|
||||
|
||||
def validate(self, task):
|
||||
"""Validate the driver-specific Node deployment info.
|
||||
|
||||
:param task: A TaskManager instance containing the node to act on.
|
||||
:raises: InvalidParameterValue on malformed parameter(s)
|
||||
"""
|
||||
# NOTE(dtantsur): the fast_track option is mutable, so we have to check
|
||||
# it again on validation.
|
||||
if not CONF.deploy.fast_track:
|
||||
raise exception.InvalidParameterValue(
|
||||
_('[deploy]fast_track must be True to enable the agent '
|
||||
'power interface'))
|
||||
# TODO(dtantsur): support ACTIVE nodes
|
||||
if not cond_utils.agent_is_alive(task.node):
|
||||
raise exception.InvalidParameterValue(
|
||||
_('Agent seems offline for node %s, the agent power interface '
|
||||
'cannot be used') % task.node.uuid)
|
||||
|
||||
def supports_power_sync(self, task):
|
||||
"""Check if power sync is supported for the given node.
|
||||
|
||||
Not supported for the agent power since it is not possible to power
|
||||
on/off nodes.
|
||||
|
||||
:param task: A TaskManager instance containing the node to act on
|
||||
with a **shared** lock.
|
||||
:returns: boolean, whether power sync is supported.
|
||||
"""
|
||||
return False
|
||||
|
||||
def get_supported_power_states(self, task):
|
||||
"""Get a list of the supported power states.
|
||||
|
||||
Only contains REBOOT.
|
||||
|
||||
:param task: A TaskManager instance containing the node to act on.
|
||||
:returns: A list with the supported power states defined
|
||||
in :mod:`ironic.common.states`.
|
||||
"""
|
||||
return [states.REBOOT, states.SOFT_REBOOT]
|
||||
|
||||
def get_power_state(self, task):
|
||||
"""Return the power state of the task's node.
|
||||
|
||||
Essentially, the only known state is POWER ON, everything else is
|
||||
an error (or more precisely ``None``).
|
||||
|
||||
:param task: A TaskManager instance containing the node to act on.
|
||||
:returns: A power state. One of :mod:`ironic.common.states`.
|
||||
"""
|
||||
# TODO(dtantsur): support ACTIVE nodes
|
||||
if cond_utils.agent_is_alive(task.node):
|
||||
return states.POWER_ON
|
||||
else:
|
||||
LOG.error('Node %s is not fast-track-able, cannot determine '
|
||||
'its power state via the "agent" power interface',
|
||||
task.node.uuid)
|
||||
return None
|
||||
|
||||
def set_power_state(self, task, power_state, timeout=None):
|
||||
"""Set the power state of the task's node.
|
||||
|
||||
:param task: A TaskManager instance containing the node to act on.
|
||||
:param power_state: Power state from :mod:`ironic.common.states`.
|
||||
Only REBOOT and SOFT_REBOOT are supported and are synonymous.
|
||||
:param timeout: timeout (in seconds) positive integer (> 0) for any
|
||||
power state. ``None`` indicates to use default timeout.
|
||||
:raises: PowerStateFailure on non-supported power state.
|
||||
"""
|
||||
if power_state in (states.REBOOT, states.SOFT_REBOOT):
|
||||
return self.reboot(task)
|
||||
else:
|
||||
LOG.error('Power state %(state)s is not implemented for node '
|
||||
'%(node)s using the "agent" power interface',
|
||||
{'node': task.node.uuid, 'state': power_state})
|
||||
raise exception.PowerStateFailure(pstate=power_state)
|
||||
|
||||
def reboot(self, task, timeout=None):
|
||||
"""Perform a reboot of the task's node.
|
||||
|
||||
Only soft reboot is implemented.
|
||||
|
||||
:param task: A TaskManager instance containing the node to act on.
|
||||
:param timeout: timeout (in seconds) positive integer (> 0) for any
|
||||
power state. ``None`` indicates to use default timeout.
|
||||
"""
|
||||
node = task.node
|
||||
|
||||
self._client.reboot(node)
|
||||
|
||||
info = node.driver_internal_info
|
||||
# NOTE(dtantsur): wipe the agent token, otherwise the rebooted agent
|
||||
# won't be able to heartbeat. This is mostly a precaution since the
|
||||
# calling code in conductor is expected to handle it.
|
||||
if not info.get('agent_secret_token_pregenerated'):
|
||||
info.pop('agent_secret_token', None)
|
||||
# NOTE(dtantsur): the URL may change on reboot, wipe it as well (but
|
||||
# only after we call reboot).
|
||||
info.pop('agent_url', None)
|
||||
node.driver_internal_info = info
|
||||
node.save()
|
||||
|
||||
LOG.debug('Requested reboot of node %(node)s via the agent, waiting '
|
||||
'%(wait)d seconds for the node to power down',
|
||||
{'node': task.node.uuid, 'wait': _POWER_WAIT})
|
||||
time.sleep(_POWER_WAIT)
|
||||
|
||||
if (node.provision_state in (states.DEPLOYING, states.CLEANING)
|
||||
and (node.driver_internal_info.get('deployment_reboot')
|
||||
or node.driver_internal_info.get('cleaning_reboot'))):
|
||||
# NOTE(dtantsur): we need to downgrade the lock otherwise
|
||||
# heartbeats won't be processed. It should not have side effects
|
||||
# for nodes in DEPLOYING/CLEANING.
|
||||
task.downgrade_lock()
|
||||
|
||||
try:
|
||||
self._wait_for_reboot(task, timeout)
|
||||
finally:
|
||||
# The caller probably expects a lock, so re-acquire it
|
||||
task.upgrade_lock()
|
||||
|
||||
def _wait_for_reboot(self, task, timeout):
|
||||
wait = CONF.agent.post_deploy_get_power_state_retry_interval
|
||||
if not timeout:
|
||||
timeout = CONF.agent.post_deploy_get_power_state_retries * wait
|
||||
|
||||
@retrying.retry(
|
||||
stop_max_delay=timeout,
|
||||
retry_on_result=lambda result: not result,
|
||||
retry_on_exception=(
|
||||
lambda e: isinstance(e, exception.AgentConnectionFailed)),
|
||||
wait_fixed=wait * 1000
|
||||
)
|
||||
def _wait_until_rebooted(task):
|
||||
try:
|
||||
status = self._client.get_commands_status(
|
||||
task.node, retry_connection=False, expect_errors=True)
|
||||
except exception.AgentConnectionFailed:
|
||||
LOG.debug('Still waiting for the agent to come back on the '
|
||||
'node %s', task.node.uuid)
|
||||
raise
|
||||
|
||||
if any(cmd['command_name'] == agent_client.REBOOT_COMMAND
|
||||
for cmd in status):
|
||||
LOG.debug('Still waiting for the agent to power off on the '
|
||||
'node %s', task.node.uuid)
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
try:
|
||||
_wait_until_rebooted(task)
|
||||
except exception.AgentConnectionFailed as exc:
|
||||
msg = _('Agent failed to come back on %(node)s with the "agent" '
|
||||
'power interface: %(exc)s') % {
|
||||
'node': task.node.uuid, 'exc': exc}
|
||||
LOG.error(msg)
|
||||
raise exception.PowerStateFailure(msg)
|
||||
except Exception as exc:
|
||||
LOG.error('Could not reboot node %(node)s with the "agent" power '
|
||||
'interface: %(exc)s',
|
||||
{'node': task.node.uuid, 'exc': exc})
|
||||
raise exception.PowerStateFailure(
|
||||
_('Unexpected error when rebooting through the agent: %s')
|
||||
% exc)
|
127
ironic/tests/unit/drivers/modules/test_agent_power.py
Normal file
127
ironic/tests/unit/drivers/modules/test_agent_power.py
Normal file
@ -0,0 +1,127 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import datetime
|
||||
from unittest import mock
|
||||
|
||||
from ironic.common import exception
|
||||
from ironic.common import states
|
||||
from ironic.conductor import task_manager
|
||||
from ironic.drivers.modules import agent_client
|
||||
from ironic.drivers.modules import agent_power
|
||||
from ironic.tests.unit.db import base as db_base
|
||||
from ironic.tests.unit.objects import utils as object_utils
|
||||
|
||||
|
||||
@mock.patch('time.sleep', lambda _sec: None)
|
||||
class AgentPowerTest(db_base.DbTestCase):
|
||||
|
||||
def setUp(self):
|
||||
super(AgentPowerTest, self).setUp()
|
||||
self.config(fast_track=True, group='deploy')
|
||||
self.power = agent_power.AgentPower()
|
||||
dii = {
|
||||
'agent_last_heartbeat': datetime.datetime.now().strftime(
|
||||
"%Y-%m-%dT%H:%M:%S.%f"),
|
||||
'deployment_reboot': True,
|
||||
'agent_url': 'http://url',
|
||||
'agent_secret_token': 'very secret',
|
||||
}
|
||||
self.node = object_utils.create_test_node(
|
||||
self.context, driver_internal_info=dii,
|
||||
provision_state=states.DEPLOYING)
|
||||
self.task = mock.Mock(spec=task_manager.TaskManager, node=self.node)
|
||||
|
||||
def test_basics(self):
|
||||
self.assertEqual({}, self.power.get_properties())
|
||||
self.assertFalse(self.power.supports_power_sync(self.task))
|
||||
self.assertEqual([states.REBOOT, states.SOFT_REBOOT],
|
||||
self.power.get_supported_power_states(self.task))
|
||||
|
||||
def test_validate(self):
|
||||
self.power.validate(self.task)
|
||||
|
||||
def test_validate_fails(self):
|
||||
self.node.driver_internal_info['agent_last_heartbeat'] = \
|
||||
datetime.datetime(2010, 7, 19).strftime(
|
||||
"%Y-%m-%dT%H:%M:%S.%f")
|
||||
self.assertRaises(exception.InvalidParameterValue,
|
||||
self.power.validate, self.task)
|
||||
|
||||
del self.node.driver_internal_info['agent_last_heartbeat']
|
||||
self.assertRaises(exception.InvalidParameterValue,
|
||||
self.power.validate, self.task)
|
||||
|
||||
def test_get_power_state(self):
|
||||
self.assertEqual(states.POWER_ON,
|
||||
self.power.get_power_state(self.task))
|
||||
|
||||
def test_get_power_state_unknown(self):
|
||||
self.node.driver_internal_info['agent_last_heartbeat'] = \
|
||||
datetime.datetime(2010, 7, 19).strftime(
|
||||
"%Y-%m-%dT%H:%M:%S.%f")
|
||||
self.assertIsNone(self.power.get_power_state(self.task))
|
||||
|
||||
del self.node.driver_internal_info['agent_last_heartbeat']
|
||||
self.assertIsNone(self.power.get_power_state(self.task))
|
||||
|
||||
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
|
||||
autospec=True)
|
||||
@mock.patch.object(agent_client.AgentClient, 'reboot', autospec=True)
|
||||
def test_reboot(self, mock_reboot, mock_commands):
|
||||
mock_commands.side_effect = [
|
||||
[{'command_name': 'run_image', 'command_status': 'RUNNING'}],
|
||||
exception.AgentConnectionFailed,
|
||||
exception.AgentConnectionFailed,
|
||||
[{'command_name': 'get_deploy_steps', 'command_status': 'RUNNING'}]
|
||||
]
|
||||
with task_manager.acquire(self.context, self.node.id) as task:
|
||||
# Save the node since the upgrade_lock call changes it
|
||||
node = task.node
|
||||
self.power.reboot(task)
|
||||
mock_reboot.assert_called_once_with(self.power._client, node)
|
||||
mock_commands.assert_called_with(self.power._client, node,
|
||||
retry_connection=False,
|
||||
expect_errors=True)
|
||||
self.assertEqual(4, mock_commands.call_count)
|
||||
|
||||
node.refresh()
|
||||
self.assertNotIn('agent_secret_token', node.driver_internal_info)
|
||||
self.assertNotIn('agent_url', node.driver_internal_info)
|
||||
|
||||
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
|
||||
autospec=True)
|
||||
@mock.patch.object(agent_client.AgentClient, 'reboot', autospec=True)
|
||||
def test_reboot_timeout(self, mock_reboot, mock_commands):
|
||||
mock_commands.side_effect = exception.AgentConnectionFailed
|
||||
with task_manager.acquire(self.context, self.node.id) as task:
|
||||
node = task.node
|
||||
self.assertRaisesRegex(exception.PowerStateFailure,
|
||||
'Agent failed to come back',
|
||||
self.power.reboot, task, timeout=0.001)
|
||||
mock_commands.assert_called_with(self.power._client, node,
|
||||
retry_connection=False,
|
||||
expect_errors=True)
|
||||
|
||||
@mock.patch.object(agent_client.AgentClient, 'reboot', autospec=True)
|
||||
def test_reboot_another_state(self, mock_reboot):
|
||||
with task_manager.acquire(self.context, self.node.id) as task:
|
||||
task.node.provision_state = states.DEPLOYWAIT
|
||||
self.power.reboot(task)
|
||||
mock_reboot.assert_called_once_with(self.power._client, task.node)
|
||||
|
||||
@mock.patch.object(agent_client.AgentClient, 'reboot', autospec=True)
|
||||
def test_reboot_into_instance(self, mock_reboot):
|
||||
with task_manager.acquire(self.context, self.node.id) as task:
|
||||
del task.node.driver_internal_info['deployment_reboot']
|
||||
self.power.reboot(task)
|
||||
mock_reboot.assert_called_once_with(self.power._client, task.node)
|
6
releasenotes/notes/agent-power-a000fdf37cb870e4.yaml
Normal file
6
releasenotes/notes/agent-power-a000fdf37cb870e4.yaml
Normal file
@ -0,0 +1,6 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
The new **experimental** ``agent`` power interface allows limited
|
||||
provisioning operations on nodes without BMC credentials. See `story
|
||||
2007771 <https://storyboard.openstack.org/#!/story/2007771>`_ for details.
|
@ -121,6 +121,7 @@ ironic.hardware.interfaces.network =
|
||||
noop = ironic.drivers.modules.network.noop:NoopNetwork
|
||||
|
||||
ironic.hardware.interfaces.power =
|
||||
agent = ironic.drivers.modules.agent_power:AgentPower
|
||||
fake = ironic.drivers.modules.fake:FakePower
|
||||
ibmc = ironic.drivers.modules.ibmc.power:IBMCPower
|
||||
idrac = ironic.drivers.modules.drac.power:DracPower
|
||||
|
Loading…
Reference in New Issue
Block a user