Fix in-band cleaning for ramdisk and anaconda deploy

We have implemented the cleaning prepare/tear_down, but haven't
implemented fetching/running in-band clean steps. This change moves
the cleaning logic from AgentDeployMixin to AgentBaseMixin, where it
arguably belongs.

In a follow-up patch I'm planning to reduce the number of mix-ins we
currently have, but that won't be backportable.

Change-Id: Ibc5610b14cea487d26191249e5c0333fdcd4b914
This commit is contained in:
Dmitry Tantsur 2021-09-02 18:30:42 +02:00
parent 8ea1a438d3
commit 75304deefb
5 changed files with 199 additions and 131 deletions

View File

@ -201,7 +201,7 @@ def validate_http_provisioning_configuration(node):
deploy_utils.check_for_missing_params(params, error_msg)
class CustomAgentDeploy(agent_base.AgentDeployMixin, agent_base.AgentBaseMixin,
class CustomAgentDeploy(agent_base.AgentBaseMixin, agent_base.AgentDeployMixin,
base.DeployInterface):
"""A deploy interface that relies on a custom agent to deploy.

View File

@ -639,19 +639,16 @@ class HeartbeatMixin(object):
class AgentBaseMixin(object):
"""Mixin with base methods not relying on any deploy steps."""
"""Mixin with base methods not relying on any deploy steps.
Provides full support for in-band and out-of-band cleaning and
the machinery to support both deploy and clean in-band steps.
"""
def should_manage_boot(self, task):
"""Whether agent boot is managed by ironic."""
return True
def refresh_steps(self, task, step_type):
"""Refresh the node's cached steps.
:param task: a TaskManager instance
:param step_type: "clean" or "deploy"
"""
@METRICS.timer('AgentBaseMixin.tear_down')
@task_manager.require_exclusive_lock
def tear_down(self, task):
@ -701,7 +698,7 @@ class AgentBaseMixin(object):
"""
pass
@METRICS.timer('AgentDeployMixin.prepare_cleaning')
@METRICS.timer('AgentBaseMixin.prepare_cleaning')
def prepare_cleaning(self, task):
"""Boot into the agent to prepare for cleaning.
@ -719,7 +716,7 @@ class AgentBaseMixin(object):
self.refresh_steps(task, 'clean')
return result
@METRICS.timer('AgentDeployMixin.tear_down_cleaning')
@METRICS.timer('AgentBaseMixin.tear_down_cleaning')
def tear_down_cleaning(self, task):
"""Clean up the PXE and DHCP files after cleaning.
@ -730,64 +727,7 @@ class AgentBaseMixin(object):
deploy_utils.tear_down_inband_cleaning(
task, manage_boot=self.should_manage_boot(task))
class AgentOobStepsMixin(object):
"""Mixin with out-of-band deploy steps."""
@METRICS.timer('AgentDeployMixin.switch_to_tenant_network')
@base.deploy_step(priority=30)
@task_manager.require_exclusive_lock
def switch_to_tenant_network(self, task):
"""Deploy step to switch the node to the tenant network.
:param task: a TaskManager object containing the node
"""
try:
with manager_utils.power_state_for_network_configuration(task):
task.driver.network.remove_provisioning_network(task)
task.driver.network.configure_tenant_networks(task)
except Exception as e:
msg = (_('Error changing node %(node)s to tenant networks after '
'deploy. %(cls)s: %(error)s') %
{'node': task.node.uuid, 'cls': e.__class__.__name__,
'error': e})
# NOTE(mgoddard): Don't collect logs since the node has been
# powered off.
log_and_raise_deployment_error(task, msg, collect_logs=False,
exc=e)
@METRICS.timer('AgentDeployMixin.boot_instance')
@base.deploy_step(priority=20)
@task_manager.require_exclusive_lock
def boot_instance(self, task):
"""Deploy step to boot the final instance.
:param task: a TaskManager object containing the node
"""
can_power_on = (states.POWER_ON in
task.driver.power.get_supported_power_states(task))
try:
if can_power_on:
manager_utils.node_power_action(task, states.POWER_ON)
else:
LOG.debug('Not trying to power on node %s that does not '
'support powering on, assuming already running',
task.node.uuid)
except Exception as e:
msg = (_('Error booting node %(node)s after deploy. '
'%(cls)s: %(error)s') %
{'node': task.node.uuid, 'cls': e.__class__.__name__,
'error': e})
# NOTE(mgoddard): Don't collect logs since the node has been
# powered off.
log_and_raise_deployment_error(task, msg, collect_logs=False,
exc=e)
class AgentDeployMixin(HeartbeatMixin, AgentOobStepsMixin):
"""Mixin with deploy methods."""
@METRICS.timer('AgentDeployMixin.get_clean_steps')
@METRICS.timer('AgentBaseMixin.get_clean_steps')
def get_clean_steps(self, task):
"""Get the list of clean steps from the agent.
@ -806,26 +746,7 @@ class AgentDeployMixin(HeartbeatMixin, AgentOobStepsMixin):
task, 'clean', interface='deploy',
override_priorities=new_priorities)
@METRICS.timer('AgentDeployMixin.get_deploy_steps')
def get_deploy_steps(self, task):
"""Get the list of deploy steps from the agent.
:param task: a TaskManager object containing the node
:raises InstanceDeployFailure: if the deploy steps are not yet
available (cached), for example, when a node has just been
enrolled and has not been deployed yet.
:returns: A list of deploy step dictionaries
"""
steps = super(AgentDeployMixin, self).get_deploy_steps(task)[:]
ib_steps = get_steps(task, 'deploy', interface='deploy')
# NOTE(dtantsur): we allow in-band steps to be shadowed by out-of-band
# ones, see the docstring of execute_deploy_step for details.
steps += [step for step in ib_steps
# FIXME(dtantsur): nested loops are not too efficient
if not conductor_steps.find_step(steps, step)]
return steps
@METRICS.timer('AgentDeployMixin.refresh_steps')
@METRICS.timer('AgentBaseMixin.refresh_steps')
def refresh_steps(self, task, step_type):
"""Refresh the node's cached clean/deploy steps from the booted agent.
@ -911,7 +832,7 @@ class AgentDeployMixin(HeartbeatMixin, AgentOobStepsMixin):
'%(steps)s', {'node': node.uuid, 'steps': steps,
'type': step_type})
@METRICS.timer('AgentDeployMixin.execute_clean_step')
@METRICS.timer('AgentBaseMixin.execute_clean_step')
def execute_clean_step(self, task, step):
"""Execute a clean step asynchronously on the agent.
@ -923,37 +844,6 @@ class AgentDeployMixin(HeartbeatMixin, AgentOobStepsMixin):
"""
return execute_step(task, step, 'clean')
@METRICS.timer('AgentDeployMixin.execute_deploy_step')
def execute_deploy_step(self, task, step):
"""Execute a deploy step.
We're trying to find a step among both out-of-band and in-band steps.
In case of duplicates, out-of-band steps take priority. This property
allows having an out-of-band deploy step that calls into
a corresponding in-band step after some preparation (e.g. with
additional input).
:param task: a TaskManager object containing the node
:param step: a deploy step dictionary to execute
:raises: InstanceDeployFailure if the agent does not return a command
status
:returns: states.DEPLOYWAIT to signify the step will be completed async
"""
agent_running = task.node.driver_internal_info.get(
'agent_cached_deploy_steps')
oob_steps = self.deploy_steps
if conductor_steps.find_step(oob_steps, step):
return super(AgentDeployMixin, self).execute_deploy_step(
task, step)
elif not agent_running:
raise exception.InstanceDeployFailure(
_('Deploy step %(step)s has not been found. Available '
'out-of-band steps: %(oob)s. Agent is not running.') %
{'step': step, 'oob': oob_steps})
else:
return execute_step(task, step, 'deploy')
def _process_version_mismatch(self, task, step_type):
node = task.node
# For manual clean, the target provision state is MANAGEABLE, whereas
@ -1015,7 +905,7 @@ class AgentDeployMixin(HeartbeatMixin, AgentOobStepsMixin):
manager_utils.notify_conductor_resume_operation(task, step_type)
@METRICS.timer('AgentDeployMixin.process_next_step')
@METRICS.timer('AgentBaseMixin.process_next_step')
def process_next_step(self, task, step_type, **kwargs):
"""Start the next clean/deploy step if the previous one is complete.
@ -1113,6 +1003,113 @@ class AgentDeployMixin(HeartbeatMixin, AgentOobStepsMixin):
'type': step_type})
return _step_failure_handler(task, msg, step_type)
class AgentOobStepsMixin(object):
"""Mixin with out-of-band deploy steps."""
@METRICS.timer('AgentOobStepsMixin.switch_to_tenant_network')
@base.deploy_step(priority=30)
@task_manager.require_exclusive_lock
def switch_to_tenant_network(self, task):
"""Deploy step to switch the node to the tenant network.
:param task: a TaskManager object containing the node
"""
try:
with manager_utils.power_state_for_network_configuration(task):
task.driver.network.remove_provisioning_network(task)
task.driver.network.configure_tenant_networks(task)
except Exception as e:
msg = (_('Error changing node %(node)s to tenant networks after '
'deploy. %(cls)s: %(error)s') %
{'node': task.node.uuid, 'cls': e.__class__.__name__,
'error': e})
# NOTE(mgoddard): Don't collect logs since the node has been
# powered off.
log_and_raise_deployment_error(task, msg, collect_logs=False,
exc=e)
@METRICS.timer('AgentOobStepsMixin.boot_instance')
@base.deploy_step(priority=20)
@task_manager.require_exclusive_lock
def boot_instance(self, task):
"""Deploy step to boot the final instance.
:param task: a TaskManager object containing the node
"""
can_power_on = (states.POWER_ON in
task.driver.power.get_supported_power_states(task))
try:
if can_power_on:
manager_utils.node_power_action(task, states.POWER_ON)
else:
LOG.debug('Not trying to power on node %s that does not '
'support powering on, assuming already running',
task.node.uuid)
except Exception as e:
msg = (_('Error booting node %(node)s after deploy. '
'%(cls)s: %(error)s') %
{'node': task.node.uuid, 'cls': e.__class__.__name__,
'error': e})
# NOTE(mgoddard): Don't collect logs since the node has been
# powered off.
log_and_raise_deployment_error(task, msg, collect_logs=False,
exc=e)
class AgentDeployMixin(HeartbeatMixin, AgentOobStepsMixin):
"""Mixin with deploy methods."""
@METRICS.timer('AgentDeployMixin.get_deploy_steps')
def get_deploy_steps(self, task):
"""Get the list of deploy steps from the agent.
:param task: a TaskManager object containing the node
:raises InstanceDeployFailure: if the deploy steps are not yet
available (cached), for example, when a node has just been
enrolled and has not been deployed yet.
:returns: A list of deploy step dictionaries
"""
steps = super(AgentDeployMixin, self).get_deploy_steps(task)[:]
ib_steps = get_steps(task, 'deploy', interface='deploy')
# NOTE(dtantsur): we allow in-band steps to be shadowed by out-of-band
# ones, see the docstring of execute_deploy_step for details.
steps += [step for step in ib_steps
# FIXME(dtantsur): nested loops are not too efficient
if not conductor_steps.find_step(steps, step)]
return steps
@METRICS.timer('AgentDeployMixin.execute_deploy_step')
def execute_deploy_step(self, task, step):
"""Execute a deploy step.
We're trying to find a step among both out-of-band and in-band steps.
In case of duplicates, out-of-band steps take priority. This property
allows having an out-of-band deploy step that calls into
a corresponding in-band step after some preparation (e.g. with
additional input).
:param task: a TaskManager object containing the node
:param step: a deploy step dictionary to execute
:raises: InstanceDeployFailure if the agent does not return a command
status
:returns: states.DEPLOYWAIT to signify the step will be completed async
"""
agent_running = task.node.driver_internal_info.get(
'agent_cached_deploy_steps')
oob_steps = self.deploy_steps
if conductor_steps.find_step(oob_steps, step):
return super(AgentDeployMixin, self).execute_deploy_step(
task, step)
elif not agent_running:
raise exception.InstanceDeployFailure(
_('Deploy step %(step)s has not been found. Available '
'out-of-band steps: %(oob)s. Agent is not running.') %
{'step': step, 'oob': oob_steps})
else:
return execute_step(task, step, 'deploy')
@METRICS.timer('AgentDeployMixin.tear_down_agent')
@base.deploy_step(priority=40)
@task_manager.require_exclusive_lock

View File

@ -48,6 +48,11 @@ DRIVER_INFO = db_utils.get_test_agent_driver_info()
DRIVER_INTERNAL_INFO = db_utils.get_test_agent_driver_internal_info()
class FakeAgentDeploy(agent_base.AgentBaseMixin, agent_base.AgentDeployMixin,
fake.FakeDeploy):
pass
class AgentDeployMixinBaseTest(db_base.DbTestCase):
def setUp(self):
@ -66,7 +71,7 @@ class AgentDeployMixinBaseTest(db_base.DbTestCase):
'default_%s_interface' % iface: impl}
self.config(**config_kwarg)
self.config(enabled_hardware_types=['fake-hardware'])
self.deploy = agent_base.AgentDeployMixin()
self.deploy = FakeAgentDeploy()
n = {
'driver': 'fake-hardware',
'instance_info': INSTANCE_INFO,
@ -1712,8 +1717,8 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest):
autospec=True)
@mock.patch.object(manager_utils, 'notify_conductor_resume_operation',
autospec=True)
@mock.patch.object(agent_base.AgentDeployMixin,
'refresh_steps', autospec=True)
@mock.patch.object(agent_base.AgentBaseMixin, 'refresh_steps',
autospec=True)
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
autospec=True)
def _test_continue_cleaning_clean_version_mismatch(
@ -1752,8 +1757,8 @@ class AgentDeployMixinTest(AgentDeployMixinBaseTest):
autospec=True)
@mock.patch.object(manager_utils, 'notify_conductor_resume_operation',
autospec=True)
@mock.patch.object(agent_base.AgentDeployMixin,
'refresh_steps', autospec=True)
@mock.patch.object(agent_base.AgentBaseMixin, 'refresh_steps',
autospec=True)
@mock.patch.object(agent_client.AgentClient, 'get_commands_status',
autospec=True)
def test_continue_cleaning_clean_version_mismatch_fail(
@ -1847,6 +1852,7 @@ class TestRefreshCleanSteps(AgentDeployMixinBaseTest):
def setUp(self):
super(TestRefreshCleanSteps, self).setUp()
self.deploy = agent_base.AgentBaseMixin()
self.node.driver_internal_info['agent_url'] = 'http://127.0.0.1:9999'
self.ports = [object_utils.create_test_port(self.context,
node_id=self.node.id)]
@ -1984,10 +1990,6 @@ class TestRefreshCleanSteps(AgentDeployMixinBaseTest):
task.ports)
class FakeAgentDeploy(agent_base.AgentDeployMixin, fake.FakeDeploy):
pass
class StepMethodsTestCase(db_base.DbTestCase):
def setUp(self):

View File

@ -1041,6 +1041,58 @@ class PXERamdiskDeployTestCase(db_base.DbTestCase):
self.assertIsNone(task.driver.deploy.deploy(task))
self.assertTrue(mock_warning.called)
@mock.patch.object(agent_base, 'get_steps', autospec=True)
def test_get_clean_steps(self, mock_get_steps):
# Test getting clean steps
mock_steps = [{'priority': 10, 'interface': 'deploy',
'step': 'erase_devices'}]
mock_get_steps.return_value = mock_steps
with task_manager.acquire(self.context, self.node.uuid) as task:
steps = task.driver.deploy.get_clean_steps(task)
mock_get_steps.assert_called_once_with(
task, 'clean', interface='deploy',
override_priorities={'erase_devices': None,
'erase_devices_metadata': None})
self.assertEqual(mock_steps, steps)
def test_get_deploy_steps(self):
# Only the default deploy step exists in the ramdisk deploy
expected = [{'argsinfo': None, 'interface': 'deploy', 'priority': 100,
'step': 'deploy'}]
with task_manager.acquire(self.context, self.node.uuid) as task:
steps = task.driver.deploy.get_deploy_steps(task)
self.assertEqual(expected, steps)
@mock.patch.object(agent_base, 'execute_step', autospec=True)
def test_execute_clean_step(self, mock_execute_step):
step = {
'priority': 10,
'interface': 'deploy',
'step': 'erase_devices',
'reboot_requested': False
}
with task_manager.acquire(self.context, self.node.uuid) as task:
result = task.driver.deploy.execute_clean_step(task, step)
self.assertIs(result, mock_execute_step.return_value)
mock_execute_step.assert_called_once_with(task, step, 'clean')
@mock.patch.object(deploy_utils, 'prepare_inband_cleaning', autospec=True)
def test_prepare_cleaning(self, prepare_inband_cleaning_mock):
prepare_inband_cleaning_mock.return_value = states.CLEANWAIT
with task_manager.acquire(self.context, self.node.uuid) as task:
self.assertEqual(
states.CLEANWAIT, task.driver.deploy.prepare_cleaning(task))
prepare_inband_cleaning_mock.assert_called_once_with(
task, manage_boot=True)
@mock.patch.object(deploy_utils, 'tear_down_inband_cleaning',
autospec=True)
def test_tear_down_cleaning(self, tear_down_cleaning_mock):
with task_manager.acquire(self.context, self.node.uuid) as task:
task.driver.deploy.tear_down_cleaning(task)
tear_down_cleaning_mock.assert_called_once_with(
task, manage_boot=True)
class PXEAnacondaDeployTestCase(db_base.DbTestCase):

View File

@ -0,0 +1,17 @@
---
upgrade:
- |
In-band cleaning has been fixed for ``ramdisk`` and ``anaconda``
deploy interfaces. If you rely on actual clean steps not running,
you need to disable cleaning instead for the relevant nodes::
baremetal node set <node> --no-automated-clean
fixes:
- |
Fixes in-band cleaning for the ``ramdisk`` and ``anaconda`` deploy
interfaces. Previously no in-band steps were fetched from the ramdisk.
other:
- |
The cleaning code has been moved from ``AgentDeployMixin`` to
``AgentBaseMixin``. Most of 3rd party deploy interfaces will need to
include both anyway.