Add CLEANWAIT state

This patch adds the CLEANWAIT state. When a node is in CLEANWAIT means
that the ramdisk is executing a clean step (async). When the node is
in CLEANING state it means that the conductor is executing a clean step
(sync).

This is the first patch of a series that aim to make nodes in CLEANWAIT
abortable. We still need a way need some way to tell if a step is
abortable; aborting steps could have negative effects such as bricking
things.

Depends-On: I195ecd90e7e4165504da5ac330cee3fc7c3039c2
Co-Authored-By: Jim Rollenhagen <jim@jimrollenhagen.com>
Partial-Bug: #1455825
Change-Id: Ic2bc4f147f68947f53d341fda5e0c8d7b594a553
This commit is contained in:
Lucas Alvares Gomes 2015-07-09 16:29:12 +01:00
parent 7f88443bce
commit b6ed09e297
12 changed files with 192 additions and 49 deletions

View File

@ -132,6 +132,13 @@ represented in target_provision_state.
CLEANING = 'cleaning'
""" Node is being automatically cleaned to prepare it for provisioning. """
CLEANWAIT = 'clean wait'
""" Node is waiting to be cleaned.
This will be the node `provision_state` while the node is waiting for
the driver to finish cleaning step.
"""
CLEANFAIL = 'clean failed'
""" Node failed cleaning. This requires operator intervention to resolve. """
@ -220,6 +227,7 @@ machine.add_state(DEPLOYFAIL, target=ACTIVE, **watchers)
# Add clean* states
machine.add_state(CLEANING, target=AVAILABLE, **watchers)
machine.add_state(CLEANWAIT, target=AVAILABLE, **watchers)
machine.add_state(CLEANFAIL, target=AVAILABLE, **watchers)
# Add delete* states
@ -278,6 +286,11 @@ machine.add_transition(CLEANING, AVAILABLE, 'done')
# If cleaning fails, wait for operator intervention
machine.add_transition(CLEANING, CLEANFAIL, 'fail')
machine.add_transition(CLEANWAIT, CLEANFAIL, 'fail')
# A deployment may also wait on external callbacks
machine.add_transition(CLEANING, CLEANWAIT, 'wait')
machine.add_transition(CLEANWAIT, CLEANING, 'resume')
# An operator may want to move a CLEANFAIL node to MANAGEABLE, to perform
# other actions like zapping

View File

@ -830,7 +830,7 @@ class ConductorManager(periodic_task.PeriodicTasks):
:param context: an admin context.
:param node_id: the id or uuid of a node.
:raises: InvalidStateRequested if the node is not in CLEANING state
:raises: InvalidStateRequested if the node is not in CLEANWAIT state
:raises: NoFreeConductorWorker when there is no free worker to start
async task
:raises: NodeLocked if node is locked by another conductor.
@ -841,15 +841,27 @@ class ConductorManager(periodic_task.PeriodicTasks):
with task_manager.acquire(context, node_id, shared=False,
purpose='node cleaning') as task:
if task.node.provision_state != states.CLEANING:
# TODO(lucasagomes): CLEANING here for backwards compat
# with previous code, otherwise nodes in CLEANING when this
# is deployed would fail. Should be removed once the M
# release starts.
if task.node.provision_state not in (states.CLEANWAIT,
states.CLEANING):
raise exception.InvalidStateRequested(_(
'Cannot continue cleaning on %(node)s, node is in '
'%(state)s state, should be %(clean_state)s') %
{'node': task.node.uuid,
'state': task.node.provision_state,
'clean_state': states.CLEANING})
'clean_state': states.CLEANWAIT})
task.set_spawn_error_hook(cleaning_error_handler, task.node,
'Failed to run next clean step')
# TODO(lucasagomes): This conditional is here for backwards
# compat with previous code. Should be removed once the M
# release starts.
if task.node.provision_state == states.CLEANWAIT:
task.process_event('resume')
task.spawn_after(
self._spawn_worker,
self._do_next_clean_step,
@ -892,11 +904,20 @@ class ConductorManager(periodic_task.PeriodicTasks):
% {'node': node.uuid, 'e': e})
LOG.exception(msg)
return cleaning_error_handler(task, msg)
# TODO(lucasagomes): Should be removed once the M release starts
if prepare_result == states.CLEANING:
LOG.warning(_LW('Returning CLEANING for asynchronous prepare '
'cleaning has been deprecated. Please use '
'CLEANWAIT instead.'))
prepare_result = states.CLEANWAIT
if prepare_result == states.CLEANWAIT:
# Prepare is asynchronous, the deploy driver will need to
# set node.driver_internal_info['clean_steps'] and
# node.clean_step and then make an RPC call to
# continue_node_cleaning to start cleaning.
task.process_event('wait')
return
set_node_cleaning_steps(task)
@ -951,15 +972,23 @@ class ConductorManager(periodic_task.PeriodicTasks):
cleaning_error_handler(task, msg)
return
# Check if the step is done or not. The step should return
# states.CLEANING if the step is still being executed, or
# None if the step is done.
# TODO(lucasagomes): Should be removed once the M release starts
if result == states.CLEANING:
LOG.warning(_LW('Returning CLEANING for asynchronous clean '
'steps has been deprecated. Please use '
'CLEANWAIT instead.'))
result = states.CLEANWAIT
# Check if the step is done or not. The step should return
# states.CLEANWAIT if the step is still being executed, or
# None if the step is done.
if result == states.CLEANWAIT:
# Kill this worker, the async step will make an RPC call to
# continue_node_clean to continue cleaning
LOG.info(_LI('Clean step %(step)s on node %(node)s being '
'executed asynchronously, waiting for driver.') %
{'node': node.uuid, 'step': step})
task.process_event('wait')
return
elif result is not None:
msg = (_('While executing step %(step)s on node '
@ -1091,14 +1120,15 @@ class ConductorManager(periodic_task.PeriodicTasks):
# (through to its DB API call) so that we can eliminate our call
# and first set of checks below.
exclude_states = (states.DEPLOYWAIT, states.ENROLL)
exclude_states = (states.DEPLOYWAIT, states.CLEANWAIT, states.ENROLL)
filters = {'reserved': False, 'maintenance': False}
node_iter = self.iter_nodes(fields=['id'], filters=filters)
for (node_uuid, driver, node_id) in node_iter:
try:
# NOTE(deva): we should not acquire a lock on a node in
# DEPLOYWAIT, as this could cause an error within
# a deploy ramdisk POSTing back at the same time.
# DEPLOYWAIT/CLEANWAIT, as this could cause an
# error within a deploy ramdisk POSTing back at
# the same time.
# TODO(deva): refactor this check, because it needs to be done
# in every periodic task, not just this one.
node = objects.Node.get_by_id(context, node_id)

View File

@ -177,7 +177,7 @@ class BaseInterface(object):
A clean step should take a single argument: a TaskManager object.
A step can be executed synchronously or asynchronously. A step should
return None if the method has completed synchronously or
states.CLEANING if the step will continue to execute asynchronously.
states.CLEANWAIT if the step will continue to execute asynchronously.
If the step executes asynchronously, it should issue a call to the
'continue_node_clean' RPC, so the conductor can begin the next
clean step.
@ -185,7 +185,7 @@ class BaseInterface(object):
:param task: A TaskManager object
:param step: The clean step dictionary representing the step to execute
:returns: None if this method has completed synchronously, or
states.CLEANING if the step will continue to execute
states.CLEANWAIT if the step will continue to execute
asynchronously.
"""
return getattr(self, step['step'])(task)
@ -312,7 +312,7 @@ class DeployInterface(BaseInterface):
:param task: a TaskManager instance containing the node to act on.
:returns: If this function is going to be asynchronous, should return
`states.CLEANING`. Otherwise, should return `None`. The interface
`states.CLEANWAIT`. Otherwise, should return `None`. The interface
will need to call _get_cleaning_steps and then RPC to
continue_node_cleaning
"""
@ -859,13 +859,14 @@ class InspectInterface(object):
def clean_step(priority):
"""Decorator for cleaning and zapping steps.
If priority is greater than 0, the function will be executed as part of the
CLEANING state for any node using the interface with the decorated clean
step. During CLEANING, a list of steps will be ordered by priority for all
interfaces associated with the node, and then execute_clean_step() will be
called on each step. Steps will be executed based on priority, with the
highest priority step being called first, the next highest priority
being call next, and so on.
If priority is greater than 0, the function will be executed as part
of the CLEANING (sync) or CLEANWAIT (async) state for any node using
the interface with the decorated clean step. During the cleaning,
a list of steps will be ordered by priority for all interfaces
associated with the node, and then execute_clean_step() will be
called on each step. Steps will be executed based on priority,
with the highest priority step being called first, the next highest
priority being call next, and so on.
Decorated clean steps should take a single argument, a TaskManager object.
@ -877,7 +878,7 @@ def clean_step(priority):
Clean steps can be either synchronous or asynchronous. If the step is
synchronous, it should return `None` when finished, and the conductor will
continue on to the next step. If the step is asynchronous, the step should
return `states.CLEANING` to signal to the conductor. When the step is
return `states.CLEANWAIT` to signal to the conductor. When the step is
complete, the step should make an RPC call to `continue_node_clean` to move
to the next step in cleaning.

View File

@ -385,7 +385,7 @@ class AgentDeploy(base.DeployInterface):
:param step: a clean step dictionary to execute
:raises: NodeCleaningFailure if the agent does not return a command
status
:returns: states.CLEANING to signify the step will be completed async
:returns: states.CLEANWAIT to signify the step will be completed async
"""
return deploy_utils.agent_execute_clean_step(task, step)
@ -395,7 +395,7 @@ class AgentDeploy(base.DeployInterface):
:param task: a TaskManager object containing the node
:raises NodeCleaningFailure: if the previous cleaning ports cannot
be removed or if new cleaning ports cannot be created
:returns: states.CLEANING to signify an asynchronous prepare
:returns: states.CLEANWAIT to signify an asynchronous prepare
"""
provider = dhcp_factory.DHCPFactory()
# If we have left over ports from a previous cleaning, remove them
@ -416,7 +416,7 @@ class AgentDeploy(base.DeployInterface):
_prepare_pxe_boot(task)
_do_pxe_boot(task, ports)
# Tell the conductor we are waiting for the agent to boot.
return states.CLEANING
return states.CLEANWAIT
def tear_down_cleaning(self, task):
"""Clean up the PXE and DHCP files after cleaning.

View File

@ -263,14 +263,22 @@ class BaseAgentVendor(base.VendorInterface):
elif (node.provision_state == states.DEPLOYWAIT and
self.deploy_has_started(task)):
node.touch_provisioning()
elif (node.provision_state == states.CLEANING and
not node.clean_step):
# TODO(lucasagomes): CLEANING here for backwards compat
# with previous code, otherwise nodes in CLEANING when this
# is deployed would fail. Should be removed once the M
# release starts.
elif (node.provision_state in (states.CLEANWAIT, states.CLEANING)
and not node.clean_step):
# Agent booted from prepare_cleaning
LOG.debug('Node %s just booted to start cleaning.', node.uuid)
manager.set_node_cleaning_steps(task)
self._notify_conductor_resume_clean(task)
elif (node.provision_state == states.CLEANING and
node.clean_step):
# TODO(lucasagomes): CLEANING here for backwards compat
# with previous code, otherwise nodes in CLEANING when this
# is deployed would fail. Should be removed once the M
# release starts.
elif (node.provision_state in (states.CLEANWAIT, states.CLEANING)
and node.clean_step):
self.continue_cleaning(task, **kwargs)
except Exception as e:

View File

@ -959,7 +959,7 @@ def agent_execute_clean_step(task, step):
:param task: a TaskManager object containing the node
:param step: a clean step dictionary to execute
:raises: NodeCleaningFailure if the agent does not return a command status
:returns: states.CLEANING to signify the step will be completed async
:returns: states.CLEANWAIT to signify the step will be completed async
"""
client = agent_client.AgentClient()
ports = objects.Port.list_by_node_id(
@ -970,7 +970,7 @@ def agent_execute_clean_step(task, step):
'Agent on node %(node)s returned bad command result: '
'%(result)s') % {'node': task.node.uuid,
'result': result.get('command_error')})
return states.CLEANING
return states.CLEANWAIT
def agent_add_clean_params(task):

View File

@ -592,7 +592,7 @@ class IloVirtualMediaAgentDeploy(base.DeployInterface):
:param task: a TaskManager object containing the node
:param step: a clean step dictionary to execute
:returns: states.CLEANING to signify the step will be completed async
:returns: states.CLEANWAIT to signify the step will be completed async
"""
return deploy_utils.agent_execute_clean_step(task, step)
@ -614,7 +614,7 @@ class IloVirtualMediaAgentDeploy(base.DeployInterface):
_prepare_agent_vmedia_boot(task)
# Tell the conductor we are waiting for the agent to boot.
return states.CLEANING
return states.CLEANWAIT
def tear_down_cleaning(self, task):
"""Clean up the PXE and DHCP files after cleaning."""

View File

@ -1583,7 +1583,7 @@ class DoNodeCleanTestCase(_ServiceSetUpMixin, tests_db_base.DbTestCase):
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker')
def test_continue_node_clean_worker_pool_full(self, mock_spawn):
# Test the appropriate exception is raised if the worker pool is full
prv_state = states.CLEANING
prv_state = states.CLEANWAIT
tgt_prv_state = states.AVAILABLE
node = obj_utils.create_test_node(self.context, driver='fake',
provision_state=prv_state,
@ -1599,14 +1599,11 @@ class DoNodeCleanTestCase(_ServiceSetUpMixin, tests_db_base.DbTestCase):
self.service._worker_pool.waitall()
node.refresh()
# Make sure things were rolled back
self.assertEqual(prv_state, node.provision_state)
self.assertEqual(tgt_prv_state, node.target_provision_state)
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker')
def test_continue_node_clean_wrong_state(self, mock_spawn):
# Test the appropriate exception is raised if node isn't already
# in CLEANING state
# in CLEANWAIT state
prv_state = states.DELETING
tgt_prv_state = states.AVAILABLE
node = obj_utils.create_test_node(self.context, driver='fake',
@ -1628,9 +1625,9 @@ class DoNodeCleanTestCase(_ServiceSetUpMixin, tests_db_base.DbTestCase):
self.assertIsNone(node.reservation)
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker')
def test_continue_node_clean(self, mock_spawn):
def _continue_node_clean(self, return_state, mock_spawn):
# test a node can continue cleaning via RPC
prv_state = states.CLEANING
prv_state = return_state
tgt_prv_state = states.AVAILABLE
driver_info = {'clean_steps': self.clean_steps}
node = obj_utils.create_test_node(self.context, driver='fake',
@ -1647,6 +1644,12 @@ class DoNodeCleanTestCase(_ServiceSetUpMixin, tests_db_base.DbTestCase):
mock.ANY, self.clean_steps,
self.clean_steps[1])
def test_continue_node_clean(self):
self._continue_node_clean(states.CLEANWAIT)
def test_continue_node_clean_backward_compat(self):
self._continue_node_clean(states.CLEANING)
@mock.patch('ironic.drivers.modules.fake.FakePower.validate')
def test__do_node_clean_validate_fail(self, mock_validate):
# InvalidParameterValue should be cause node to go to CLEANFAIL
@ -1717,7 +1720,7 @@ class DoNodeCleanTestCase(_ServiceSetUpMixin, tests_db_base.DbTestCase):
self.assertEqual(states.AVAILABLE, node.target_provision_state)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step')
def test__do_next_clean_step_first_step_async(self, mock_execute):
def _do_next_clean_step_first_step_async(self, return_state, mock_execute):
# Execute the first async clean step on a node
node = obj_utils.create_test_node(
self.context, driver='fake',
@ -1725,7 +1728,7 @@ class DoNodeCleanTestCase(_ServiceSetUpMixin, tests_db_base.DbTestCase):
target_provision_state=states.AVAILABLE,
last_error=None,
clean_step={})
mock_execute.return_value = states.CLEANING
mock_execute.return_value = return_state
self._start_service()
@ -1737,12 +1740,19 @@ class DoNodeCleanTestCase(_ServiceSetUpMixin, tests_db_base.DbTestCase):
self.service._worker_pool.waitall()
node.refresh()
self.assertEqual(states.CLEANING, node.provision_state)
self.assertEqual(states.CLEANWAIT, node.provision_state)
self.assertEqual(self.clean_steps[0], node.clean_step)
mock_execute.assert_called_once_with(mock.ANY, self.clean_steps[0])
def test_do_next_clean_step_first_step_async(self):
self._do_next_clean_step_first_step_async(states.CLEANWAIT)
def test_do_next_clean_step_first_step_async_backward_compat(self):
self._do_next_clean_step_first_step_async(states.CLEANING)
@mock.patch('ironic.drivers.modules.fake.FakePower.execute_clean_step')
def test__do_next_clean_step_continue_from_last_step(self, mock_execute):
def _do_next_clean_step_continue_from_last_step(self, return_state,
mock_execute):
# Resume an in-progress cleaning after the first async step
node = obj_utils.create_test_node(
self.context, driver='fake',
@ -1750,7 +1760,7 @@ class DoNodeCleanTestCase(_ServiceSetUpMixin, tests_db_base.DbTestCase):
target_provision_state=states.AVAILABLE,
last_error=None,
clean_step=self.clean_steps[0])
mock_execute.return_value = states.CLEANING
mock_execute.return_value = return_state
self._start_service()
@ -1762,10 +1772,48 @@ class DoNodeCleanTestCase(_ServiceSetUpMixin, tests_db_base.DbTestCase):
self.service._worker_pool.waitall()
node.refresh()
self.assertEqual(states.CLEANING, node.provision_state)
self.assertEqual(states.CLEANWAIT, node.provision_state)
self.assertEqual(self.clean_steps[1], node.clean_step)
mock_execute.assert_called_once_with(mock.ANY, self.clean_steps[1])
def test_do_next_clean_step_continue_from_last_step(self):
self._do_next_clean_step_continue_from_last_step(states.CLEANWAIT)
def test_do_next_clean_step_continue_from_last_step_backward_compat(self):
self._do_next_clean_step_continue_from_last_step(states.CLEANING)
@mock.patch('ironic.drivers.modules.fake.FakePower.execute_clean_step')
def _do_next_clean_step_continue_from_last_cleaning(self, return_state,
mock_execute):
# Resume an in-progress cleaning after the first async step
node = obj_utils.create_test_node(
self.context, driver='fake',
provision_state=states.CLEANING,
target_provision_state=states.AVAILABLE,
last_error=None,
clean_step=self.clean_steps[0])
mock_execute.return_value = return_state
self._start_service()
with task_manager.acquire(
self.context, node['id'], shared=False) as task:
self.service._do_next_clean_step(task, self.clean_steps,
self.clean_steps[0])
self.service._worker_pool.waitall()
node.refresh()
self.assertEqual(states.CLEANWAIT, node.provision_state)
self.assertEqual(self.clean_steps[1], node.clean_step)
mock_execute.assert_called_once_with(mock.ANY, self.clean_steps[1])
def test_do_next_clean_step_continue_from_last_cleaning(self):
self._do_next_clean_step_continue_from_last_cleaning(states.CLEANWAIT)
def test_do_next_clean_step_continue_from_last_cleaning_backward_com(self):
self._do_next_clean_step_continue_from_last_cleaning(states.CLEANING)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step')
def test__do_next_clean_step_last_step_noop(self, mock_execute):
# Resume where last_step is the last cleaning step, should be noop

View File

@ -791,7 +791,7 @@ class IloVirtualMediaAgentDeployTestCase(db_base.DbTestCase):
shared=False) as task:
returned_state = task.driver.deploy.prepare_cleaning(task)
vmedia_boot_mock.assert_called_once_with(task)
self.assertEqual(states.CLEANING, returned_state)
self.assertEqual(states.CLEANWAIT, returned_state)
create_port_mock.assert_called_once_with(mock.ANY, task)
delete_mock.assert_called_once_with(mock.ANY, task)
self.assertEqual(task.node.driver_internal_info.get(

View File

@ -312,7 +312,7 @@ class TestAgentDeploy(db_base.DbTestCase):
create_mock.return_value = ports
with task_manager.acquire(
self.context, self.node['uuid'], shared=False) as task:
self.assertEqual(states.CLEANING,
self.assertEqual(states.CLEANWAIT,
self.driver.prepare_cleaning(task))
prepare_mock.assert_called_once_with(task)
boot_mock.assert_called_once_with(task, ports)

View File

@ -23,6 +23,7 @@ import mock
from ironic.common import boot_devices
from ironic.common import exception
from ironic.common import states
from ironic.conductor import manager
from ironic.conductor import task_manager
from ironic.conductor import utils as manager_utils
from ironic.drivers.modules import agent_base_vendor
@ -296,6 +297,48 @@ class TestBaseAgentVendor(db_base.DbTestCase):
'1be26c0b-03f2-4d2e-ae87-c02d7f33c123: Failed checking if deploy '
'is done. exception: LlamaException')
@mock.patch.object(manager, 'set_node_cleaning_steps', autospec=True)
@mock.patch.object(agent_base_vendor.BaseAgentVendor,
'_notify_conductor_resume_clean', autospec=True)
def test_heartbeat_resume_clean(self, mock_notify, mock_set_steps):
kwargs = {
'agent_url': 'http://127.0.0.1:9999/bar'
}
self.node.clean_step = {}
for state in (states.CLEANWAIT, states.CLEANING):
self.node.provision_state = state
self.node.save()
with task_manager.acquire(
self.context, self.node.uuid, shared=True) as task:
self.passthru.heartbeat(task, **kwargs)
mock_notify.assert_called_once_with(mock.ANY, task)
mock_set_steps.assert_called_once_with(task)
mock_notify.reset_mock()
mock_set_steps.reset_mock()
@mock.patch.object(agent_base_vendor.BaseAgentVendor,
'continue_cleaning', autospec=True)
def test_heartbeat_continue_cleaning(self, mock_continue):
kwargs = {
'agent_url': 'http://127.0.0.1:9999/bar'
}
self.node.clean_step = {
'priority': 10,
'interface': 'deploy',
'step': 'foo',
'reboot_requested': False
}
for state in (states.CLEANWAIT, states.CLEANING):
self.node.provision_state = state
self.node.save()
with task_manager.acquire(
self.context, self.node.uuid, shared=True) as task:
self.passthru.heartbeat(task, **kwargs)
mock_continue.assert_called_once_with(mock.ANY, task, **kwargs)
mock_continue.reset_mock()
@mock.patch.object(agent_base_vendor.BaseAgentVendor, 'continue_deploy',
autospec=True)
@mock.patch.object(agent_base_vendor.BaseAgentVendor, 'reboot_to_instance',

View File

@ -1843,7 +1843,7 @@ class AgentCleaningTestCase(db_base.DbTestCase):
response = utils.agent_execute_clean_step(
task,
self.clean_steps['clean_steps']['GenericHardwareManager'][0])
self.assertEqual(states.CLEANING, response)
self.assertEqual(states.CLEANWAIT, response)
@mock.patch('ironic.objects.Port.list_by_node_id',
spec_set=types.FunctionType)
@ -1859,7 +1859,7 @@ class AgentCleaningTestCase(db_base.DbTestCase):
response = utils.agent_execute_clean_step(
task,
self.clean_steps['clean_steps']['GenericHardwareManager'][0])
self.assertEqual(states.CLEANING, response)
self.assertEqual(states.CLEANWAIT, response)
@mock.patch('ironic.objects.Port.list_by_node_id',
spec_set=types.FunctionType)
@ -1876,7 +1876,7 @@ class AgentCleaningTestCase(db_base.DbTestCase):
response = utils.agent_execute_clean_step(
task,
self.clean_steps['clean_steps']['GenericHardwareManager'][0])
self.assertEqual(states.CLEANING, response)
self.assertEqual(states.CLEANWAIT, response)
def test_agent_add_clean_params(self):
cfg.CONF.agent.agent_erase_devices_iterations = 2