Merge "Add an option to abort cleaning and deployment if node is in maintenance"

This commit is contained in:
Zuul 2019-09-21 08:50:13 +00:00 committed by Gerrit Code Review
commit 4e88793256
8 changed files with 184 additions and 14 deletions

View File

@ -1300,6 +1300,14 @@ class ConductorManager(base_manager.BaseConductorManager):
'successfully moved to AVAILABLE state.', node.uuid)
return
# NOTE(dtantsur): this is only reachable during automated cleaning,
# for manual cleaning we verify maintenance mode earlier on.
if (not CONF.conductor.allow_provisioning_in_maintenance
and node.maintenance):
msg = _('Cleaning a node in maintenance mode is not allowed')
return utils.cleaning_error_handler(task, msg,
tear_down_cleaning=False)
try:
# NOTE(ghe): Valid power and network values are needed to perform
# a cleaning.
@ -1543,7 +1551,8 @@ class ConductorManager(base_manager.BaseConductorManager):
exception.NodeLocked,
exception.InvalidParameterValue,
exception.InvalidStateRequested,
exception.UnsupportedDriverExtension)
exception.UnsupportedDriverExtension,
exception.NodeInMaintenance)
def do_provisioning_action(self, context, node_id, action):
"""RPC method to initiate certain provisioning state transitions.
@ -1556,7 +1565,7 @@ class ConductorManager(base_manager.BaseConductorManager):
:raises: InvalidParameterValue
:raises: InvalidStateRequested
:raises: NoFreeConductorWorker
:raises: NodeInMaintenance
"""
with task_manager.acquire(context, node_id, shared=False,
purpose='provision action %s'
@ -1564,6 +1573,11 @@ class ConductorManager(base_manager.BaseConductorManager):
node = task.node
if (action == states.VERBS['provide']
and node.provision_state == states.MANAGEABLE):
# NOTE(dtantsur): do this early to avoid entering cleaning.
if (not CONF.conductor.allow_provisioning_in_maintenance
and node.maintenance):
raise exception.NodeInMaintenance(op=_('providing'),
node=node.uuid)
task.process_event(
'provide',
callback=self._spawn_worker,

View File

@ -416,7 +416,9 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
# for automated cleaning, it is AVAILABLE.
manual_clean = node.target_provision_state == states.MANAGEABLE
node.last_error = msg
node.maintenance_reason = msg
# NOTE(dtantsur): avoid overwriting existing maintenance_reason
if not node.maintenance_reason:
node.maintenance_reason = msg
node.save()
if set_fail_state and node.provision_state != states.CLEANFAIL:

View File

@ -178,6 +178,18 @@ opts = [
'longer. In an environment where all tenants are '
'trusted (eg, because there is only one tenant), '
'this option could be safely disabled.')),
cfg.BoolOpt('allow_provisioning_in_maintenance',
default=True,
mutable=True,
help=_('Whether to allow nodes to enter or undergo deploy or '
'cleaning when in maintenance mode. If this option is '
'set to False, and a node enters maintenance during '
'deploy or cleaning, the process will be aborted '
'after the next heartbeat. Automated cleaning or '
'making a node available will also fail. If True '
'(the default), the process will begin and will pause '
'after the node starts heartbeating. Moving it from '
'maintenance will make the process continue.')),
cfg.IntOpt('clean_callback_timeout',
default=1800,
help=_('Timeout (seconds) to wait for a callback from the '

View File

@ -296,6 +296,31 @@ class HeartbeatMixin(object):
return FASTTRACK_HEARTBEAT_ALLOWED
return HEARTBEAT_ALLOWED
def _heartbeat_in_maintenance(self, task):
node = task.node
if (node.provision_state in (states.CLEANING, states.CLEANWAIT)
and not CONF.conductor.allow_provisioning_in_maintenance):
LOG.error('Aborting cleaning for node %s, as it is in maintenance '
'mode', node.uuid)
last_error = _('Cleaning aborted as node is in maintenance mode')
manager_utils.cleaning_error_handler(task, last_error)
elif (node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT)
and not CONF.conductor.allow_provisioning_in_maintenance):
LOG.error('Aborting deployment for node %s, as it is in '
'maintenance mode', node.uuid)
last_error = _('Deploy aborted as node is in maintenance mode')
deploy_utils.set_failed_state(task, last_error, collect_logs=False)
elif (node.provision_state in (states.RESCUING, states.RESCUEWAIT)
and not CONF.conductor.allow_provisioning_in_maintenance):
LOG.error('Aborting rescuing for node %s, as it is in '
'maintenance mode', node.uuid)
last_error = _('Rescue aborted as node is in maintenance mode')
manager_utils.rescuing_error_handler(task, last_error)
else:
LOG.warning('Heartbeat from node %(node)s in '
'maintenance mode; not taking any action.',
{'node': node.uuid})
@METRICS.timer('HeartbeatMixin.heartbeat')
def heartbeat(self, task, callback_url, agent_version):
"""Process a heartbeat.
@ -342,20 +367,18 @@ class HeartbeatMixin(object):
'node as on-line.', {'node': task.node.uuid})
return
if node.maintenance:
return self._heartbeat_in_maintenance(task)
# Async call backs don't set error state on their own
# TODO(jimrollenhagen) improve error messages here
msg = _('Failed checking if deploy is done.')
try:
if node.maintenance:
# this shouldn't happen often, but skip the rest if it does.
LOG.debug('Heartbeat from node %(node)s in maintenance mode; '
'not taking any action.', {'node': node.uuid})
return
# NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we
# are currently in the core deploy.deploy step. Other deploy steps
# may cause the agent to boot, but we should not trigger deployment
# at that point.
elif node.provision_state == states.DEPLOYWAIT:
if node.provision_state == states.DEPLOYWAIT:
if self.in_core_deploy_step(task):
if not self.deploy_has_started(task):
msg = _('Node failed to deploy.')

View File

@ -3161,6 +3161,31 @@ class DoProvisioningActionTestCase(mgr_utils.ServiceSetUpMixin,
mock_spawn.assert_called_with(self.service,
self.service._do_node_clean, mock.ANY)
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
autospec=True)
def test_do_provision_action_provide_in_maintenance(self, mock_spawn):
CONF.set_override('allow_provisioning_in_maintenance', False,
group='conductor')
# test when a node is cleaned going from manageable to available
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.MANAGEABLE,
target_provision_state=None,
maintenance=True)
self._start_service()
mock_spawn.reset_mock()
exc = self.assertRaises(messaging.rpc.ExpectedException,
self.service.do_provisioning_action,
self.context, node.uuid, 'provide')
# Compare true exception hidden by @messaging.expected_exceptions
self.assertEqual(exception.NodeInMaintenance, exc.exc_info[0])
node.refresh()
self.assertEqual(states.MANAGEABLE, node.provision_state)
self.assertIsNone(node.target_provision_state)
self.assertIsNone(node.last_error)
self.assertFalse(mock_spawn.called)
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
autospec=True)
def test_do_provision_action_manage(self, mock_spawn):
@ -3822,6 +3847,31 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self.assertTrue(mock_validate.called)
self.assertIn('clean_steps', node.driver_internal_info)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.tear_down_cleaning',
autospec=True)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_cleaning',
autospec=True)
def test__do_node_clean_maintenance(self, mock_prep, mock_tear_down):
CONF.set_override('allow_provisioning_in_maintenance', False,
group='conductor')
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.CLEANING,
target_provision_state=states.AVAILABLE,
maintenance=True,
maintenance_reason='Original reason')
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_node_clean(task)
node.refresh()
self.assertEqual(states.CLEANFAIL, node.provision_state)
self.assertEqual(states.AVAILABLE, node.target_provision_state)
self.assertIn('is not allowed', node.last_error)
self.assertTrue(node.maintenance)
self.assertEqual('Original reason', node.maintenance_reason)
self.assertFalse(mock_prep.called)
self.assertFalse(mock_tear_down.called)
@mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate',
autospec=True)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_cleaning',

View File

@ -986,9 +986,9 @@ class ErrorHandlersTestCase(tests_base.TestCase):
# strict typing of the node power state fields and would fail if passed
# a Mock object in constructors. A task context is also required for
# notifications.
power_attrs = {'power_state': states.POWER_OFF,
'target_power_state': states.POWER_ON}
self.node.configure_mock(**power_attrs)
self.node.configure_mock(power_state=states.POWER_OFF,
target_power_state=states.POWER_ON,
maintenance=False, maintenance_reason=None)
self.task.context = self.context
@mock.patch.object(conductor_utils, 'LOG')
@ -1116,7 +1116,6 @@ class ErrorHandlersTestCase(tests_base.TestCase):
self.assertEqual('clean failure', self.node.fault)
def test_abort_on_conductor_take_over_cleaning(self):
self.node.maintenance = False
self.node.provision_state = states.CLEANFAIL
conductor_utils.abort_on_conductor_take_over(self.task)
self.assertTrue(self.node.maintenance)
@ -1128,7 +1127,6 @@ class ErrorHandlersTestCase(tests_base.TestCase):
self.node.save.assert_called_once_with()
def test_abort_on_conductor_take_over_deploying(self):
self.node.maintenance = False
self.node.provision_state = states.DEPLOYFAIL
conductor_utils.abort_on_conductor_take_over(self.task)
self.assertFalse(self.node.maintenance)

View File

@ -200,6 +200,44 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
self.assertEqual(
'3.2.0',
task.node.driver_internal_info['agent_version'])
self.assertEqual(state, task.node.provision_state)
self.assertIsNone(task.node.last_error)
self.assertEqual(0, ncrc_mock.call_count)
self.assertEqual(0, rti_mock.call_count)
self.assertEqual(0, cd_mock.call_count)
@mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy',
autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
'reboot_to_instance', autospec=True)
@mock.patch.object(manager_utils, 'notify_conductor_resume_clean',
autospec=True)
def test_heartbeat_in_maintenance_abort(self, ncrc_mock, rti_mock,
cd_mock):
CONF.set_override('allow_provisioning_in_maintenance', False,
group='conductor')
for state, expected in [(states.DEPLOYWAIT, states.DEPLOYFAIL),
(states.CLEANWAIT, states.CLEANFAIL),
(states.RESCUEWAIT, states.RESCUEFAIL)]:
for m in (ncrc_mock, rti_mock, cd_mock):
m.reset_mock()
self.node.provision_state = state
self.node.maintenance = True
self.node.save()
agent_url = 'url-%s' % state
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
self.deploy.heartbeat(task, agent_url, '3.2.0')
self.assertFalse(task.shared)
self.assertEqual(
agent_url,
task.node.driver_internal_info['agent_url'])
self.assertEqual(
'3.2.0',
task.node.driver_internal_info['agent_version'])
self.node.refresh()
self.assertEqual(expected, self.node.provision_state)
self.assertIn('aborted', self.node.last_error)
self.assertEqual(0, ncrc_mock.call_count)
self.assertEqual(0, rti_mock.call_count)
self.assertEqual(0, cd_mock.call_count)
@ -252,6 +290,29 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
self.assertEqual(0, rti_mock.call_count)
self.assertEqual(0, cd_mock.call_count)
@mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy',
autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
'reboot_to_instance', autospec=True)
@mock.patch.object(manager_utils, 'notify_conductor_resume_clean',
autospec=True)
def test_heartbeat_noops_in_wrong_state2(self, ncrc_mock, rti_mock,
cd_mock):
CONF.set_override('allow_provisioning_in_maintenance', False,
group='conductor')
allowed = {states.DEPLOYWAIT, states.CLEANWAIT}
for state in set(states.machine.states) - allowed:
for m in (ncrc_mock, rti_mock, cd_mock):
m.reset_mock()
with task_manager.acquire(self.context, self.node.uuid,
shared=True) as task:
self.node.provision_state = state
self.deploy.heartbeat(task, 'url', '1.0.0')
self.assertTrue(task.shared)
self.assertEqual(0, ncrc_mock.call_count)
self.assertEqual(0, rti_mock.call_count)
self.assertEqual(0, cd_mock.call_count)
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
'in_core_deploy_step', autospec=True)
@mock.patch.object(agent_base_vendor.HeartbeatMixin,

View File

@ -0,0 +1,10 @@
---
fixes:
- |
Currently ironic allows entering deployment or cleaning for nodes in
maintenance mode. However, heartbeats do not cause any actions for such
nodes, thus deployment or cleaning never finish if the nodes are not moved
out of maintenance. A new configuration option
``[conductor]allow_provisioning_in_maintenance`` (defaulting to ``True``)
is added to configure this behavior. If it is set to ``False``, deployment
and cleaning will be prevented from nodes in maintenance mode.