Merge "Add an option to abort cleaning and deployment if node is in maintenance"
This commit is contained in:
commit
4e88793256
@ -1300,6 +1300,14 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
'successfully moved to AVAILABLE state.', node.uuid)
|
||||
return
|
||||
|
||||
# NOTE(dtantsur): this is only reachable during automated cleaning,
|
||||
# for manual cleaning we verify maintenance mode earlier on.
|
||||
if (not CONF.conductor.allow_provisioning_in_maintenance
|
||||
and node.maintenance):
|
||||
msg = _('Cleaning a node in maintenance mode is not allowed')
|
||||
return utils.cleaning_error_handler(task, msg,
|
||||
tear_down_cleaning=False)
|
||||
|
||||
try:
|
||||
# NOTE(ghe): Valid power and network values are needed to perform
|
||||
# a cleaning.
|
||||
@ -1543,7 +1551,8 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
exception.NodeLocked,
|
||||
exception.InvalidParameterValue,
|
||||
exception.InvalidStateRequested,
|
||||
exception.UnsupportedDriverExtension)
|
||||
exception.UnsupportedDriverExtension,
|
||||
exception.NodeInMaintenance)
|
||||
def do_provisioning_action(self, context, node_id, action):
|
||||
"""RPC method to initiate certain provisioning state transitions.
|
||||
|
||||
@ -1556,7 +1565,7 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
:raises: InvalidParameterValue
|
||||
:raises: InvalidStateRequested
|
||||
:raises: NoFreeConductorWorker
|
||||
|
||||
:raises: NodeInMaintenance
|
||||
"""
|
||||
with task_manager.acquire(context, node_id, shared=False,
|
||||
purpose='provision action %s'
|
||||
@ -1564,6 +1573,11 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
node = task.node
|
||||
if (action == states.VERBS['provide']
|
||||
and node.provision_state == states.MANAGEABLE):
|
||||
# NOTE(dtantsur): do this early to avoid entering cleaning.
|
||||
if (not CONF.conductor.allow_provisioning_in_maintenance
|
||||
and node.maintenance):
|
||||
raise exception.NodeInMaintenance(op=_('providing'),
|
||||
node=node.uuid)
|
||||
task.process_event(
|
||||
'provide',
|
||||
callback=self._spawn_worker,
|
||||
|
@ -416,7 +416,9 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
|
||||
# for automated cleaning, it is AVAILABLE.
|
||||
manual_clean = node.target_provision_state == states.MANAGEABLE
|
||||
node.last_error = msg
|
||||
node.maintenance_reason = msg
|
||||
# NOTE(dtantsur): avoid overwriting existing maintenance_reason
|
||||
if not node.maintenance_reason:
|
||||
node.maintenance_reason = msg
|
||||
node.save()
|
||||
|
||||
if set_fail_state and node.provision_state != states.CLEANFAIL:
|
||||
|
@ -178,6 +178,18 @@ opts = [
|
||||
'longer. In an environment where all tenants are '
|
||||
'trusted (eg, because there is only one tenant), '
|
||||
'this option could be safely disabled.')),
|
||||
cfg.BoolOpt('allow_provisioning_in_maintenance',
|
||||
default=True,
|
||||
mutable=True,
|
||||
help=_('Whether to allow nodes to enter or undergo deploy or '
|
||||
'cleaning when in maintenance mode. If this option is '
|
||||
'set to False, and a node enters maintenance during '
|
||||
'deploy or cleaning, the process will be aborted '
|
||||
'after the next heartbeat. Automated cleaning or '
|
||||
'making a node available will also fail. If True '
|
||||
'(the default), the process will begin and will pause '
|
||||
'after the node starts heartbeating. Moving it from '
|
||||
'maintenance will make the process continue.')),
|
||||
cfg.IntOpt('clean_callback_timeout',
|
||||
default=1800,
|
||||
help=_('Timeout (seconds) to wait for a callback from the '
|
||||
|
@ -296,6 +296,31 @@ class HeartbeatMixin(object):
|
||||
return FASTTRACK_HEARTBEAT_ALLOWED
|
||||
return HEARTBEAT_ALLOWED
|
||||
|
||||
def _heartbeat_in_maintenance(self, task):
|
||||
node = task.node
|
||||
if (node.provision_state in (states.CLEANING, states.CLEANWAIT)
|
||||
and not CONF.conductor.allow_provisioning_in_maintenance):
|
||||
LOG.error('Aborting cleaning for node %s, as it is in maintenance '
|
||||
'mode', node.uuid)
|
||||
last_error = _('Cleaning aborted as node is in maintenance mode')
|
||||
manager_utils.cleaning_error_handler(task, last_error)
|
||||
elif (node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT)
|
||||
and not CONF.conductor.allow_provisioning_in_maintenance):
|
||||
LOG.error('Aborting deployment for node %s, as it is in '
|
||||
'maintenance mode', node.uuid)
|
||||
last_error = _('Deploy aborted as node is in maintenance mode')
|
||||
deploy_utils.set_failed_state(task, last_error, collect_logs=False)
|
||||
elif (node.provision_state in (states.RESCUING, states.RESCUEWAIT)
|
||||
and not CONF.conductor.allow_provisioning_in_maintenance):
|
||||
LOG.error('Aborting rescuing for node %s, as it is in '
|
||||
'maintenance mode', node.uuid)
|
||||
last_error = _('Rescue aborted as node is in maintenance mode')
|
||||
manager_utils.rescuing_error_handler(task, last_error)
|
||||
else:
|
||||
LOG.warning('Heartbeat from node %(node)s in '
|
||||
'maintenance mode; not taking any action.',
|
||||
{'node': node.uuid})
|
||||
|
||||
@METRICS.timer('HeartbeatMixin.heartbeat')
|
||||
def heartbeat(self, task, callback_url, agent_version):
|
||||
"""Process a heartbeat.
|
||||
@ -342,20 +367,18 @@ class HeartbeatMixin(object):
|
||||
'node as on-line.', {'node': task.node.uuid})
|
||||
return
|
||||
|
||||
if node.maintenance:
|
||||
return self._heartbeat_in_maintenance(task)
|
||||
|
||||
# Async call backs don't set error state on their own
|
||||
# TODO(jimrollenhagen) improve error messages here
|
||||
msg = _('Failed checking if deploy is done.')
|
||||
try:
|
||||
if node.maintenance:
|
||||
# this shouldn't happen often, but skip the rest if it does.
|
||||
LOG.debug('Heartbeat from node %(node)s in maintenance mode; '
|
||||
'not taking any action.', {'node': node.uuid})
|
||||
return
|
||||
# NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we
|
||||
# are currently in the core deploy.deploy step. Other deploy steps
|
||||
# may cause the agent to boot, but we should not trigger deployment
|
||||
# at that point.
|
||||
elif node.provision_state == states.DEPLOYWAIT:
|
||||
if node.provision_state == states.DEPLOYWAIT:
|
||||
if self.in_core_deploy_step(task):
|
||||
if not self.deploy_has_started(task):
|
||||
msg = _('Node failed to deploy.')
|
||||
|
@ -3161,6 +3161,31 @@ class DoProvisioningActionTestCase(mgr_utils.ServiceSetUpMixin,
|
||||
mock_spawn.assert_called_with(self.service,
|
||||
self.service._do_node_clean, mock.ANY)
|
||||
|
||||
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
|
||||
autospec=True)
|
||||
def test_do_provision_action_provide_in_maintenance(self, mock_spawn):
|
||||
CONF.set_override('allow_provisioning_in_maintenance', False,
|
||||
group='conductor')
|
||||
# test when a node is cleaned going from manageable to available
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake-hardware',
|
||||
provision_state=states.MANAGEABLE,
|
||||
target_provision_state=None,
|
||||
maintenance=True)
|
||||
|
||||
self._start_service()
|
||||
mock_spawn.reset_mock()
|
||||
exc = self.assertRaises(messaging.rpc.ExpectedException,
|
||||
self.service.do_provisioning_action,
|
||||
self.context, node.uuid, 'provide')
|
||||
# Compare true exception hidden by @messaging.expected_exceptions
|
||||
self.assertEqual(exception.NodeInMaintenance, exc.exc_info[0])
|
||||
node.refresh()
|
||||
self.assertEqual(states.MANAGEABLE, node.provision_state)
|
||||
self.assertIsNone(node.target_provision_state)
|
||||
self.assertIsNone(node.last_error)
|
||||
self.assertFalse(mock_spawn.called)
|
||||
|
||||
@mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
|
||||
autospec=True)
|
||||
def test_do_provision_action_manage(self, mock_spawn):
|
||||
@ -3822,6 +3847,31 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
self.assertTrue(mock_validate.called)
|
||||
self.assertIn('clean_steps', node.driver_internal_info)
|
||||
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.tear_down_cleaning',
|
||||
autospec=True)
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_cleaning',
|
||||
autospec=True)
|
||||
def test__do_node_clean_maintenance(self, mock_prep, mock_tear_down):
|
||||
CONF.set_override('allow_provisioning_in_maintenance', False,
|
||||
group='conductor')
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake-hardware',
|
||||
provision_state=states.CLEANING,
|
||||
target_provision_state=states.AVAILABLE,
|
||||
maintenance=True,
|
||||
maintenance_reason='Original reason')
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_node_clean(task)
|
||||
node.refresh()
|
||||
self.assertEqual(states.CLEANFAIL, node.provision_state)
|
||||
self.assertEqual(states.AVAILABLE, node.target_provision_state)
|
||||
self.assertIn('is not allowed', node.last_error)
|
||||
self.assertTrue(node.maintenance)
|
||||
self.assertEqual('Original reason', node.maintenance_reason)
|
||||
self.assertFalse(mock_prep.called)
|
||||
self.assertFalse(mock_tear_down.called)
|
||||
|
||||
@mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate',
|
||||
autospec=True)
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_cleaning',
|
||||
|
@ -986,9 +986,9 @@ class ErrorHandlersTestCase(tests_base.TestCase):
|
||||
# strict typing of the node power state fields and would fail if passed
|
||||
# a Mock object in constructors. A task context is also required for
|
||||
# notifications.
|
||||
power_attrs = {'power_state': states.POWER_OFF,
|
||||
'target_power_state': states.POWER_ON}
|
||||
self.node.configure_mock(**power_attrs)
|
||||
self.node.configure_mock(power_state=states.POWER_OFF,
|
||||
target_power_state=states.POWER_ON,
|
||||
maintenance=False, maintenance_reason=None)
|
||||
self.task.context = self.context
|
||||
|
||||
@mock.patch.object(conductor_utils, 'LOG')
|
||||
@ -1116,7 +1116,6 @@ class ErrorHandlersTestCase(tests_base.TestCase):
|
||||
self.assertEqual('clean failure', self.node.fault)
|
||||
|
||||
def test_abort_on_conductor_take_over_cleaning(self):
|
||||
self.node.maintenance = False
|
||||
self.node.provision_state = states.CLEANFAIL
|
||||
conductor_utils.abort_on_conductor_take_over(self.task)
|
||||
self.assertTrue(self.node.maintenance)
|
||||
@ -1128,7 +1127,6 @@ class ErrorHandlersTestCase(tests_base.TestCase):
|
||||
self.node.save.assert_called_once_with()
|
||||
|
||||
def test_abort_on_conductor_take_over_deploying(self):
|
||||
self.node.maintenance = False
|
||||
self.node.provision_state = states.DEPLOYFAIL
|
||||
conductor_utils.abort_on_conductor_take_over(self.task)
|
||||
self.assertFalse(self.node.maintenance)
|
||||
|
@ -200,6 +200,44 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
|
||||
self.assertEqual(
|
||||
'3.2.0',
|
||||
task.node.driver_internal_info['agent_version'])
|
||||
self.assertEqual(state, task.node.provision_state)
|
||||
self.assertIsNone(task.node.last_error)
|
||||
self.assertEqual(0, ncrc_mock.call_count)
|
||||
self.assertEqual(0, rti_mock.call_count)
|
||||
self.assertEqual(0, cd_mock.call_count)
|
||||
|
||||
@mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy',
|
||||
autospec=True)
|
||||
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
|
||||
'reboot_to_instance', autospec=True)
|
||||
@mock.patch.object(manager_utils, 'notify_conductor_resume_clean',
|
||||
autospec=True)
|
||||
def test_heartbeat_in_maintenance_abort(self, ncrc_mock, rti_mock,
|
||||
cd_mock):
|
||||
CONF.set_override('allow_provisioning_in_maintenance', False,
|
||||
group='conductor')
|
||||
for state, expected in [(states.DEPLOYWAIT, states.DEPLOYFAIL),
|
||||
(states.CLEANWAIT, states.CLEANFAIL),
|
||||
(states.RESCUEWAIT, states.RESCUEFAIL)]:
|
||||
for m in (ncrc_mock, rti_mock, cd_mock):
|
||||
m.reset_mock()
|
||||
self.node.provision_state = state
|
||||
self.node.maintenance = True
|
||||
self.node.save()
|
||||
agent_url = 'url-%s' % state
|
||||
with task_manager.acquire(self.context, self.node.uuid,
|
||||
shared=True) as task:
|
||||
self.deploy.heartbeat(task, agent_url, '3.2.0')
|
||||
self.assertFalse(task.shared)
|
||||
self.assertEqual(
|
||||
agent_url,
|
||||
task.node.driver_internal_info['agent_url'])
|
||||
self.assertEqual(
|
||||
'3.2.0',
|
||||
task.node.driver_internal_info['agent_version'])
|
||||
self.node.refresh()
|
||||
self.assertEqual(expected, self.node.provision_state)
|
||||
self.assertIn('aborted', self.node.last_error)
|
||||
self.assertEqual(0, ncrc_mock.call_count)
|
||||
self.assertEqual(0, rti_mock.call_count)
|
||||
self.assertEqual(0, cd_mock.call_count)
|
||||
@ -252,6 +290,29 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
|
||||
self.assertEqual(0, rti_mock.call_count)
|
||||
self.assertEqual(0, cd_mock.call_count)
|
||||
|
||||
@mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy',
|
||||
autospec=True)
|
||||
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
|
||||
'reboot_to_instance', autospec=True)
|
||||
@mock.patch.object(manager_utils, 'notify_conductor_resume_clean',
|
||||
autospec=True)
|
||||
def test_heartbeat_noops_in_wrong_state2(self, ncrc_mock, rti_mock,
|
||||
cd_mock):
|
||||
CONF.set_override('allow_provisioning_in_maintenance', False,
|
||||
group='conductor')
|
||||
allowed = {states.DEPLOYWAIT, states.CLEANWAIT}
|
||||
for state in set(states.machine.states) - allowed:
|
||||
for m in (ncrc_mock, rti_mock, cd_mock):
|
||||
m.reset_mock()
|
||||
with task_manager.acquire(self.context, self.node.uuid,
|
||||
shared=True) as task:
|
||||
self.node.provision_state = state
|
||||
self.deploy.heartbeat(task, 'url', '1.0.0')
|
||||
self.assertTrue(task.shared)
|
||||
self.assertEqual(0, ncrc_mock.call_count)
|
||||
self.assertEqual(0, rti_mock.call_count)
|
||||
self.assertEqual(0, cd_mock.call_count)
|
||||
|
||||
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
|
||||
'in_core_deploy_step', autospec=True)
|
||||
@mock.patch.object(agent_base_vendor.HeartbeatMixin,
|
||||
|
@ -0,0 +1,10 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Currently ironic allows entering deployment or cleaning for nodes in
|
||||
maintenance mode. However, heartbeats do not cause any actions for such
|
||||
nodes, thus deployment or cleaning never finish if the nodes are not moved
|
||||
out of maintenance. A new configuration option
|
||||
``[conductor]allow_provisioning_in_maintenance`` (defaulting to ``True``)
|
||||
is added to configure this behavior. If it is set to ``False``, deployment
|
||||
and cleaning will be prevented from nodes in maintenance mode.
|
Loading…
x
Reference in New Issue
Block a user