From fcb793682dc063d6c36ce4555af018759a842ea8 Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Mon, 29 Jul 2019 11:25:41 +0200 Subject: [PATCH] Add an option to abort cleaning and deployment if node is in maintenance We don't prevent cleaning to happen for nodes in maintenance mode. However, cleaning cannot succeed in this case, as we disable processing heartbeats. This change adds a new configuration option that will cause such node to enter CLEAN FAIL on the first heartbeat. The same is done for deployment and automated cleaning during providing. Finally, elevate the log level for such heartbeats from debug to warning, as it may be a sign of a problem (especially if the new option is off). Change-Id: I9f3ee44f39c448eb2609c5989acd36e7da844ef4 Story: #1563644 Task: #9171 --- ironic/conductor/manager.py | 18 +++++- ironic/conductor/utils.py | 4 +- ironic/conf/conductor.py | 12 ++++ ironic/drivers/modules/agent_base_vendor.py | 35 +++++++++-- ironic/tests/unit/conductor/test_manager.py | 50 +++++++++++++++ ironic/tests/unit/conductor/test_utils.py | 8 +-- .../drivers/modules/test_agent_base_vendor.py | 61 +++++++++++++++++++ ...cleaning-maintenance-7ae83b1e4ff992b0.yaml | 10 +++ 8 files changed, 184 insertions(+), 14 deletions(-) create mode 100644 releasenotes/notes/cleaning-maintenance-7ae83b1e4ff992b0.yaml diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index ce71cf2cc3..0e6349f942 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -1299,6 +1299,14 @@ class ConductorManager(base_manager.BaseConductorManager): 'successfully moved to AVAILABLE state.', node.uuid) return + # NOTE(dtantsur): this is only reachable during automated cleaning, + # for manual cleaning we verify maintenance mode earlier on. + if (not CONF.conductor.allow_provisioning_in_maintenance + and node.maintenance): + msg = _('Cleaning a node in maintenance mode is not allowed') + return utils.cleaning_error_handler(task, msg, + tear_down_cleaning=False) + try: # NOTE(ghe): Valid power and network values are needed to perform # a cleaning. @@ -1542,7 +1550,8 @@ class ConductorManager(base_manager.BaseConductorManager): exception.NodeLocked, exception.InvalidParameterValue, exception.InvalidStateRequested, - exception.UnsupportedDriverExtension) + exception.UnsupportedDriverExtension, + exception.NodeInMaintenance) def do_provisioning_action(self, context, node_id, action): """RPC method to initiate certain provisioning state transitions. @@ -1555,7 +1564,7 @@ class ConductorManager(base_manager.BaseConductorManager): :raises: InvalidParameterValue :raises: InvalidStateRequested :raises: NoFreeConductorWorker - + :raises: NodeInMaintenance """ with task_manager.acquire(context, node_id, shared=False, purpose='provision action %s' @@ -1563,6 +1572,11 @@ class ConductorManager(base_manager.BaseConductorManager): node = task.node if (action == states.VERBS['provide'] and node.provision_state == states.MANAGEABLE): + # NOTE(dtantsur): do this early to avoid entering cleaning. + if (not CONF.conductor.allow_provisioning_in_maintenance + and node.maintenance): + raise exception.NodeInMaintenance(op=_('providing'), + node=node.uuid) task.process_event( 'provide', callback=self._spawn_worker, diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 27f589b949..458c958b87 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -412,7 +412,9 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True, # for automated cleaning, it is AVAILABLE. manual_clean = node.target_provision_state == states.MANAGEABLE node.last_error = msg - node.maintenance_reason = msg + # NOTE(dtantsur): avoid overwriting existing maintenance_reason + if not node.maintenance_reason: + node.maintenance_reason = msg node.save() if set_fail_state and node.provision_state != states.CLEANFAIL: diff --git a/ironic/conf/conductor.py b/ironic/conf/conductor.py index 1dd21355b8..3ed0fc4f45 100644 --- a/ironic/conf/conductor.py +++ b/ironic/conf/conductor.py @@ -178,6 +178,18 @@ opts = [ 'longer. In an environment where all tenants are ' 'trusted (eg, because there is only one tenant), ' 'this option could be safely disabled.')), + cfg.BoolOpt('allow_provisioning_in_maintenance', + default=True, + mutable=True, + help=_('Whether to allow nodes to enter or undergo deploy or ' + 'cleaning when in maintenance mode. If this option is ' + 'set to False, and a node enters maintenance during ' + 'deploy or cleaning, the process will be aborted ' + 'after the next heartbeat. Automated cleaning or ' + 'making a node available will also fail. If True ' + '(the default), the process will begin and will pause ' + 'after the node starts heartbeating. Moving it from ' + 'maintenance will make the process continue.')), cfg.IntOpt('clean_callback_timeout', default=1800, help=_('Timeout (seconds) to wait for a callback from the ' diff --git a/ironic/drivers/modules/agent_base_vendor.py b/ironic/drivers/modules/agent_base_vendor.py index 0fa2d45487..3a1c507827 100644 --- a/ironic/drivers/modules/agent_base_vendor.py +++ b/ironic/drivers/modules/agent_base_vendor.py @@ -296,6 +296,31 @@ class HeartbeatMixin(object): return FASTTRACK_HEARTBEAT_ALLOWED return HEARTBEAT_ALLOWED + def _heartbeat_in_maintenance(self, task): + node = task.node + if (node.provision_state in (states.CLEANING, states.CLEANWAIT) + and not CONF.conductor.allow_provisioning_in_maintenance): + LOG.error('Aborting cleaning for node %s, as it is in maintenance ' + 'mode', node.uuid) + last_error = _('Cleaning aborted as node is in maintenance mode') + manager_utils.cleaning_error_handler(task, last_error) + elif (node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT) + and not CONF.conductor.allow_provisioning_in_maintenance): + LOG.error('Aborting deployment for node %s, as it is in ' + 'maintenance mode', node.uuid) + last_error = _('Deploy aborted as node is in maintenance mode') + deploy_utils.set_failed_state(task, last_error, collect_logs=False) + elif (node.provision_state in (states.RESCUING, states.RESCUEWAIT) + and not CONF.conductor.allow_provisioning_in_maintenance): + LOG.error('Aborting rescuing for node %s, as it is in ' + 'maintenance mode', node.uuid) + last_error = _('Rescue aborted as node is in maintenance mode') + manager_utils.rescuing_error_handler(task, last_error) + else: + LOG.warning('Heartbeat from node %(node)s in ' + 'maintenance mode; not taking any action.', + {'node': node.uuid}) + @METRICS.timer('HeartbeatMixin.heartbeat') def heartbeat(self, task, callback_url, agent_version): """Process a heartbeat. @@ -342,20 +367,18 @@ class HeartbeatMixin(object): 'node as on-line.', {'node': task.node.uuid}) return + if node.maintenance: + return self._heartbeat_in_maintenance(task) + # Async call backs don't set error state on their own # TODO(jimrollenhagen) improve error messages here msg = _('Failed checking if deploy is done.') try: - if node.maintenance: - # this shouldn't happen often, but skip the rest if it does. - LOG.debug('Heartbeat from node %(node)s in maintenance mode; ' - 'not taking any action.', {'node': node.uuid}) - return # NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we # are currently in the core deploy.deploy step. Other deploy steps # may cause the agent to boot, but we should not trigger deployment # at that point. - elif node.provision_state == states.DEPLOYWAIT: + if node.provision_state == states.DEPLOYWAIT: if self.in_core_deploy_step(task): if not self.deploy_has_started(task): msg = _('Node failed to deploy.') diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index 3a101b5d7a..902971a76e 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -3160,6 +3160,31 @@ class DoProvisioningActionTestCase(mgr_utils.ServiceSetUpMixin, mock_spawn.assert_called_with(self.service, self.service._do_node_clean, mock.ANY) + @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker', + autospec=True) + def test_do_provision_action_provide_in_maintenance(self, mock_spawn): + CONF.set_override('allow_provisioning_in_maintenance', False, + group='conductor') + # test when a node is cleaned going from manageable to available + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.MANAGEABLE, + target_provision_state=None, + maintenance=True) + + self._start_service() + mock_spawn.reset_mock() + exc = self.assertRaises(messaging.rpc.ExpectedException, + self.service.do_provisioning_action, + self.context, node.uuid, 'provide') + # Compare true exception hidden by @messaging.expected_exceptions + self.assertEqual(exception.NodeInMaintenance, exc.exc_info[0]) + node.refresh() + self.assertEqual(states.MANAGEABLE, node.provision_state) + self.assertIsNone(node.target_provision_state) + self.assertIsNone(node.last_error) + self.assertFalse(mock_spawn.called) + @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker', autospec=True) def test_do_provision_action_manage(self, mock_spawn): @@ -3821,6 +3846,31 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self.assertTrue(mock_validate.called) self.assertIn('clean_steps', node.driver_internal_info) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.tear_down_cleaning', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_cleaning', + autospec=True) + def test__do_node_clean_maintenance(self, mock_prep, mock_tear_down): + CONF.set_override('allow_provisioning_in_maintenance', False, + group='conductor') + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.CLEANING, + target_provision_state=states.AVAILABLE, + maintenance=True, + maintenance_reason='Original reason') + with task_manager.acquire( + self.context, node.uuid, shared=False) as task: + self.service._do_node_clean(task) + node.refresh() + self.assertEqual(states.CLEANFAIL, node.provision_state) + self.assertEqual(states.AVAILABLE, node.target_provision_state) + self.assertIn('is not allowed', node.last_error) + self.assertTrue(node.maintenance) + self.assertEqual('Original reason', node.maintenance_reason) + self.assertFalse(mock_prep.called) + self.assertFalse(mock_tear_down.called) + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', autospec=True) @mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_cleaning', diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index 5c5fb68b31..410abb4646 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -980,9 +980,9 @@ class ErrorHandlersTestCase(tests_base.TestCase): # strict typing of the node power state fields and would fail if passed # a Mock object in constructors. A task context is also required for # notifications. - power_attrs = {'power_state': states.POWER_OFF, - 'target_power_state': states.POWER_ON} - self.node.configure_mock(**power_attrs) + self.node.configure_mock(power_state=states.POWER_OFF, + target_power_state=states.POWER_ON, + maintenance=False, maintenance_reason=None) self.task.context = self.context @mock.patch.object(conductor_utils, 'LOG') @@ -1110,7 +1110,6 @@ class ErrorHandlersTestCase(tests_base.TestCase): self.assertEqual('clean failure', self.node.fault) def test_abort_on_conductor_take_over_cleaning(self): - self.node.maintenance = False self.node.provision_state = states.CLEANFAIL conductor_utils.abort_on_conductor_take_over(self.task) self.assertTrue(self.node.maintenance) @@ -1122,7 +1121,6 @@ class ErrorHandlersTestCase(tests_base.TestCase): self.node.save.assert_called_once_with() def test_abort_on_conductor_take_over_deploying(self): - self.node.maintenance = False self.node.provision_state = states.DEPLOYFAIL conductor_utils.abort_on_conductor_take_over(self.task) self.assertFalse(self.node.maintenance) diff --git a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py index 185e063600..4ac6497d45 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py +++ b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py @@ -200,6 +200,44 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): self.assertEqual( '3.2.0', task.node.driver_internal_info['agent_version']) + self.assertEqual(state, task.node.provision_state) + self.assertIsNone(task.node.last_error) + self.assertEqual(0, ncrc_mock.call_count) + self.assertEqual(0, rti_mock.call_count) + self.assertEqual(0, cd_mock.call_count) + + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy', + autospec=True) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, + 'reboot_to_instance', autospec=True) + @mock.patch.object(manager_utils, 'notify_conductor_resume_clean', + autospec=True) + def test_heartbeat_in_maintenance_abort(self, ncrc_mock, rti_mock, + cd_mock): + CONF.set_override('allow_provisioning_in_maintenance', False, + group='conductor') + for state, expected in [(states.DEPLOYWAIT, states.DEPLOYFAIL), + (states.CLEANWAIT, states.CLEANFAIL), + (states.RESCUEWAIT, states.RESCUEFAIL)]: + for m in (ncrc_mock, rti_mock, cd_mock): + m.reset_mock() + self.node.provision_state = state + self.node.maintenance = True + self.node.save() + agent_url = 'url-%s' % state + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + self.deploy.heartbeat(task, agent_url, '3.2.0') + self.assertFalse(task.shared) + self.assertEqual( + agent_url, + task.node.driver_internal_info['agent_url']) + self.assertEqual( + '3.2.0', + task.node.driver_internal_info['agent_version']) + self.node.refresh() + self.assertEqual(expected, self.node.provision_state) + self.assertIn('aborted', self.node.last_error) self.assertEqual(0, ncrc_mock.call_count) self.assertEqual(0, rti_mock.call_count) self.assertEqual(0, cd_mock.call_count) @@ -252,6 +290,29 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): self.assertEqual(0, rti_mock.call_count) self.assertEqual(0, cd_mock.call_count) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy', + autospec=True) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, + 'reboot_to_instance', autospec=True) + @mock.patch.object(manager_utils, 'notify_conductor_resume_clean', + autospec=True) + def test_heartbeat_noops_in_wrong_state2(self, ncrc_mock, rti_mock, + cd_mock): + CONF.set_override('allow_provisioning_in_maintenance', False, + group='conductor') + allowed = {states.DEPLOYWAIT, states.CLEANWAIT} + for state in set(states.machine.states) - allowed: + for m in (ncrc_mock, rti_mock, cd_mock): + m.reset_mock() + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + self.node.provision_state = state + self.deploy.heartbeat(task, 'url', '1.0.0') + self.assertTrue(task.shared) + self.assertEqual(0, ncrc_mock.call_count) + self.assertEqual(0, rti_mock.call_count) + self.assertEqual(0, cd_mock.call_count) + @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'in_core_deploy_step', autospec=True) @mock.patch.object(agent_base_vendor.HeartbeatMixin, diff --git a/releasenotes/notes/cleaning-maintenance-7ae83b1e4ff992b0.yaml b/releasenotes/notes/cleaning-maintenance-7ae83b1e4ff992b0.yaml new file mode 100644 index 0000000000..b5b1f71250 --- /dev/null +++ b/releasenotes/notes/cleaning-maintenance-7ae83b1e4ff992b0.yaml @@ -0,0 +1,10 @@ +--- +fixes: + - | + Currently ironic allows entering deployment or cleaning for nodes in + maintenance mode. However, heartbeats do not cause any actions for such + nodes, thus deployment or cleaning never finish if the nodes are not moved + out of maintenance. A new configuration option + ``[conductor]allow_provisioning_in_maintenance`` (defaulting to ``True``) + is added to configure this behavior. If it is set to ``False``, deployment + and cleaning will be prevented from nodes in maintenance mode.