From 7b6e58a5acbab3a34928de3166633778804e5412 Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Tue, 9 Apr 2024 14:05:18 +0200 Subject: [PATCH] Fix servicing clean-up Serious issues: - Nothing powers on nodes after servicing, so they end up active and powered off in the end. - Restoring power state was done three times. Minor issues: - Function _tear_down_node_servicing is called twice causing a traceback. - Furthermore, process_event('done') is also called in another place in deploy utils. - Make sure nodes are never considered for fast-track when servicing, it prevents clean-up of virtual media devices. Change-Id: I92fd7a0009a816e93e316e4674c7509b61a474d4 (cherry picked from commit 6c8673c1b495095a0c92e0323976f3bc3834ac08) --- ironic/common/states.py | 5 +++++ ironic/conductor/servicing.py | 2 -- ironic/conductor/utils.py | 5 ++++- ironic/drivers/modules/deploy_utils.py | 14 ++++---------- ironic/tests/unit/conductor/test_utils.py | 7 +++++++ .../unit/drivers/modules/test_deploy_utils.py | 5 ++++- 6 files changed, 24 insertions(+), 14 deletions(-) diff --git a/ironic/common/states.py b/ironic/common/states.py index 868c0d143e..56558f5a52 100644 --- a/ironic/common/states.py +++ b/ironic/common/states.py @@ -252,6 +252,11 @@ SERVICEHOLD = 'service hold' """ Node is being held for direct intervention from a service step. """ +"""All Node states related to servicing.""" +SERVICING_STATES = frozenset((SERVICING, SERVICEWAIT, + SERVICEFAIL, SERVICEHOLD)) + + # NOTE(kaifeng): INSPECTING is allowed to keep backwards compatibility, # starting from API 1.39 node update is disallowed in this state. UPDATE_ALLOWED_STATES = (DEPLOYFAIL, INSPECTING, INSPECTFAIL, INSPECTWAIT, diff --git a/ironic/conductor/servicing.py b/ironic/conductor/servicing.py index 598038a18a..2fa5d71cb8 100644 --- a/ironic/conductor/servicing.py +++ b/ironic/conductor/servicing.py @@ -86,8 +86,6 @@ def do_node_service(task, service_steps=None, disable_ramdisk=False): return utils.servicing_error_handler(task, msg) steps = node.driver_internal_info.get('service_steps', []) - if not steps: - _tear_down_node_service(task, disable_ramdisk=disable_ramdisk) step_index = 0 if steps else None do_next_service_step(task, step_index, disable_ramdisk=disable_ramdisk) diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 6757c90065..844c516260 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -1120,7 +1120,10 @@ def fast_track_able(task): # TODO(TheJulia): Should we check the provisioning/deployment # networks match config wise? Do we care? #decisionsdecisions and task.driver.storage.should_write_image(task) - and task.node.last_error is None) + and task.node.last_error is None + # NOTE(dtantsur): Fast track makes zero sense for servicing and + # may prevent normal clean-up from happening. + and task.node.provision_state not in states.SERVICING_STATES) def value_within_timeout(value, timeout): diff --git a/ironic/drivers/modules/deploy_utils.py b/ironic/drivers/modules/deploy_utils.py index 523567731f..0277ac3a5b 100644 --- a/ironic/drivers/modules/deploy_utils.py +++ b/ironic/drivers/modules/deploy_utils.py @@ -821,21 +821,15 @@ def tear_down_inband_service(task): task.driver.boot.clean_up_ramdisk(task) - power_state_to_restore = manager_utils.power_on_node_if_needed(task) - if not service_failure: - manager_utils.restore_power_state_if_needed( - task, power_state_to_restore) - with manager_utils.power_state_for_network_configuration(task): task.driver.network.remove_servicing_network(task) task.driver.network.configure_tenant_networks(task) - task.driver.boot.prepare_instance(task) - manager_utils.restore_power_state_if_needed( - task, power_state_to_restore) - # Change the task instead of return the state. - task.process_event('done') + task.driver.boot.prepare_instance(task) + # prepare_instance does not power on the node, the deploy interface is + # normally responsible for that. + manager_utils.node_power_action(task, states.POWER_ON) def get_image_instance_info(node): diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index e99784bd11..d5051491da 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -2200,6 +2200,13 @@ class FastTrackTestCase(db_base.DbTestCase): self.context, self.node.uuid, shared=False) as task: self.assertFalse(conductor_utils.is_fast_track(task)) + def test_is_fast_track_not_in_servicing(self, mock_get_power): + mock_get_power.return_value = states.POWER_ON + with task_manager.acquire( + self.context, self.node.uuid, shared=False) as task: + task.node.provision_state = states.SERVICING + self.assertFalse(conductor_utils.is_fast_track(task)) + class GetNodeNextStepsTestCase(db_base.DbTestCase): def setUp(self): diff --git a/ironic/tests/unit/drivers/modules/test_deploy_utils.py b/ironic/tests/unit/drivers/modules/test_deploy_utils.py index f9026055c8..846fecab11 100644 --- a/ironic/tests/unit/drivers/modules/test_deploy_utils.py +++ b/ironic/tests/unit/drivers/modules/test_deploy_utils.py @@ -1071,7 +1071,10 @@ class AgentMethodsTestCase(db_base.DbTestCase): with task_manager.acquire( self.context, self.node.uuid, shared=False) as task: utils.tear_down_inband_service(task) - power_mock.assert_called_once_with(task, states.POWER_OFF) + power_mock.assert_has_calls([ + mock.call(task, states.POWER_OFF), + mock.call(task, states.POWER_ON), + ]) remove_service_network_mock.assert_called_once_with( task.driver.network, task) clean_up_ramdisk_mock.assert_called_once_with(