Fix servicing clean-up

Serious issues:
- Nothing powers on nodes after servicing, so they end up active and
  powered off in the end.
- Restoring power state was done three times.

Minor issues:
- Function _tear_down_node_servicing is called twice causing a traceback.
- Furthermore, process_event('done') is also called in another place
  in deploy utils.
- Make sure nodes are never considered for fast-track when servicing, it
  prevents clean-up of virtual media devices.

Change-Id: I92fd7a0009a816e93e316e4674c7509b61a474d4
This commit is contained in:
Dmitry Tantsur 2024-04-09 14:05:18 +02:00
parent 22aa29b864
commit 6c8673c1b4
No known key found for this signature in database
GPG Key ID: 315B2AF9FD216C60
6 changed files with 24 additions and 14 deletions

View File

@ -252,6 +252,11 @@ SERVICEHOLD = 'service hold'
""" Node is being held for direct intervention from a service step. """
"""All Node states related to servicing."""
SERVICING_STATES = frozenset((SERVICING, SERVICEWAIT,
SERVICEFAIL, SERVICEHOLD))
# NOTE(kaifeng): INSPECTING is allowed to keep backwards compatibility,
# starting from API 1.39 node update is disallowed in this state.
UPDATE_ALLOWED_STATES = (DEPLOYFAIL, INSPECTING, INSPECTFAIL, INSPECTWAIT,

View File

@ -86,8 +86,6 @@ def do_node_service(task, service_steps=None, disable_ramdisk=False):
return utils.servicing_error_handler(task, msg)
steps = node.driver_internal_info.get('service_steps', [])
if not steps:
_tear_down_node_service(task, disable_ramdisk=disable_ramdisk)
step_index = 0 if steps else None
do_next_service_step(task, step_index, disable_ramdisk=disable_ramdisk)

View File

@ -1120,7 +1120,10 @@ def fast_track_able(task):
# TODO(TheJulia): Should we check the provisioning/deployment
# networks match config wise? Do we care? #decisionsdecisions
and task.driver.storage.should_write_image(task)
and task.node.last_error is None)
and task.node.last_error is None
# NOTE(dtantsur): Fast track makes zero sense for servicing and
# may prevent normal clean-up from happening.
and task.node.provision_state not in states.SERVICING_STATES)
def value_within_timeout(value, timeout):

View File

@ -821,21 +821,15 @@ def tear_down_inband_service(task):
task.driver.boot.clean_up_ramdisk(task)
power_state_to_restore = manager_utils.power_on_node_if_needed(task)
if not service_failure:
manager_utils.restore_power_state_if_needed(
task, power_state_to_restore)
with manager_utils.power_state_for_network_configuration(task):
task.driver.network.remove_servicing_network(task)
task.driver.network.configure_tenant_networks(task)
task.driver.boot.prepare_instance(task)
manager_utils.restore_power_state_if_needed(
task, power_state_to_restore)
# Change the task instead of return the state.
task.process_event('done')
task.driver.boot.prepare_instance(task)
# prepare_instance does not power on the node, the deploy interface is
# normally responsible for that.
manager_utils.node_power_action(task, states.POWER_ON)
def get_image_instance_info(node):

View File

@ -2200,6 +2200,13 @@ class FastTrackTestCase(db_base.DbTestCase):
self.context, self.node.uuid, shared=False) as task:
self.assertFalse(conductor_utils.is_fast_track(task))
def test_is_fast_track_not_in_servicing(self, mock_get_power):
mock_get_power.return_value = states.POWER_ON
with task_manager.acquire(
self.context, self.node.uuid, shared=False) as task:
task.node.provision_state = states.SERVICING
self.assertFalse(conductor_utils.is_fast_track(task))
class GetNodeNextStepsTestCase(db_base.DbTestCase):
def setUp(self):

View File

@ -1071,7 +1071,10 @@ class AgentMethodsTestCase(db_base.DbTestCase):
with task_manager.acquire(
self.context, self.node.uuid, shared=False) as task:
utils.tear_down_inband_service(task)
power_mock.assert_called_once_with(task, states.POWER_OFF)
power_mock.assert_has_calls([
mock.call(task, states.POWER_OFF),
mock.call(task, states.POWER_ON),
])
remove_service_network_mock.assert_called_once_with(
task.driver.network, task)
clean_up_ramdisk_mock.assert_called_once_with(