Merge "Do not tear down node upon cleaning failure"

This commit is contained in:
Zuul 2019-08-15 21:14:32 +00:00 committed by Gerrit Code Review
commit 028ab71d3f
4 changed files with 33 additions and 10 deletions

@ -385,6 +385,8 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
set_fail_state=True): set_fail_state=True):
"""Put a failed node in CLEANFAIL and maintenance.""" """Put a failed node in CLEANFAIL and maintenance."""
node = task.node node = task.node
node.fault = faults.CLEAN_FAILURE
node.maintenance = True
if tear_down_cleaning: if tear_down_cleaning:
try: try:
@ -410,9 +412,7 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
# for automated cleaning, it is AVAILABLE. # for automated cleaning, it is AVAILABLE.
manual_clean = node.target_provision_state == states.MANAGEABLE manual_clean = node.target_provision_state == states.MANAGEABLE
node.last_error = msg node.last_error = msg
node.maintenance = True
node.maintenance_reason = msg node.maintenance_reason = msg
node.fault = faults.CLEAN_FAILURE
node.save() node.save()
if set_fail_state and node.provision_state != states.CLEANFAIL: if set_fail_state and node.provision_state != states.CLEANFAIL:

@ -32,6 +32,7 @@ from oslo_utils import strutils
import six import six
from ironic.common import exception from ironic.common import exception
from ironic.common import faults
from ironic.common.glance_service import service_utils from ironic.common.glance_service import service_utils
from ironic.common.i18n import _ from ironic.common.i18n import _
from ironic.common import image_service from ironic.common import image_service
@ -935,10 +936,11 @@ def tear_down_inband_cleaning(task, manage_boot=True):
"""Tears down the environment setup for in-band cleaning. """Tears down the environment setup for in-band cleaning.
This method does the following: This method does the following:
1. Powers off the bare metal node. 1. Powers off the bare metal node (unless the node is fast
2. If 'manage_boot' parameter is set to true, it also tracked or there was a cleaning failure).
calls the 'clean_up_ramdisk' method of boot interface to clean up 2. If 'manage_boot' parameter is set to true, it also calls
the environment that was set for booting agent ramdisk. the 'clean_up_ramdisk' method of boot interface to clean
up the environment that was set for booting agent ramdisk.
3. Deletes the cleaning ports which were setup as part 3. Deletes the cleaning ports which were setup as part
of cleaning. of cleaning.
@ -950,7 +952,11 @@ def tear_down_inband_cleaning(task, manage_boot=True):
removed. removed.
""" """
fast_track = manager_utils.is_fast_track(task) fast_track = manager_utils.is_fast_track(task)
if not fast_track:
node = task.node
cleaning_failure = (node.fault == faults.CLEAN_FAILURE)
if not (fast_track or cleaning_failure):
manager_utils.node_power_action(task, states.POWER_OFF) manager_utils.node_power_action(task, states.POWER_OFF)
if manage_boot: if manage_boot:
@ -958,7 +964,7 @@ def tear_down_inband_cleaning(task, manage_boot=True):
power_state_to_restore = manager_utils.power_on_node_if_needed(task) power_state_to_restore = manager_utils.power_on_node_if_needed(task)
task.driver.network.remove_cleaning_network(task) task.driver.network.remove_cleaning_network(task)
if not fast_track: if not (fast_track or cleaning_failure):
manager_utils.restore_power_state_if_needed( manager_utils.restore_power_state_if_needed(
task, power_state_to_restore) task, power_state_to_restore)

@ -31,6 +31,7 @@ from testtools import matchers
from ironic.common import boot_devices from ironic.common import boot_devices
from ironic.common import exception from ironic.common import exception
from ironic.common import faults
from ironic.common import image_service from ironic.common import image_service
from ironic.common import states from ironic.common import states
from ironic.common import utils as common_utils from ironic.common import utils as common_utils
@ -1761,12 +1762,14 @@ class AgentMethodsTestCase(db_base.DbTestCase):
def _test_tear_down_inband_cleaning( def _test_tear_down_inband_cleaning(
self, power_mock, remove_cleaning_network_mock, self, power_mock, remove_cleaning_network_mock,
clean_up_ramdisk_mock, is_fast_track_mock, clean_up_ramdisk_mock, is_fast_track_mock,
manage_boot=True, fast_track=False): manage_boot=True, fast_track=False, cleaning_error=False):
is_fast_track_mock.return_value = fast_track is_fast_track_mock.return_value = fast_track
with task_manager.acquire( with task_manager.acquire(
self.context, self.node.uuid, shared=False) as task: self.context, self.node.uuid, shared=False) as task:
if cleaning_error:
task.node.fault = faults.CLEAN_FAILURE
utils.tear_down_inband_cleaning(task, manage_boot=manage_boot) utils.tear_down_inband_cleaning(task, manage_boot=manage_boot)
if not fast_track: if not (fast_track or cleaning_error):
power_mock.assert_called_once_with(task, states.POWER_OFF) power_mock.assert_called_once_with(task, states.POWER_OFF)
else: else:
self.assertFalse(power_mock.called) self.assertFalse(power_mock.called)
@ -1786,6 +1789,9 @@ class AgentMethodsTestCase(db_base.DbTestCase):
def test_tear_down_inband_cleaning_fast_track(self): def test_tear_down_inband_cleaning_fast_track(self):
self._test_tear_down_inband_cleaning(fast_track=True) self._test_tear_down_inband_cleaning(fast_track=True)
def test_tear_down_inband_cleaning_cleaning_error(self):
self._test_tear_down_inband_cleaning(cleaning_error=True)
def test_build_agent_options_conf(self): def test_build_agent_options_conf(self):
self.config(api_url='https://api-url', group='conductor') self.config(api_url='https://api-url', group='conductor')
options = utils.build_agent_options(self.node) options = utils.build_agent_options(self.node)

@ -0,0 +1,11 @@
---
fixes:
- |
Fixes a bug where ironic would shut a node down upon cleaning failure.
Now, the node stays powered on (as documented and intended).
upgrade:
- |
When a failure occurs during cleaning, nodes will no longer be shut down. The
behaviour was changed to prevent harm and allow for an admin intervention
when sensitive operations, such as firmware upgrades, are performed and fail
during cleaning.