Merge "Do not tear down node upon cleaning failure"

This commit is contained in:
Zuul 2019-08-15 21:14:32 +00:00 committed by Gerrit Code Review
commit 028ab71d3f
4 changed files with 33 additions and 10 deletions

@ -385,6 +385,8 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
set_fail_state=True):
"""Put a failed node in CLEANFAIL and maintenance."""
node = task.node
node.fault = faults.CLEAN_FAILURE
node.maintenance = True
if tear_down_cleaning:
try:
@ -410,9 +412,7 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True,
# for automated cleaning, it is AVAILABLE.
manual_clean = node.target_provision_state == states.MANAGEABLE
node.last_error = msg
node.maintenance = True
node.maintenance_reason = msg
node.fault = faults.CLEAN_FAILURE
node.save()
if set_fail_state and node.provision_state != states.CLEANFAIL:

@ -32,6 +32,7 @@ from oslo_utils import strutils
import six
from ironic.common import exception
from ironic.common import faults
from ironic.common.glance_service import service_utils
from ironic.common.i18n import _
from ironic.common import image_service
@ -935,10 +936,11 @@ def tear_down_inband_cleaning(task, manage_boot=True):
"""Tears down the environment setup for in-band cleaning.
This method does the following:
1. Powers off the bare metal node.
2. If 'manage_boot' parameter is set to true, it also
calls the 'clean_up_ramdisk' method of boot interface to clean up
the environment that was set for booting agent ramdisk.
1. Powers off the bare metal node (unless the node is fast
tracked or there was a cleaning failure).
2. If 'manage_boot' parameter is set to true, it also calls
the 'clean_up_ramdisk' method of boot interface to clean
up the environment that was set for booting agent ramdisk.
3. Deletes the cleaning ports which were setup as part
of cleaning.
@ -950,7 +952,11 @@ def tear_down_inband_cleaning(task, manage_boot=True):
removed.
"""
fast_track = manager_utils.is_fast_track(task)
if not fast_track:
node = task.node
cleaning_failure = (node.fault == faults.CLEAN_FAILURE)
if not (fast_track or cleaning_failure):
manager_utils.node_power_action(task, states.POWER_OFF)
if manage_boot:
@ -958,7 +964,7 @@ def tear_down_inband_cleaning(task, manage_boot=True):
power_state_to_restore = manager_utils.power_on_node_if_needed(task)
task.driver.network.remove_cleaning_network(task)
if not fast_track:
if not (fast_track or cleaning_failure):
manager_utils.restore_power_state_if_needed(
task, power_state_to_restore)

@ -31,6 +31,7 @@ from testtools import matchers
from ironic.common import boot_devices
from ironic.common import exception
from ironic.common import faults
from ironic.common import image_service
from ironic.common import states
from ironic.common import utils as common_utils
@ -1761,12 +1762,14 @@ class AgentMethodsTestCase(db_base.DbTestCase):
def _test_tear_down_inband_cleaning(
self, power_mock, remove_cleaning_network_mock,
clean_up_ramdisk_mock, is_fast_track_mock,
manage_boot=True, fast_track=False):
manage_boot=True, fast_track=False, cleaning_error=False):
is_fast_track_mock.return_value = fast_track
with task_manager.acquire(
self.context, self.node.uuid, shared=False) as task:
if cleaning_error:
task.node.fault = faults.CLEAN_FAILURE
utils.tear_down_inband_cleaning(task, manage_boot=manage_boot)
if not fast_track:
if not (fast_track or cleaning_error):
power_mock.assert_called_once_with(task, states.POWER_OFF)
else:
self.assertFalse(power_mock.called)
@ -1786,6 +1789,9 @@ class AgentMethodsTestCase(db_base.DbTestCase):
def test_tear_down_inband_cleaning_fast_track(self):
self._test_tear_down_inband_cleaning(fast_track=True)
def test_tear_down_inband_cleaning_cleaning_error(self):
self._test_tear_down_inband_cleaning(cleaning_error=True)
def test_build_agent_options_conf(self):
self.config(api_url='https://api-url', group='conductor')
options = utils.build_agent_options(self.node)

@ -0,0 +1,11 @@
---
fixes:
- |
Fixes a bug where ironic would shut a node down upon cleaning failure.
Now, the node stays powered on (as documented and intended).
upgrade:
- |
When a failure occurs during cleaning, nodes will no longer be shut down. The
behaviour was changed to prevent harm and allow for an admin intervention
when sensitive operations, such as firmware upgrades, are performed and fail
during cleaning.