Handle HTTP 400 and 409 race condition in Redfish power operations

Treat HTTP 400 and 409 errors as success when the node is already
in the target power state, preventing deployment failures from
race conditions between power state change completion and state
verification timeout.

Also refresh system state to get current power state from BMC
instead of using potentially stale cached data.

Assisted-By: Claude Sonnet 4.5

Change-Id: Id66ff9c70a9dd6969e3ac7fc74328dfc6e0431bd
Signed-off-by: Riccardo Pittau <elfosardo@gmail.com>
(cherry picked from commit 3e778ef954)
This commit is contained in:
Riccardo Pittau
2025-10-29 16:51:28 +01:00
parent a54c628f7b
commit f2a4a741c2
3 changed files with 239 additions and 0 deletions

View File

@@ -117,6 +117,47 @@ class RedfishPower(base.PowerInterface):
try:
_set_power_state(task, system, power_state, timeout=timeout)
except sushy.exceptions.HTTPError as e:
# Handle HTTP 400 (BadRequest) and 409 (Conflict) errors where
# the BMC indicates the node is already in the desired state
if e.status_code in (400, 409):
target_state = TARGET_STATE_MAP.get(power_state, power_state)
# Refresh system state to get current power state from BMC
# instead of using potentially stale cached data
try:
system.refresh()
except Exception as refresh_error:
LOG.warning('Failed to refresh system state for node '
'%(node)s: %(error)s',
{'node': task.node.uuid,
'error': refresh_error})
current_state = GET_POWER_STATE_MAP.get(system.power_state)
# If current state is undetermined, check for hints in the
# error message.
if current_state is None:
error_msg = str(e).lower()
if (target_state == states.POWER_OFF
and ('host is powered off' in error_msg
or 'already powered off' in error_msg)):
current_state = states.POWER_OFF
elif (target_state == states.POWER_ON
and ('host is powered on' in error_msg
or 'already powered on' in error_msg)):
current_state = states.POWER_ON
if current_state == target_state:
LOG.info('Node %(node)s power operation (%(requested)s) '
'failed with HTTP %(code)s, but node is already '
'in the expected final state (%(final)s). '
'Treating as successful. Error was: %(error)s',
{'node': task.node.uuid, 'requested': power_state,
'code': e.status_code, 'final': target_state,
'error': e})
return # Success - already in desired final state
raise
except sushy.exceptions.SushyError as e:
error_msg = (_('Setting power state to %(state)s failed for node '
'%(node)s. Error: %(error)s') %

View File

@@ -166,6 +166,193 @@ class RedfishPowerTestCase(db_base.DbTestCase):
sushy.RESET_ON)
mock_get_system.assert_called_once_with(task.node)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_set_power_state_race_condition_handling(
self, mock_get_system):
fake_system = mock_get_system.return_value
mock_response = mock.Mock(status_code=400)
mock_response.json.return_value = {
'error': {
'message': 'Host is powered OFF. Power On host and try again',
}
}
fake_system.reset_system.side_effect = (
sushy.exceptions.BadRequestError(
method='POST', url='test', response=mock_response))
success_scenarios = [
(
states.SOFT_POWER_OFF,
states.POWER_OFF,
sushy.RESET_GRACEFUL_SHUTDOWN,
),
(
states.SOFT_REBOOT,
states.POWER_ON,
sushy.RESET_GRACEFUL_RESTART
),
(states.POWER_OFF, states.POWER_OFF, sushy.RESET_FORCE_OFF),
(states.POWER_ON, states.POWER_ON, sushy.RESET_ON),
(states.REBOOT, states.POWER_ON, sushy.RESET_FORCE_RESTART),
(states.POWER_OFF, None, sushy.RESET_FORCE_OFF),
]
failure_scenarios = [
(states.POWER_OFF, states.POWER_ON),
(states.POWER_ON, states.POWER_OFF),
]
for target_state, final_state, expected_reset in success_scenarios:
fake_system.power_state = None
if final_state == states.POWER_OFF:
fake_system.power_state = sushy.SYSTEM_POWER_STATE_OFF
elif final_state == states.POWER_ON:
fake_system.power_state = sushy.SYSTEM_POWER_STATE_ON
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
task.driver.power.set_power_state(task, target_state)
fake_system.reset_system.assert_called_with(expected_reset)
fake_system.reset_mock()
for target_state, actual_state in failure_scenarios:
fake_system.power_state = None
if actual_state == states.POWER_OFF:
fake_system.power_state = sushy.SYSTEM_POWER_STATE_OFF
elif actual_state == states.POWER_ON:
fake_system.power_state = sushy.SYSTEM_POWER_STATE_ON
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
self.assertRaises(
sushy.exceptions.BadRequestError,
task.driver.power.set_power_state, task, target_state)
fake_system.reset_mock()
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_set_power_state_conflict_error_handling(
self, mock_get_system):
"""Test HTTP 409 Conflict error handling when node already in state."""
fake_system = mock_get_system.return_value
# Simulate HTTP 409 Conflict error with "already powered off" message
mock_response_off = mock.Mock(status_code=409)
mock_response_off.json.return_value = {
'error': {
'message': 'Server is already powered OFF.',
}
}
error_409_off = sushy.exceptions.HTTPError(
method='POST', url='test', response=mock_response_off)
# Simulate HTTP 409 Conflict error with "already powered on" message
mock_response_on = mock.Mock(status_code=409)
mock_response_on.json.return_value = {
'error': {
'message': 'Server is already powered ON.',
}
}
error_409_on = sushy.exceptions.HTTPError(
method='POST', url='test', response=mock_response_on)
# Test scenarios where node is already in target state (should succeed)
success_scenarios = [
(
states.POWER_OFF,
sushy.SYSTEM_POWER_STATE_OFF,
error_409_off,
sushy.RESET_FORCE_OFF,
),
(
states.SOFT_POWER_OFF,
sushy.SYSTEM_POWER_STATE_OFF,
error_409_off,
sushy.RESET_GRACEFUL_SHUTDOWN,
),
(
states.POWER_ON,
sushy.SYSTEM_POWER_STATE_ON,
error_409_on,
sushy.RESET_ON,
),
]
for (target_state, power_state, error,
expected_reset) in success_scenarios:
fake_system.reset_system.side_effect = error
fake_system.power_state = power_state
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
# Should succeed without raising exception
task.driver.power.set_power_state(task, target_state)
fake_system.reset_system.assert_called_with(expected_reset)
# Verify that refresh() was called to get current state
fake_system.refresh.assert_called_once()
fake_system.reset_mock()
# Test scenarios where node is NOT in target state (should fail)
failure_scenarios = [
(states.POWER_OFF, sushy.SYSTEM_POWER_STATE_ON, error_409_off),
(states.POWER_ON, sushy.SYSTEM_POWER_STATE_OFF, error_409_on),
]
for target_state, power_state, error in failure_scenarios:
fake_system.reset_system.side_effect = error
fake_system.power_state = power_state
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
# Should raise HTTPError since state doesn't match
self.assertRaises(
sushy.exceptions.HTTPError,
task.driver.power.set_power_state, task, target_state)
fake_system.reset_mock()
@mock.patch.object(redfish_power.LOG, 'warning', autospec=True)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_set_power_state_conflict_error_refresh_fails(
self, mock_get_system, mock_log):
"""Test HTTP 409 handling when refresh() fails but error msg helps."""
fake_system = mock_get_system.return_value
# Simulate HTTP 409 Conflict error
mock_response = mock.Mock(status_code=409)
mock_response.json.return_value = {
'error': {
'message': 'Server is already powered OFF.',
}
}
error_409 = sushy.exceptions.HTTPError(
method='POST', url='test', response=mock_response)
fake_system.reset_system.side_effect = error_409
# Make refresh() raise an exception
fake_system.refresh.side_effect = sushy.exceptions.ConnectionError(
'Connection failed')
# Set power_state to None to force fallback to error message
fake_system.power_state = None
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
# Should succeed by falling back to error message parsing
task.driver.power.set_power_state(task, states.POWER_OFF)
fake_system.reset_system.assert_called_with(
sushy.RESET_FORCE_OFF)
# Verify that refresh() was attempted
fake_system.refresh.assert_called_once()
# Verify that warning was logged about refresh failure
self.assertTrue(mock_log.called)
log_msg = mock_log.call_args[0][0]
self.assertIn('Failed to refresh system state', log_msg)
@mock.patch.object(redfish_mgmt.RedfishManagement, 'restore_boot_device',
autospec=True)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)

View File

@@ -0,0 +1,11 @@
---
fixes:
- |
Fixes a race condition where the Redfish power interface could fail with
HTTP 400 (BadRequest) or HTTP 409 (Conflict) errors during power
operations. These errors are now treated as success when the node is
already in the target power state (either on or off), preventing
deployment failures when power state changes complete after Ironic's
state verification times out.
Also refresh system state to get current power state from BMC
instead of using potentially stale cached data.