novadriver: Fix race for instance task_state

Stop using cached instance state and fetch real-time data from Nova
to avoid stale power_state and task_state values.

Add _check_and_wait_for_task_state() helper that polls with exponential
backoff (up to 20s) for task_state to clear, with a 4s stability check
to handle rapid state transitions. Returns 503 when instance is busy,
signaling clients to retry.

This eliminates ConflictException errors during power operations and
properly handles in-progress rebuilds and power state transitions.

Assisted-By: Claude Code/claude-4.5-sonnet
Change-Id: Iedf39aceb8c260903e00e77f6a919fd65e6a98d6
Signed-off-by: Harald Jensås <hjensas@redhat.com>
This commit is contained in:
Harald Jensås
2025-10-21 21:18:29 +02:00
parent ebef351f51
commit ad220ed98a
3 changed files with 271 additions and 17 deletions

View File

@@ -0,0 +1,10 @@
---
fixes:
- |
Fixes OpenStack Nova driver to properly handle instance ``task_state``
conflicts and race conditions during power state operations. The driver
now waits for instances to be ready before attempting power actions or
rebuilds, preventing ``ConflictExceptions`` when instances are busy with
tasks like ``powering-off``, ``rebuilding``, or ``spawning``. When the
instance is busy, the driver now returns ``HTTP 503`` with ``SYS518``
error code to allow clients to retry.

View File

@@ -176,7 +176,9 @@ class OpenStackDriver(AbstractSystemsDriver):
if power state can't be determined
"""
try:
# Get instance (uses cache) then fetch fresh state
instance = self._get_instance(identity)
instance.fetch(self._cc.compute)
except error.FishyError:
return
@@ -186,6 +188,67 @@ class OpenStackDriver(AbstractSystemsDriver):
return 'Off'
def _check_and_wait_for_task_state(self, instance, max_wait=20,
initial_wait=2, stability_wait=4):
"""Wait for instance task_state to clear and become stable
Polls with exponential backoff until task_state is None, then performs
a stability check to ensure it remains None.
:param instance: Current instance object
:param max_wait: Maximum time to wait in seconds (default 20)
:param initial_wait: Initial wait interval (exponential backoff)
:param stability_wait: Time to wait after task_state=None to ensure
it stays None (catches rapid state transitions)
:returns: Tuple of (is_ready, instance) where is_ready is True if
task_state is None and stable, False if still busy
"""
waited = 0
wait_interval = initial_wait
max_interval = 5 # Cap exponential backoff at 5 seconds
while waited < max_wait:
sleep_time = min(wait_interval, max_wait - waited)
time.sleep(sleep_time)
waited += sleep_time
# Still busy, fetch new state and apply exponential backoff
if instance.task_state is not None:
wait_interval = min(wait_interval * 2, max_interval)
instance.fetch(self._cc.compute)
continue
# This catches Nova doing rapid state transitions like:
# rebuilding -> None -> powering-off -> None
self._logger.debug(
'Instance %(identity)s task_state cleared after '
'%(waited)s seconds, verifying stability' %
{'identity': instance.id, 'waited': waited})
# Cap stability wait to not exceed max_wait
stability_sleep = min(stability_wait, max_wait - waited)
time.sleep(stability_sleep)
waited += stability_sleep
instance.fetch(self._cc.compute)
# Task state changed again, reset backoff and continue waiting
if instance.task_state is not None:
self._logger.debug(
'Instance %(identity)s task_state changed to '
'%(task_state)s during stability check, '
'continuing to wait' %
{'identity': instance.id,
'task_state': instance.task_state})
wait_interval = initial_wait
continue
# Task state is stable at None, return success
return True, instance
# Task still running after max_wait
return False, instance
def set_power_state(self, identity, state):
"""Set computer system power state
@@ -197,7 +260,29 @@ class OpenStackDriver(AbstractSystemsDriver):
:raises: `error.FishyError` if power state can't be set
"""
# Get instance (uses cache) then fetch fresh state
instance = self._get_instance(identity)
instance.fetch(self._cc.compute)
# Log current instance state for diagnostics
self._logger.debug(
'set_power_state called: identity=%(identity)s, '
'requested_state=%(state)s, current_power_state=%(power_state)s, '
'current_status=%(status)s, current_task_state=%(task_state)s' %
{'identity': identity, 'state': state,
'power_state': instance.power_state,
'status': instance.status,
'task_state': instance.task_state})
# Check if instance is ready before attempting any operations
# Prevents power actions and rebuilds during busy states
# (e.g. powering-off, powering-on, rebuilding, spawning)
is_ready, instance = self._check_and_wait_for_task_state(instance)
if not is_ready:
msg = ('SYS518: Cloud instance is busy, task_state: %s'
% instance.task_state)
raise error.FishyError(msg, 503)
delayed_media_eject = instance.metadata.get('sushy-tools-delay-eject')
if delayed_media_eject:
@@ -210,11 +295,9 @@ class OpenStackDriver(AbstractSystemsDriver):
False, self._rebuild_with_blank_image, identity)
self._remove_delayed_eject_metadata(identity)
if instance.task_state is not None:
# SYS518 is used here to trick openstack/sushy to do retries.
# iDRAC uses SYS518 when a previous task is still running.
msg = ('SYS518: Cloud instance is busy, task_state: %s'
% instance.task_state)
# Always raise 503 after triggering rebuild to force client retry
# This avoids race conditions and prevents exceeding 60s timeout
msg = ('SYS518: Cloud instance rebuilding, retry required')
raise error.FishyError(msg, 503)
if state in ('On', 'ForceOn'):

View File

@@ -64,6 +64,7 @@ class NovaDriverTestCase(base.BaseTestCase):
server = mock.Mock(id=self.uuid,
power_state=1)
self.nova_mock.return_value.get_server.return_value = server
server.fetch.return_value = None
power_state = self.test_driver.get_power_state(self.uuid)
@@ -73,67 +74,82 @@ class NovaDriverTestCase(base.BaseTestCase):
server = mock.Mock(id=self.uuid,
power_state=0)
self.nova_mock.return_value.get_server.return_value = server
server.fetch.return_value = None
power_state = self.test_driver.get_power_state(self.uuid)
self.assertEqual('Off', power_state)
def test_set_power_state_on(self):
@mock.patch('time.sleep')
def test_set_power_state_on(self, mock_sleep):
server = mock.Mock(id=self.uuid, power_state=0, task_state=None,
metadata={})
self.nova_mock.return_value.get_server.return_value = server
server.fetch.return_value = None
self.test_driver.set_power_state(self.uuid, 'On')
compute = self.nova_mock.return_value.compute
compute.start_server.assert_called_once_with(self.uuid)
def test_set_power_state_forceon(self):
@mock.patch('time.sleep')
def test_set_power_state_forceon(self, mock_sleep):
server = mock.Mock(id=self.uuid, power_state=0, task_state=None,
metadata={})
self.nova_mock.return_value.get_server.return_value = server
server.fetch.return_value = None
self.test_driver.set_power_state(self.uuid, 'ForceOn')
compute = self.nova_mock.return_value.compute
compute.start_server.assert_called_once_with(self.uuid)
def test_set_power_state_forceoff(self):
@mock.patch('time.sleep')
def test_set_power_state_forceoff(self, mock_sleep):
server = mock.Mock(id=self.uuid, power_state=1, task_state=None,
metadata={})
self.nova_mock.return_value.get_server.return_value = server
server.fetch.return_value = None
self.test_driver.set_power_state(self.uuid, 'ForceOff')
compute = self.nova_mock.return_value.compute
compute.stop_server.assert_called_once_with(self.uuid)
def test_set_power_state_gracefulshutdown(self):
@mock.patch('time.sleep')
def test_set_power_state_gracefulshutdown(self, mock_sleep):
server = mock.Mock(id=self.uuid, power_state=1, task_state=None,
metadata={})
self.nova_mock.return_value.get_server.return_value = server
server.fetch.return_value = None
self.test_driver.set_power_state(self.uuid, 'GracefulShutdown')
compute = self.nova_mock.return_value.compute
compute.stop_server.assert_called_once_with(self.uuid)
def test_set_power_state_gracefulrestart(self):
@mock.patch('time.sleep')
def test_set_power_state_gracefulrestart(self, mock_sleep):
server = mock.Mock(id=self.uuid, power_state=1, task_state=None,
metadata={})
self.nova_mock.return_value.get_server.return_value = server
server.fetch.return_value = None
self.test_driver.set_power_state(self.uuid, 'GracefulRestart')
compute = self.nova_mock.return_value.compute
compute.reboot_server.assert_called_once_with(
self.uuid, reboot_type='SOFT')
def test_set_power_state_forcerestart(self):
@mock.patch('time.sleep')
def test_set_power_state_forcerestart(self, mock_sleep):
server = mock.Mock(id=self.uuid, power_state=1, task_state=None,
metadata={})
self.nova_mock.return_value.get_server.return_value = server
server.fetch.return_value = None
self.test_driver.set_power_state(
self.uuid, 'ForceRestart')
compute = self.nova_mock.return_value.compute
compute.reboot_server.assert_called_once_with(
self.uuid, reboot_type='HARD')
def test_set_power_state_raises_SYS518(self):
server = mock.Mock(
@mock.patch('time.sleep')
def test_set_power_state_raises_SYS518(self, mock_sleep):
server_busy = mock.Mock(
id=self.uuid, power_state=1, task_state='rebuilding',
metadata={})
self.nova_mock.return_value.get_server.return_value = server
self.nova_mock.return_value.get_server.return_value = server_busy
server_busy.fetch.return_value = None
e = self.assertRaises(
error.FishyError, self.test_driver.set_power_state,
self.uuid, 'SOFT')
@@ -141,18 +157,53 @@ class NovaDriverTestCase(base.BaseTestCase):
'SYS518: Cloud instance is busy, task_state: rebuilding',
str(e))
@mock.patch('time.sleep')
@mock.patch.object(OpenStackDriver, "_rebuild_with_blank_image",
autospec=True)
def test_set_power_state_delayed_eject(self, mock_rebuild_blank):
def test_set_power_state_delayed_eject(self, mock_rebuild_blank,
mock_sleep):
"""Test that 503 is always raised after triggering rebuild"""
server = mock.Mock(id=self.uuid, power_state=1, task_state=None,
metadata={'sushy-tools-delay-eject': 'true'})
self.nova_mock.return_value.get_server.return_value = server
self.test_driver.set_power_state(self.uuid, 'ForceOff')
server.fetch.return_value = None
# Should raise 503 after triggering rebuild
e = self.assertRaises(
error.FishyError, self.test_driver.set_power_state,
self.uuid, 'ForceOff')
self.assertIn('SYS518: Cloud instance rebuilding', str(e))
# Rebuild should have been triggered
mock_rebuild_blank.assert_called_once_with(self.test_driver, self.uuid)
self._cc.delete_server_metadata.assert_called_once_with(
self.uuid, ['sushy-tools-delay-eject'])
# Power state change should NOT have been attempted
compute = self.nova_mock.return_value.compute
compute.stop_server.assert_called_once_with(self.uuid)
compute.stop_server.assert_not_called()
@mock.patch('time.sleep')
@mock.patch.object(OpenStackDriver, "_rebuild_with_blank_image",
autospec=True)
def test_set_power_state_delayed_eject_busy(self, mock_rebuild_blank,
mock_sleep):
"""Test that rebuild is not attempted when instance is busy"""
server = mock.Mock(id=self.uuid, power_state=4,
task_state='powering-off',
metadata={'sushy-tools-delay-eject': 'true'})
self.nova_mock.return_value.get_server.return_value = server
server.fetch.return_value = None
# Should raise 503 before attempting rebuild due to powering-off state
e = self.assertRaises(
error.FishyError, self.test_driver.set_power_state,
self.uuid, 'On')
self.assertIn('SYS518: Cloud instance is busy', str(e))
self.assertIn('powering-off', str(e))
# Rebuild should NOT have been attempted
mock_rebuild_blank.assert_not_called()
def test_get_boot_device(self):
server = mock.Mock(id=self.uuid)
@@ -581,3 +632,113 @@ class NovaDriverTestCase(base.BaseTestCase):
self.uuid, 'aaa-bbb')
self.assertEqual(
'Server rebuild attempt resulted in status ERROR', str(e))
@mock.patch('time.sleep')
def test_check_and_wait_task_state_immediately_ready(self, mock_sleep):
"""Test when instance is immediately ready (task_state=None)"""
server = mock.Mock(id=self.uuid, task_state=None)
# fetch() updates instance in-place, doesn't change task_state
server.fetch.return_value = None
is_ready, result_instance = (
self.test_driver._check_and_wait_for_task_state(server))
self.assertTrue(is_ready)
self.assertEqual(server, result_instance)
# Should sleep for initial_wait (2s) + stability_wait (4s)
self.assertEqual(2, mock_sleep.call_count)
mock_sleep.assert_any_call(2) # initial_wait
mock_sleep.assert_any_call(4) # stability_wait
# fetch() called once after stability check
server.fetch.assert_called_once_with(self._cc.compute)
@mock.patch('time.sleep')
def test_check_and_wait_task_state_becomes_ready(self, mock_sleep):
"""Test when instance becomes ready after polling"""
server = mock.Mock(id=self.uuid, task_state='rebuilding')
# fetch() updates: busy -> ready -> ready (stability check)
fetch_states = ['rebuilding', None, None]
fetch_call_count = [0]
def update_task_state(compute):
server.task_state = fetch_states[fetch_call_count[0]]
fetch_call_count[0] += 1
server.fetch.side_effect = update_task_state
is_ready, result_instance = (
self.test_driver._check_and_wait_for_task_state(server))
self.assertTrue(is_ready)
self.assertEqual(server, result_instance)
# Sleeps: 2s (busy), 4s (fetch->ready), 2s (check), 4s (stability)
self.assertEqual(4, mock_sleep.call_count)
@mock.patch('time.sleep')
def test_check_and_wait_task_state_timeout(self, mock_sleep):
"""Test when instance stays busy and times out"""
server = mock.Mock(id=self.uuid, task_state='rebuilding')
# fetch() keeps task_state as 'rebuilding'
server.fetch.return_value = None
is_ready, result_instance = (
self.test_driver._check_and_wait_for_task_state(
server, max_wait=3))
self.assertFalse(is_ready)
self.assertEqual(server, result_instance)
# Should have tried to poll multiple times within max_wait
self.assertGreater(mock_sleep.call_count, 0)
@mock.patch('time.sleep')
def test_check_and_wait_task_state_stability_check_fails(self,
mock_sleep):
"""Test when stability check detects a state change"""
server = mock.Mock(id=self.uuid, task_state=None)
# fetch() updates: ready->busy (stability), busy->ready, ready
fetch_states = ['powering-off', 'powering-off', None, None]
fetch_call_count = [0]
def update_task_state(compute):
server.task_state = fetch_states[fetch_call_count[0]]
fetch_call_count[0] += 1
server.fetch.side_effect = update_task_state
is_ready, result_instance = (
self.test_driver._check_and_wait_for_task_state(server))
self.assertTrue(is_ready)
self.assertEqual(server, result_instance)
# Should have multiple sleeps due to stability check retry
self.assertGreater(mock_sleep.call_count, 2)
@mock.patch('time.sleep')
def test_check_and_wait_task_state_exponential_backoff(self, mock_sleep):
"""Test exponential backoff behavior"""
server = mock.Mock(id=self.uuid, task_state='rebuilding')
# fetch() updates: busy -> ready -> ready (stability)
fetch_states = ['rebuilding', None, None]
fetch_call_count = [0]
def update_task_state(compute):
server.task_state = fetch_states[fetch_call_count[0]]
fetch_call_count[0] += 1
server.fetch.side_effect = update_task_state
is_ready, result_instance = (
self.test_driver._check_and_wait_for_task_state(server))
self.assertTrue(is_ready)
# Check that exponential backoff was used
calls = [call[0][0] for call in mock_sleep.call_args_list]
# Should see: 2s (initial), 4s (2*2), 5s (capped), 4s (stability)
self.assertEqual(4, len(calls))
self.assertEqual(2, calls[0]) # initial_wait
self.assertEqual(4, calls[1]) # exponential (2*2)
self.assertEqual(5, calls[2]) # capped at max_interval (4*2=8->5)
self.assertEqual(4, calls[3]) # stability_wait