novadriver: Fix race for instance task_state
Stop using cached instance state and fetch real-time data from Nova to avoid stale power_state and task_state values. Add _check_and_wait_for_task_state() helper that polls with exponential backoff (up to 20s) for task_state to clear, with a 4s stability check to handle rapid state transitions. Returns 503 when instance is busy, signaling clients to retry. This eliminates ConflictException errors during power operations and properly handles in-progress rebuilds and power state transitions. Assisted-By: Claude Code/claude-4.5-sonnet Change-Id: Iedf39aceb8c260903e00e77f6a919fd65e6a98d6 Signed-off-by: Harald Jensås <hjensas@redhat.com>
This commit is contained in:
@@ -0,0 +1,10 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixes OpenStack Nova driver to properly handle instance ``task_state``
|
||||
conflicts and race conditions during power state operations. The driver
|
||||
now waits for instances to be ready before attempting power actions or
|
||||
rebuilds, preventing ``ConflictExceptions`` when instances are busy with
|
||||
tasks like ``powering-off``, ``rebuilding``, or ``spawning``. When the
|
||||
instance is busy, the driver now returns ``HTTP 503`` with ``SYS518``
|
||||
error code to allow clients to retry.
|
||||
@@ -176,7 +176,9 @@ class OpenStackDriver(AbstractSystemsDriver):
|
||||
if power state can't be determined
|
||||
"""
|
||||
try:
|
||||
# Get instance (uses cache) then fetch fresh state
|
||||
instance = self._get_instance(identity)
|
||||
instance.fetch(self._cc.compute)
|
||||
|
||||
except error.FishyError:
|
||||
return
|
||||
@@ -186,6 +188,67 @@ class OpenStackDriver(AbstractSystemsDriver):
|
||||
|
||||
return 'Off'
|
||||
|
||||
def _check_and_wait_for_task_state(self, instance, max_wait=20,
|
||||
initial_wait=2, stability_wait=4):
|
||||
"""Wait for instance task_state to clear and become stable
|
||||
|
||||
Polls with exponential backoff until task_state is None, then performs
|
||||
a stability check to ensure it remains None.
|
||||
|
||||
:param instance: Current instance object
|
||||
:param max_wait: Maximum time to wait in seconds (default 20)
|
||||
:param initial_wait: Initial wait interval (exponential backoff)
|
||||
:param stability_wait: Time to wait after task_state=None to ensure
|
||||
it stays None (catches rapid state transitions)
|
||||
:returns: Tuple of (is_ready, instance) where is_ready is True if
|
||||
task_state is None and stable, False if still busy
|
||||
"""
|
||||
waited = 0
|
||||
wait_interval = initial_wait
|
||||
max_interval = 5 # Cap exponential backoff at 5 seconds
|
||||
|
||||
while waited < max_wait:
|
||||
sleep_time = min(wait_interval, max_wait - waited)
|
||||
time.sleep(sleep_time)
|
||||
waited += sleep_time
|
||||
|
||||
# Still busy, fetch new state and apply exponential backoff
|
||||
if instance.task_state is not None:
|
||||
wait_interval = min(wait_interval * 2, max_interval)
|
||||
instance.fetch(self._cc.compute)
|
||||
continue
|
||||
|
||||
# This catches Nova doing rapid state transitions like:
|
||||
# rebuilding -> None -> powering-off -> None
|
||||
self._logger.debug(
|
||||
'Instance %(identity)s task_state cleared after '
|
||||
'%(waited)s seconds, verifying stability' %
|
||||
{'identity': instance.id, 'waited': waited})
|
||||
|
||||
# Cap stability wait to not exceed max_wait
|
||||
stability_sleep = min(stability_wait, max_wait - waited)
|
||||
time.sleep(stability_sleep)
|
||||
waited += stability_sleep
|
||||
|
||||
instance.fetch(self._cc.compute)
|
||||
|
||||
# Task state changed again, reset backoff and continue waiting
|
||||
if instance.task_state is not None:
|
||||
self._logger.debug(
|
||||
'Instance %(identity)s task_state changed to '
|
||||
'%(task_state)s during stability check, '
|
||||
'continuing to wait' %
|
||||
{'identity': instance.id,
|
||||
'task_state': instance.task_state})
|
||||
wait_interval = initial_wait
|
||||
continue
|
||||
|
||||
# Task state is stable at None, return success
|
||||
return True, instance
|
||||
|
||||
# Task still running after max_wait
|
||||
return False, instance
|
||||
|
||||
def set_power_state(self, identity, state):
|
||||
"""Set computer system power state
|
||||
|
||||
@@ -197,7 +260,29 @@ class OpenStackDriver(AbstractSystemsDriver):
|
||||
:raises: `error.FishyError` if power state can't be set
|
||||
|
||||
"""
|
||||
# Get instance (uses cache) then fetch fresh state
|
||||
instance = self._get_instance(identity)
|
||||
instance.fetch(self._cc.compute)
|
||||
|
||||
# Log current instance state for diagnostics
|
||||
self._logger.debug(
|
||||
'set_power_state called: identity=%(identity)s, '
|
||||
'requested_state=%(state)s, current_power_state=%(power_state)s, '
|
||||
'current_status=%(status)s, current_task_state=%(task_state)s' %
|
||||
{'identity': identity, 'state': state,
|
||||
'power_state': instance.power_state,
|
||||
'status': instance.status,
|
||||
'task_state': instance.task_state})
|
||||
|
||||
# Check if instance is ready before attempting any operations
|
||||
# Prevents power actions and rebuilds during busy states
|
||||
# (e.g. powering-off, powering-on, rebuilding, spawning)
|
||||
is_ready, instance = self._check_and_wait_for_task_state(instance)
|
||||
if not is_ready:
|
||||
msg = ('SYS518: Cloud instance is busy, task_state: %s'
|
||||
% instance.task_state)
|
||||
raise error.FishyError(msg, 503)
|
||||
|
||||
delayed_media_eject = instance.metadata.get('sushy-tools-delay-eject')
|
||||
|
||||
if delayed_media_eject:
|
||||
@@ -210,11 +295,9 @@ class OpenStackDriver(AbstractSystemsDriver):
|
||||
False, self._rebuild_with_blank_image, identity)
|
||||
self._remove_delayed_eject_metadata(identity)
|
||||
|
||||
if instance.task_state is not None:
|
||||
# SYS518 is used here to trick openstack/sushy to do retries.
|
||||
# iDRAC uses SYS518 when a previous task is still running.
|
||||
msg = ('SYS518: Cloud instance is busy, task_state: %s'
|
||||
% instance.task_state)
|
||||
# Always raise 503 after triggering rebuild to force client retry
|
||||
# This avoids race conditions and prevents exceeding 60s timeout
|
||||
msg = ('SYS518: Cloud instance rebuilding, retry required')
|
||||
raise error.FishyError(msg, 503)
|
||||
|
||||
if state in ('On', 'ForceOn'):
|
||||
|
||||
@@ -64,6 +64,7 @@ class NovaDriverTestCase(base.BaseTestCase):
|
||||
server = mock.Mock(id=self.uuid,
|
||||
power_state=1)
|
||||
self.nova_mock.return_value.get_server.return_value = server
|
||||
server.fetch.return_value = None
|
||||
|
||||
power_state = self.test_driver.get_power_state(self.uuid)
|
||||
|
||||
@@ -73,67 +74,82 @@ class NovaDriverTestCase(base.BaseTestCase):
|
||||
server = mock.Mock(id=self.uuid,
|
||||
power_state=0)
|
||||
self.nova_mock.return_value.get_server.return_value = server
|
||||
server.fetch.return_value = None
|
||||
|
||||
power_state = self.test_driver.get_power_state(self.uuid)
|
||||
|
||||
self.assertEqual('Off', power_state)
|
||||
|
||||
def test_set_power_state_on(self):
|
||||
@mock.patch('time.sleep')
|
||||
def test_set_power_state_on(self, mock_sleep):
|
||||
server = mock.Mock(id=self.uuid, power_state=0, task_state=None,
|
||||
metadata={})
|
||||
self.nova_mock.return_value.get_server.return_value = server
|
||||
server.fetch.return_value = None
|
||||
self.test_driver.set_power_state(self.uuid, 'On')
|
||||
compute = self.nova_mock.return_value.compute
|
||||
compute.start_server.assert_called_once_with(self.uuid)
|
||||
|
||||
def test_set_power_state_forceon(self):
|
||||
@mock.patch('time.sleep')
|
||||
def test_set_power_state_forceon(self, mock_sleep):
|
||||
server = mock.Mock(id=self.uuid, power_state=0, task_state=None,
|
||||
metadata={})
|
||||
self.nova_mock.return_value.get_server.return_value = server
|
||||
server.fetch.return_value = None
|
||||
self.test_driver.set_power_state(self.uuid, 'ForceOn')
|
||||
compute = self.nova_mock.return_value.compute
|
||||
compute.start_server.assert_called_once_with(self.uuid)
|
||||
|
||||
def test_set_power_state_forceoff(self):
|
||||
@mock.patch('time.sleep')
|
||||
def test_set_power_state_forceoff(self, mock_sleep):
|
||||
server = mock.Mock(id=self.uuid, power_state=1, task_state=None,
|
||||
metadata={})
|
||||
self.nova_mock.return_value.get_server.return_value = server
|
||||
server.fetch.return_value = None
|
||||
self.test_driver.set_power_state(self.uuid, 'ForceOff')
|
||||
compute = self.nova_mock.return_value.compute
|
||||
compute.stop_server.assert_called_once_with(self.uuid)
|
||||
|
||||
def test_set_power_state_gracefulshutdown(self):
|
||||
@mock.patch('time.sleep')
|
||||
def test_set_power_state_gracefulshutdown(self, mock_sleep):
|
||||
server = mock.Mock(id=self.uuid, power_state=1, task_state=None,
|
||||
metadata={})
|
||||
self.nova_mock.return_value.get_server.return_value = server
|
||||
server.fetch.return_value = None
|
||||
self.test_driver.set_power_state(self.uuid, 'GracefulShutdown')
|
||||
compute = self.nova_mock.return_value.compute
|
||||
compute.stop_server.assert_called_once_with(self.uuid)
|
||||
|
||||
def test_set_power_state_gracefulrestart(self):
|
||||
@mock.patch('time.sleep')
|
||||
def test_set_power_state_gracefulrestart(self, mock_sleep):
|
||||
server = mock.Mock(id=self.uuid, power_state=1, task_state=None,
|
||||
metadata={})
|
||||
self.nova_mock.return_value.get_server.return_value = server
|
||||
server.fetch.return_value = None
|
||||
self.test_driver.set_power_state(self.uuid, 'GracefulRestart')
|
||||
compute = self.nova_mock.return_value.compute
|
||||
compute.reboot_server.assert_called_once_with(
|
||||
self.uuid, reboot_type='SOFT')
|
||||
|
||||
def test_set_power_state_forcerestart(self):
|
||||
@mock.patch('time.sleep')
|
||||
def test_set_power_state_forcerestart(self, mock_sleep):
|
||||
server = mock.Mock(id=self.uuid, power_state=1, task_state=None,
|
||||
metadata={})
|
||||
self.nova_mock.return_value.get_server.return_value = server
|
||||
server.fetch.return_value = None
|
||||
self.test_driver.set_power_state(
|
||||
self.uuid, 'ForceRestart')
|
||||
compute = self.nova_mock.return_value.compute
|
||||
compute.reboot_server.assert_called_once_with(
|
||||
self.uuid, reboot_type='HARD')
|
||||
|
||||
def test_set_power_state_raises_SYS518(self):
|
||||
server = mock.Mock(
|
||||
@mock.patch('time.sleep')
|
||||
def test_set_power_state_raises_SYS518(self, mock_sleep):
|
||||
server_busy = mock.Mock(
|
||||
id=self.uuid, power_state=1, task_state='rebuilding',
|
||||
metadata={})
|
||||
self.nova_mock.return_value.get_server.return_value = server
|
||||
self.nova_mock.return_value.get_server.return_value = server_busy
|
||||
server_busy.fetch.return_value = None
|
||||
e = self.assertRaises(
|
||||
error.FishyError, self.test_driver.set_power_state,
|
||||
self.uuid, 'SOFT')
|
||||
@@ -141,18 +157,53 @@ class NovaDriverTestCase(base.BaseTestCase):
|
||||
'SYS518: Cloud instance is busy, task_state: rebuilding',
|
||||
str(e))
|
||||
|
||||
@mock.patch('time.sleep')
|
||||
@mock.patch.object(OpenStackDriver, "_rebuild_with_blank_image",
|
||||
autospec=True)
|
||||
def test_set_power_state_delayed_eject(self, mock_rebuild_blank):
|
||||
def test_set_power_state_delayed_eject(self, mock_rebuild_blank,
|
||||
mock_sleep):
|
||||
"""Test that 503 is always raised after triggering rebuild"""
|
||||
server = mock.Mock(id=self.uuid, power_state=1, task_state=None,
|
||||
metadata={'sushy-tools-delay-eject': 'true'})
|
||||
self.nova_mock.return_value.get_server.return_value = server
|
||||
self.test_driver.set_power_state(self.uuid, 'ForceOff')
|
||||
server.fetch.return_value = None
|
||||
|
||||
# Should raise 503 after triggering rebuild
|
||||
e = self.assertRaises(
|
||||
error.FishyError, self.test_driver.set_power_state,
|
||||
self.uuid, 'ForceOff')
|
||||
self.assertIn('SYS518: Cloud instance rebuilding', str(e))
|
||||
|
||||
# Rebuild should have been triggered
|
||||
mock_rebuild_blank.assert_called_once_with(self.test_driver, self.uuid)
|
||||
self._cc.delete_server_metadata.assert_called_once_with(
|
||||
self.uuid, ['sushy-tools-delay-eject'])
|
||||
|
||||
# Power state change should NOT have been attempted
|
||||
compute = self.nova_mock.return_value.compute
|
||||
compute.stop_server.assert_called_once_with(self.uuid)
|
||||
compute.stop_server.assert_not_called()
|
||||
|
||||
@mock.patch('time.sleep')
|
||||
@mock.patch.object(OpenStackDriver, "_rebuild_with_blank_image",
|
||||
autospec=True)
|
||||
def test_set_power_state_delayed_eject_busy(self, mock_rebuild_blank,
|
||||
mock_sleep):
|
||||
"""Test that rebuild is not attempted when instance is busy"""
|
||||
server = mock.Mock(id=self.uuid, power_state=4,
|
||||
task_state='powering-off',
|
||||
metadata={'sushy-tools-delay-eject': 'true'})
|
||||
self.nova_mock.return_value.get_server.return_value = server
|
||||
server.fetch.return_value = None
|
||||
|
||||
# Should raise 503 before attempting rebuild due to powering-off state
|
||||
e = self.assertRaises(
|
||||
error.FishyError, self.test_driver.set_power_state,
|
||||
self.uuid, 'On')
|
||||
self.assertIn('SYS518: Cloud instance is busy', str(e))
|
||||
self.assertIn('powering-off', str(e))
|
||||
|
||||
# Rebuild should NOT have been attempted
|
||||
mock_rebuild_blank.assert_not_called()
|
||||
|
||||
def test_get_boot_device(self):
|
||||
server = mock.Mock(id=self.uuid)
|
||||
@@ -581,3 +632,113 @@ class NovaDriverTestCase(base.BaseTestCase):
|
||||
self.uuid, 'aaa-bbb')
|
||||
self.assertEqual(
|
||||
'Server rebuild attempt resulted in status ERROR', str(e))
|
||||
|
||||
@mock.patch('time.sleep')
|
||||
def test_check_and_wait_task_state_immediately_ready(self, mock_sleep):
|
||||
"""Test when instance is immediately ready (task_state=None)"""
|
||||
server = mock.Mock(id=self.uuid, task_state=None)
|
||||
# fetch() updates instance in-place, doesn't change task_state
|
||||
server.fetch.return_value = None
|
||||
|
||||
is_ready, result_instance = (
|
||||
self.test_driver._check_and_wait_for_task_state(server))
|
||||
|
||||
self.assertTrue(is_ready)
|
||||
self.assertEqual(server, result_instance)
|
||||
# Should sleep for initial_wait (2s) + stability_wait (4s)
|
||||
self.assertEqual(2, mock_sleep.call_count)
|
||||
mock_sleep.assert_any_call(2) # initial_wait
|
||||
mock_sleep.assert_any_call(4) # stability_wait
|
||||
# fetch() called once after stability check
|
||||
server.fetch.assert_called_once_with(self._cc.compute)
|
||||
|
||||
@mock.patch('time.sleep')
|
||||
def test_check_and_wait_task_state_becomes_ready(self, mock_sleep):
|
||||
"""Test when instance becomes ready after polling"""
|
||||
server = mock.Mock(id=self.uuid, task_state='rebuilding')
|
||||
|
||||
# fetch() updates: busy -> ready -> ready (stability check)
|
||||
fetch_states = ['rebuilding', None, None]
|
||||
fetch_call_count = [0]
|
||||
|
||||
def update_task_state(compute):
|
||||
server.task_state = fetch_states[fetch_call_count[0]]
|
||||
fetch_call_count[0] += 1
|
||||
|
||||
server.fetch.side_effect = update_task_state
|
||||
|
||||
is_ready, result_instance = (
|
||||
self.test_driver._check_and_wait_for_task_state(server))
|
||||
|
||||
self.assertTrue(is_ready)
|
||||
self.assertEqual(server, result_instance)
|
||||
# Sleeps: 2s (busy), 4s (fetch->ready), 2s (check), 4s (stability)
|
||||
self.assertEqual(4, mock_sleep.call_count)
|
||||
|
||||
@mock.patch('time.sleep')
|
||||
def test_check_and_wait_task_state_timeout(self, mock_sleep):
|
||||
"""Test when instance stays busy and times out"""
|
||||
server = mock.Mock(id=self.uuid, task_state='rebuilding')
|
||||
# fetch() keeps task_state as 'rebuilding'
|
||||
server.fetch.return_value = None
|
||||
|
||||
is_ready, result_instance = (
|
||||
self.test_driver._check_and_wait_for_task_state(
|
||||
server, max_wait=3))
|
||||
|
||||
self.assertFalse(is_ready)
|
||||
self.assertEqual(server, result_instance)
|
||||
# Should have tried to poll multiple times within max_wait
|
||||
self.assertGreater(mock_sleep.call_count, 0)
|
||||
|
||||
@mock.patch('time.sleep')
|
||||
def test_check_and_wait_task_state_stability_check_fails(self,
|
||||
mock_sleep):
|
||||
"""Test when stability check detects a state change"""
|
||||
server = mock.Mock(id=self.uuid, task_state=None)
|
||||
|
||||
# fetch() updates: ready->busy (stability), busy->ready, ready
|
||||
fetch_states = ['powering-off', 'powering-off', None, None]
|
||||
fetch_call_count = [0]
|
||||
|
||||
def update_task_state(compute):
|
||||
server.task_state = fetch_states[fetch_call_count[0]]
|
||||
fetch_call_count[0] += 1
|
||||
|
||||
server.fetch.side_effect = update_task_state
|
||||
|
||||
is_ready, result_instance = (
|
||||
self.test_driver._check_and_wait_for_task_state(server))
|
||||
|
||||
self.assertTrue(is_ready)
|
||||
self.assertEqual(server, result_instance)
|
||||
# Should have multiple sleeps due to stability check retry
|
||||
self.assertGreater(mock_sleep.call_count, 2)
|
||||
|
||||
@mock.patch('time.sleep')
|
||||
def test_check_and_wait_task_state_exponential_backoff(self, mock_sleep):
|
||||
"""Test exponential backoff behavior"""
|
||||
server = mock.Mock(id=self.uuid, task_state='rebuilding')
|
||||
|
||||
# fetch() updates: busy -> ready -> ready (stability)
|
||||
fetch_states = ['rebuilding', None, None]
|
||||
fetch_call_count = [0]
|
||||
|
||||
def update_task_state(compute):
|
||||
server.task_state = fetch_states[fetch_call_count[0]]
|
||||
fetch_call_count[0] += 1
|
||||
|
||||
server.fetch.side_effect = update_task_state
|
||||
|
||||
is_ready, result_instance = (
|
||||
self.test_driver._check_and_wait_for_task_state(server))
|
||||
|
||||
self.assertTrue(is_ready)
|
||||
# Check that exponential backoff was used
|
||||
calls = [call[0][0] for call in mock_sleep.call_args_list]
|
||||
# Should see: 2s (initial), 4s (2*2), 5s (capped), 4s (stability)
|
||||
self.assertEqual(4, len(calls))
|
||||
self.assertEqual(2, calls[0]) # initial_wait
|
||||
self.assertEqual(4, calls[1]) # exponential (2*2)
|
||||
self.assertEqual(5, calls[2]) # capped at max_interval (4*2=8->5)
|
||||
self.assertEqual(4, calls[3]) # stability_wait
|
||||
|
||||
Reference in New Issue
Block a user