Retry calls to nova API when connection errors are detected

Currently, when a call to the Nova API fails for any connection
temporary issue, Watcher actions fail.

This patch is adding retries to nova api calls when connection issues
are detected. The retries can be configured with new parameters in the
nova section:

- http_retries (default is 3 retries)
- http_retry_interval (default is 2 seconds).

Note that, in order to implement the retries in all the nova_helper
methods, in some cases i am adding new elementary calls which are called
by more complex ones.

Closes-Bug: #2133934

Change-Id: I587920f9e287cec2df3489b13c4ef78013de1876
Signed-off-by: Alfredo Moralejo <amoralej@redhat.com>
This commit is contained in:
Alfredo Moralejo
2025-12-12 10:51:28 +01:00
parent 64993ae6ad
commit 1d32e734f3
4 changed files with 367 additions and 25 deletions

View File

@@ -0,0 +1,7 @@
---
fixes:
- |
Resolve failures caused by temporary connection errors to Nova by adding
configurable retries to Nova API calls when connection related errors are
found. The retries can be configured via `[nova] http_retries` (default is
3 retries) and `[nova] http_retry_interval` (default is 2 seconds).

View File

@@ -16,8 +16,10 @@
# limitations under the License.
#
import functools
import time
from keystoneauth1 import exceptions as ksa_exc
from novaclient import api_versions
from oslo_log import log
@@ -32,6 +34,27 @@ LOG = log.getLogger(__name__)
CONF = conf.CONF
def nova_retries(call):
@functools.wraps(call)
def wrapper(*args, **kwargs):
retries = CONF.nova.http_retries
retry_interval = CONF.nova.http_retry_interval
for i in range(retries + 1):
try:
return call(*args, **kwargs)
except ksa_exc.ConnectionError as e:
LOG.warning('Error connecting to Nova service: %s', e)
if i < retries:
LOG.warning('Retrying connection to Nova service')
time.sleep(retry_interval)
else:
LOG.error(
'Failed to connect to Nova service after %d attempts',
retries + 1)
raise
return wrapper
class NovaHelper:
def __init__(self, osc=None):
@@ -53,6 +76,7 @@ class NovaHelper:
api_versions.APIVersion(version_str='2.96'))
return self._is_pinned_az_available
@nova_retries
def get_compute_node_list(self):
hypervisors = self.nova.hypervisors.list()
# filter out baremetal nodes from hypervisors
@@ -60,6 +84,7 @@ class NovaHelper:
node.hypervisor_type != 'ironic']
return compute_nodes
@nova_retries
def get_compute_node_by_name(self, node_name, servers=False,
detailed=False):
"""Search for a hypervisor (compute node) by hypervisor_hostname
@@ -95,6 +120,7 @@ class NovaHelper:
LOG.exception(exc)
raise exception.ComputeNodeNotFound(name=node_hostname)
@nova_retries
def get_compute_node_by_uuid(self, node_uuid):
"""Get compute node by uuid
@@ -103,6 +129,7 @@ class NovaHelper:
"""
return self.nova.hypervisors.get(node_uuid)
@nova_retries
def get_instance_list(self, filters=None, marker=None, limit=-1):
"""List servers for all tenants with details.
@@ -125,31 +152,72 @@ class NovaHelper:
marker=marker,
limit=limit)
@nova_retries
def get_instance_by_uuid(self, instance_uuid):
return [instance for instance in
self.nova.servers.list(search_opts={"all_tenants": True,
"uuid": instance_uuid})]
@nova_retries
def get_flavor_list(self):
return self.nova.flavors.list(**{'is_public': None})
@nova_retries
def get_flavor(self, flavor):
return self.nova.flavors.get(flavor)
@nova_retries
def get_aggregate_list(self):
return self.nova.aggregates.list()
@nova_retries
def get_aggregate_detail(self, aggregate_id):
return self.nova.aggregates.get(aggregate_id)
@nova_retries
def get_service_list(self):
return self.nova.services.list(binary='nova-compute')
@nova_retries
def find_instance(self, instance_id):
return self.nova.servers.get(instance_id)
@nova_retries
def nova_start_instance(self, instance_id):
return self.nova.servers.start(instance_id)
@nova_retries
def nova_stop_instance(self, instance_id):
return self.nova.servers.stop(instance_id)
@nova_retries
def instance_resize(self, instance, flavor_id):
return instance.resize(flavor=flavor_id)
@nova_retries
def instance_confirm_resize(self, instance):
return instance.confirm_resize()
@nova_retries
def instance_live_migrate(self, instance, dest_hostname):
# From nova api version 2.25(Mitaka release), the default value of
# block_migration is None which is mapped to 'auto'.
return instance.live_migrate(host=dest_hostname)
@nova_retries
def instance_migrate(self, instance, dest_hostname):
return instance.migrate(host=dest_hostname)
@nova_retries
def live_migration_abort(self, instance_id, migration_id):
return self.nova.server_migrations.live_migration_abort(
server=instance_id, migration=migration_id)
def confirm_resize(self, instance, previous_status, retry=60):
instance.confirm_resize()
instance = self.nova.servers.get(instance.id)
self.instance_confirm_resize(instance)
instance = self.find_instance(instance.id)
while instance.status != previous_status and retry:
instance = self.nova.servers.get(instance.id)
instance = self.find_instance(instance.id)
retry -= 1
time.sleep(1)
if instance.status == previous_status:
@@ -217,12 +285,12 @@ class NovaHelper:
{'instance': instance_id, 'host': host_name})
previous_status = getattr(instance, 'status')
instance.migrate(host=dest_hostname)
instance = self.nova.servers.get(instance_id)
self.instance_migrate(instance, dest_hostname)
instance = self.find_instance(instance_id)
while (getattr(instance, 'status') not in
["VERIFY_RESIZE", "ERROR"] and retry):
instance = self.nova.servers.get(instance.id)
instance = self.find_instance(instance.id)
time.sleep(interval)
retry -= 1
new_hostname = getattr(instance, 'OS-EXT-SRV-ATTR:host')
@@ -270,9 +338,9 @@ class NovaHelper:
flavor_id = None
try:
flavor_id = self.nova.flavors.get(flavor).id
flavor_id = self.get_flavor(flavor).id
except nvexceptions.NotFound:
flavor_id = [f.id for f in self.nova.flavors.list() if
flavor_id = [f.id for f in self.get_flavor_list() if
f.name == flavor][0]
except nvexceptions.ClientException as e:
LOG.debug("Nova client exception occurred while resizing "
@@ -291,11 +359,11 @@ class NovaHelper:
"Instance %(id)s is in '%(status)s' status.",
{'id': instance_id, 'status': instance_status})
instance.resize(flavor=flavor_id)
self.instance_resize(instance, flavor_id)
while getattr(instance,
'OS-EXT-STS:vm_state') != 'resized' \
and retry:
instance = self.nova.servers.get(instance.id)
instance = self.find_instance(instance.id)
LOG.debug('Waiting the resize of %s to %s', instance, flavor_id)
time.sleep(interval)
retry -= 1
@@ -304,7 +372,7 @@ class NovaHelper:
if instance_status != 'VERIFY_RESIZE':
return False
instance.confirm_resize()
self.instance_confirm_resize(instance)
LOG.debug("Resizing succeeded : instance %s is now on flavor "
"'%s'.", instance_id, flavor_id)
@@ -346,17 +414,15 @@ class NovaHelper:
"Instance %(instance)s found on host '%(host)s'.",
{'instance': instance_id, 'host': host_name})
# From nova api version 2.25(Mitaka release), the default value of
# block_migration is None which is mapped to 'auto'.
instance.live_migrate(host=dest_hostname)
self.instance_live_migrate(instance, dest_hostname)
instance = self.nova.servers.get(instance_id)
instance = self.find_instance(instance_id)
# NOTE: If destination host is not specified for live migration
# let nova scheduler choose the destination host.
if dest_hostname is None:
while (instance.status not in ['ACTIVE', 'ERROR'] and retry):
instance = self.nova.servers.get(instance.id)
instance = self.find_instance(instance.id)
LOG.debug('Waiting the migration of %s', instance.id)
time.sleep(interval)
retry -= 1
@@ -374,7 +440,7 @@ class NovaHelper:
while getattr(instance,
'OS-EXT-SRV-ATTR:host') != dest_hostname \
and retry:
instance = self.nova.servers.get(instance.id)
instance = self.find_instance(instance.id)
if not getattr(instance, 'OS-EXT-STS:task_state'):
LOG.debug("Instance task state: %s is null", instance_id)
break
@@ -401,8 +467,7 @@ class NovaHelper:
if migration:
migration_id = getattr(migration[0], "id")
try:
self.nova.server_migrations.live_migration_abort(
server=instance_id, migration=migration_id)
self.live_migration_abort(instance_id, migration_id)
except exception as e:
# Note: Does not return from here, as abort request can't be
# accepted but migration still going on.
@@ -412,7 +477,7 @@ class NovaHelper:
"No running migrations found for instance %s", instance_id)
while retry:
instance = self.nova.servers.get(instance_id)
instance = self.find_instance(instance_id)
if (getattr(instance, 'OS-EXT-STS:task_state') is None and
getattr(instance, 'status') in ['ACTIVE', 'ERROR']):
break
@@ -432,6 +497,7 @@ class NovaHelper:
raise Exception("Live migration execution and abort both failed "
f"for the instance {instance_id}")
@nova_retries
def enable_service_nova_compute(self, hostname):
if (api_versions.APIVersion(version_str=CONF.nova_client.api_version) <
api_versions.APIVersion(version_str='2.53')):
@@ -445,6 +511,7 @@ class NovaHelper:
return status
@nova_retries
def disable_service_nova_compute(self, hostname, reason=None):
if (api_versions.APIVersion(version_str=CONF.nova_client.api_version) <
api_versions.APIVersion(version_str='2.53')):
@@ -477,7 +544,7 @@ class NovaHelper:
LOG.debug("Instance has been stopped: %s", instance_id)
return True
else:
self.nova.servers.stop(instance_id)
self.nova_stop_instance(instance_id)
if self.wait_for_instance_state(instance, "stopped", 8, 10):
LOG.debug("Instance %s stopped.", instance_id)
@@ -501,7 +568,7 @@ class NovaHelper:
LOG.debug("Instance has already been started: %s", instance_id)
return True
else:
self.nova.servers.start(instance_id)
self.nova_start_instance(instance_id)
if self.wait_for_instance_state(instance, "active", 8, 10):
LOG.debug("Instance %s started.", instance_id)
@@ -525,13 +592,14 @@ class NovaHelper:
while getattr(server, 'OS-EXT-STS:vm_state') != state and retry:
time.sleep(sleep)
server = self.nova.servers.get(server)
server = self.find_instance(server)
retry -= 1
return getattr(server, 'OS-EXT-STS:vm_state') == state
def get_hostname(self, instance):
return str(getattr(instance, 'OS-EXT-SRV-ATTR:host'))
@nova_retries
def get_running_migration(self, instance_id):
return self.nova.server_migrations.list(server=instance_id)

View File

@@ -38,6 +38,17 @@ NOVA_OPTS = [
help='Interval in seconds to check the status in Nova VM '
'migrations (value is float). Default value is 5.0 '
'seconds.'),
cfg.IntOpt('http_retries',
default=3,
min=1,
help='Maximum number of retries for HTTP requests to the Nova '
'service when connection errors occur. Default is 3'),
cfg.FloatOpt('http_retry_interval',
default=2.0,
min=0.1,
help='Interval in seconds to retry HTTP requests to the Nova '
'service when connection errors occur. Default is 2 '
'seconds.'),
]

View File

@@ -20,6 +20,7 @@ import fixtures
import time
from unittest import mock
from keystoneauth1 import exceptions as ksa_exc
from novaclient import api_versions
import novaclient.exceptions as nvexceptions
@@ -961,10 +962,12 @@ class TestNovaHelper(base.TestCase):
self.assertFalse(result)
@mock.patch.object(servers.Server, 'confirm_resize', autospec=True)
def test_confirm_resize(self, mock_confirm_resize, mock_cinder, mock_nova):
def test_confirm_resize(self, mock_confirm_resize, mock_cinder,
mock_nova):
nova_util = nova_helper.NovaHelper()
instance = self.fake_server(self.instance_uuid)
self.fake_nova_find_list(nova_util, fake_find=instance, fake_list=None)
self.fake_nova_find_list(nova_util, fake_find=instance,
fake_list=None)
# verify that the method will return True when the status of instance
# is not in the expected status.
@@ -1001,3 +1004,256 @@ class TestNovaHelper(base.TestCase):
self.assertEqual(1, len(compute_nodes))
self.assertEqual(hypervisor1_name,
compute_nodes[0].hypervisor_hostname)
def test_find_instance(self, mock_cinder, mock_nova):
nova_util = nova_helper.NovaHelper()
instance = self.fake_server(self.instance_uuid)
self.fake_nova_find_list(nova_util, fake_find=instance,
fake_list=None)
nova_util.nova.servers.get.return_value = instance
result = nova_util.find_instance(self.instance_uuid)
self.assertEqual(1, nova_util.nova.servers.get.call_count)
self.mock_sleep.assert_not_called()
self.assertEqual(instance, result)
def test_find_instance_retries(self, mock_cinder, mock_nova):
nova_util = nova_helper.NovaHelper()
instance = self.fake_server(self.instance_uuid)
self.fake_nova_find_list(nova_util, fake_find=instance,
fake_list=None)
nova_util.nova.servers.get.side_effect = [
ksa_exc.ConnectionError("Connection failed"),
instance
]
result = nova_util.find_instance(self.instance_uuid)
self.assertEqual(2, nova_util.nova.servers.get.call_count)
self.assertEqual(1, self.mock_sleep.call_count)
self.assertEqual(instance, result)
def test_find_instance_retries_exhausts_retries(self, mock_cinder,
mock_nova):
nova_util = nova_helper.NovaHelper()
instance = self.fake_server(self.instance_uuid)
self.fake_nova_find_list(nova_util, fake_find=instance,
fake_list=None)
nova_util.nova.servers.get.side_effect = ksa_exc.ConnectionError(
"Connection failed")
self.assertRaises(ksa_exc.ConnectionError,
nova_util.find_instance, self.instance_uuid)
self.assertEqual(4, nova_util.nova.servers.get.call_count)
self.assertEqual(3, self.mock_sleep.call_count)
def test_nova_start_instance(self, mock_cinder, mock_nova):
nova_util = nova_helper.NovaHelper()
instance = self.fake_server(self.instance_uuid)
nova_util.nova_start_instance(instance.id)
nova_util.nova.servers.start.assert_called_once_with(instance.id)
def test_nova_stop_instance(self, mock_cinder, mock_nova):
nova_util = nova_helper.NovaHelper()
instance = self.fake_server(self.instance_uuid)
nova_util.nova_stop_instance(instance.id)
nova_util.nova.servers.stop.assert_called_once_with(instance.id)
@mock.patch.object(servers.Server, 'resize', autospec=True)
def test_instance_resize(self, mock_resize, mock_cinder, mock_nova):
nova_util = nova_helper.NovaHelper()
instance = self.fake_server(self.instance_uuid)
flavor_name = "m1.small"
result = nova_util.instance_resize(instance, flavor_name)
mock_resize.assert_called_once_with(instance, flavor=flavor_name)
self.assertTrue(result)
@mock.patch.object(servers.Server, 'confirm_resize', autospec=True)
def test_instance_confirm_resize(self, mock_confirm_resize, mock_cinder,
mock_nova):
nova_util = nova_helper.NovaHelper()
instance = self.fake_server(self.instance_uuid)
nova_util.instance_confirm_resize(instance)
mock_confirm_resize.assert_called_once_with(instance)
@mock.patch.object(servers.Server, 'live_migrate', autospec=True)
def test_instance_live_migrate(self, mock_live_migrate, mock_cinder,
mock_nova):
nova_util = nova_helper.NovaHelper()
instance = self.fake_server(self.instance_uuid)
dest_hostname = "dest_hostname"
nova_util.instance_live_migrate(instance, dest_hostname)
mock_live_migrate.assert_called_once_with(
instance, host="dest_hostname")
@mock.patch.object(servers.Server, 'migrate', autospec=True)
def test_instance_migrate(self, mock_migrate, mock_cinder, mock_nova):
nova_util = nova_helper.NovaHelper()
instance = self.fake_server(self.instance_uuid)
dest_hostname = "dest_hostname"
nova_util.instance_migrate(instance, dest_hostname)
mock_migrate.assert_called_once_with(instance, host="dest_hostname")
def test_live_migration_abort(self, mock_cinder, mock_nova):
nova_util = nova_helper.NovaHelper()
instance = self.fake_server(self.instance_uuid)
nova_util.live_migration_abort(instance.id, 1)
nova_util.nova.server_migrations.live_migration_abort.\
assert_called_once_with(server=instance.id, migration=1)
class TestNovaRetries(base.TestCase):
"""Test suite for the nova_retries decorator."""
def setUp(self):
super().setUp()
self.mock_sleep = self.useFixture(
fixtures.MockPatchObject(time, 'sleep')).mock
def test_nova_retries_success_on_first_attempt(self):
"""Test that decorator returns result when function succeeds."""
@nova_helper.nova_retries
def mock_function():
return "success"
result = mock_function()
self.assertEqual("success", result)
self.mock_sleep.assert_not_called()
def test_nova_retries_success_after_retries(self):
"""Test that decorator retries and succeeds after ConnectionError."""
self.flags(http_retries=3, http_retry_interval=2, group='nova')
call_count = {'count': 0}
@nova_helper.nova_retries
def mock_function():
call_count['count'] += 1
if call_count['count'] < 3:
raise ksa_exc.ConnectionError("Connection failed")
return "success"
result = mock_function()
self.assertEqual("success", result)
self.assertEqual(3, call_count['count'])
# Should have slept 2 times (before retry 2 and 3)
self.assertEqual(2, self.mock_sleep.call_count)
# Verify sleep was called with correct interval
for call in self.mock_sleep.call_args_list:
self.assertEqual(call[0][0], 2)
def test_nova_retries_exhausts_retries(self):
"""Test that decorator re-raises after exhausting retries."""
self.flags(http_retries=3, http_retry_interval=1, group='nova')
call_count = {'count': 0}
@nova_helper.nova_retries
def mock_function():
call_count['count'] += 1
raise ksa_exc.ConnectionError("Connection failed")
self.assertRaises(ksa_exc.ConnectionError, mock_function)
# Should have tried 3 times
self.assertEqual(4, call_count['count'])
# Should have slept 2 times (between attempts)
self.assertEqual(3, self.mock_sleep.call_count)
# Verify sleep was called with correct interval
for call in self.mock_sleep.call_args_list:
self.assertEqual(call[0][0], 1)
def test_nova_retries_with_custom_retry_interval(self):
"""Test that decorator uses configured retry interval."""
self.flags(http_retries=4, http_retry_interval=5, group='nova')
call_count = {'count': 0}
@nova_helper.nova_retries
def mock_function():
call_count['count'] += 1
raise ksa_exc.ConnectionError("Connection failed")
self.assertRaises(ksa_exc.ConnectionError, mock_function)
# Should have tried 4 times
self.assertEqual(5, call_count['count'])
# Should have slept 3 times (between attempts)
self.assertEqual(4, self.mock_sleep.call_count)
# Verify sleep was called with 5 second interval
for call in self.mock_sleep.call_args_list:
self.assertEqual(call[0][0], 5)
def test_nova_retries_with_function_args(self):
"""Test that decorator preserves function arguments and return."""
@nova_helper.nova_retries
def mock_function(arg1, arg2, kwarg1=None):
return f"{arg1}-{arg2}-{kwarg1}"
result = mock_function("a", "b", kwarg1="c")
self.assertEqual("a-b-c", result)
self.mock_sleep.assert_not_called()
def test_nova_retries_propagates_other_exceptions(self):
"""Test that decorator doesn't catch non-ConnectionError exception."""
@nova_helper.nova_retries
def mock_function():
raise ValueError("Some other error")
self.assertRaises(ValueError, mock_function)
self.mock_sleep.assert_not_called()
@mock.patch.object(nova_helper, 'LOG')
def test_nova_retries_logging_on_retry(self, mock_log):
"""Test that decorator logs warnings during retries."""
self.flags(http_retries=3, http_retry_interval=1, group='nova')
call_count = {'count': 0}
@nova_helper.nova_retries
def mock_function():
call_count['count'] += 1
if call_count['count'] < 2:
raise ksa_exc.ConnectionError("Connection failed")
return "success"
mock_function()
# Should have logged warning about connection error
self.assertTrue(mock_log.warning.called)
# Check that retry message was logged
warning_calls = [call for call in mock_log.warning.call_args_list
if 'Retrying connection' in str(call)]
self.assertEqual(1, len(warning_calls))
@mock.patch.object(nova_helper, 'LOG')
def test_nova_retries_logging_on_final_failure(self, mock_log):
"""Test that decorator logs error when all retries are exhausted."""
self.flags(http_retries=2, http_retry_interval=1, group='nova')
@nova_helper.nova_retries
def mock_function():
raise ksa_exc.ConnectionError("Connection failed")
self.assertRaises(ksa_exc.ConnectionError, mock_function)
# Should have logged error about final failure
self.assertTrue(mock_log.error.called)
error_calls = [call for call in mock_log.error.call_args_list
if 'Failed to connect' in str(call)]
self.assertEqual(1, len(error_calls))
def test_nova_retries_single_retry_config(self):
"""Test decorator behavior with single retry configured."""
self.flags(http_retries=1, http_retry_interval=1, group='nova')
call_count = {'count': 0}
@nova_helper.nova_retries
def mock_function():
call_count['count'] += 1
raise ksa_exc.ConnectionError("Connection failed")
self.assertRaises(ksa_exc.ConnectionError, mock_function)
# Should have tried twice
self.assertEqual(2, call_count['count'])
# Should have slept once
self.assertEqual(1, self.mock_sleep.call_count)